From e3b76d76aad1af0e797951e31fddce433c5b1cad Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Mon, 11 Jun 2018 11:24:19 -0400 Subject: [PATCH 1/3] Add POWER8 vector impl for 4-way SHA256 This speeds up 4-way SHA256 by about 3.75x over the C impl. --- configure.ac | 20 ++ src/Makefile.am | 10 + src/crypto/sha256.cpp | 17 +- src/crypto/sha256_power8.cpp | 401 +++++++++++++++++++++++++++++++++++ 4 files changed, 447 insertions(+), 1 deletion(-) create mode 100644 src/crypto/sha256_power8.cpp diff --git a/configure.ac b/configure.ac index 4f71515873..390ccecc97 100644 --- a/configure.ac +++ b/configure.ac @@ -449,6 +449,7 @@ enable_sse42=no enable_sse41=no enable_avx2=no enable_x86_shani=no +enable_power8=no dnl Check for optional instruction set support. Enabling these does _not_ imply that all code will dnl be compiled with them, rather that specific objects/libs may use them after checking for runtime @@ -459,6 +460,7 @@ AX_CHECK_COMPILE_FLAG([-msse4.2], [SSE42_CXXFLAGS="-msse4.2"], [], [$CXXFLAG_WER AX_CHECK_COMPILE_FLAG([-msse4.1], [SSE41_CXXFLAGS="-msse4.1"], [], [$CXXFLAG_WERROR]) AX_CHECK_COMPILE_FLAG([-mavx -mavx2], [AVX2_CXXFLAGS="-mavx -mavx2"], [], [$CXXFLAG_WERROR]) AX_CHECK_COMPILE_FLAG([-msse4 -msha], [X86_SHANI_CXXFLAGS="-msse4 -msha"], [], [$CXXFLAG_WERROR]) +AX_CHECK_COMPILE_FLAG([-mpower8-vector], [POWER8_CXXFLAGS="-mpower8-vector"], [], [$CXXFLAG_WERROR]) enable_clmul= AX_CHECK_COMPILE_FLAG([-mpclmul], [enable_clmul=yes], [], [$CXXFLAG_WERROR], [AC_LANG_PROGRAM([ @@ -588,6 +590,22 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ ) CXXFLAGS="$TEMP_CXXFLAGS" +TEMP_CXXFLAGS="$CXXFLAGS" +CXXFLAGS="$TEMP_CXXFLAGS $POWER8_CXXFLAGS" +AC_MSG_CHECKING(for POWER8 compiler support) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include + #include + ]], [[ + unsigned char src[16]; + __builtin_crypto_vshasigmaw((__vector uint32_t)vec_vsx_ld(0, src), 1, 0xf); + return 0; + ]])], + [ AC_MSG_RESULT(yes); enable_power8=yes; AC_DEFINE(ENABLE_POWER8, 1, [Define if compiler supports POWER8 instructions.]) ], + [ AC_MSG_RESULT(no) ] +) +CXXFLAGS="$TEMP_CXXFLAGS" + CORE_CPPFLAGS="$CORE_CPPFLAGS -DHAVE_BUILD_INFO" AC_ARG_WITH([utils], @@ -1791,6 +1809,7 @@ AM_CONDITIONAL([ENABLE_AVX2], [test "$enable_avx2" = "yes"]) AM_CONDITIONAL([ENABLE_X86_SHANI], [test "$enable_x86_shani" = "yes"]) AM_CONDITIONAL([ENABLE_ARM_CRC], [test "$enable_arm_crc" = "yes"]) AM_CONDITIONAL([ENABLE_ARM_SHANI], [test "$enable_arm_shani" = "yes"]) +AM_CONDITIONAL([ENABLE_POWER8], [test "$enable_power8" = "yes"]) AM_CONDITIONAL([WORDS_BIGENDIAN], [test "$ac_cv_c_bigendian" = "yes"]) AM_CONDITIONAL([USE_NATPMP], [test "$use_natpmp" = "yes"]) AM_CONDITIONAL([USE_UPNP], [test "$use_upnp" = "yes"]) @@ -1852,6 +1871,7 @@ AC_SUBST(AVX2_CXXFLAGS) AC_SUBST(X86_SHANI_CXXFLAGS) AC_SUBST(ARM_CRC_CXXFLAGS) AC_SUBST(ARM_SHANI_CXXFLAGS) +AC_SUBST(POWER8_CXXFLAGS) AC_SUBST(LIBTOOL_APP_LDFLAGS) AC_SUBST(USE_SQLITE) AC_SUBST(USE_BDB) diff --git a/src/Makefile.am b/src/Makefile.am index b5d5c4652a..56bfd4b022 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -66,6 +66,10 @@ if ENABLE_ARM_SHANI LIBBITCOIN_CRYPTO_ARM_SHANI = crypto/libbitcoin_crypto_arm_shani.la LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_ARM_SHANI) endif +if ENABLE_POWER8 +LIBBITCOIN_CRYPTO_POWER8 = crypto/libbitcoin_crypto_power8.a +LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_POWER8) +endif noinst_LTLIBRARIES += $(LIBBITCOIN_CRYPTO) $(LIBSECP256K1): $(wildcard secp256k1/src/*.h) $(wildcard secp256k1/src/*.c) $(wildcard secp256k1/include/*) @@ -625,6 +629,12 @@ crypto_libbitcoin_crypto_arm_shani_la_CPPFLAGS += -DENABLE_ARM_SHANI crypto_libbitcoin_crypto_arm_shani_la_SOURCES = crypto/sha256_arm_shani.cpp # +crypto_libbitcoin_crypto_power8_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) +crypto_libbitcoin_crypto_power8_a_CPPFLAGS = $(AM_CPPFLAGS) +crypto_libbitcoin_crypto_power8_a_CXXFLAGS += $(POWER8_CXXFLAGS) +crypto_libbitcoin_crypto_power8_a_CPPFLAGS += -DENABLE_POWER8 +crypto_libbitcoin_crypto_power8_a_SOURCES = crypto/sha256_power8.cpp + # consensus # libbitcoin_consensus_a_CPPFLAGS = $(AM_CPPFLAGS) $(BITCOIN_INCLUDES) libbitcoin_consensus_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) diff --git a/src/crypto/sha256.cpp b/src/crypto/sha256.cpp index 4c7bb6f20f..b00d3f222c 100644 --- a/src/crypto/sha256.cpp +++ b/src/crypto/sha256.cpp @@ -63,6 +63,15 @@ void Transform_2way(unsigned char* out, const unsigned char* in); } #endif // DISABLE_OPTIMIZED_SHA256 +#if defined(__linux__) && defined(ENABLE_POWER8) +#include +namespace sha256_power8 +{ +void Transform_4way(unsigned char* out, const unsigned char* in); +} +#endif + + // Internal implementation code. namespace { @@ -652,7 +661,13 @@ std::string SHA256AutoDetect(sha256_implementation::UseImplementation use_implem ret += ",avx2(8way)"; } #endif -#endif // defined(HAVE_GETCPUID) +#elif (defined(__linux__)) && defined(ENABLE_POWER8) + if (getauxval(AT_HWCAP2) & 0x02000000) { + TransformD64_4way = sha256_power8::Transform_4way; + assert(SelfTest()); + return "power8(4way),C(1way)"; + } +#endif #if defined(ENABLE_ARM_SHANI) bool have_arm_shani = false; diff --git a/src/crypto/sha256_power8.cpp b/src/crypto/sha256_power8.cpp new file mode 100644 index 0000000000..c0e0f8bdba --- /dev/null +++ b/src/crypto/sha256_power8.cpp @@ -0,0 +1,401 @@ +// Copyright (c) 2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. +// +// This is a translation to GCC extended asm syntax from YASM code by Intel +// (available at the bottom of this file). + +#if defined(HAVE_CONFIG_H) +#include +#endif + +#include + +#if defined(HAVE_ENDIAN_H) +#include +#elif defined(HAVE_SYS_ENDIAN_H) +#include +#endif + +#include + +namespace sha256_power8 +{ + +typedef vector uint32_t uint32x4_p8; +typedef vector uint8_t uint8x16_p8; + +//! Gets the first uin32_t from a, b, c, d, converts from BE to host endian, and returns them concatenated +template static inline uint32x4_p8 pack_bytes + (const uint8x16_p8 a, const uint8x16_p8 b, const uint8x16_p8 c, const uint8x16_p8 d) { + uint8x16_p8 perm1 = {0+OFFS,1+OFFS,2+OFFS,3+OFFS, 16+OFFS,17+OFFS,18+OFFS,19+OFFS, 0,0,0,0, 0,0,0,0}; +#ifdef WORDS_BIGENDIAN + uint8x16_p8 perm2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +#else + uint8x16_p8 perm2 = {3,2,1,0, 7,6,5,4, 19,18,17,16, 23,22,21,20}; +#endif + return (uint32x4_p8)vec_perm(vec_perm((uint8x16_p8)a, (uint8x16_p8)b, perm1), vec_perm((uint8x16_p8)c, (uint8x16_p8)d, perm1), perm2); +} + +static const __attribute__((aligned(16))) uint32_t K[] = { + 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, + 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, + 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, + 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, + 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, + 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, + 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, + 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, + 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, + 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, + 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, + 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, + 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, + 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, + 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, + 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 +}; + +#define Ch(x, y, z) vec_sel((z), (y), (x)) +#define Maj(x, y, z) vec_sel((y), (z), vec_xor((x), (y))) + +#define KRound(a, b, c, d, e, f, g, h, k, w) \ + do { \ + uint32x4_p8 t1 = h + Ch(e, f, g) + __builtin_crypto_vshasigmaw(e, 1, 0xf) + k + w; \ + uint32x4_p8 t2 = Maj(a, b, c) + __builtin_crypto_vshasigmaw(a, 1, 0); \ + d += t1; \ + h = t1 + t2; \ + } while(false); + +#define Round(a, b, c, d, e, f, g, h, k, w) \ + do { \ + uint32x4_p8 kay = {k, k, k, k}; \ + uint32x4_p8 t1 = h + Ch(e, f, g) + __builtin_crypto_vshasigmaw(e, 1, 0xf) + kay + w; \ + uint32x4_p8 t2 = Maj(a, b, c) + __builtin_crypto_vshasigmaw(a, 1, 0); \ + d += t1; \ + h = t1 + t2; \ + } while(false); + +#define KRound2(a, b, c, d, e, f, g, h, k) \ + do { \ + uint32x4_p8 t1 = h + Ch(e, f, g) + __builtin_crypto_vshasigmaw(e, 1, 0xf) + k; \ + uint32x4_p8 t2 = Maj(a, b, c) + __builtin_crypto_vshasigmaw(a, 1, 0); \ + d += t1; \ + h = t1 + t2; \ + } while(false); + +#define Round2(a, b, c, d, e, f, g, h, k) \ + do { \ + uint32x4_p8 kay = {k, k, k, k}; \ + uint32x4_p8 t1 = h + Ch(e, f, g) + __builtin_crypto_vshasigmaw(e, 1, 0xf) + kay; \ + uint32x4_p8 t2 = Maj(a, b, c) + __builtin_crypto_vshasigmaw(a, 1, 0); \ + d += t1; \ + h = t1 + t2; \ + } while(false); + +#define sigma0(w) __builtin_crypto_vshasigmaw(w, 0, 0) +#define sigma1(w) __builtin_crypto_vshasigmaw(w, 0, 0xf) + +/** Perform a 4 double-SHA-256 64-byte updates at once*/ +void Transform_4way(unsigned char* out, const unsigned char* in) +{ + uint32x4_p8 a = {0x6a09e667ul, 0x6a09e667ul, 0x6a09e667ul, 0x6a09e667ul}; + uint32x4_p8 b = {0xbb67ae85ul, 0xbb67ae85ul, 0xbb67ae85ul, 0xbb67ae85ul}; + uint32x4_p8 c = {0x3c6ef372ul, 0x3c6ef372ul, 0x3c6ef372ul, 0x3c6ef372ul}; + uint32x4_p8 d = {0xa54ff53aul, 0xa54ff53aul, 0xa54ff53aul, 0xa54ff53aul}; + uint32x4_p8 e = {0x510e527ful, 0x510e527ful, 0x510e527ful, 0x510e527ful}; + uint32x4_p8 f = {0x9b05688cul, 0x9b05688cul, 0x9b05688cul, 0x9b05688cul}; + uint32x4_p8 g = {0x1f83d9abul, 0x1f83d9abul, 0x1f83d9abul, 0x1f83d9abul}; + uint32x4_p8 h = {0x5be0cd19ul, 0x5be0cd19ul, 0x5be0cd19ul, 0x5be0cd19ul}; + + uint8x16_p8 w0123_0 = vec_vsx_ld(0 *16 + 0 , in); + uint8x16_p8 w4567_0 = vec_vsx_ld(1 *16 + 0 , in); + uint8x16_p8 w8901_0 = vec_vsx_ld(2 *16 + 0 , in); + uint8x16_p8 w2345_0 = vec_vsx_ld(3 *16 + 0 , in); + + uint8x16_p8 w0123_1 = vec_vsx_ld(0 *16 + 64 , in); + uint8x16_p8 w4567_1 = vec_vsx_ld(1 *16 + 64 , in); + uint8x16_p8 w8901_1 = vec_vsx_ld(2 *16 + 64 , in); + uint8x16_p8 w2345_1 = vec_vsx_ld(3 *16 + 64 , in); + + uint8x16_p8 w0123_2 = vec_vsx_ld(0 *16 + 128, in); + uint8x16_p8 w4567_2 = vec_vsx_ld(1 *16 + 128, in); + uint8x16_p8 w8901_2 = vec_vsx_ld(2 *16 + 128, in); + uint8x16_p8 w2345_2 = vec_vsx_ld(3 *16 + 128, in); + + uint8x16_p8 w0123_3 = vec_vsx_ld(0 *16 + 192, in); + uint8x16_p8 w4567_3 = vec_vsx_ld(1 *16 + 192, in); + uint8x16_p8 w8901_3 = vec_vsx_ld(2 *16 + 192, in); + uint8x16_p8 w2345_3 = vec_vsx_ld(3 *16 + 192, in); + + uint32x4_p8 w0 = pack_bytes<0 >(w0123_0, w0123_1, w0123_2, w0123_3); + uint32x4_p8 w1 = pack_bytes<4 >(w0123_0, w0123_1, w0123_2, w0123_3); + uint32x4_p8 w2 = pack_bytes<8 >(w0123_0, w0123_1, w0123_2, w0123_3); + uint32x4_p8 w3 = pack_bytes<12>(w0123_0, w0123_1, w0123_2, w0123_3); + + uint32x4_p8 w4 = pack_bytes<0 >(w4567_0, w4567_1, w4567_2, w4567_3); + uint32x4_p8 w5 = pack_bytes<4 >(w4567_0, w4567_1, w4567_2, w4567_3); + uint32x4_p8 w6 = pack_bytes<8 >(w4567_0, w4567_1, w4567_2, w4567_3); + uint32x4_p8 w7 = pack_bytes<12>(w4567_0, w4567_1, w4567_2, w4567_3); + + uint32x4_p8 w8 = pack_bytes<0 >(w8901_0, w8901_1, w8901_2, w8901_3); + uint32x4_p8 w9 = pack_bytes<4 >(w8901_0, w8901_1, w8901_2, w8901_3); + uint32x4_p8 w10 = pack_bytes<8 >(w8901_0, w8901_1, w8901_2, w8901_3); + uint32x4_p8 w11 = pack_bytes<12>(w8901_0, w8901_1, w8901_2, w8901_3); + + uint32x4_p8 w12 = pack_bytes<0 >(w2345_0, w2345_1, w2345_2, w2345_3); + uint32x4_p8 w13 = pack_bytes<4 >(w2345_0, w2345_1, w2345_2, w2345_3); + uint32x4_p8 w14 = pack_bytes<8 >(w2345_0, w2345_1, w2345_2, w2345_3); + uint32x4_p8 w15 = pack_bytes<12>(w2345_0, w2345_1, w2345_2, w2345_3); + + uint32x4_p8 k = (uint32x4_p8)vec_ld(0, K); + KRound(a, b, c, d, e, f, g, h, vec_splat(k, 0), w0); + KRound(h, a, b, c, d, e, f, g, vec_splat(k, 1), w1); + KRound(g, h, a, b, c, d, e, f, vec_splat(k, 2), w2); + KRound(f, g, h, a, b, c, d, e, vec_splat(k, 3), w3); + k = (uint32x4_p8)vec_ld(1*16, K); + KRound(e, f, g, h, a, b, c, d, vec_splat(k, 0), w4); + KRound(d, e, f, g, h, a, b, c, vec_splat(k, 1), w5); + KRound(c, d, e, f, g, h, a, b, vec_splat(k, 2), w6); + KRound(b, c, d, e, f, g, h, a, vec_splat(k, 3), w7); + k = (uint32x4_p8)vec_ld(2*16, K); + KRound(a, b, c, d, e, f, g, h, vec_splat(k, 0), w8); + KRound(h, a, b, c, d, e, f, g, vec_splat(k, 1), w9); + KRound(g, h, a, b, c, d, e, f, vec_splat(k, 2), w10); + KRound(f, g, h, a, b, c, d, e, vec_splat(k, 3), w11); + k = (uint32x4_p8)vec_ld(3*16, K); + KRound(e, f, g, h, a, b, c, d, vec_splat(k, 0), w12); + KRound(d, e, f, g, h, a, b, c, vec_splat(k, 1), w13); + KRound(c, d, e, f, g, h, a, b, vec_splat(k, 2), w14); + KRound(b, c, d, e, f, g, h, a, vec_splat(k, 3), w15); + + for (int i = 0; i < 3; i++) { + k = (uint32x4_p8)vec_ld((4+4*i)*16, K); + KRound(a, b, c, d, e, f, g, h, vec_splat(k, 0), (w0 += sigma1(w14) + w9 + sigma0(w1))); + KRound(h, a, b, c, d, e, f, g, vec_splat(k, 1), (w1 += sigma1(w15) + w10 + sigma0(w2))); + KRound(g, h, a, b, c, d, e, f, vec_splat(k, 2), (w2 += sigma1(w0) + w11 + sigma0(w3))); + KRound(f, g, h, a, b, c, d, e, vec_splat(k, 3), (w3 += sigma1(w1) + w12 + sigma0(w4))); + k = (uint32x4_p8)vec_ld((5+4*i)*16, K); + KRound(e, f, g, h, a, b, c, d, vec_splat(k, 0), (w4 += sigma1(w2) + w13 + sigma0(w5))); + KRound(d, e, f, g, h, a, b, c, vec_splat(k, 1), (w5 += sigma1(w3) + w14 + sigma0(w6))); + KRound(c, d, e, f, g, h, a, b, vec_splat(k, 2), (w6 += sigma1(w4) + w15 + sigma0(w7))); + KRound(b, c, d, e, f, g, h, a, vec_splat(k, 3), (w7 += sigma1(w5) + w0 + sigma0(w8))); + k = (uint32x4_p8)vec_ld((6+4*i)*16, K); + KRound(a, b, c, d, e, f, g, h, vec_splat(k, 0), (w8 += sigma1(w6) + w1 + sigma0(w9))); + KRound(h, a, b, c, d, e, f, g, vec_splat(k, 1), (w9 += sigma1(w7) + w2 + sigma0(w10))); + KRound(g, h, a, b, c, d, e, f, vec_splat(k, 2), (w10 += sigma1(w8) + w3 + sigma0(w11))); + KRound(f, g, h, a, b, c, d, e, vec_splat(k, 3), (w11 += sigma1(w9) + w4 + sigma0(w12))); + k = (uint32x4_p8)vec_ld((7+4*i)*16, K); + KRound(e, f, g, h, a, b, c, d, vec_splat(k, 0), (w12 += sigma1(w10) + w5 + sigma0(w13))); + KRound(d, e, f, g, h, a, b, c, vec_splat(k, 1), (w13 += sigma1(w11) + w6 + sigma0(w14))); + KRound(c, d, e, f, g, h, a, b, vec_splat(k, 2), (w14 += sigma1(w12) + w7 + sigma0(w15))); + KRound(b, c, d, e, f, g, h, a, vec_splat(k, 3), (w15 += sigma1(w13) + w8 + sigma0(w0))); + } + + a += uint32x4_p8{0x6a09e667ul, 0x6a09e667ul, 0x6a09e667ul, 0x6a09e667ul}; + b += uint32x4_p8{0xbb67ae85ul, 0xbb67ae85ul, 0xbb67ae85ul, 0xbb67ae85ul}; + c += uint32x4_p8{0x3c6ef372ul, 0x3c6ef372ul, 0x3c6ef372ul, 0x3c6ef372ul}; + d += uint32x4_p8{0xa54ff53aul, 0xa54ff53aul, 0xa54ff53aul, 0xa54ff53aul}; + e += uint32x4_p8{0x510e527ful, 0x510e527ful, 0x510e527ful, 0x510e527ful}; + f += uint32x4_p8{0x9b05688cul, 0x9b05688cul, 0x9b05688cul, 0x9b05688cul}; + g += uint32x4_p8{0x1f83d9abul, 0x1f83d9abul, 0x1f83d9abul, 0x1f83d9abul}; + h += uint32x4_p8{0x5be0cd19ul, 0x5be0cd19ul, 0x5be0cd19ul, 0x5be0cd19ul}; + + uint32x4_p8 t0 = a; + uint32x4_p8 t1 = b; + uint32x4_p8 t2 = c; + uint32x4_p8 t3 = d; + uint32x4_p8 t4 = e; + uint32x4_p8 t5 = f; + uint32x4_p8 t6 = g; + uint32x4_p8 t7 = h; + + KRound2(a, b, c, d, e, f, g, h, 0xc28a2f98); + KRound2(h, a, b, c, d, e, f, g, 0x71374491); + KRound2(g, h, a, b, c, d, e, f, 0xb5c0fbcf); + KRound2(f, g, h, a, b, c, d, e, 0xe9b5dba5); + KRound2(e, f, g, h, a, b, c, d, 0x3956c25b); + KRound2(d, e, f, g, h, a, b, c, 0x59f111f1); + KRound2(c, d, e, f, g, h, a, b, 0x923f82a4); + KRound2(b, c, d, e, f, g, h, a, 0xab1c5ed5); + KRound2(a, b, c, d, e, f, g, h, 0xd807aa98); + KRound2(h, a, b, c, d, e, f, g, 0x12835b01); + KRound2(g, h, a, b, c, d, e, f, 0x243185be); + KRound2(f, g, h, a, b, c, d, e, 0x550c7dc3); + KRound2(e, f, g, h, a, b, c, d, 0x72be5d74); + KRound2(d, e, f, g, h, a, b, c, 0x80deb1fe); + KRound2(c, d, e, f, g, h, a, b, 0x9bdc06a7); + KRound2(b, c, d, e, f, g, h, a, 0xc19bf374); + KRound2(a, b, c, d, e, f, g, h, 0x649b69c1); + KRound2(h, a, b, c, d, e, f, g, 0xf0fe4786); + KRound2(g, h, a, b, c, d, e, f, 0x0fe1edc6); + KRound2(f, g, h, a, b, c, d, e, 0x240cf254); + KRound2(e, f, g, h, a, b, c, d, 0x4fe9346f); + KRound2(d, e, f, g, h, a, b, c, 0x6cc984be); + KRound2(c, d, e, f, g, h, a, b, 0x61b9411e); + KRound2(b, c, d, e, f, g, h, a, 0x16f988fa); + KRound2(a, b, c, d, e, f, g, h, 0xf2c65152); + KRound2(h, a, b, c, d, e, f, g, 0xa88e5a6d); + KRound2(g, h, a, b, c, d, e, f, 0xb019fc65); + KRound2(f, g, h, a, b, c, d, e, 0xb9d99ec7); + KRound2(e, f, g, h, a, b, c, d, 0x9a1231c3); + KRound2(d, e, f, g, h, a, b, c, 0xe70eeaa0); + KRound2(c, d, e, f, g, h, a, b, 0xfdb1232b); + KRound2(b, c, d, e, f, g, h, a, 0xc7353eb0); + KRound2(a, b, c, d, e, f, g, h, 0x3069bad5); + KRound2(h, a, b, c, d, e, f, g, 0xcb976d5f); + KRound2(g, h, a, b, c, d, e, f, 0x5a0f118f); + KRound2(f, g, h, a, b, c, d, e, 0xdc1eeefd); + KRound2(e, f, g, h, a, b, c, d, 0x0a35b689); + KRound2(d, e, f, g, h, a, b, c, 0xde0b7a04); + KRound2(c, d, e, f, g, h, a, b, 0x58f4ca9d); + KRound2(b, c, d, e, f, g, h, a, 0xe15d5b16); + KRound2(a, b, c, d, e, f, g, h, 0x007f3e86); + KRound2(h, a, b, c, d, e, f, g, 0x37088980); + KRound2(g, h, a, b, c, d, e, f, 0xa507ea32); + KRound2(f, g, h, a, b, c, d, e, 0x6fab9537); + KRound2(e, f, g, h, a, b, c, d, 0x17406110); + KRound2(d, e, f, g, h, a, b, c, 0x0d8cd6f1); + KRound2(c, d, e, f, g, h, a, b, 0xcdaa3b6d); + KRound2(b, c, d, e, f, g, h, a, 0xc0bbbe37); + KRound2(a, b, c, d, e, f, g, h, 0x83613bda); + KRound2(h, a, b, c, d, e, f, g, 0xdb48a363); + KRound2(g, h, a, b, c, d, e, f, 0x0b02e931); + KRound2(f, g, h, a, b, c, d, e, 0x6fd15ca7); + KRound2(e, f, g, h, a, b, c, d, 0x521afaca); + KRound2(d, e, f, g, h, a, b, c, 0x31338431); + KRound2(c, d, e, f, g, h, a, b, 0x6ed41a95); + KRound2(b, c, d, e, f, g, h, a, 0x6d437890); + KRound2(a, b, c, d, e, f, g, h, 0xc39c91f2); + KRound2(h, a, b, c, d, e, f, g, 0x9eccabbd); + KRound2(g, h, a, b, c, d, e, f, 0xb5c9a0e6); + KRound2(f, g, h, a, b, c, d, e, 0x532fb63c); + KRound2(e, f, g, h, a, b, c, d, 0xd2c741c6); + KRound2(d, e, f, g, h, a, b, c, 0x07237ea3); + KRound2(c, d, e, f, g, h, a, b, 0xa4954b68); + KRound2(b, c, d, e, f, g, h, a, 0x4c191d76); + + + w0 = t0 + a; + w1 = t1 + b; + w2 = t2 + c; + w3 = t3 + d; + w4 = t4 + e; + w5 = t5 + f; + w6 = t6 + g; + w7 = t7 + h; + + a = uint32x4_p8{0x6a09e667ul, 0x6a09e667ul, 0x6a09e667ul, 0x6a09e667ul}; + b = uint32x4_p8{0xbb67ae85ul, 0xbb67ae85ul, 0xbb67ae85ul, 0xbb67ae85ul}; + c = uint32x4_p8{0x3c6ef372ul, 0x3c6ef372ul, 0x3c6ef372ul, 0x3c6ef372ul}; + d = uint32x4_p8{0xa54ff53aul, 0xa54ff53aul, 0xa54ff53aul, 0xa54ff53aul}; + e = uint32x4_p8{0x510e527ful, 0x510e527ful, 0x510e527ful, 0x510e527ful}; + f = uint32x4_p8{0x9b05688cul, 0x9b05688cul, 0x9b05688cul, 0x9b05688cul}; + g = uint32x4_p8{0x1f83d9abul, 0x1f83d9abul, 0x1f83d9abul, 0x1f83d9abul}; + h = uint32x4_p8{0x5be0cd19ul, 0x5be0cd19ul, 0x5be0cd19ul, 0x5be0cd19ul}; + + Round(a, b, c, d, e, f, g, h, 0x428a2f98, w0); + Round(h, a, b, c, d, e, f, g, 0x71374491, w1); + Round(g, h, a, b, c, d, e, f, 0xb5c0fbcf, w2); + Round(f, g, h, a, b, c, d, e, 0xe9b5dba5, w3); + Round(e, f, g, h, a, b, c, d, 0x3956c25b, w4); + Round(d, e, f, g, h, a, b, c, 0x59f111f1, w5); + Round(c, d, e, f, g, h, a, b, 0x923f82a4, w6); + Round(b, c, d, e, f, g, h, a, 0xab1c5ed5, w7); + Round2(a, b, c, d, e, f, g, h, 0x5807aa98); + Round2(h, a, b, c, d, e, f, g, 0x12835b01); + Round2(g, h, a, b, c, d, e, f, 0x243185be); + Round2(f, g, h, a, b, c, d, e, 0x550c7dc3); + Round2(e, f, g, h, a, b, c, d, 0x72be5d74); + Round2(d, e, f, g, h, a, b, c, 0x80deb1fe); + Round2(c, d, e, f, g, h, a, b, 0x9bdc06a7); + Round2(b, c, d, e, f, g, h, a, 0xc19bf274); + Round(a, b, c, d, e, f, g, h, 0xe49b69c1, (w0 += sigma0(w1))); + w1 += uint32x4_p8{0xa00000, 0xa00000, 0xa00000, 0xa00000}; + Round(h, a, b, c, d, e, f, g, 0xefbe4786, (w1 += sigma0(w2))); + Round(g, h, a, b, c, d, e, f, 0x0fc19dc6, (w2 += sigma1(w0) + sigma0(w3))); + Round(f, g, h, a, b, c, d, e, 0x240ca1cc, (w3 += sigma1(w1) + sigma0(w4))); + Round(e, f, g, h, a, b, c, d, 0x2de92c6f, (w4 += sigma1(w2) + sigma0(w5))); + Round(d, e, f, g, h, a, b, c, 0x4a7484aa, (w5 += sigma1(w3) + sigma0(w6))); + w6 += uint32x4_p8{0x100, 0x100, 0x100, 0x100}; + Round(c, d, e, f, g, h, a, b, 0x5cb0a9dc, (w6 += sigma1(w4) + sigma0(w7))); + w7 += uint32x4_p8{0x11002000, 0x11002000, 0x11002000, 0x11002000}; + Round(b, c, d, e, f, g, h, a, 0x76f988da, (w7 += sigma1(w5) + w0)); + w8 = uint32x4_p8{0x80000000, 0x80000000, 0x80000000, 0x80000000}; + Round(a, b, c, d, e, f, g, h, 0x983e5152, (w8 += sigma1(w6) + w1)); + Round(h, a, b, c, d, e, f, g, 0xa831c66d, (w9 = sigma1(w7) + w2)); + Round(g, h, a, b, c, d, e, f, 0xb00327c8, (w10 = sigma1(w8) + w3)); + Round(f, g, h, a, b, c, d, e, 0xbf597fc7, (w11 = sigma1(w9) + w4)); + Round(e, f, g, h, a, b, c, d, 0xc6e00bf3, (w12 = sigma1(w10) + w5)); + Round(d, e, f, g, h, a, b, c, 0xd5a79147, (w13 = sigma1(w11) + w6)); + w14 = uint32x4_p8{0x400022, 0x400022, 0x400022, 0x400022}; + Round(c, d, e, f, g, h, a, b, 0x06ca6351, (w14 += sigma1(w12) + w7)); + w15 = uint32x4_p8{0x100, 0x100, 0x100, 0x100}; + Round(b, c, d, e, f, g, h, a, 0x14292967, (w15 += sigma1(w13) + w8 + sigma0(w0))); + Round(a, b, c, d, e, f, g, h, 0x27b70a85, (w0 += sigma1(w14) + w9 + sigma0(w1))); + Round(h, a, b, c, d, e, f, g, 0x2e1b2138, (w1 += sigma1(w15) + w10 + sigma0(w2))); + Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc, (w2 += sigma1(w0) + w11 + sigma0(w3))); + Round(f, g, h, a, b, c, d, e, 0x53380d13, (w3 += sigma1(w1) + w12 + sigma0(w4))); + Round(e, f, g, h, a, b, c, d, 0x650a7354, (w4 += sigma1(w2) + w13 + sigma0(w5))); + Round(d, e, f, g, h, a, b, c, 0x766a0abb, (w5 += sigma1(w3) + w14 + sigma0(w6))); + Round(c, d, e, f, g, h, a, b, 0x81c2c92e, (w6 += sigma1(w4) + w15 + sigma0(w7))); + Round(b, c, d, e, f, g, h, a, 0x92722c85, (w7 += sigma1(w5) + w0 + sigma0(w8))); + Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1, (w8 += sigma1(w6) + w1 + sigma0(w9))); + Round(h, a, b, c, d, e, f, g, 0xa81a664b, (w9 += sigma1(w7) + w2 + sigma0(w10))); + Round(g, h, a, b, c, d, e, f, 0xc24b8b70, (w10 += sigma1(w8) + w3 + sigma0(w11))); + Round(f, g, h, a, b, c, d, e, 0xc76c51a3, (w11 += sigma1(w9) + w4 + sigma0(w12))); + Round(e, f, g, h, a, b, c, d, 0xd192e819, (w12 += sigma1(w10) + w5 + sigma0(w13))); + Round(d, e, f, g, h, a, b, c, 0xd6990624, (w13 += sigma1(w11) + w6 + sigma0(w14))); + Round(c, d, e, f, g, h, a, b, 0xf40e3585, (w14 += sigma1(w12) + w7 + sigma0(w15))); + Round(b, c, d, e, f, g, h, a, 0x106aa070, (w15 += sigma1(w13) + w8 + sigma0(w0))); + Round(a, b, c, d, e, f, g, h, 0x19a4c116, (w0 += sigma1(w14) + w9 + sigma0(w1))); + Round(h, a, b, c, d, e, f, g, 0x1e376c08, (w1 += sigma1(w15) + w10 + sigma0(w2))); + Round(g, h, a, b, c, d, e, f, 0x2748774c, (w2 += sigma1(w0) + w11 + sigma0(w3))); + Round(f, g, h, a, b, c, d, e, 0x34b0bcb5, (w3 += sigma1(w1) + w12 + sigma0(w4))); + Round(e, f, g, h, a, b, c, d, 0x391c0cb3, (w4 += sigma1(w2) + w13 + sigma0(w5))); + Round(d, e, f, g, h, a, b, c, 0x4ed8aa4a, (w5 += sigma1(w3) + w14 + sigma0(w6))); + Round(c, d, e, f, g, h, a, b, 0x5b9cca4f, (w6 += sigma1(w4) + w15 + sigma0(w7))); + Round(b, c, d, e, f, g, h, a, 0x682e6ff3, (w7 += sigma1(w5) + w0 + sigma0(w8))); + Round(a, b, c, d, e, f, g, h, 0x748f82ee, (w8 += sigma1(w6) + w1 + sigma0(w9))); + Round(h, a, b, c, d, e, f, g, 0x78a5636f, (w9 += sigma1(w7) + w2 + sigma0(w10))); + Round(g, h, a, b, c, d, e, f, 0x84c87814, (w10 += sigma1(w8) + w3 + sigma0(w11))); + Round(f, g, h, a, b, c, d, e, 0x8cc70208, (w11 += sigma1(w9) + w4 + sigma0(w12))); + Round(e, f, g, h, a, b, c, d, 0x90befffa, (w12 += sigma1(w10) + w5 + sigma0(w13))); + Round(d, e, f, g, h, a, b, c, 0xa4506ceb, (w13 += sigma1(w11) + w6 + sigma0(w14))); + Round(c, d, e, f, g, h, a, b, 0xbef9a3f7, (w14 + sigma1(w12) + w7 + sigma0(w15))); + Round(b, c, d, e, f, g, h, a, 0xc67178f2, (w15 + sigma1(w13) + w8 + sigma0(w0))); + + a += uint32x4_p8{0x6a09e667ul, 0x6a09e667ul, 0x6a09e667ul, 0x6a09e667ul}; + b += uint32x4_p8{0xbb67ae85ul, 0xbb67ae85ul, 0xbb67ae85ul, 0xbb67ae85ul}; + c += uint32x4_p8{0x3c6ef372ul, 0x3c6ef372ul, 0x3c6ef372ul, 0x3c6ef372ul}; + d += uint32x4_p8{0xa54ff53aul, 0xa54ff53aul, 0xa54ff53aul, 0xa54ff53aul}; + e += uint32x4_p8{0x510e527ful, 0x510e527ful, 0x510e527ful, 0x510e527ful}; + f += uint32x4_p8{0x9b05688cul, 0x9b05688cul, 0x9b05688cul, 0x9b05688cul}; + g += uint32x4_p8{0x1f83d9abul, 0x1f83d9abul, 0x1f83d9abul, 0x1f83d9abul}; + h += uint32x4_p8{0x5be0cd19ul, 0x5be0cd19ul, 0x5be0cd19ul, 0x5be0cd19ul}; + + w0123_0 = (uint8x16_p8)pack_bytes<0 >((uint8x16_p8)a, (uint8x16_p8)b, (uint8x16_p8)c, (uint8x16_p8)d); + w4567_0 = (uint8x16_p8)pack_bytes<0 >((uint8x16_p8)e, (uint8x16_p8)f, (uint8x16_p8)g, (uint8x16_p8)h); + + w0123_1 = (uint8x16_p8)pack_bytes<4 >((uint8x16_p8)a, (uint8x16_p8)b, (uint8x16_p8)c, (uint8x16_p8)d); + w4567_1 = (uint8x16_p8)pack_bytes<4 >((uint8x16_p8)e, (uint8x16_p8)f, (uint8x16_p8)g, (uint8x16_p8)h); + + w0123_2 = (uint8x16_p8)pack_bytes<8 >((uint8x16_p8)a, (uint8x16_p8)b, (uint8x16_p8)c, (uint8x16_p8)d); + w4567_2 = (uint8x16_p8)pack_bytes<8 >((uint8x16_p8)e, (uint8x16_p8)f, (uint8x16_p8)g, (uint8x16_p8)h); + + w0123_3 = (uint8x16_p8)pack_bytes<12>((uint8x16_p8)a, (uint8x16_p8)b, (uint8x16_p8)c, (uint8x16_p8)d); + w4567_3 = (uint8x16_p8)pack_bytes<12>((uint8x16_p8)e, (uint8x16_p8)f, (uint8x16_p8)g, (uint8x16_p8)h); + + vec_vsx_st(w0123_0, 0 *16 + 0 , out); + vec_vsx_st(w4567_0, 1 *16 + 0 , out); + + vec_vsx_st(w0123_1, 0 *16 + 32, out); + vec_vsx_st(w4567_1, 1 *16 + 32, out); + + vec_vsx_st(w0123_2, 0 *16 + 64, out); + vec_vsx_st(w4567_2, 1 *16 + 64, out); + + vec_vsx_st(w0123_3, 0 *16 + 96, out); + vec_vsx_st(w4567_3, 1 *16 + 96, out); +} +} From 61b5e37d3939b3466845d7fb5ecf8775fe7209c9 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Wed, 19 Aug 2020 16:51:36 +0000 Subject: [PATCH 2/3] Bugfix: crypto: Use GNU __vector extension for POWER8 SHA256 Standard Altivec `vector` has a number of issues: - Strictly speaking, cannot use fixed-size types like uint32_t as we do - GCC 10 won't allow it with standard C++11 mode - configure is only checking for __vector working, not vector Since __vector is near-universal and checked by configure anyway, just use that instead --- src/crypto/sha256_power8.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/crypto/sha256_power8.cpp b/src/crypto/sha256_power8.cpp index c0e0f8bdba..fe956f66b6 100644 --- a/src/crypto/sha256_power8.cpp +++ b/src/crypto/sha256_power8.cpp @@ -22,8 +22,8 @@ namespace sha256_power8 { -typedef vector uint32_t uint32x4_p8; -typedef vector uint8_t uint8x16_p8; +typedef __vector uint32_t uint32x4_p8; +typedef __vector uint8_t uint8x16_p8; //! Gets the first uin32_t from a, b, c, d, converts from BE to host endian, and returns them concatenated template static inline uint32x4_p8 pack_bytes From 5fa767af29c611516ace6f4bfb3d758351d1a7dd Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Tue, 26 Sep 2023 01:43:24 +0000 Subject: [PATCH 3/3] build: Adapt POWER8 bitcoincrypto for .la libraries Inspired-by: c1e16cb31f4d8edde8fea310011189b8b272cb07 (#24937) --- src/Makefile.am | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index 56bfd4b022..5acadab865 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -67,7 +67,7 @@ LIBBITCOIN_CRYPTO_ARM_SHANI = crypto/libbitcoin_crypto_arm_shani.la LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_ARM_SHANI) endif if ENABLE_POWER8 -LIBBITCOIN_CRYPTO_POWER8 = crypto/libbitcoin_crypto_power8.a +LIBBITCOIN_CRYPTO_POWER8 = crypto/libbitcoin_crypto_power8.la LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_POWER8) endif noinst_LTLIBRARIES += $(LIBBITCOIN_CRYPTO) @@ -629,11 +629,15 @@ crypto_libbitcoin_crypto_arm_shani_la_CPPFLAGS += -DENABLE_ARM_SHANI crypto_libbitcoin_crypto_arm_shani_la_SOURCES = crypto/sha256_arm_shani.cpp # -crypto_libbitcoin_crypto_power8_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) -crypto_libbitcoin_crypto_power8_a_CPPFLAGS = $(AM_CPPFLAGS) -crypto_libbitcoin_crypto_power8_a_CXXFLAGS += $(POWER8_CXXFLAGS) -crypto_libbitcoin_crypto_power8_a_CPPFLAGS += -DENABLE_POWER8 -crypto_libbitcoin_crypto_power8_a_SOURCES = crypto/sha256_power8.cpp +# See explanation for -static in crypto_libbitcoin_crypto_base_la's LDFLAGS and +# CXXFLAGS above +crypto_libbitcoin_crypto_power8_la_LDFLAGS = $(AM_LDFLAGS) -static +crypto_libbitcoin_crypto_power8_la_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) -static +crypto_libbitcoin_crypto_power8_la_CPPFLAGS = $(AM_CPPFLAGS) +crypto_libbitcoin_crypto_power8_la_CXXFLAGS += $(POWER8_CXXFLAGS) +crypto_libbitcoin_crypto_power8_la_CPPFLAGS += -DENABLE_POWER8 +crypto_libbitcoin_crypto_power8_la_SOURCES = crypto/sha256_power8.cpp +# # consensus # libbitcoin_consensus_a_CPPFLAGS = $(AM_CPPFLAGS) $(BITCOIN_INCLUDES)