diff --git a/README.md b/README.md index e69de29..16b4196 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,9 @@ +#矿池算法文件 +``` +编译命令(gcc编译器,ubuntu系统) +blake2b: +sha3x: +blake3: +heavyHash: +randomx: +``` \ No newline at end of file diff --git a/blake2b/blake2b.c b/blake2b/blake2b.c new file mode 100644 index 0000000..452d7ab --- /dev/null +++ b/blake2b/blake2b.c @@ -0,0 +1,237 @@ +#include +#include +#include +#include "blake2b.h" + +#ifndef ROTR64 +#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) +#endif +// Little-endian byte access. +#define B2B_GET64(p) \ +(((uint64_t) ((uint8_t *) (p))[0]) ^ \ +(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \ +(((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \ +(((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \ +(((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \ +(((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \ +(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \ +(((uint64_t) ((uint8_t *) (p))[7]) << 56)) +// G Mixing function. +#define B2B_G(a, b, c, d, x, y) { \ +v[a] = v[a] + v[b] + x; \ +v[d] = ROTR64(v[d] ^ v[a], 32); \ +v[c] = v[c] + v[d]; \ +v[b] = ROTR64(v[b] ^ v[c], 24); \ +v[a] = v[a] + v[b] + y; \ +v[d] = ROTR64(v[d] ^ v[a], 16); \ +v[c] = v[c] + v[d]; \ +v[b] = ROTR64(v[b] ^ v[c], 63); \ +} +// Initialization Vector. +static const uint64_t blake2b_iv[8] = { + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 +}; + +unsigned int be32toh(unsigned int x) +{ + return (((x & 0xff000000U) >> 24) | ((x & 0x00ff0000U) >> 8) | + ((x & 0x0000ff00U) << 8) | ((x & 0x000000ffU) << 24)); +} + +static void blake2b_compress(blake2b_ctx *ctx, int last) +{ + const uint8_t sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } + }; +int i; +uint64_t v[16], m[16]; +//long int v15; +for (i = 0; i < 8; i++) { // init work variables + v[i] = ctx->h[i]; + v[i + 8] = blake2b_iv[i]; +} +//v15= v[15]; +//printf("the v15=%02lx\n" ,v15); + +v[12] ^= ctx->t[0]; // low 64 bits of offset +v[13] ^= ctx->t[1]; // high 64 bits + +//printf("ctx->t[0]= %016llx\n",ctx->t[0]); +//printf("ctx->t[1]= %016llx\n",ctx->t[1]); + +if (last) // last block flag set ? + v[14] = ~v[14]; + +for (i = 0; i < 16; i++) // get little-endian words + m[i] = B2B_GET64(&ctx->b[8 * i]); + +//for (int i = 0; i < 16; ++i) printf("v[%0d]=%016llx\n", i,v[i]); +//for (int i = 0; i < 16; ++i) printf("m[%0d]=%016llx\n", i,m[i]); + +for (i = 0; i < 12; i++) { // twelve rounds for (i = 0; i < 12; i++) { // twelve rounds + //printf("i=%0d\n",i); + //for (int i = 0; i < 16; ++i) printf("v[%0d]=%016llx\n", i,v[i]); + //for (int i = 0; i < 16; ++i) printf("m[%0d]=%016llx\n", i,m[i]); + + B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]); + B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]); + B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]); + B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]); + B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]); + B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]); + B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]); + B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]); + + } + //for (int i = 0; i < 16; ++i) printf("v[%0d]=%016llx\n", i,v[i]); +//v15= v[15]; +//printf("the v15=%02lx\n" ,v15); +for( i = 0; i < 8; ++i ) + ctx->h[i] ^= v[i] ^ v[i + 8]; +//v15= v[15]; +//printf("the v15=%02lx\n" ,v15); +} + +void blake2b_update(blake2b_ctx *ctx, const void *in, size_t inlen) // data bytes +{ +size_t i; +for (i = 0; i < inlen; i++) { + if (ctx->c == 128) { // buffer full ? + ctx->t[0] += ctx->c; // add counters + if (ctx->t[0] < ctx->c) // carry overflow ? + ctx->t[1]++; // high word + blake2b_compress(ctx, 0); // compress (not last) + ctx->c = 0; // counter to zero + //for (int i = 0; i < 8; ++i) printf("ctx->h[%0d]=%016llx\n",i,ctx->h[i]); + } + ctx->b[ctx->c++] = ((const uint8_t *) in)[i]; + } +/* +int t0,t1; +t0 = ctx->t[0]; +t1 = ctx->t[1]; +printf("the t[0]=%02x,the t[1]=%02x\n", t0,t1); +*/ + //printf("the t[0]=%02x,the t[1]=%02x\n", ctx->t[0],ctx->t[1]); +} + +int blake2b_init(blake2b_ctx *ctx, size_t outlen) // (keylen=0: no key) +{ +//size_t i; + +//if (outlen == 0 || outlen > 64 || keylen > 64) +// return -1; // illegal parameters +// +//for (i = 0; i < 8; i++) // state, "param block" +// ctx->h[i] = blake2b_iv[i]; +// +// ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen; +// ctx->t[0] = 0; // input count low word +// ctx->t[1] = 0; // input count high word +// ctx->c = 0; // pointer within buffer +// ctx->outlen = outlen; +// +//for (i = keylen; i < 128; i++) // zero input block +// ctx->b[i] = 0; +//if (keylen > 0) { +// blake2b_update(ctx, key, keylen); +// ctx->c = 128; // at the end +// } +ctx->h[0]= 0x6a09e667f2bdc93a; +ctx->h[1]= 0xbb67ae8584caa73b; +ctx->h[2]= 0x3c6ef372fe94f82b; +ctx->h[3]= 0xa54ff53a5f1d36f1; +ctx->h[4]= 0x510e527fade682d1; +ctx->h[5]= 0x9b05688c2b3e6c1f; +ctx->h[6]= 0x48ec89c38820de31; +ctx->h[7]= 0x5be0cd10137e21b1; +ctx->t[0] = 0; // input count low word +ctx->t[1] = 0; // input count high word +ctx->c = 0; // pointer within buffer +ctx->outlen = outlen; +return 0; +} + +void blake2b_final(blake2b_ctx *ctx, void *out) +{ +size_t i; +ctx->t[0] += ctx->c; // mark last block offset +if (ctx->t[0] < ctx->c) // carry overflow + ctx->t[1]++; // high word +while (ctx->c < 128) // fill up with zeros + ctx->b[ctx->c++] = 0; +//printf("the msg is :\n"); +//for (int i = 0; i < 128; ++i) printf("%02x",ctx->b[i]); +//printf("\n"); +blake2b_compress(ctx, 1); // final block flag = 1 +// little endian convert and store +/* +int t0,t1; +t0 = ctx->t[0]; +t1 = ctx->t[1]; +printf("the t[0]=%02x,the t[1]=%02x\n", t0,t1); +*/ +//for (i = 0; i < 128; i++) +// ((uint8_t *) msg_s1)[i] = ctx->b[i]; +//for (int i = 0; i < 8; ++i) printf("ctx->h[%0d]=%016llx\n",i,ctx->h[i]); +//printf("ctx->outlen= %0d\n",ctx->outlen ); +for (i = 0; i < ctx->outlen; i++) { + ((uint8_t *) out)[i] = (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF; +} +} + +/* +int blake2b(void *out, size_t outlen,const void *key, size_t keylen,const void *in, size_t inlen,void *msg_s1) +{ + blake2b_ctx ctx; + if (blake2b_init(&ctx, outlen, key, keylen)) + return -1; + blake2b_update(&ctx, in, inlen); + blake2b_final(&ctx, out); + return 0; +} + +int main(int argc, char** argv) { +int i,j; +uint8_t md[50]; +uint8_t msg_s1[128]; + +uint8_t in[140+4] = { + 0x04,0x00,0x00,0x00, + 0xe5,0x4c,0x27,0x54,0x40,0x50,0x66,0x8f,0x27,0x2e,0xc3,0xb4,0x60,0xe1,0xcd,0xe7, + 0x45,0xc6,0xb2,0x12,0x39,0xa8,0x1d,0xae,0x63,0x7f,0xde,0x47,0x04,0x00,0x00,0x00, + 0x84,0x4b,0xc0,0xc5,0x56,0x96,0xef,0x99,0x20,0xee,0xda,0x11,0xc1,0xeb,0x41,0xb0, + 0xc2,0xe7,0x32,0x4b,0x46,0xcc,0x2e,0x7a,0xa0,0xc2,0xaa,0x77,0x36,0x44,0x8d,0x7a, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x68,0x24,0x1a,0x58, + 0x7e,0x7e,0x06,0x1d, + 0x25,0x0e,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x01,0x00,0x00,0x00 + }; + +blake2b(md, 50, NULL, 0, in, sizeof(in), msg_s1); + +printf("the hash out is \n"); +for (i=0; i < sizeof(md); ++i) { + printf("%02x",md[i]); +} +printf("\n"); +return 0; +} +*/ \ No newline at end of file diff --git a/blake2b/blake2b.h b/blake2b/blake2b.h new file mode 100644 index 0000000..f1b2276 --- /dev/null +++ b/blake2b/blake2b.h @@ -0,0 +1,24 @@ +#include +#include + + +enum blake2b_constant +{ + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 +}; + +typedef struct { + uint8_t b[128]; // input buffer + uint64_t h[8]; // chained state + uint64_t t[2]; // total number of bytes + size_t c; // pointer for b[] + size_t outlen; // digest size +} blake2b_ctx; + +void blake2b_update(blake2b_ctx *ctx, const void *in, size_t inlen); +int blake2b_init(blake2b_ctx *ctx, size_t outlen); +void blake2b_final(blake2b_ctx *ctx, void *out); \ No newline at end of file diff --git a/heavyHash/DiagonalMatrix.h b/heavyHash/DiagonalMatrix.h new file mode 100644 index 0000000..6a0634c --- /dev/null +++ b/heavyHash/DiagonalMatrix.h @@ -0,0 +1,192 @@ +#ifndef _SINGULAR_DIAGONAL_MATRIX_H +#define _SINGULAR_DIAGONAL_MATRIX_H + +#include "singular.h" + + + +//#define L(M,N) (M < N ? M : N) +#define L(M,N) (M*N) +#if 1 + +typedef struct class_DiagonalMatrix DiagonalMatrix_t; +struct class_DiagonalMatrix { + double *pBlock; + + double (*operator)(struct class_DiagonalMatrix *p, int i, int j); + void (*release)(struct class_DiagonalMatrix *p); +}; + +#else +#include +#include +#include + +//namespace singular { + + /** + * Diagonal matrix. + */ + template < int M, int N > + class DiagonalMatrix { + public: + enum { + /** Number of diagonal elements. */ + L = M < N ? M : N + }; + private: + /** + * Memory block for the diagonal elements. + * The ith row and ith column is given by `elements[i]`. + */ + double* pBlock; + public: + /** Initializes a diagonal matrix filled with 0. */ + DiagonalMatrix() { + this->pBlock = new double[L]; + std::fill(this->pBlock, this->pBlock + L, 0.0); + } + + /** + * Initializes a diagonal matrix with given diagonal values. + * + * The diagonal matrix will look like, + * \f[ + * \begin{bmatrix} + * \text{values[0]} & & \\ + * & \ddots & \\ + * & & \text{values[min(M, N)-1]} + * \end{bmatrix} + * \f] + * + * The behavior is undefined if `values` has less than `min(M, N)` + * elements. + * + * @param values + * Diagonal values of the matrix. + */ + explicit DiagonalMatrix(const double values[]) { + this->pBlock = new double[L]; + memcpy(this->pBlock, values, sizeof(double) * L); + } + + /** + * Steals the memory block from a given diagonal matrix. + * + * @param[in,out] copyee + * Diagonal matrix from which the memory block is to be stolen. + * No loger valid after this call. + */ +#if SINGULAR_RVALUE_REFERENCE_SUPPORTED + DiagonalMatrix(DiagonalMatrix&& copyee) : pBlock(copyee.pBlock) { + copyee.pBlock = nullptr; + } +#else + DiagonalMatrix(const DiagonalMatrix& copyee) : pBlock(copyee.pBlock) { + const_cast< DiagonalMatrix& >(copyee).pBlock = nullptr; + } +#endif + + /** Releases the memory block of this diagonal matrix. */ + ~DiagonalMatrix() { + this->release(); + } + + /** + * Steals the memory block from a given diagonal matrix. + * + * @param[in,out] copyee + * Diagonal matrix from which the memory block is to be stolen. + * No longer valid after this call. + * @return + * Reference to this diagonal matrix. + */ +#if SINGULAR_RVALUE_REFERENCE_SUPPORTED + DiagonalMatrix& operator =(DiagonalMatrix&& copyee) { +#else + DiagonalMatrix& operator =(const DiagonalMatrix& copyee) { +#endif + this->release(); + this->pBlock = copyee.pBlock; +#if SINGULAR_RVALUE_REFERENCE_SUPPORTED + copyee.pBlock = nullptr; +#else + const_cast< DiagonalMatrix& >(copyee).pBlock = nullptr; +#endif + return *this; + } + + /** + * Returns a clone of this matrix. + * + * @return + * Clone of this matrix. + */ + inline DiagonalMatrix clone() const { + return DiagonalMatrix(this->pBlock); + } + + /** + * Returns the element at a given row and column. + * + * The behavior is undefined, + * - if `i < 0` or `i >= M`, + * - or if `j < 0` or `j >= N` + * + * @param i + * Index of the row to be obtained. + * @param j + * Index of the column to be obtained. + * @return + * Element at the ith row and jth column. + * 0 if `i != j`. + */ + double operator ()(int i, int j) const { + assert(i >= 0 && i < M); + assert(j >= 0 && j < N); + if (i == j) { + return this->pBlock[i]; + } else { + return 0.0; + } + } + + /** + * Transposes this matrix. + * + * @return + * Transposed matrix. + */ + DiagonalMatrix< N, M > transpose() const { + return DiagonalMatrix< N, M >(this->pBlock); + } + private: +#if SINGULAR_FUNCTION_DELETION_SUPPORTED + /** Copy constructor is not allowed. */ + DiagonalMatrix(const DiagonalMatrix& copyee) = delete; + + /** Copy assignment is not allowed. */ + DiagonalMatrix& operator =(const DiagonalMatrix& copyee) = delete; +#elif SINGULAR_RVALUE_REFERENCE_SUPPORTED + /** Copy constructor is not allowed. */ + DiagonalMatrix(const DiagonalMatrix& copyee) {} + + /** Copy assignment is not allowed. */ + DiagonalMatrix& operator =(const DiagonalMatrix& copyee) { + return *this; + } +#endif + + /** + * Releases the memory block of this matrix. + * Has no effect if the memory block has already been released. + */ + inline void release() { + delete[] this->pBlock; + this->pBlock = nullptr; + } + }; + +//} +#endif +#endif diff --git a/heavyHash/Makefile b/heavyHash/Makefile new file mode 100644 index 0000000..ddc449a --- /dev/null +++ b/heavyHash/Makefile @@ -0,0 +1,14 @@ +SRCS = heavyhash.c obtc.c sha3.c + +OBJS = $(SRCS:.c=.o) +CC = gcc +CCFLAGS = -Wall + +libkas.a:$(OBJS) + ar -rv libkas.a $(OBJS) + +%.o:%.c + $(CC) $(CCFLAGS) -c $< -o $@ + +clean: + rm -rf *.o *.a diff --git a/heavyHash/Matrix.h b/heavyHash/Matrix.h new file mode 100644 index 0000000..377fa48 --- /dev/null +++ b/heavyHash/Matrix.h @@ -0,0 +1,25 @@ +#ifndef _SINGULAR_MATRIX_H +#define _SINGULAR_MATRIX_H + +#include "singular.h" +#include "Vector.h" + +//#include +//#include +//#include + + +typedef struct class_Matrix Matrix_t; +struct class_Matrix { + double* pBlock; + + Matrix_t (*clone)(struct class_Matrix *p); + void (*filledwith)(struct class_Matrix *p,const double values[]); + double (*operator)(struct class_Matrix *p, int i, int j); + Vector_t (*row)(struct class_Matrix *p, int i); + Vector_t (*column)(struct class_Matrix *p, int j); + void (*release)(struct class_Matrix *p); +}; + + +#endif diff --git a/heavyHash/Reflector.h b/heavyHash/Reflector.h new file mode 100644 index 0000000..fc9042b --- /dev/null +++ b/heavyHash/Reflector.h @@ -0,0 +1,19 @@ +#ifndef _SINGULAR_REFLECTOR_H +#define _SINGULAR_REFLECTOR_H + +#include "Matrix.h" +#include "singular.h" + + + +typedef struct class_Reflector Reflector_t; +struct class_Reflector { + Vector_t u; + double gamma; + size_t L; + + double* ptr; +}; + + +#endif diff --git a/heavyHash/Rotator.h b/heavyHash/Rotator.h new file mode 100644 index 0000000..03047d7 --- /dev/null +++ b/heavyHash/Rotator.h @@ -0,0 +1,17 @@ +#ifndef _SINGULAR_ROTATOR_H +#define _SINGULAR_ROTATOR_H + +#include "Matrix.h" +#include "singular.h" + + +typedef struct class_Rotator Rotator_t; +struct class_Rotator { + double elements[4]; + double (*operator)(struct class_Rotator *p, int i, int j); + void (*applyFromLeftTo)(struct class_Rotator *p, Matrix_t rhs, int k); + void (*applyFromRightTo)(struct class_Rotator *p, Matrix_t rhs, int k); +}; + + +#endif diff --git a/heavyHash/Svd.h b/heavyHash/Svd.h new file mode 100644 index 0000000..a070917 --- /dev/null +++ b/heavyHash/Svd.h @@ -0,0 +1,45 @@ +#ifndef _SINGULAR_SVD_H +#define _SINGULAR_SVD_H + +#include "DiagonalMatrix.h" +#include "Matrix.h" +#include "Reflector.h" +#include "Rotator.h" +//#include "singular.h" + +//#include +//#include +//#include + +typedef struct Svd Svd_t; +struct Svd { + //USV decomposeUSV(const Matrix< M, N >& m) + bool (*isFullRank)(Svd_t *p, DiagonalMatrix_t singularValues, const int size); +}; + +typedef struct class_BidiagonalMatrix BidiagonalMatrix_t; +struct class_BidiagonalMatrix { + double* pBlock; + double (*operator)(struct class_BidiagonalMatrix *p, int i, int j); + double (*applyFirstRotatorFromRight)(struct class_BidiagonalMatrix *p, Rotator_t *r); + double (*applyRotatorFromRight)(struct class_BidiagonalMatrix *p, Rotator_t *r, int n, double bulge); + double (*applyRotatorFromLeft)(struct class_BidiagonalMatrix *p, Rotator_t *r, int n, double bulge); + BidiagonalMatrix_t (*bidiagonalize)(struct class_BidiagonalMatrix *p, Matrix_t m); + void (*doFrancis)(struct class_BidiagonalMatrix *m,int n); + double (*calculateShift)(struct class_BidiagonalMatrix *m, int n); + void (*releases)(struct class_BidiagonalMatrix *p); +}; + +void BidiagonalMatrix_doFrancis(BidiagonalMatrix_t *m, int n); +double BidiagonalMatrix_calculateShift(BidiagonalMatrix_t *m, int n); +double BidiagonalMatrix_applyRotatorFromLeft(BidiagonalMatrix_t *ptr, Rotator_t *r, int n, double bulge); +double BidiagonalMatrix_applyRotatorFromRight(BidiagonalMatrix_t *ptr, Rotator_t *r, int n, double bulge); +double BidiagonalMatrix_applyFirstRotatorFromRight(BidiagonalMatrix_t *p, Rotator_t *r); +double BidiagonalMatrix_operator(BidiagonalMatrix_t *p, int i, int j); +void BidiagonalMatrix_release(BidiagonalMatrix_t *p); +void BidiagonalMatrix_init(BidiagonalMatrix_t *p, Matrix_t *m); +void BidiagonalMatrix_def(BidiagonalMatrix_t *p); +BidiagonalMatrix_t BidiagonalMatrix_bidiagonalize(BidiagonalMatrix_t *p, Matrix_t m); + + +#endif diff --git a/heavyHash/Vector.h b/heavyHash/Vector.h new file mode 100644 index 0000000..458ceb6 --- /dev/null +++ b/heavyHash/Vector.h @@ -0,0 +1,22 @@ +#ifndef _SINGULAR_VECTOR_H +#define _SINGULAR_VECTOR_H + + +#include +#include "singular.h" + + +typedef struct class_Vector Vector_t; +struct class_Vector { + double* pBlock; + size_t len; + ptrdiff_t delta; + + double* ptr; + void (*move)(struct class_Vector *p, ptrdiff_t delta); + double (*operator)(struct class_Vector *p, size_t idx); + Vector_t (*slice)(struct class_Vector *p, size_t start); +}; + + +#endif diff --git a/heavyHash/heavyhash.c b/heavyHash/heavyhash.c new file mode 100644 index 0000000..13fce9d --- /dev/null +++ b/heavyHash/heavyhash.c @@ -0,0 +1,150 @@ +#include "sha3.h" +#include "obtc.h" + + + + +void CSHA3_256_Write(CSHA3_256 *p, const unsigned char* data, size_t len) { + sha3_update(&p->context, data, len); + //return *this; +} + +void CSHA3_256_Finalize(CSHA3_256 *p, unsigned char hash[OUTPUT_SIZE]) { + sha3_final(hash, &p->context); +} + +/*void CSHA3_256_Reset(Obtc_t *Obtc, CSHA3_256 *p) { + sha3_init(Obtc,&p->context, OUTPUT_SIZE); + //return *this; +}*/ + +void CSHA3_256_init(Obtc_t *Obtc, CSHA3_256 *p) { + + sha3_init(Obtc, &p->context, OUTPUT_SIZE); + + p->Write = CSHA3_256_Write; + p->Finalize = CSHA3_256_Finalize; + //p->Reset = CSHA3_256_Reset; +} + +void CSHA3_256_CSHA3_256(Obtc_t *Obtc,CSHA3_256 *p) { + sha3_init(Obtc,&p->context, OUTPUT_SIZE); + +} + + +void CHeavyHash_Write(CHeavyHash *p, const unsigned char* data, size_t len) { + p->hasher.Write(&p->hasher,data, len); + //sha3_update(&CSHA3_256_p.context, data, len); + //CSHA3_256_Write(&CSHA3_256_p, data, OUTPUT_SIZE); + } + +void CHeavyHash_Finalize(Obtc_t *Obtc, CHeavyHash *p, unsigned char hash[OUTPUT_SIZE]) { + uint256 hash_first; + uint8_t a[32]; + + p->hasher.Finalize(&p->hasher,&Obtc->g_hash_first.bb.data[0]); + memcpy(a,&Obtc->g_hash_first.bb.data[0],32); + + uint256 product = MultiplyUsing4bitPrecision(p->matrix, Obtc->g_hash_first); + + uint256 hash_xored; + for (size_t i = 0; i < OUTPUT_SIZE; ++i) { + //hash_xored.begin()[i] = hash_first.begin()[i] ^ product.begin()[i]; + hash_xored.bb.data[i] = Obtc->g_hash_first.bb.data[i] ^ product.bb.data[i]; + + + } + + uint8_t temp[200]={ + 0x16,0x19,0x32,0x7d,0x10,0xb9,0xda,0x35,0x54,0x9a,0xe0,0x31,0x2f,0x9f,0xc6,0x15,0x92,0xbb,0x39,0x9d, + 0xb5,0x29,0x0c,0x0a,0x47,0xc3,0x9f,0x67,0x51,0x12,0xc2,0x2e,0xc7,0x76,0xc5,0x04,0x84,0x81,0xb9,0x57, + 0xb9,0x92,0xf2,0xd3,0x7b,0x34,0xca,0x58,0xea,0x8f,0xdb,0x80,0xba,0xc4,0x6d,0x39,0x7e,0x8f,0x1d,0xb1, + 0x77,0x65,0xcc,0x07,0x87,0xe9,0x61,0xb0,0x36,0xbc,0x94,0x16,0x77,0x4c,0x86,0x83,0x54,0x34,0xf2,0xb0, + 0x4e,0xf7,0x4b,0x3a,0x99,0xcd,0xb0,0x44,0x2e,0xc6,0x5b,0xd3,0x56,0x24,0x93,0xe4,0x6c,0x6b,0x7d,0x01, + 0xa7,0x69,0xcc,0x3d,0xd3,0x1f,0x4c,0xc3,0x54,0xc1,0x8c,0x3f,0xf4,0x31,0xc0,0x5d,0xd0,0xa9,0xa2,0x26, + 0xa0,0xbc,0xaa,0x9f,0x79,0x2a,0x3d,0x0c,0x80,0x39,0xf9,0xa6,0x0d,0xcf,0x6a,0x48,0x5e,0x21,0x90,0x40, + 0x25,0x0f,0xc4,0x62,0xc1,0x00,0xff,0x2a,0x93,0x89,0x35,0xba,0x72,0xc7,0xd8,0x2e,0x14,0xf3,0x40,0x69, + 0xe7,0x20,0xe0,0xdf,0x44,0xee,0xce,0xde,0x11,0xa7,0x5f,0x4c,0x80,0x05,0x64,0x98,0x7a,0x14,0xff,0x48, + 0x16,0xc7,0xf8,0xee,0x79,0x62,0x9b,0x0e,0x2f,0x9f,0x42,0x16,0x3a,0xd7,0x4c,0x52,0xb2,0x24,0x85,0x09, + }; + for(int i = 0 ;i< 200 ;i++)Obtc->const_data[i] = temp[i]; + + // CSHA3_256().Write(hash_xored.begin(), OUTPUT_SIZE).Finalize(hash); + + CSHA3_256_CSHA3_256(Obtc, &p->hasher); + CSHA3_256_Write(&p->hasher, &hash_xored.bb.data[0], OUTPUT_SIZE); + CSHA3_256_Finalize(&p->hasher, hash) ; +} + +void CHeavyHash_Reset(CHeavyHash *p, uint64_t matrix_[64*64]) { + for (int i = 0; i < 64*64; ++i) + p->matrix[i] = matrix_[i]; +} + +void CHeavyHash_init(Obtc_t *Obtc, CHeavyHash *p, uint64_t matrix_[64*64]){ + + p->Write = CHeavyHash_Write; + p->Finalize = CHeavyHash_Finalize; + p->Reset = CHeavyHash_Reset; + + p->hasher.Write = CSHA3_256_Write; + p->hasher.Finalize = CSHA3_256_Finalize; + //p->hasher.Reset = CSHA3_256_Reset; + + sha3_init(Obtc, &p->hasher.context, OUTPUT_SIZE); + + for (int i = 0; i < 64*64; ++i) + p->matrix[i] = matrix_[i]; + +} + + + + +void MultiplyMatrices(uint64_t matrix[64*64], uint64_t vector[64], uint64_t product[64]){ + for (int i = 0; i < 64; ++i) { + for (int j = 0; j < 64; ++j) { + product[i] += matrix[64*i + j]*vector[j]; + } + } +} + +uint256 MultiplyUsing4bitPrecision(uint64_t matrix[64*64], const uint256 hash) { + // conversion to matrix with 4 bit values + uint64_t vector[64] = {0}; + ConvertTo4BitPrecisionVector(hash, vector); + + // perform matrix multiplication + uint64_t product[64] = {0}; + MultiplyMatrices(matrix, vector, product); + for (int i = 0; i < 64; ++i) { + product[i] >>= 10; + } + return Convert4bitVectorToUint(product); +} + +void ConvertTo4BitPrecisionVector(uint256 bit_sequence, uint64_t vector[64]) { + int index = 0; + int i; + + for (i = 0; i < WIDTH; i++) { + + vector[index] = bit_sequence.bb.data[i] >> 4; + vector[index+1] = bit_sequence.bb.data[i] & 0xF; + index += 2; + } +} + +uint256 Convert4bitVectorToUint(const uint64_t x[64]) { + uint256 bit_sequence; + int index = 0; + int i; + + for (i = 0; i < WIDTH; i++) { + bit_sequence.bb.data[i] = ( x[index] << 4) | x[index+1]; + index += 2; + } + + return bit_sequence; +} diff --git a/heavyHash/heavyhash.h b/heavyHash/heavyhash.h new file mode 100644 index 0000000..9243c3f --- /dev/null +++ b/heavyHash/heavyhash.h @@ -0,0 +1,98 @@ +#ifndef OPOW_CRYPTO_HEAVYHASH_H +#define OPOW_CRYPTO_HEAVYHASH_H + +#include +#include +#include "sha3.h" + +//#include +//#include "obtc.h" + + +#define OUTPUT_SIZE 32 + +typedef struct class_CSHA3_256 CSHA3_256; + +struct class_CSHA3_256 +{ + sha3_ctx_t context; + + // static const size_t OUTPUT_SIZE = 32; + + //CSHA3_256& Write(const unsigned char* data, size_t len); + void (*Write)(struct class_CSHA3_256 *p, const unsigned char* data, size_t len); + void (*Finalize)(struct class_CSHA3_256 *p, unsigned char hash[OUTPUT_SIZE]); + //CSHA3_256& Reset(); +}; + + +typedef struct class_CHeavyHash CHeavyHash; +struct class_CHeavyHash +{ + + uint64_t matrix[64*64]; + CSHA3_256 hasher; + + //static const size_t OUTPUT_SIZE = 32; + //explicit CHeavyHash(uint64_t matrix_[64*64]); + //CHeavyHash& Reset(uint64_t matrix_[64*64]); + //CHeavyHash& Write(const unsigned char* data, size_t len); + //void Finalize(unsigned char hash[OUTPUT_SIZE]); + void (*Reset)(struct class_CHeavyHash *p, uint64_t matrix_[64*64]); + void (*Write)(struct class_CHeavyHash *p, const unsigned char* data, size_t len); + void (*Finalize)(struct class_CHeavyHash *p, unsigned char hash[OUTPUT_SIZE]); +}; + +#if 0 +/** A hasher class for SHA3-256. */ +class CSHA3_256 +{ +private: + sha3_ctx_t context; + +public: + static const size_t OUTPUT_SIZE = 32; + + CSHA3_256(); + CSHA3_256& Write(const unsigned char* data, size_t len); + void Finalize(unsigned char hash[OUTPUT_SIZE]); + CSHA3_256& Reset(); +}; + +class CHeavyHash +{ +private: + uint64_t matrix[64*64]; + CSHA3_256 hasher; + +public: + static const size_t OUTPUT_SIZE = 32; + explicit CHeavyHash(uint64_t matrix_[64*64]); + CHeavyHash& Reset(uint64_t matrix_[64*64]); + CHeavyHash& Write(const unsigned char* data, size_t len); + void Finalize(unsigned char hash[OUTPUT_SIZE]); +}; +#endif +uint256 MultiplyUsing4bitPrecision(uint64_t matrix[64*64], const uint256 hash); + +void ConvertTo4BitPrecisionVector(uint256 bit_sequence, uint64_t vector[64]); + +uint256 Convert4bitVectorToUint(const uint64_t x[64]); + + +//zzj add +/*extern void CSHA3_256_init(struct Obtc_opt *Obtc, CSHA3_256 *p); +void CSHA3_256_CSHA3_256(struct Obtc_opt *Obtc, CSHA3_256 *p); + +void CSHA3_256_Write(CSHA3_256 *p, const unsigned char* data, size_t len); + +void CSHA3_256_Finalize(CSHA3_256 *p, unsigned char hash[OUTPUT_SIZE]); +// + +void CHeavyHash_init(struct Obtc_opt *Obtc, CHeavyHash *p, uint64_t matrix_[64*64]); +void CHeavyHash_Write(CHeavyHash *p, const unsigned char* data, size_t len); + +void CHeavyHash_Finalize(struct Obtc_opt *Obtc, CHeavyHash *p, unsigned char hash[OUTPUT_SIZE]); +*/ + +#endif // OPOW_CRYPTO_HEAVYHASH_H diff --git a/heavyHash/heavyhash.o b/heavyHash/heavyhash.o new file mode 100644 index 0000000..8e2ead9 Binary files /dev/null and b/heavyHash/heavyhash.o differ diff --git a/heavyHash/libkas.a b/heavyHash/libkas.a new file mode 100644 index 0000000..478fbab Binary files /dev/null and b/heavyHash/libkas.a differ diff --git a/heavyHash/obtc.c b/heavyHash/obtc.c new file mode 100644 index 0000000..b5ff6ae --- /dev/null +++ b/heavyHash/obtc.c @@ -0,0 +1,907 @@ +//! heavyhash extracted from optical bitcoin +//! 2022 barrystyle + +#include +#include +#include +#include //qsort +#include + +#include "obtc.h" + + +#define M 64 +#define N 64 + +bool Is4BitPrecision(const uint64_t matrix[64*64]) +{ + for (int i = 0; i < 64; ++i) { + for (int j = 0; j < 64; ++j) { + if (matrix[ i*64 + j] > 0xF) + return false; + } + } + return true; +} + + + + +double DiagonalMatrix_operator(DiagonalMatrix_t *p, int i, int j) +{ + assert(i >= 0 && i < 64); + assert(j >= 0 && j < 64); + if (i == j) { + return p->pBlock[i]; + } else { + return 0.0; + } +} + +void DiagonalMatrix_release(DiagonalMatrix_t *p) +{ + if (p->pBlock != NULL){ + free(p->pBlock); + p->pBlock = NULL; + } +} + +void DiagonalMatrix_init(DiagonalMatrix_t *p, const double values[]) +{ + p->pBlock = (double *)malloc(sizeof(double)*M); + //memset(pBlock, 0.0, sizeof(double)*L(64,64)); + memcpy(p->pBlock, values, sizeof(double) * M); + + p->operator = DiagonalMatrix_operator; + p->release = DiagonalMatrix_release; + +} + +void DiagonalMatrix_DiagonalMatrix(DiagonalMatrix_t *p) +{ + p->operator = DiagonalMatrix_operator; + p->release = DiagonalMatrix_release; +} + +//-----------------------------vector-------------------------------// + +void vector_move(Vector_t *p, ptrdiff_t delta) { + p->ptr += delta; +} + +Vector_t vector_slice(Vector_t v, size_t start) { + //assert(start >= 0 && start <= p->len); + Vector_t v_tmp; + v_tmp.pBlock = v.pBlock + start * v.delta; + v_tmp.len = v.len - start; + v_tmp.delta = v.delta; + return v_tmp; +} + +double Vector_column_operator(Vector_t *p, size_t idx){ + return p->pBlock[idx * p->delta]; +} + +double Vector_row_operator(Vector_t *p, size_t idx){ + return p->pBlock[idx * p->delta]; +} + +void Vector_sync(Matrix_t *p, size_t idx, Vector_t vec, int offset){ + for(int i = 0; i < vec.len; i++){ + p->pBlock[idx+(offset+i)*N] = vec.pBlock[i]; + } +} + +void Vector_row_sync(Matrix_t *p, size_t idx, Vector_t vec, int offset){ + for(int i = 0; i < vec.len; i++){ + p->pBlock[offset+idx*N+i] = vec.pBlock[i]; + } +} + + +//-----------------------------Martrix-------------------------------// +Matrix_t Matrix_clone(Matrix_t *p) +{ + Matrix_t m; + + m.pBlock = (double *)malloc(sizeof(double)*L(64,64)); + memcpy(m.pBlock, p->pBlock, sizeof(double)*L(64,64)); + + return m; +} + +void Matrix_filledwith(Matrix_t *p, const double values[]) +{ + //p->pBlock = (double *)malloc(sizeof(double)*L(64,64)); + //memset(pBlock, 0.0, sizeof(double)*L(64,64)); + memcpy(p->pBlock, values, sizeof(double) * L(64,64)); +} + +double Matrix_operator(Matrix_t *p, int i, int j) +{ + assert(i >= 0 && i < N); + assert(j >= 0 && j < N); + + return p->pBlock[i*N+j]; + +} + + + +Vector_t Matrix_row(Matrix_t *p, int i) +{ + Vector_t vec_tmp; + vec_tmp.len = N; + vec_tmp.delta = 1; + vec_tmp.pBlock = p->pBlock + i*N; + //return Vector< const double >(this->pBlock + i * N, N, 1); + return vec_tmp; + +} + +Vector_t Matrix_column(Matrix_t *p, int j) +{ + Vector_t vec_tmp; + vec_tmp.len = M; + vec_tmp.delta = N; + vec_tmp.pBlock = p->pBlock + j; + + return vec_tmp; + //return Vector< double >(this->pBlock + j, M, N); + +} + +void Matrix_release(Matrix_t *p) +{ + if (p->pBlock != NULL){ + free(p->pBlock); + p->pBlock = NULL; + } +} + +void Matrix_init(Matrix_t *p) +{ + p->pBlock = (double *)malloc(sizeof(double)*L(64,64)); + memset(p->pBlock, 0.0, sizeof(double)*L(64,64)); + //memcpy(p->pBlock, values, sizeof(double) * L(64,64)); +} + +void Matrix_def(Matrix_t *p) +{ + //p->clone = Matrix_clone; + p->filledwith = Matrix_filledwith; + p->operator = Matrix_operator; + p->row = Matrix_row; + p->column = Matrix_column; + p->release = Matrix_release; +} + + + + + + +//-----------------------------Rotator-------------------------------// + +double max(double a, double b) +{ + return a > b ? a : b; +} + +double Rotator_operator(Rotator_t *p, int i, int j){ + assert(0 <= i && i < 2); + assert(0 <= j && j < 2); + return p->elements[i * 2 + j]; +} + +void Rotator_init(Rotator_t *p, double x1, double x2) +{ + // normalizes by the maximum magnitude + // to avoid harmful underflow and overflow + double mx = max(fabs(x1), fabs(x2)); + + x1 /= mx; + x2 /= mx; + double norm = sqrt(x1 * x1 + x2 * x2); + double cs = x1 / norm; + double sn = x2 / norm; + p->elements[0] = cs; + p->elements[1] = -sn; + p->elements[2] = sn; + p->elements[3] = cs; + + p->operator = Rotator_operator; +} + +//-----------------------------Reflector-------------------------------// + + +void Reflector_transform(Reflector_t *p, double u0, size_t len){ + int i; + for (i = 0; i < len; i++){ + p->u.pBlock[i] = p->u.pBlock[i] /u0; + } +} + +void Reflector_transform_left(Reflector_t *src1, Vector_t src2, Vector_t dst, double gUM, size_t len){ + int i; + for (i = 0; i < len; i++){ + dst.pBlock[i] = src2.pBlock[i] - src1->u.pBlock[i] * gUM; + } +} + +void Reflector_transform_right(Reflector_t *src1, Vector_t src2, Vector_t dst, double gMU, size_t len){ + int i; + for (i = 0; i < len; i++){ + dst.pBlock[i] = src2.pBlock[i] - gMU * src1->u.pBlock[i]; + } +} + + + +void Reflector_init(Reflector_t *p, Vector_t v) { + //assert(v.size() > 0 && v.size() <= L); + //const size_t N = v.size(); + //const size_t p->L = sizeof(v)/sizeof(double); + p->L = v.len; + + p->u.pBlock = (double *)malloc(sizeof(double)*v.len); + memcpy(p->u.pBlock, v.pBlock, sizeof(double)*v.len); + + // normalizes elements by the maximum amplitude + // to avoid harmful underflow and overflow + + double mx = 0.0; + + for (size_t i = 0; i < p->L; ++i) { + mx = max(fabs(p->u.pBlock[i]), mx); + } + + if (mx > 0.0) { + // calculates the normalized norm + double tau = 0.0; + for (size_t i = 0; i < p->L; ++i) { + double x = p->u.pBlock[i] / mx; + p->u.pBlock[i] = x; + tau += x * x; + } + tau = sqrt(tau); + // tau's sign should be the same as the first element in `u` + if (p->u.pBlock[0] < 0.0) { + tau = -tau; + } + double u0 = p->u.pBlock[0] + tau; + p->u.pBlock[0] = u0; + Reflector_transform(p, u0, p->L); + + p->gamma = u0 / tau; + } else { + // v is a zero vector + p->gamma = 0.0; + memset(p->u.pBlock, 0.0, p->L); + } +} + +void Reflector_release(Reflector_t *p){ + if (p->u.pBlock != NULL){ + free(p->u.pBlock); + p->u.pBlock = NULL; + } +} + + +double inner_product(double *a,double *b,int n){ + int i; + double sum = 0.0; + + for(i = 0; i < n; i++) + { + sum += (*(a+i))*(*(b+i)); + } + return sum; +} + +Matrix_t Reflector_applyFromLeftTo(Reflector_t *p, Matrix_t m){ + // H * m = m - gamma * u * u^T * m + Matrix_t m2 = Matrix_clone(&m);//m->clone(m); + Vector_t vec_m; + Vector_t vec_m2; + + int offset = N - p->L; + for (int i = 0; i < N; ++i) { + // caches gamma * u^T * m + vec_m = Matrix_column(&m, i); + + Vector_t srcColumn = vector_slice(vec_m, offset); + double v_src_column[srcColumn.len]; + + for(size_t i = 0; i < srcColumn.len; i++){ + v_src_column[i] = Vector_column_operator(&srcColumn, i); + } + srcColumn.pBlock = v_src_column; + + double gUM = inner_product(p->u.pBlock, srcColumn.pBlock, p->L); + //Vector< const double > srcColumn = m->column(m, i).slice(offset); + + gUM *= p->gamma; + // H * m = m - u * gUM + vec_m2 = Matrix_column(&m2, i); + Vector_t dstColumn = vector_slice(vec_m2, offset); + double v_dstcolumn[dstColumn.len]; + + for(size_t i = 0; i < dstColumn.len; i++){ + v_dstcolumn[i] = Vector_column_operator(&dstColumn, i); + } + dstColumn.pBlock = v_dstcolumn; + + Reflector_transform_left(p, srcColumn, dstColumn, gUM, p->L); + Vector_sync(&m2, i, dstColumn, offset); + } + Matrix_release(&m); + return m2; +} + +Matrix_t Reflector_applyFromRightTo(Reflector_t *p, Matrix_t m){ + // m * H = m - m * gamma * u * u^T + Matrix_t m2 = Matrix_clone(&m); + Vector_t vec_m; + Vector_t vec_m2; + + int offset = 64 - p->L; + + for (int i = 0; i < M; ++i) { + // caches gamma * m * u + vec_m = Matrix_row(&m, i); + Vector_t srcRow = vector_slice(vec_m, offset); + + double v_src_row[srcRow.len]; + for(size_t j = 0; j< srcRow.len; j++){ + v_src_row[j] = Vector_row_operator(&srcRow, j); + } + srcRow.pBlock = v_src_row; + + double gMU = inner_product(p->u.pBlock, srcRow.pBlock, p->L); + + gMU *= p->gamma; + // m * H = m - gMU * u^T + vec_m2 = Matrix_row(&m2, i); + + Vector_t dstRow = vector_slice(vec_m2, offset); + + double v_dstrow[dstRow.len]; + + for(size_t j = 0; j < dstRow.len; j++){ + v_dstrow[j] = Vector_row_operator(&dstRow, j); + } + dstRow.pBlock = v_dstrow; + + Reflector_transform_right(p ,srcRow, dstRow, gMU, p->L); + Vector_row_sync(&m2, i, dstRow, offset); + } + Matrix_release(&m); + return m2; +} + + +//-----------------------------Svd-------------------------------// + +int cmp_double(const void* e1, const void* e2) +{ + if ((*(double*)e2 - *(double*)e1) > 0.00000) + return 1; + else if ((*(double*)e2 - *(double*)e1) == 0.000000) + return 0; + else + return -1; +} + +DiagonalMatrix_t Svd_decomposeUSV(BidiagonalMatrix_t *p, Matrix_t *m) { + const int MAX_ITERATIONS = N * 10; + // allocates matrices + Matrix_t m1 = Matrix_clone(m); + Matrix_def(&m1); + + + + // bidiagonalizes a given matrix + BidiagonalMatrix_t m2 = p->bidiagonalize(p, m1); + // repeats Francis iteration + + + int iteration = 0; + int n = N; + + while (n >= 2) { + // processes the n-1 x n-1 submatrix + // if the current n x n submatrix has converged + double bn = m2.operator(&m2, n - 1, n - 1); + + if (bn == 0.0 || fabs(m2.operator(&m2, n - 2, n - 1) / bn) < 1.0e-15) { + --n; + } else { + // aborts if too many iterations + ++iteration; + if (iteration > MAX_ITERATIONS) { + break; + } + m2.doFrancis(&m2, n); + } + } + + // copies the diagonal elements + // and makes all singular values positive + double ss[N]; + for (int i = 0; i < N; ++i) { + if (m2.operator(&m2, i, i) < 0) { + ss[i] = -m2.operator(&m2, i, i); + // inverts the sign of the right singular vector + //Vector< double > vi = v.column(i); + //std::transform( + // vi.begin(), vi.end(), vi.begin(), + // [](double x) { + // return -x; + // }); + } else { + ss[i] = m2.operator(&m2, i, i); + } + } + + // sorts singular values in descending order if necessary + int shuffle[M]; // M >= N + bool sortNeeded = false; + for (int i = 0; i < M; ++i) { + shuffle[i] = i; + sortNeeded = sortNeeded || (i < N - 1 && ss[i] < ss[i + 1]); + } + + m1.release(&m1); + BidiagonalMatrix_release(p); + + + DiagonalMatrix_t dm; + if (sortNeeded) { + // shuffles the N (<= M) singular values + qsort(ss, N,sizeof(double), cmp_double); + + double ss2[M]; + + memcpy(ss2, ss, M*sizeof(double)); + DiagonalMatrix_init(&dm, ss2); + + return dm; + } else { + DiagonalMatrix_init(&dm, ss); + return dm; + } +} + + + + +bool Svd_isFullRank(DiagonalMatrix_t *p, const int size) { + const double round_off = 1.000009e-12; + for (int i = 0; i < size; ++i) { + if (fabs( p->operator(p, i, i) ) < round_off){ + p->release(p); + return false; + } + } + p->release(p); + return true; +} + + +//-----------------------------BidiagonalMatrix_t-------------------------------// +BidiagonalMatrix_t BidiagonalMatrix_bidiagonalize(BidiagonalMatrix_t *p, Matrix_t m) +{ + assert(M >= N); + + Vector_t vec_m; + Vector_t vec_m2; + + for (int i = 0; i < N; ++i) { + Reflector_t rU; + + vec_m = Matrix_column(&m, i); + Vector_t column_slice = vector_slice(vec_m, i); + // applies a householder transform to the column vector i + + double v_column[column_slice.len]; + + for(size_t i = 0; i < column_slice.len; i++){ + v_column[i] = Vector_column_operator(&column_slice, i); + } + column_slice.pBlock = v_column; + + Reflector_init(&rU, column_slice); + + m = Reflector_applyFromLeftTo(&rU, m); + + Reflector_release(&rU); + //u = rU.applyFromRightTo(u); // U1^T*U0^T = U0*U1 + if (i < N - 1) { + // applies a householder transform to the row vector i + 1 + //Reflector< N > rV(m.row(i).slice(i + 1)); + Reflector_t rV; + vec_m2 = Matrix_row(&m, i); + Vector_t row_slice = vector_slice(vec_m2, i+1); + + double v_row[row_slice.len]; + + for(size_t i = 0; i < row_slice.len; i++){ + v_row[i] = Vector_row_operator(&row_slice, i); + } + row_slice.pBlock = v_row; + Reflector_init(&rV, row_slice); + + m = Reflector_applyFromRightTo(&rV, m); + //m = rV.applyFromRightTo(m); + //v = rV.applyFromRightTo(v); + + Reflector_release(&rV); + + } + } + + BidiagonalMatrix_init(p, &m); + return *p; +} + +void BidiagonalMatrix_release(BidiagonalMatrix_t *p) +{ + if (p->pBlock != NULL){ + free(p->pBlock); + p->pBlock = NULL; + } +} + +double BidiagonalMatrix_operator(BidiagonalMatrix_t *p, int i, int j) +{ + assert(i >= 0 && i < M); + assert(j >= 0 && j < N); + if (i == j) { + return p->pBlock[2 * i]; + } else if (i + 1 == j) { + return p->pBlock[2 * i + 1]; + } else { + return 0.0; + } + +} + +double BidiagonalMatrix_applyFirstRotatorFromRight(BidiagonalMatrix_t *p, Rotator_t *r) +{ + double b1 = p->pBlock[0]; + double g1 = p->pBlock[1]; + double b2 = p->pBlock[2]; + double r11 = Rotator_operator(r, 0, 0);//r->operator(r, 0, 0); + double r12 = Rotator_operator(r, 0, 1);//r->operator(r, 0, 1); + double r21 = Rotator_operator(r, 1, 0);//r->operator(r, 1, 0); + double r22 = Rotator_operator(r, 1, 1);//r->operator(r, 1, 1); + //Rotator_operator + + p->pBlock[0] = b1 * r11 + g1 * r21; + p->pBlock[1] = b1 * r12 + g1 * r22; + p->pBlock[2] = b2 * r22; + return b2 * r21; +} + +double BidiagonalMatrix_applyRotatorFromRight(BidiagonalMatrix_t *ptr, Rotator_t *r, int n, double bulge) +{ + double* p = ptr->pBlock + n * 2; + double g0 = p[-1]; + double b1 = p[0]; + double g1 = p[1]; + double b2 = p[2]; + double r11 = r->operator(r, 0, 0); + double r12 = r->operator(r, 0, 1); + double r21 = r->operator(r, 1, 0); + double r22 = r->operator(r, 1, 1); + p[-1] = g0 * r11 + bulge * r21; + p[0] = b1 * r11 + g1 * r21; + p[1] = b1 * r12 + g1 * r22; + p[2] = b2 * r22; + return b2 * r21; +} + +double BidiagonalMatrix_applyRotatorFromLeft(BidiagonalMatrix_t *ptr, Rotator_t *r, int n, double bulge) +{ + double* p = ptr->pBlock + n * 2; + double b1 = p[0]; + double g1 = p[1]; + double b2 = p[2]; + double r11 = r->operator(r, 0, 0); + double r12 = r->operator(r, 0, 1); + double r21 = r->operator(r, 1, 0); + double r22 = r->operator(r, 1, 1); + + p[0] = r11 * b1 + r21 * bulge; + p[1] = r11 * g1 + r21 * b2; + p[2] = r12 * g1 + r22 * b2; + double newBulge; + if (n < N - 2) { + double g2 = p[3]; + newBulge = r21 * g2; + p[3] = r22 * g2; + } else { + newBulge = 0.0; + } + return newBulge; +} + +double BidiagonalMatrix_calculateShift(BidiagonalMatrix_t *m, int n) +{ + assert(M >= N); + assert(n >= 2); + double b1 = m->operator(m, n - 2, n - 2); + double b2 = m->operator(m, n - 1, n - 1); + double g1 = m->operator(m, n - 2, n - 1); + + // solves lambda^4 - d*lambda^2 + e = 0 + // where + // d = b1^2 + b2^2 + g1^2 + // e = b1^2 * b2^2 + // chooses lambda (rho) closest to b2 + double rho; + double d = b1 * b1 + b2 * b2 + g1 * g1; + double e = b1 * b1 * b2 * b2; + // lambda^2 = (d +- sqrt(d^2 - 4e)) / 2 + // so, f = d^2 - 4e must be positive + double f = d * d - 4 * e; + + if (f >= 0) { + f = sqrt(f); + // lambda = +-sqrt(d +- f) (d >= 0, f >= 0) + // if d > f, both d+f and d-f have real square roots + // otherwise considers only d+f + if (d > f) { + // lets l1 > l2 + double l1 = sqrt((d + f) * 0.5); + double l2 = sqrt((d - f) * 0.5); + // if b2 >= 0, chooses a positive shift + // otherwise chooses a negative shift + if (b2 >= 0) { + if (fabs(b2 - l1) < fabs(b2 - l2)) { + rho = l1; + } else { + rho = l2; + } + } else { + if (fabs(b2 + l1) < fabs(b2 + l2)) { + rho = -l1; + } else { + rho = -l2; + } + } + } else { + double l1 = sqrt((d + f) * 0.5); + if (fabs(b2 - l1) <= fabs(b2 + l1)) { + rho = l1; + } else { + rho = -l1; + } + } + } else { + // no solution. chooses b2 as the shift + rho = b2; + } + + return rho; +} + + + +void BidiagonalMatrix_doFrancis(BidiagonalMatrix_t *m, int n) +{ + assert(M >= N); + assert(n >= 2); + // calculates the shift + double rho = m->calculateShift(m, n); + + // applies the first right rotator + double b1 = m->operator(m, 0, 0); + double g1 = m->operator(m, 0, 1); + double mx = max(fabs(rho), max(fabs(b1), fabs(g1))); + rho /= mx; + b1 /= mx; + g1 /= mx; + //Rotator_t r0(b1 * b1 - rho * rho, b1 * g1); + + Rotator_t r0; + Rotator_init(&r0, b1 * b1 - rho * rho, b1 * g1); + + double bulge = m->applyFirstRotatorFromRight(m, &r0); + //v = r0.applyFromRightTo(&r0, v, 0); + // applies the first left rotator + + Rotator_t r1; + Rotator_init(&r1, m->operator(m, 0, 0), bulge); + //Rotator_t r1(m(0, 0), bulge); + bulge = m->applyRotatorFromLeft(m, &r1, 0, bulge); + //u = r1.applyFromRightTo(&r1, u, 0); // U1^T*U0^T = U0*U1 + + for (int i = 1; i + 1 < n; ++i) { + // calculates (i+1)-th right rotator + //Rotator rV(m(i - 1, i), bulge); + Rotator_t rV; + Rotator_init(&rV, m->operator(m, i - 1, i), bulge); + + bulge = m->applyRotatorFromRight(m, &rV, i, bulge); + //v = rV.applyFromRightTo(&rV, v, i); + // calculates (i+1)-th left rotator + //Rotator rU(m(i, i), bulge); + Rotator_t rU; + Rotator_init(&rU, m->operator(m, i, i), bulge); + + bulge = m->applyRotatorFromLeft(m, &rU, i, bulge); + //u = rU.applyFromRightTo(rU, u, i); // U1^T*U0^T = U0*U1 + } +} + +void BidiagonalMatrix_def(BidiagonalMatrix_t *p) +{ + p->applyFirstRotatorFromRight = BidiagonalMatrix_applyFirstRotatorFromRight; + p->applyRotatorFromLeft = BidiagonalMatrix_applyRotatorFromLeft; + p->applyRotatorFromRight = BidiagonalMatrix_applyRotatorFromRight; + p->bidiagonalize = BidiagonalMatrix_bidiagonalize; + p->calculateShift = BidiagonalMatrix_calculateShift; + p->doFrancis = BidiagonalMatrix_doFrancis; + p->operator = BidiagonalMatrix_operator; + p->releases = BidiagonalMatrix_release; + +} + +void BidiagonalMatrix_init(BidiagonalMatrix_t *p, Matrix_t *m) +{ + assert(M >= N); + int len; + len = 2 * N - 1; + + p->pBlock = (double *)malloc(sizeof(double)*len); + memset(p->pBlock, 0.0,sizeof(double)*len); + + for (int i = 0; i < N; ++i) { + p->pBlock[i * 2] = Matrix_operator(m, i, i);//m->operator(m, i, i); + if (i < N - 1) { + p->pBlock[i * 2 + 1] = Matrix_operator(m, i, i + 1);//m->operator(m, i, i + 1); + } + } +} + + +bool IsFullRank(const uint64_t matrix_[64*64]) +{ + double matrix__ [64*64]; + // Matrix<64, 64> matrix; + + + for (int i = 0; i < 64; ++i) { + for (int j = 0; j < 64; ++j) { + matrix__[64*i + j] = (double) matrix_[64*i + j]; + } + } + + DiagonalMatrix_t dm; + Matrix_t mt; + BidiagonalMatrix_t bt; + + DiagonalMatrix_init(&dm, matrix__); + //matrix.fill(matrix__); + + Matrix_init(&mt); + Matrix_def(&mt); + mt.filledwith(&mt, matrix__); + + BidiagonalMatrix_def(&bt); + DiagonalMatrix_t usv = Svd_decomposeUSV(&bt, &mt); + DiagonalMatrix_t singularValues = usv; + mt.release(&mt); + dm.release(&dm); + //DiagonalMatrix_release(&dm); + return Svd_isFullRank(&usv,64); + + + +} + + +uint64_t GetUint64_t(uint8_t *data, int pos) +{ + const uint8_t* ptr = data + pos * 8; + return ((uint64_t)ptr[0]) | \ + ((uint64_t)ptr[1]) << 8 | \ + ((uint64_t)ptr[2]) << 16 | \ + ((uint64_t)ptr[3]) << 24 | \ + ((uint64_t)ptr[4]) << 32 | \ + ((uint64_t)ptr[5]) << 40 | \ + ((uint64_t)ptr[6]) << 48 | \ + ((uint64_t)ptr[7]) << 56; +} + +void XoShiRo256PlusPlus_init(Obtc_t *Obtc, uint64_t *s, uint256 seed) { + for (int i = 0; i < 4; ++i) { + //p->s[i] = seed.GetUint64(i); + s[i] = GetUint64_t(Obtc->data_r,i); + } +} + +uint64_t RotateLeft64(const uint64_t x, int k) { + return (x << k) | (x >> (64 - k)); +} + + +uint64_t XoShiRo256PlusPlus_operator(uint64_t *s){ + const uint64_t result = RotateLeft64(s[0] + s[3], 23) + s[0]; + + const uint64_t t = s[1] << 17; + + s[2] ^= s[0]; + s[3] ^= s[1]; + s[1] ^= s[2]; + s[0] ^= s[3]; + + s[2] ^= t; + + s[3] = RotateLeft64(s[3], 45); + + return result; +} + +void GenerateHeavyHashMatrix_t(Obtc_t *Obtc, uint256 matrix_seed, uint64_t matrix[64*64]) +{ + XoShiRo256PlusPlus_init(Obtc, Obtc->ss, matrix_seed); + + do { + for (int i = 0; i < 64; ++i) { + for (int j = 0; j < 64; j += 16) { + uint64_t value = XoShiRo256PlusPlus_operator(Obtc->ss);//generator(); + for (int shift = 0; shift < 16; ++shift) { + matrix[64*i + j + shift] = (value >> (4 * shift)) & 0xF; + } + } + } + //} while (!Is4BitPrecision(matrix) || !IsFullRank(matrix)); + }while(!Is4BitPrecision(matrix)); +} + + + + +void serialize_heavyhash(Obtc_t *Obtc, uint64_t matrix[64*64], const char* in, char* out, int len) +{ + uint8_t temp[200]={ + 0x02,0xb9,0x7c,0x78,0x6f,0x82,0x43,0x83,0x5d,0x11,0x29,0xcf,0x82,0xaf,0xa5,0xbc,0xb1,0xfc,0xce,0x9c, + 0xe7,0x8b,0x52,0x72,0x48,0xb0,0x94,0x27,0xa8,0x74,0x2e,0xdb,0x89,0xca,0x4e,0x84,0x9b,0xce,0xcf,0x4a, + 0xd1,0x02,0x57,0x41,0x05,0x09,0x5f,0x8d,0xba,0x1d,0xe5,0xe4,0x45,0x16,0x68,0xe4,0xc1,0xa2,0x02,0x1d, + 0x56,0x3b,0xb1,0x42,0x8f,0x06,0xdd,0x1c,0x7a,0x2f,0x85,0x1a,0x34,0x85,0x54,0x90,0x64,0xa3,0x6a,0x46, + 0xb2,0x1a,0x60,0x1f,0x85,0xb4,0xb2,0x23,0xe6,0xc8,0x5d,0x8f,0x82,0xe9,0xda,0x89,0xec,0x70,0xf1,0xa4, + 0x25,0xb1,0x37,0x15,0x44,0xe3,0x67,0x87,0x5b,0x29,0x91,0x52,0x0f,0x96,0x07,0x05,0x40,0xf1,0x4a,0x0e, + 0x2e,0x65,0x1c,0x3c,0x43,0x28,0x5f,0xf0,0xf8,0xeb,0xf1,0x33,0x88,0x66,0x31,0x40,0x77,0x6b,0xf6,0x0c, + 0x78,0x9b,0xc2,0x9c,0x18,0x3a,0x98,0x1e,0xad,0x41,0x5b,0x10,0x4a,0xef,0x61,0xd6,0x29,0xdc,0xe2,0x46, + 0x7b,0x2f,0xaf,0xca,0x87,0x5e,0x2d,0x65,0x1b,0xa5,0xa4,0xa3,0xf5,0x98,0x69,0xa0,0x1e,0x5f,0x2e,0x72, + 0x0e,0xfb,0x44,0xd2,0x29,0xbf,0x88,0x55,0xb7,0x02,0x7e,0x3c,0x11,0x3c,0xff,0x0d,0xa1,0xf6,0xd8,0x3d + }; + for(int i = 0 ;i< 200 ;i++)Obtc->const_data[i] = temp[i]; + + CHeavyHash_init(Obtc, &Obtc->CHeavyHash_p, matrix); + CHeavyHash_Write(&Obtc->CHeavyHash_p, (const unsigned char*)in, len); + CHeavyHash_Finalize(Obtc, &Obtc->CHeavyHash_p, (unsigned char*)out); +} + + + +void opticalbtc_hash(const char* in, char* out, int len) +{ + uint8_t *ptr = (uint8_t*) in; + uint256 seed, hashprev; + uint64_t matrix[64*64]; + + Obtc_t Obtc; + + CSHA3_256_init(&Obtc, &Obtc.CSHA3_256_p); + memcpy(Obtc.data_r,ptr, 32); + GenerateHeavyHashMatrix_t(&Obtc, seed, matrix); + serialize_heavyhash(&Obtc, matrix, in, out, len); + +} + diff --git a/heavyHash/obtc.h b/heavyHash/obtc.h new file mode 100644 index 0000000..bdbaa2c --- /dev/null +++ b/heavyHash/obtc.h @@ -0,0 +1,52 @@ +#ifndef OBTC_H +#define OBTC_H + + +#include "uint256.h" +#include "xoshiro256pp.h" +#include "Svd.h" +#include "DiagonalMatrix.h" +#include "Matrix.h" +#include "Rotator.h" +#include "heavyhash.h" + + +typedef struct Obtc_opt Obtc_t; +struct Obtc_opt{ + uint8_t data_r[32]; + uint64_t ss[4]; + uint8_t const_data[200]; + CSHA3_256 CSHA3_256_p; + CHeavyHash CHeavyHash_p; + uint256 g_hash_first; + XoShiRo256PlusPlus_t *xo; + DiagonalMatrix_t g_DiagonalMatrix; + +}; + +//struct Obtc_opt; + + +bool Is4BitPrecision(const uint64_t matrix[64*64]); +bool IsFullRank(const uint64_t matrix_[64*64]); +void GenerateHeavyHashMatrix(uint256 matrix_seed, uint64_t matrix[64*64]); +void serialize_heavyhash(Obtc_t *Obtc, uint64_t matrix[64*64], const char* in, char* out, int len); +void opticalbtc_hash(const char* in, char* out, int len); + +extern void CSHA3_256_init(Obtc_t *Obtc, CSHA3_256 *p); +extern void CSHA3_256_CSHA3_256(Obtc_t *Obtc, CSHA3_256 *p); + +extern void CSHA3_256_Write(CSHA3_256 *p, const unsigned char* data, size_t len); + +extern void CSHA3_256_Finalize(CSHA3_256 *p, unsigned char hash[OUTPUT_SIZE]); +//extern void CSHA3_256_Reset(Obtc_t *Obtc, CSHA3_256 *p); + +extern void CHeavyHash_init(Obtc_t *Obtc, CHeavyHash *p, uint64_t matrix_[64*64]); +extern void CHeavyHash_Write(CHeavyHash *p, const unsigned char* data, size_t len); + +extern void CHeavyHash_Finalize(Obtc_t *Obtc, CHeavyHash *p, unsigned char hash[OUTPUT_SIZE]); + +extern int sha3_init(Obtc_t *Obtc,sha3_ctx_t *c, int mdlen); // mdlen = hash output in bytes + + +#endif // OBTC_H diff --git a/heavyHash/obtc.o b/heavyHash/obtc.o new file mode 100644 index 0000000..bf11e3b Binary files /dev/null and b/heavyHash/obtc.o differ diff --git a/heavyHash/sha3.c b/heavyHash/sha3.c new file mode 100644 index 0000000..6bf9256 --- /dev/null +++ b/heavyHash/sha3.c @@ -0,0 +1,199 @@ +// sha3.c +// 19-Nov-11 Markku-Juhani O. Saarinen + +// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3" +// Revised 03-Sep-15 for portability + OpenSSL - style API +#include +#include "sha3.h" +#include "obtc.h" + + + + +// update the state with given number of rounds + +void sha3_keccakf(uint64_t st[25]) +{ + // constants + const uint64_t keccakf_rndc[24] = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 + }; + const int keccakf_rotc[24] = { + 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, + 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 + }; + const int keccakf_piln[24] = { + 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, + 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 + }; + + // variables + int i, j, r; + uint64_t t, bc[5]; + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + uint8_t *v; + + // endianess conversion. this is redundant on little-endian targets + for (i = 0; i < 25; i++) { + v = (uint8_t *) &st[i]; + st[i] = ((uint64_t) v[0]) | (((uint64_t) v[1]) << 8) | + (((uint64_t) v[2]) << 16) | (((uint64_t) v[3]) << 24) | + (((uint64_t) v[4]) << 32) | (((uint64_t) v[5]) << 40) | + (((uint64_t) v[6]) << 48) | (((uint64_t) v[7]) << 56); + } +#endif + + // actual iteration + for (r = 0; r < KECCAKF_ROUNDS; r++) { + + // Theta + for (i = 0; i < 5; i++) + bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20]; + + for (i = 0; i < 5; i++) { + t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); + for (j = 0; j < 25; j += 5) + st[j + i] ^= t; + } + + // Rho Pi + t = st[1]; + for (i = 0; i < 24; i++) { + j = keccakf_piln[i]; + bc[0] = st[j]; + st[j] = ROTL64(t, keccakf_rotc[i]); + t = bc[0]; + } + + // Chi + for (j = 0; j < 25; j += 5) { + for (i = 0; i < 5; i++) + bc[i] = st[j + i]; + for (i = 0; i < 5; i++) + st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5]; + } + + // Iota + st[0] ^= keccakf_rndc[r]; + } + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + // endianess conversion. this is redundant on little-endian targets + for (i = 0; i < 25; i++) { + v = (uint8_t *) &st[i]; + t = st[i]; + v[0] = t & 0xFF; + v[1] = (t >> 8) & 0xFF; + v[2] = (t >> 16) & 0xFF; + v[3] = (t >> 24) & 0xFF; + v[4] = (t >> 32) & 0xFF; + v[5] = (t >> 40) & 0xFF; + v[6] = (t >> 48) & 0xFF; + v[7] = (t >> 56) & 0xFF; + } +#endif +} + +// Initialize the context for SHA3 + +int sha3_init(Obtc_t *Obtc, sha3_ctx_t *c, int mdlen) +{ + int i; + + for (i = 0; i < 200; i++){ + c->st.b[i] = Obtc->const_data[199-i]; + } + + c->mdlen = mdlen; + c->rsiz = 200 - 2 * mdlen; + c->pt = 0; + + return 1; +} + +// update state with more data + +int sha3_update(sha3_ctx_t *c, const void *data, size_t len) +{ + size_t i; + int j; + + j = c->pt; + + for (i = 0; i < len; i++) { + c->st.b[j++] ^= ((const uint8_t *) data)[i]; + if (j >= c->rsiz) { + sha3_keccakf(c->st.q); + j = 0; + } + } + c->pt = j; + + return 1; +} + +// finalize and output a hash + +int sha3_final(void *md, sha3_ctx_t *c) +{ + int i; + + // c->st.b[c->pt] ^= 0x06; + c->st.b[c->pt] ^= 0x04; + c->st.b[c->rsiz - 1] ^= 0x80; + sha3_keccakf(c->st.q); + + for (i = 0; i < c->mdlen; i++) { + ((uint8_t *) md)[i] = c->st.b[i]; + } + + return 1; +} + +// compute a SHA-3 hash (md) of given byte length from "in" + +/*void *sha3(const void *in, size_t inlen, void *md, int mdlen) +{ + sha3_ctx_t sha3; + + sha3_init(&sha3, mdlen); + sha3_update(&sha3, in, inlen); + sha3_final(md, &sha3); + + return md; +}*/ + +// SHAKE128 and SHAKE256 extensible-output functionality + +void shake_xof(sha3_ctx_t *c) +{ + c->st.b[c->pt] ^= 0x1F; + c->st.b[c->rsiz - 1] ^= 0x80; + sha3_keccakf(c->st.q); + c->pt = 0; +} + +void shake_out(sha3_ctx_t *c, void *out, size_t len) +{ + size_t i; + int j; + + j = c->pt; + for (i = 0; i < len; i++) { + if (j >= c->rsiz) { + sha3_keccakf(c->st.q); + j = 0; + } + ((uint8_t *) out)[i] = c->st.b[j++]; + } + c->pt = j; +} + diff --git a/heavyHash/sha3.h b/heavyHash/sha3.h new file mode 100644 index 0000000..b4ff036 --- /dev/null +++ b/heavyHash/sha3.h @@ -0,0 +1,51 @@ +// sha3.h +// 19-Nov-11 Markku-Juhani O. Saarinen + +#ifndef SHA3_H +#define SHA3_H + + +#include +#include + + + + +#ifndef KECCAKF_ROUNDS +#define KECCAKF_ROUNDS 24 +#endif + +#ifndef ROTL64 +#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y)))) +#endif + +// state context +typedef struct { + union { // state: + uint8_t b[200]; // 8-bit bytes + uint64_t q[25]; // 64-bit words + } st; + int pt, rsiz, mdlen; // these don't overflow +} sha3_ctx_t; + +// Compression function. +void sha3_keccakf(uint64_t st[25]); + +// OpenSSL - like interfece + +int sha3_update(sha3_ctx_t *c, const void *data, size_t len); +int sha3_final(void *md, sha3_ctx_t *c); // digest goes to md + +// compute a sha3 hash (md) of given byte length from "in" +void *sha3(const void *in, size_t inlen, void *md, int mdlen); + +// SHAKE128 and SHAKE256 extensible-output functions +//#define shake128_init(c) sha3_init(c, 16) +//#define shake256_init(c) sha3_init(c, 32) +//#define shake_update sha3_update + +void shake_xof(sha3_ctx_t *c); +void shake_out(sha3_ctx_t *c, void *out, size_t len); + +#endif + diff --git a/heavyHash/sha3.o b/heavyHash/sha3.o new file mode 100644 index 0000000..d84927a Binary files /dev/null and b/heavyHash/sha3.o differ diff --git a/heavyHash/singular.h b/heavyHash/singular.h new file mode 100644 index 0000000..cd3540f --- /dev/null +++ b/heavyHash/singular.h @@ -0,0 +1,42 @@ +#ifndef _SINGULAR_SINGULAR_H +#define _SINGULAR_SINGULAR_H + +/** The version of the singular library. */ +#define SINGULAR_VERSION "@PROJECT_VERSION@" + +/** + * Whether rvalue references are supported. + * + * Visual Studio 2010 and lower do not have rvalue references so far. + */ +#if defined(_MSC_VER) && _MSC_VER < 1700 +#define SINGULAR_RVALUE_REFERENCE_SUPPORTED 0 +#else +#define SINGULAR_RVALUE_REFERENCE_SUPPORTED 1 +#endif + +/** + * Whether function deletions are supported. + * + * Visual Studio 2012 and lower do not like "delete" stuff so far. + */ +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define SINGULAR_FUNCTION_DELETION_SUPPORTED 0 +#else +#define SINGULAR_FUNCTION_DELETION_SUPPORTED 1 +#endif + +/** + * Whether template friend operator overalodings are supported. + * + * Visual Studio 2012 and lower do not like overloading a template firend + * operators. + * Neither does GCC. + */ +#if (defined(_MSC_VER) && _MSC_VER < 1800) || (defined(__GNUC__) && !defined(__clang__)) +#define SINGULAR_TEMPLATE_FRIEND_OPERATOR_OVERLOADING_SUPPORTED 0 +#else +#define SINGULAR_TEMPLATE_FRIEND_OPERATOR_OVERLOADING_SUPPORTED 1 +#endif + +#endif diff --git a/heavyHash/test.c b/heavyHash/test.c new file mode 100644 index 0000000..2d28982 --- /dev/null +++ b/heavyHash/test.c @@ -0,0 +1,183 @@ +#include +#include +#include + +#include "obtc.h" +#include "singular.h" +#include + + +//uint8_t const_data[200]; + +static const int hex2bin_tbl[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +}; + +bool hex2bin(unsigned char *p, const char *hexstr, size_t len) +{ + int nibble1, nibble2; + unsigned char idx; + bool ret = false; + + while (*hexstr && len) { + if ((!hexstr[1])) { + printf("hex2bin str truncated"); + return ret; + } + + idx = *hexstr++; + nibble1 = hex2bin_tbl[idx]; + idx = *hexstr++; + nibble2 = hex2bin_tbl[idx]; + + if (((nibble1 < 0) || (nibble2 < 0))) { + printf("hex2bin scan failed"); + return ret; + } + + *p++ = (((unsigned char)nibble1) << 4) | ((unsigned char)nibble2); + --len; + } + + if ((len == 0 && *hexstr == 0)) + ret = true; + return ret; +} + + +int main(int argc, char **argv) +{ + uint8_t genesis_block[80]; + uint8_t hash[32]; + + uint8_t last_prehash[32]; + uint8_t last_prehash2[32]; + uint8_t prehash_tab[32]; + uint8_t nonce_tab[8]; + char *prehash_str = "d76ffb1d8e31ec04579b0452b52bde7dbd088e912ab1b11ba924ff309ab44a43";//argv[1]; + char *nonce_str = "80aa59a7901f2502";//argv[2]; + //char *last_prehash_str = argv[3]; + //char *last_prehash_str2 = argv[4]; + + hex2bin(prehash_tab, prehash_str, strlen(prehash_str)/2); + hex2bin(nonce_tab, nonce_str, strlen(nonce_str)/2); + //hex2bin(last_prehash, last_prehash_str, strlen(last_prehash_str)/2); + //hex2bin(last_prehash2, last_prehash_str2, strlen(last_prehash_str2)/2); + /*for (uint8_t i = 0; i<32;i++){ + printf("0x%x, ",prehash_tab[i]); + } + printf("\n"); + + for (uint8_t i = 0; i<8;i++){ + printf("0x%x, ",nonce_tab[i]); + } + printf("\n");*/ + + //uint8_t prehash[32] = {0x81,0x55,0x3a,0x69,0x5a,0x05,0x88,0x99,0x8c,0x41,0x37,0x92,0xe7,0x4c,0xe8,0xb8,0xf8,0xa0,0x96,0xd6,0x4b,0x3e,0xe4,0x73,0x87,0x37,0x24,0x34,0x48,0x5c,0x0b,0x6f}; + //uint8_t utime[8] = {0x00,0x00,0x01,0x84,0x8c,0xa8,0x7c,0x49}; + uint8_t pad[32] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; + //uint8_t nonce[8] = {0x2f,0x84,0x00,0x00,0x0e,0xba,0x16,0x7c}; + +#if 0 + //uint8_t prehash[32] = {0xa4,0x8f,0xae,0x69,0xeb,0x28,0xc7,0xe0,0x14,0x11,0x4f,0x01,0xae,0x60,0xc8,0xc3,0x82,0x73,0xc4,0x60,0x66,0xcf,0x95,0xd6,0x77,0x1a,0x55,0xd6,0x16,0xd7,0xa1,0x9a};//大端 + //uint8_t utime[8] = {0x00,0x00,0x01,0x87,0x22,0x1e,0xad,0x44}; + //uint8_t nonce[8] = {0x8e,0xd4,0x00,0x10,0x6b,0xe7,0xe4,0x00}; + //uint8_t nonce[8] = {0x8e,0xd4,0x00,0x12,0x27,0xc6,0x90,0xa0}; + //uint8_t nonce[8] = {0x8e,0xd4,0x00,0x32,0x0b,0x6b,0xd6,0xd1}; + + + //3f 9a aa c6 32 af 1a 4e 0e 1f ea 8a f8 e3 d5 32 b7 5a a4 71 b2 e4 ef fe a5 bd cc fa 3b dd b6 61 + uint8_t prehash[32] = {0x3f,0x9a,0xaa,0xc6,0x32,0xaf,0x1a,0x4e,0x0e,0x1f,0xea,0x8a,0xf8,0xe3,0xd5,0x32,0xb7,0x5a,0xa4,0x71,0xb2,0xe4,0xef,0xfe,0xa5,0xbd,0xcc,0xfa,0x3b,0xdd,0xb6,0x61};//大端 + uint8_t utime[8] = {0x00,0x00,0x01,0x87,0x21,0xeb,0x73,0x79}; + uint8_t nonce[8] = {0xa3,0xdd,0x02,0x10,0x1a,0x87,0xb4,0x70}; + + + + +#else + + + /*443e01000000ffff00000000 + e0af2a3ba173157d3f70c94aad742fdf16d9930fdfc9d6301e869bcef04ced6c + e0af2a3ba173157d3f70c94aad742fdf16d9930fdfc9d6301e869bcef04ced6c + dbee84288701000000000000901f25020000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + [2023-03-28 22:00:46.549] 00 cc 01 11 70 83 85 16 90 1f 25 02 + + kas_pow_hash: in:e0af2a3ba173157d3f70c94aad742fdf16d9930fdfc9d6301e869bcef04ced6cdbee842887010000000000000000000000000000000000000000000000000000000000000000000070838516901f2502 + kas_pow_hash: out:dae78f5008d3b66f + 01a740ce33c812ba + 772b3f5763da7bc6 + da24cb6c00000000*/ + + uint8_t prehash[32] = {0xe0,0xaf,0x2a,0x3b,0xa1,0x73,0x15,0x7d,0x3f,0x70,0xc9,0x4a,0xad,0x74,0x2f,0xdf,0x16,0xd9,0x93,0x0f,0xdf,0xc9,0xd6,0x30,0x1e,0x86,0x9b,0xce,0xf0,0x4c,0xed,0x6c}; + //uint8_t utime[8] = {0x00,0x00,0x01,0x87,0x28,0x84,0xee,0xdb}; + uint8_t nonce[8] = {0x02,0x25,0x1f,0x90,0x16,0x85,0x83,0x70}; + +#endif + + /*for (int i = 0; i < 32; ++i) genesis_block[i] = prehash[i]; + for (int i = 0; i < 8; ++i) genesis_block[i+32] = utime[7-i]; + for (int i = 0; i < 32; ++i) genesis_block[i+40] = pad[31-i]; + for (int i = 0; i < 8; ++i) genesis_block[i+72] = nonce[7-i];*/ + //uint8_t utime[8] = {0x00,0x00,0x01,0x87,0x21,0xeb,0x73,0x79}; + //dbee8428870100000 + uint8_t utime[8] = {0x00,0x00,0x01,0x87,0x28,0x84,0xee,0xdb}; + + for (int i = 0; i < 32; ++i) genesis_block[i] = prehash_tab[i]; + for (int i = 0; i < 8; ++i) genesis_block[i+32] = utime[7-i]; + for (int i = 0; i < 32; ++i) genesis_block[i+40] = pad[31-i]; + for (int i = 0; i < 8; ++i) genesis_block[i+72] = nonce_tab[i]; + + clock_t start, finish; + double Total_time; + uint32_t cnt = 0;; + //while(1) + { + start = clock(); + opticalbtc_hash((const char*)&genesis_block, (char*)&hash, sizeof(genesis_block)); + finish = clock(); + Total_time = (double)(finish-start) / CLOCKS_PER_SEC; + printf( "\n cnt = %d, opticalbtc_hash run times %f seconds\n", cnt++, Total_time); + + for (int i=31; i>-1; i--) { + printf("%02hhx", hash[i]); + } + printf("\n"); + } + + + //if (hash[31] != 0 || hash[30] != 0){ + // for (int i = 0; i < 32; ++i) genesis_block[i] = last_prehash[i]; + // opticalbtc_hash((const char*)&genesis_block, (char*)&hash, sizeof(genesis_block)); + //} + + //if (hash[31] != 0 || hash[30] != 0){ + // for (int i = 0; i < 32; ++i) genesis_block[i] = last_prehash2[i]; + // opticalbtc_hash((const char*)&genesis_block, (char*)&hash, sizeof(genesis_block)); + //} + + if (hash[31] != 0 && hash[30] != 0){ + printf("reject\n"); + } + + return 0; +} + + +//g++ -std=c++11 *.cpp diff --git a/heavyHash/uint256.h b/heavyHash/uint256.h new file mode 100644 index 0000000..72b36d1 --- /dev/null +++ b/heavyHash/uint256.h @@ -0,0 +1,44 @@ +// Copyright (c) 2009-2010 Satoshi Nakamoto +// Copyright (c) 2009-2016 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_UINT256_H +#define BITCOIN_UINT256_H + +#include +//#include +//#include +#include +#include +//#include +#include + + + +/** 256-bit opaque blob. + * @note This type is called uint256 for historical reasons only. It is an + * opaque blob of 256 bits and has no integer operations. Use arith_uint256 if + * those are required. + */ +#define UPPER_P(x) x->elements[0] +#define LOWER_P(x) x->elements[1] +#define UPPER(x) x.elements[0] +#define LOWER(x) x.elements[1] +#define WIDTH 32 + +typedef struct class_base_blob base_blob_t; +struct class_base_blob{ + uint8_t data[WIDTH]; +}; + + +typedef struct uint128_t { uint64_t elements[2]; } uint128_t; +typedef struct uint256_t { + uint128_t elements[2]; + base_blob_t bb; +} uint256; + + + +#endif // BITCOIN_UINT256_H diff --git a/heavyHash/xoshiro256pp.h b/heavyHash/xoshiro256pp.h new file mode 100644 index 0000000..4bf39fa --- /dev/null +++ b/heavyHash/xoshiro256pp.h @@ -0,0 +1,15 @@ +#ifndef OPOW_CRYPTO_XOSHIRO256PP_H +#define OPOW_CRYPTO_XOSHIRO256PP_H + +#include +#include "uint256.h" + + +typedef struct class_XoShiRo256PlusPlus XoShiRo256PlusPlus_t; +struct class_XoShiRo256PlusPlus{ + uint64_t s[4]; +}; + + + +#endif //OPOW_CRYPTO_XOSHIRO256PP_H diff --git a/randomx/a.exe b/randomx/a.exe new file mode 100644 index 0000000..6241a1e Binary files /dev/null and b/randomx/a.exe differ diff --git a/randomx/aes_hash.cpp b/randomx/aes_hash.cpp new file mode 100644 index 0000000..c8db76c --- /dev/null +++ b/randomx/aes_hash.cpp @@ -0,0 +1,241 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "soft_aes.h" +#include + +//NOTE: The functions below were tuned for maximum performance +//and are not cryptographically secure outside of the scope of RandomX. +//It's not recommended to use them as general hash functions and PRNGs. + +//AesHash1R: +//state0, state1, state2, state3 = Blake2b-512("RandomX AesHash1R state") +//xkey0, xkey1 = Blake2b-256("RandomX AesHash1R xkeys") + +#define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d +#define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e +#define AES_HASH_1R_STATE2 0xe8a07ce4, 0x5079506b, 0xae62c7d0, 0x6a770017 +#define AES_HASH_1R_STATE3 0x7e994948, 0x79a10005, 0x07ad828d, 0x630a240c + +#define AES_HASH_1R_XKEY0 0x06890201, 0x90dc56bf, 0x8b24949f, 0xf6fa8389 +#define AES_HASH_1R_XKEY1 0xed18f99b, 0xee1043c6, 0x51f4e03c, 0x61b263d1 + +/* + Calculate a 512-bit hash of 'input' using 4 lanes of AES. + The input is treated as a set of round keys for the encryption + of the initial state. + + 'inputSize' must be a multiple of 64. + + For a 2 MiB input, this has the same security as 32768-round + AES encryption. + + Hashing throughput: >20 GiB/s per CPU core with hardware AES +*/ +template +void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { + assert(inputSize % 64 == 0); + const uint8_t* inptr = (uint8_t*)input; + const uint8_t* inputEnd = inptr + inputSize; + + rx_vec_i128 state0, state1, state2, state3; + rx_vec_i128 in0, in1, in2, in3; + + //intial state + state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0); + state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1); + state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2); + state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3); + + //process 64 bytes at a time in 4 lanes + while (inptr < inputEnd) { + in0 = rx_load_vec_i128((rx_vec_i128*)inptr + 0); + in1 = rx_load_vec_i128((rx_vec_i128*)inptr + 1); + in2 = rx_load_vec_i128((rx_vec_i128*)inptr + 2); + in3 = rx_load_vec_i128((rx_vec_i128*)inptr + 3); + + state0 = aesenc(state0, in0); + state1 = aesdec(state1, in1); + state2 = aesenc(state2, in2); + state3 = aesdec(state3, in3); + + inptr += 64; + } + + //two extra rounds to achieve full diffusion + rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0); + rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1); + + state0 = aesenc(state0, xkey0); + state1 = aesdec(state1, xkey0); + state2 = aesenc(state2, xkey0); + state3 = aesdec(state3, xkey0); + + state0 = aesenc(state0, xkey1); + state1 = aesdec(state1, xkey1); + state2 = aesenc(state2, xkey1); + state3 = aesdec(state3, xkey1); + + //output hash + rx_store_vec_i128((rx_vec_i128*)hash + 0, state0); + rx_store_vec_i128((rx_vec_i128*)hash + 1, state1); + rx_store_vec_i128((rx_vec_i128*)hash + 2, state2); + rx_store_vec_i128((rx_vec_i128*)hash + 3, state3); +} + +template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); +template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); + +//AesGenerator1R: +//key0, key1, key2, key3 = Blake2b-512("RandomX AesGenerator1R keys") + +#define AES_GEN_1R_KEY0 0xb4f44917, 0xdbb5552b, 0x62716609, 0x6daca553 +#define AES_GEN_1R_KEY1 0x0da1dc4e, 0x1725d378, 0x846a710d, 0x6d7caf07 +#define AES_GEN_1R_KEY2 0x3e20e345, 0xf4c0794f, 0x9f947ec6, 0x3f1262f1 +#define AES_GEN_1R_KEY3 0x49169154, 0x16314c88, 0xb1ba317c, 0x6aef8135 + +/* + Fill 'buffer' with pseudorandom data based on 512-bit 'state'. + The state is encrypted using a single AES round per 16 bytes of output + in 4 lanes. + + 'outputSize' must be a multiple of 64. + + The modified state is written back to 'state' to allow multiple + calls to this function. +*/ +template +void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { + assert(outputSize % 64 == 0); + const uint8_t* outptr = (uint8_t*)buffer; + const uint8_t* outputEnd = outptr + outputSize; + + rx_vec_i128 state0, state1, state2, state3; + rx_vec_i128 key0, key1, key2, key3; + + key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0); + key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1); + key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2); + key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3); + + state0 = rx_load_vec_i128((rx_vec_i128*)state + 0); + state1 = rx_load_vec_i128((rx_vec_i128*)state + 1); + state2 = rx_load_vec_i128((rx_vec_i128*)state + 2); + state3 = rx_load_vec_i128((rx_vec_i128*)state + 3); + + while (outptr < outputEnd) { + state0 = aesdec(state0, key0); + state1 = aesenc(state1, key1); + state2 = aesdec(state2, key2); + state3 = aesenc(state3, key3); + + rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0); + rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1); + rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2); + rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3); + + outptr += 64; + } + + rx_store_vec_i128((rx_vec_i128*)state + 0, state0); + rx_store_vec_i128((rx_vec_i128*)state + 1, state1); + rx_store_vec_i128((rx_vec_i128*)state + 2, state2); + rx_store_vec_i128((rx_vec_i128*)state + 3, state3); +} + +template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); +template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); + +//AesGenerator4R: +//key0, key1, key2, key3 = Blake2b-512("RandomX AesGenerator4R keys 0-3") +//key4, key5, key6, key7 = Blake2b-512("RandomX AesGenerator4R keys 4-7") + +#define AES_GEN_4R_KEY0 0x99e5d23f, 0x2f546d2b, 0xd1833ddb, 0x6421aadd +#define AES_GEN_4R_KEY1 0xa5dfcde5, 0x06f79d53, 0xb6913f55, 0xb20e3450 +#define AES_GEN_4R_KEY2 0x171c02bf, 0x0aa4679f, 0x515e7baf, 0x5c3ed904 +#define AES_GEN_4R_KEY3 0xd8ded291, 0xcd673785, 0xe78f5d08, 0x85623763 +#define AES_GEN_4R_KEY4 0x229effb4, 0x3d518b6d, 0xe3d6a7a6, 0xb5826f73 +#define AES_GEN_4R_KEY5 0xb272b7d2, 0xe9024d4e, 0x9c10b3d9, 0xc7566bf3 +#define AES_GEN_4R_KEY6 0xf63befa7, 0x2ba9660a, 0xf765a38b, 0xf273c9e7 +#define AES_GEN_4R_KEY7 0xc0b0762d, 0x0c06d1fd, 0x915839de, 0x7a7cd609 + +template +void fillAes4Rx4(void *state, size_t outputSize, void *buffer) { + assert(outputSize % 64 == 0); + const uint8_t* outptr = (uint8_t*)buffer; + const uint8_t* outputEnd = outptr + outputSize; + //printf("outputSize= %zu\n",outputSize); //outputSize =2176 这里填充program 空间,总计128+256x8 byte + rx_vec_i128 state0, state1, state2, state3; + rx_vec_i128 key0, key1, key2, key3, key4, key5, key6, key7; + + key0 = rx_set_int_vec_i128(AES_GEN_4R_KEY0); + key1 = rx_set_int_vec_i128(AES_GEN_4R_KEY1); + key2 = rx_set_int_vec_i128(AES_GEN_4R_KEY2); + key3 = rx_set_int_vec_i128(AES_GEN_4R_KEY3); + key4 = rx_set_int_vec_i128(AES_GEN_4R_KEY4); + key5 = rx_set_int_vec_i128(AES_GEN_4R_KEY5); + key6 = rx_set_int_vec_i128(AES_GEN_4R_KEY6); + key7 = rx_set_int_vec_i128(AES_GEN_4R_KEY7); + + state0 = rx_load_vec_i128((rx_vec_i128*)state + 0); + state1 = rx_load_vec_i128((rx_vec_i128*)state + 1); + state2 = rx_load_vec_i128((rx_vec_i128*)state + 2); + state3 = rx_load_vec_i128((rx_vec_i128*)state + 3); + + while (outptr < outputEnd) { + state0 = aesdec(state0, key0); + state1 = aesenc(state1, key0); + state2 = aesdec(state2, key4); + state3 = aesenc(state3, key4); + + state0 = aesdec(state0, key1); + state1 = aesenc(state1, key1); + state2 = aesdec(state2, key5); + state3 = aesenc(state3, key5); + + state0 = aesdec(state0, key2); + state1 = aesenc(state1, key2); + state2 = aesdec(state2, key6); + state3 = aesenc(state3, key6); + + state0 = aesdec(state0, key3); + state1 = aesenc(state1, key3); + state2 = aesdec(state2, key7); + state3 = aesenc(state3, key7); + + rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0); + rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1); + rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2); + rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3); + + outptr += 64; + } +} + +template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); +template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); diff --git a/randomx/aes_hash.hpp b/randomx/aes_hash.hpp new file mode 100644 index 0000000..b4d0e94 --- /dev/null +++ b/randomx/aes_hash.hpp @@ -0,0 +1,40 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +template +void hashAes1Rx4(const void *input, size_t inputSize, void *hash); + +template +void fillAes1Rx4(void *state, size_t outputSize, void *buffer); + +template +void fillAes4Rx4(void *state, size_t outputSize, void *buffer); diff --git a/randomx/allocator.cpp b/randomx/allocator.cpp new file mode 100644 index 0000000..2ddbed9 --- /dev/null +++ b/randomx/allocator.cpp @@ -0,0 +1,60 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "allocator.hpp" +#include "intrin_portable.h" +#include "virtual_memory.hpp" +#include "common.hpp" + +namespace randomx { + + template + void* AlignedAllocator::allocMemory(size_t count) { + void *mem = rx_aligned_alloc(count, alignment); + if (mem == nullptr) + throw std::bad_alloc(); + return mem; + } + + template + void AlignedAllocator::freeMemory(void* ptr, size_t count) { + rx_aligned_free(ptr); + } + + template class AlignedAllocator; + + void* LargePageAllocator::allocMemory(size_t count) { + return allocLargePagesMemory(count); + } + + void LargePageAllocator::freeMemory(void* ptr, size_t count) { + freePagedMemory(ptr, count); + }; + +} \ No newline at end of file diff --git a/randomx/allocator.hpp b/randomx/allocator.hpp new file mode 100644 index 0000000..d7aa3f9 --- /dev/null +++ b/randomx/allocator.hpp @@ -0,0 +1,46 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +namespace randomx { + + template + struct AlignedAllocator { + static void* allocMemory(size_t); + static void freeMemory(void*, size_t); + }; + + struct LargePageAllocator { + static void* allocMemory(size_t); + static void freeMemory(void*, size_t); + }; + +} \ No newline at end of file diff --git a/randomx/argon2.h b/randomx/argon2.h new file mode 100644 index 0000000..9052f42 --- /dev/null +++ b/randomx/argon2.h @@ -0,0 +1,261 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#pragma once + +#include +#include +#include + +/* + * Argon2 input parameter restrictions + */ + + /* Minimum and maximum number of lanes (degree of parallelism) */ +#define ARGON2_MIN_LANES UINT32_C(1) +#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF) + +/* Minimum and maximum number of threads */ +#define ARGON2_MIN_THREADS UINT32_C(1) +#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF) + +/* Number of synchronization points between lanes per pass */ +#define ARGON2_SYNC_POINTS UINT32_C(4) + +/* Minimum and maximum digest size in bytes */ +#define ARGON2_MIN_OUTLEN UINT32_C(4) +#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */ +#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */ + +#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b)) +/* Max memory size is addressing-space/2, topping at 2^32 blocks (4 TB) */ +#define ARGON2_MAX_MEMORY_BITS \ + ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1)) +#define ARGON2_MAX_MEMORY \ + ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS) + +/* Minimum and maximum number of passes */ +#define ARGON2_MIN_TIME UINT32_C(1) +#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum password length in bytes */ +#define ARGON2_MIN_PWD_LENGTH UINT32_C(0) +#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum associated data length in bytes */ +#define ARGON2_MIN_AD_LENGTH UINT32_C(0) +#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum salt length in bytes */ +#define ARGON2_MIN_SALT_LENGTH UINT32_C(8) +#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum key length in bytes */ +#define ARGON2_MIN_SECRET UINT32_C(0) +#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF) + +/* Flags to determine which fields are securely wiped (default = no wipe). */ +#define ARGON2_DEFAULT_FLAGS UINT32_C(0) +#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0) +#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1) + + +/* Error codes */ +typedef enum Argon2_ErrorCodes { + ARGON2_OK = 0, + + ARGON2_OUTPUT_PTR_NULL = -1, + + ARGON2_OUTPUT_TOO_SHORT = -2, + ARGON2_OUTPUT_TOO_LONG = -3, + + ARGON2_PWD_TOO_SHORT = -4, + ARGON2_PWD_TOO_LONG = -5, + + ARGON2_SALT_TOO_SHORT = -6, + ARGON2_SALT_TOO_LONG = -7, + + ARGON2_AD_TOO_SHORT = -8, + ARGON2_AD_TOO_LONG = -9, + + ARGON2_SECRET_TOO_SHORT = -10, + ARGON2_SECRET_TOO_LONG = -11, + + ARGON2_TIME_TOO_SMALL = -12, + ARGON2_TIME_TOO_LARGE = -13, + + ARGON2_MEMORY_TOO_LITTLE = -14, + ARGON2_MEMORY_TOO_MUCH = -15, + + ARGON2_LANES_TOO_FEW = -16, + ARGON2_LANES_TOO_MANY = -17, + + ARGON2_PWD_PTR_MISMATCH = -18, /* NULL ptr with non-zero length */ + ARGON2_SALT_PTR_MISMATCH = -19, /* NULL ptr with non-zero length */ + ARGON2_SECRET_PTR_MISMATCH = -20, /* NULL ptr with non-zero length */ + ARGON2_AD_PTR_MISMATCH = -21, /* NULL ptr with non-zero length */ + + ARGON2_MEMORY_ALLOCATION_ERROR = -22, + + ARGON2_FREE_MEMORY_CBK_NULL = -23, + ARGON2_ALLOCATE_MEMORY_CBK_NULL = -24, + + ARGON2_INCORRECT_PARAMETER = -25, + ARGON2_INCORRECT_TYPE = -26, + + ARGON2_OUT_PTR_MISMATCH = -27, + + ARGON2_THREADS_TOO_FEW = -28, + ARGON2_THREADS_TOO_MANY = -29, + + ARGON2_MISSING_ARGS = -30, + + ARGON2_ENCODING_FAIL = -31, + + ARGON2_DECODING_FAIL = -32, + + ARGON2_THREAD_FAIL = -33, + + ARGON2_DECODING_LENGTH_FAIL = -34, + + ARGON2_VERIFY_MISMATCH = -35 +} argon2_error_codes; + +/* Memory allocator types --- for external allocation */ +typedef int(*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate); +typedef void(*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate); + +/* Argon2 external data structures */ + +/* + ***** + * Context: structure to hold Argon2 inputs: + * output array and its length, + * password and its length, + * salt and its length, + * secret and its length, + * associated data and its length, + * number of passes, amount of used memory (in KBytes, can be rounded up a bit) + * number of parallel threads that will be run. + * All the parameters above affect the output hash value. + * Additionally, two function pointers can be provided to allocate and + * deallocate the memory (if NULL, memory will be allocated internally). + * Also, three flags indicate whether to erase password, secret as soon as they + * are pre-hashed (and thus not needed anymore), and the entire memory + ***** + * Simplest situation: you have output array out[8], password is stored in + * pwd[32], salt is stored in salt[16], you do not have keys nor associated + * data. You need to spend 1 GB of RAM and you run 5 passes of Argon2d with + * 4 parallel lanes. + * You want to erase the password, but you're OK with last pass not being + * erased. You want to use the default memory allocator. + * Then you initialize: + Argon2_Context(out,8,pwd,32,salt,16,NULL,0,NULL,0,5,1<<20,4,4,NULL,NULL,true,false,false,false) + */ +typedef struct Argon2_Context { + uint8_t *out; /* output array */ + uint32_t outlen; /* digest length */ + + uint8_t *pwd; /* password array */ + uint32_t pwdlen; /* password length */ + + uint8_t *salt; /* salt array */ + uint32_t saltlen; /* salt length */ + + uint8_t *secret; /* key array */ + uint32_t secretlen; /* key length */ + + uint8_t *ad; /* associated data array */ + uint32_t adlen; /* associated data length */ + + uint32_t t_cost; /* number of passes */ + uint32_t m_cost; /* amount of memory requested (KB) */ + uint32_t lanes; /* number of lanes */ + uint32_t threads; /* maximum number of threads */ + + uint32_t version; /* version number */ + + allocate_fptr allocate_cbk; /* pointer to memory allocator */ + deallocate_fptr free_cbk; /* pointer to memory deallocator */ + + uint32_t flags; /* array of bool options */ +} argon2_context; + +/* Argon2 primitive type */ +typedef enum Argon2_type { + Argon2_d = 0, + Argon2_i = 1, + Argon2_id = 2 +} argon2_type; + +/* Version of the algorithm */ +typedef enum Argon2_version { + ARGON2_VERSION_10 = 0x10, + ARGON2_VERSION_13 = 0x13, + ARGON2_VERSION_NUMBER = ARGON2_VERSION_13 +} argon2_version; + +//Argon2 instance - forward declaration +typedef struct Argon2_instance_t argon2_instance_t; + +//Argon2 position = forward declaration +typedef struct Argon2_position_t argon2_position_t; + +//Argon2 implementation function +typedef void randomx_argon2_impl(const argon2_instance_t* instance, + argon2_position_t position); + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Function that fills the segment using previous segments also from other + * threads + * @param context current context + * @param instance Pointer to the current instance + * @param position Current position + * @pre all block pointers must be valid + */ +void randomx_argon2_fill_segment_ref(const argon2_instance_t* instance, + argon2_position_t position); + +randomx_argon2_impl *randomx_argon2_impl_ssse3(); +randomx_argon2_impl *randomx_argon2_impl_avx2(); + +#if defined(__cplusplus) +} +#endif diff --git a/randomx/argon2_avx2.c b/randomx/argon2_avx2.c new file mode 100644 index 0000000..6fb5fda --- /dev/null +++ b/randomx/argon2_avx2.c @@ -0,0 +1,175 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include + +#include "argon2.h" + +void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance, + argon2_position_t position); + +randomx_argon2_impl* randomx_argon2_impl_avx2() { +#if defined(__AVX2__) + return &randomx_argon2_fill_segment_avx2; +#endif + return NULL; +} + +#if defined(__AVX2__) + +#include "argon2_core.h" + +#include "blake2/blamka-round-avx2.h" +#include "blake2/blake2-impl.h" +#include "blake2/blake2.h" + +static void fill_block(__m256i* state, const block* ref_block, + block* next_block, int with_xor) { + __m256i block_XY[ARGON2_HWORDS_IN_BLOCK]; + unsigned int i; + + if (with_xor) { + for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { + state[i] = _mm256_xor_si256( + state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i)); + block_XY[i] = _mm256_xor_si256( + state[i], _mm256_loadu_si256((const __m256i*)next_block->v + i)); + } + } + else { + for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { + block_XY[i] = state[i] = _mm256_xor_si256( + state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i)); + } + } + + for (i = 0; i < 4; ++i) { + BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5], + state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]); + } + + for (i = 0; i < 4; ++i) { + BLAKE2_ROUND_2(state[0 + i], state[4 + i], state[8 + i], state[12 + i], + state[16 + i], state[20 + i], state[24 + i], state[28 + i]); + } + + for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { + state[i] = _mm256_xor_si256(state[i], block_XY[i]); + _mm256_storeu_si256((__m256i*)next_block->v + i, state[i]); + } +} + +void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance, + argon2_position_t position) { + // printf("randomx_argon2_fill_segment_avx2\n"); + block* ref_block = NULL, * curr_block = NULL; + block address_block, input_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index, i; + __m256i state[ARGON2_HWORDS_IN_BLOCK]; + + if (instance == NULL) { + return; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } + else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + pseudo_rand = instance->memory[prev_offset].v[0]; + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(state, ref_block, curr_block, 0); + } + else { + if (0 == position.pass) { + fill_block(state, ref_block, curr_block, 0); + } + else { + fill_block(state, ref_block, curr_block, 1); + } + } + } +} + +#endif diff --git a/randomx/argon2_core.c b/randomx/argon2_core.c new file mode 100644 index 0000000..53d00df --- /dev/null +++ b/randomx/argon2_core.c @@ -0,0 +1,396 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + + /*For memory wiping*/ +#ifdef _MSC_VER +#include +#include /* For SecureZeroMemory */ +#endif +#if defined __STDC_LIB_EXT1__ +#define __STDC_WANT_LIB_EXT1__ 1 +#endif +#define VC_GE_2005(version) (version >= 1400) + +#include +#include +#include + +#include "argon2_core.h" +#include "blake2/blake2.h" +#include "blake2/blake2-impl.h" + +#ifdef GENKAT +#include "genkat.h" +#endif + +#if defined(__clang__) +#if __has_attribute(optnone) +#define NOT_OPTIMIZED __attribute__((optnone)) +#endif +#elif defined(__GNUC__) +#define GCC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#if GCC_VERSION >= 40400 +#define NOT_OPTIMIZED __attribute__((optimize("O0"))) +#endif +#endif +#ifndef NOT_OPTIMIZED +#define NOT_OPTIMIZED +#endif + +/***************Instance and Position constructors**********/ + +static void load_block(block *dst, const void *input) { + unsigned i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i])); + } +} + +static void store_block(void *output, const block *src) { + unsigned i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]); + } +} + +uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance,const argon2_position_t *position, uint32_t pseudo_rand,int same_lane) { + /* + * Pass 0: + * This lane : all already finished segments plus already constructed + * blocks in this segment + * Other lanes : all already finished segments + * Pass 1+: + * This lane : (SYNC_POINTS - 1) last segments plus already constructed + * blocks in this segment + * Other lanes : (SYNC_POINTS - 1) last segments + */ + uint32_t reference_area_size; + uint64_t relative_position; + uint32_t start_position, absolute_position; + + if (0 == position->pass) { + /* First pass */ + if (0 == position->slice) { + /* First slice */ + reference_area_size = position->index - 1; /* all but the previous */ + } + else { + if (same_lane) { + /* The same lane => add current segment */ + reference_area_size =position->slice * instance->segment_length +position->index - 1; + } + else { + reference_area_size =position->slice * instance->segment_length +((position->index == 0) ? (-1) : 0); + } + } + } + else { + /* Second pass */ + if (same_lane) { + reference_area_size = instance->lane_length - + instance->segment_length + position->index - + 1; + } + else { + reference_area_size = instance->lane_length - + instance->segment_length + + ((position->index == 0) ? (-1) : 0); + } + } + + /* 1.2.4. Mapping pseudo_rand to 0.. and produce + * relative position */ + relative_position = pseudo_rand; + relative_position = relative_position * relative_position >> 32; + relative_position = reference_area_size - 1 - (reference_area_size * relative_position >> 32); + + /* 1.2.5 Computing starting position */ + start_position = 0; + + if (0 != position->pass) { + start_position = (position->slice == ARGON2_SYNC_POINTS - 1)? 0 : (position->slice + 1) * instance->segment_length; + } + + /* 1.2.6. Computing absolute position */ + absolute_position = (start_position + relative_position) % instance->lane_length; /* absolute position */ + return absolute_position; +} + +/* Single-threaded version for p=1 case */ +static int fill_memory_blocks_st(argon2_instance_t *instance) { + uint32_t r, s, l; + + for (r = 0; r < instance->passes; ++r) { //3 + for (s = 0; s < ARGON2_SYNC_POINTS; ++s) { //4 + for (l = 0; l < instance->lanes; ++l) { //1 + argon2_position_t position = { r, l, (uint8_t)s, 0 }; + //fill the segment using the selected implementation + instance->impl(instance, position); + } + } + } + return ARGON2_OK; +} + +int randomx_argon2_fill_memory_blocks(argon2_instance_t *instance) { + if (instance == NULL || instance->lanes == 0) { + return ARGON2_INCORRECT_PARAMETER; + } + return fill_memory_blocks_st(instance); +} + +int randomx_argon2_validate_inputs(const argon2_context *context) { + if (NULL == context) { + return ARGON2_INCORRECT_PARAMETER; + } + + /* Validate password (required param) */ + if (NULL == context->pwd) { + if (0 != context->pwdlen) { + return ARGON2_PWD_PTR_MISMATCH; + } + } + + if (ARGON2_MIN_PWD_LENGTH > context->pwdlen) { + return ARGON2_PWD_TOO_SHORT; + } + + if (ARGON2_MAX_PWD_LENGTH < context->pwdlen) { + return ARGON2_PWD_TOO_LONG; + } + + /* Validate salt (required param) */ + if (NULL == context->salt) { + if (0 != context->saltlen) { + return ARGON2_SALT_PTR_MISMATCH; + } + } + + if (ARGON2_MIN_SALT_LENGTH > context->saltlen) { + return ARGON2_SALT_TOO_SHORT; + } + + if (ARGON2_MAX_SALT_LENGTH < context->saltlen) { + return ARGON2_SALT_TOO_LONG; + } + + /* Validate secret (optional param) */ + if (NULL == context->secret) { + if (0 != context->secretlen) { + return ARGON2_SECRET_PTR_MISMATCH; + } + } + else { + if (ARGON2_MIN_SECRET > context->secretlen) { + return ARGON2_SECRET_TOO_SHORT; + } + if (ARGON2_MAX_SECRET < context->secretlen) { + return ARGON2_SECRET_TOO_LONG; + } + } + + /* Validate associated data (optional param) */ + if (NULL == context->ad) { + if (0 != context->adlen) { + return ARGON2_AD_PTR_MISMATCH; + } + } + else { + if (ARGON2_MIN_AD_LENGTH > context->adlen) { + return ARGON2_AD_TOO_SHORT; + } + if (ARGON2_MAX_AD_LENGTH < context->adlen) { + return ARGON2_AD_TOO_LONG; + } + } + + /* Validate memory cost */ + if (ARGON2_MIN_MEMORY > context->m_cost) { + return ARGON2_MEMORY_TOO_LITTLE; + } + + if (ARGON2_MAX_MEMORY < context->m_cost) { + return ARGON2_MEMORY_TOO_MUCH; + } + + if (context->m_cost < 8 * context->lanes) { + return ARGON2_MEMORY_TOO_LITTLE; + } + + /* Validate time cost */ + if (ARGON2_MIN_TIME > context->t_cost) { + return ARGON2_TIME_TOO_SMALL; + } + + if (ARGON2_MAX_TIME < context->t_cost) { + return ARGON2_TIME_TOO_LARGE; + } + + /* Validate lanes */ + if (ARGON2_MIN_LANES > context->lanes) { + return ARGON2_LANES_TOO_FEW; + } + + if (ARGON2_MAX_LANES < context->lanes) { + return ARGON2_LANES_TOO_MANY; + } + + /* Validate threads */ + if (ARGON2_MIN_THREADS > context->threads) { + return ARGON2_THREADS_TOO_FEW; + } + + if (ARGON2_MAX_THREADS < context->threads) { + return ARGON2_THREADS_TOO_MANY; + } + + if (NULL != context->allocate_cbk && NULL == context->free_cbk) { + return ARGON2_FREE_MEMORY_CBK_NULL; + } + + if (NULL == context->allocate_cbk && NULL != context->free_cbk) { + return ARGON2_ALLOCATE_MEMORY_CBK_NULL; + } + + return ARGON2_OK; +} + +void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) { + uint32_t l; + /* Make the first and second block in each lane as G(H0||0||i) or + G(H0||1||i) */ + uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; //ARGON2_BLOCK_SIZE=1024 + for (l = 0; l < instance->lanes; ++l) { + + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0); + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l); + blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash,ARGON2_PREHASH_SEED_LENGTH); + load_block(&instance->memory[l * instance->lane_length + 0],blockhash_bytes); + + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1); + blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash,ARGON2_PREHASH_SEED_LENGTH); + load_block(&instance->memory[l * instance->lane_length + 1],blockhash_bytes); + } +} + +void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type type) { + blake2b_state BlakeHash; + uint8_t value[sizeof(uint32_t)]; + + if (NULL == context || NULL == blockhash) { + return; + } + + blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH); + + store32(&value, context->lanes); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->outlen); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->m_cost); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->t_cost); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->version); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, (uint32_t)type); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->pwdlen); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->pwd != NULL) { + blake2b_update(&BlakeHash, (const uint8_t *)context->pwd, + context->pwdlen); + } + + store32(&value, context->saltlen); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->salt != NULL) { + blake2b_update(&BlakeHash, (const uint8_t *)context->salt, context->saltlen); + } + + store32(&value, context->secretlen); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->secret != NULL) { + blake2b_update(&BlakeHash, (const uint8_t *)context->secret, + context->secretlen); + } + + store32(&value, context->adlen); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->ad != NULL) { + blake2b_update(&BlakeHash, (const uint8_t *)context->ad, + context->adlen); + } + + blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); //ARGON2_PREHASH_DIGEST_LENGTH=64 +} + +int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context) { + uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; //ARGON2_PREHASH_SEED_LENGTH=72 + int result = ARGON2_OK; + + if (instance == NULL || context == NULL) + return ARGON2_INCORRECT_PARAMETER; + instance->context_ptr = context; + + /* 1. Memory allocation */ + //RandomX takes care of memory allocation + + /* 2. Initial hashing */ + /* H_0 + 8 extra bytes to produce the first blocks */ + /* uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; */ + /* Hashing all inputs */ + rxa2_initial_hash(blockhash, context, instance->type); + /* Zeroing 8 extra bytes */ + /*rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, + ARGON2_PREHASH_SEED_LENGTH - + ARGON2_PREHASH_DIGEST_LENGTH);*/ + + /* 3. Creating first blocks, we always have at least two blocks in a slice + */ + rxa2_fill_first_blocks(blockhash, instance); + + return ARGON2_OK; +} diff --git a/randomx/argon2_core.h b/randomx/argon2_core.h new file mode 100644 index 0000000..def27c6 --- /dev/null +++ b/randomx/argon2_core.h @@ -0,0 +1,163 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef ARGON2_CORE_H +#define ARGON2_CORE_H + +#include +#include "argon2.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define CONST_CAST(x) (x)(uintptr_t) + + /**********************Argon2 internal constants*******************************/ + +enum argon2_core_constants { + /* Memory block size in bytes */ + ARGON2_BLOCK_SIZE = 1024, + ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8, + ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16, + ARGON2_HWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 32, + ARGON2_512BIT_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 64, + + /* Number of pseudo-random values generated by one call to Blake in Argon2i + to + generate reference block positions */ + ARGON2_ADDRESSES_IN_BLOCK = 128, + + /* Pre-hashing digest length and its extension*/ + ARGON2_PREHASH_DIGEST_LENGTH = 64, + ARGON2_PREHASH_SEED_LENGTH = 72 +}; + +/*************************Argon2 internal data types***********************/ + +/* + * Structure for the (1KB) memory block implemented as 128 64-bit words. + * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no + * bounds checking). + */ +typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block; + +/* + * Argon2 instance: memory pointer, number of passes, amount of memory, type, + * and derived values. + * Used to evaluate the number and location of blocks to construct in each + * thread + */ +typedef struct Argon2_instance_t { + block *memory; /* Memory pointer */ + uint32_t version; + uint32_t passes; /* Number of passes */ + uint32_t memory_blocks; /* Number of blocks in memory */ + uint32_t segment_length; + uint32_t lane_length; + uint32_t lanes; + uint32_t threads; + argon2_type type; + int print_internals; /* whether to print the memory blocks */ + argon2_context *context_ptr; /* points back to original context */ + randomx_argon2_impl *impl; +} argon2_instance_t; + +/* + * Argon2 position: where we construct the block right now. Used to distribute + * work between threads. + */ +typedef struct Argon2_position_t { + uint32_t pass; + uint32_t lane; + uint8_t slice; + uint32_t index; +} argon2_position_t; + +/*Struct that holds the inputs for thread handling FillSegment*/ +typedef struct Argon2_thread_data { + argon2_instance_t *instance_ptr; + argon2_position_t pos; +} argon2_thread_data; + +/*************************Argon2 core functions********************************/ + +/* + * Computes absolute position of reference block in the lane following a skewed + * distribution and using a pseudo-random value as input + * @param instance Pointer to the current instance + * @param position Pointer to the current position + * @param pseudo_rand 32-bit pseudo-random value used to determine the position + * @param same_lane Indicates if the block will be taken from the current lane. + * If so we can reference the current segment + * @pre All pointers must be valid + */ +uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance, + const argon2_position_t *position, uint32_t pseudo_rand, + int same_lane); + +/* + * Function that validates all inputs against predefined restrictions and return + * an error code + * @param context Pointer to current Argon2 context + * @return ARGON2_OK if everything is all right, otherwise one of error codes + * (all defined in + */ +int randomx_argon2_validate_inputs(const argon2_context *context); + +/* + * Function allocates memory, hashes the inputs with Blake, and creates first + * two blocks. Returns the pointer to the main memory with 2 blocks per lane + * initialized + * @param context Pointer to the Argon2 internal structure containing memory + * pointer, and parameters for time and space requirements. + * @param instance Current Argon2 instance + * @return Zero if successful, -1 if memory failed to allocate. @context->state + * will be modified if successful. + */ +int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context); + +/* + * Function that fills the entire memory t_cost times based on the first two + * blocks in each lane + * @param instance Pointer to the current instance + * @return ARGON2_OK if successful, @context->state + */ +int randomx_argon2_fill_memory_blocks(argon2_instance_t* instance); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/randomx/argon2_ref.c b/randomx/argon2_ref.c new file mode 100644 index 0000000..b03fd91 --- /dev/null +++ b/randomx/argon2_ref.c @@ -0,0 +1,181 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include +#include +#include "argon2.h" +#include "argon2_core.h" + +#include "blake2/blamka-round-ref.h" +#include "blake2/blake2-impl.h" +#include "blake2/blake2.h" + +static void copy_block(block* dst, const block* src) { + memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK); +} + +static void xor_block(block* dst, const block* src) { + int i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + dst->v[i] ^= src->v[i]; + } +} + + /* + * Function fills a new memory block and optionally XORs the old block over the new one. + * @next_block must be initialized. + * @param prev_block Pointer to the previous block + * @param ref_block Pointer to the reference block + * @param next_block Pointer to the block to be constructed + * @param with_xor Whether to XOR into the new block (1) or just overwrite (0) + * @pre all block pointers must be valid + */ +static void fill_block(const block *prev_block, const block *ref_block,block *next_block, int with_xor) { + block blockR, block_tmp; + unsigned i; + + copy_block(&blockR, ref_block); + xor_block(&blockR, prev_block); + copy_block(&block_tmp, &blockR); + /* Now blockR = ref_block + prev_block and block_tmp = ref_block + prev_block */ + if (with_xor) { + /* Saving the next block contents for XOR over: */ + xor_block(&block_tmp, next_block); + /* Now blockR = ref_block + prev_block and + block_tmp = ref_block + prev_block + next_block */ + } + + /* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then + (16,17,..31)... finally (112,113,...127) */ + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND_NOMSG( + blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2], + blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5], + blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8], + blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11], + blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14], + blockR.v[16 * i + 15]); + } + + /* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then + (2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */ + for (i = 0; i < 8; i++) { + BLAKE2_ROUND_NOMSG( + blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16], + blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33], + blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64], + blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81], + blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112], + blockR.v[2 * i + 113]); + } + + copy_block(next_block, &block_tmp); + xor_block(next_block, &blockR); +} + +void randomx_argon2_fill_segment_ref(const argon2_instance_t *instance,argon2_position_t position) { + printf("randomx_argon2_fill_segment_ref\n"); + block *ref_block = NULL, *curr_block = NULL; + block address_block, input_block, zero_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index; + uint32_t i; + + if (instance == NULL) { + return; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } + else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + for (i = starting_index; i < instance->segment_length; ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + pseudo_rand = instance->memory[prev_offset].v[0]; + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; //0 + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if(i == starting_index && 0 == position.pass && 0 == position.slice) printf("ref_index = %d,curr_offset=%0d,prev_offset=%0d,ref_lane=%0d\n", ref_index, curr_offset, prev_offset, ref_lane); + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(instance->memory + prev_offset, ref_block, curr_block, 0); + } + else { + if (0 == position.pass) { + fill_block(instance->memory + prev_offset, ref_block, curr_block, 0); + } + else { + fill_block(instance->memory + prev_offset, ref_block, curr_block, 1); + } + } + } +} diff --git a/randomx/argon2_ssse3.c b/randomx/argon2_ssse3.c new file mode 100644 index 0000000..930fafb --- /dev/null +++ b/randomx/argon2_ssse3.c @@ -0,0 +1,183 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include + +#include "argon2.h" + +#if defined(_MSC_VER) //MSVC doesn't define SSSE3 +#define __SSSE3__ +#endif + +void randomx_argon2_fill_segment_ssse3(const argon2_instance_t* instance, + argon2_position_t position); + +randomx_argon2_impl* randomx_argon2_impl_ssse3() { +#if defined(__SSSE3__) + return &randomx_argon2_fill_segment_ssse3; +#endif + return NULL; +} + +#if defined(__SSSE3__) + +#include /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */ + +#include "argon2_core.h" + +#include "blake2/blamka-round-ssse3.h" +#include "blake2/blake2-impl.h" +#include "blake2/blake2.h" + +static void fill_block(__m128i* state, const block* ref_block, + block* next_block, int with_xor) { + __m128i block_XY[ARGON2_OWORDS_IN_BLOCK]; + unsigned int i; + + if (with_xor) { + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + state[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i)); + block_XY[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i*)next_block->v + i)); + } + } + else { + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + block_XY[i] = state[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i)); + } + } + + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], + state[8 * i + 3], state[8 * i + 4], state[8 * i + 5], + state[8 * i + 6], state[8 * i + 7]); + } + + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], + state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i], + state[8 * 6 + i], state[8 * 7 + i]); + } + + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + state[i] = _mm_xor_si128(state[i], block_XY[i]); + _mm_storeu_si128((__m128i*)next_block->v + i, state[i]); + } +} + +void randomx_argon2_fill_segment_ssse3(const argon2_instance_t* instance, + argon2_position_t position) { + // printf("randomx_argon2_fill_segment_ssse3\n"); + block* ref_block = NULL, * curr_block = NULL; + block address_block, input_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index, i; + __m128i state[ARGON2_OWORDS_IN_BLOCK]; + + if (instance == NULL) { + return; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } + else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + pseudo_rand = instance->memory[prev_offset].v[0]; + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(state, ref_block, curr_block, 0); + } + else { + if (0 == position.pass) { + fill_block(state, ref_block, curr_block, 0); + } + else { + fill_block(state, ref_block, curr_block, 1); + } + } + } +} + +#endif diff --git a/randomx/asm/configuration.asm b/randomx/asm/configuration.asm new file mode 100644 index 0000000..794d7ad --- /dev/null +++ b/randomx/asm/configuration.asm @@ -0,0 +1,48 @@ +; File start: ..\src\configuration.h +RANDOMX_ARGON_MEMORY EQU 262144t +RANDOMX_ARGON_ITERATIONS EQU 3t +RANDOMX_ARGON_LANES EQU 1t +RANDOMX_ARGON_SALT TEXTEQU <"RandomX\x03"> +RANDOMX_CACHE_ACCESSES EQU 8t +RANDOMX_SUPERSCALAR_LATENCY EQU 170t +RANDOMX_DATASET_BASE_SIZE EQU 2147483648t +RANDOMX_DATASET_EXTRA_SIZE EQU 33554368t +RANDOMX_PROGRAM_SIZE EQU 256t +RANDOMX_PROGRAM_ITERATIONS EQU 2048t +RANDOMX_PROGRAM_COUNT EQU 8t +RANDOMX_SCRATCHPAD_L3 EQU 2097152t +RANDOMX_SCRATCHPAD_L2 EQU 262144t +RANDOMX_SCRATCHPAD_L1 EQU 16384t +RANDOMX_JUMP_BITS EQU 8t +RANDOMX_JUMP_OFFSET EQU 8t +RANDOMX_FREQ_IADD_RS EQU 16t +RANDOMX_FREQ_IADD_M EQU 7t +RANDOMX_FREQ_ISUB_R EQU 16t +RANDOMX_FREQ_ISUB_M EQU 7t +RANDOMX_FREQ_IMUL_R EQU 16t +RANDOMX_FREQ_IMUL_M EQU 4t +RANDOMX_FREQ_IMULH_R EQU 4t +RANDOMX_FREQ_IMULH_M EQU 1t +RANDOMX_FREQ_ISMULH_R EQU 4t +RANDOMX_FREQ_ISMULH_M EQU 1t +RANDOMX_FREQ_IMUL_RCP EQU 8t +RANDOMX_FREQ_INEG_R EQU 2t +RANDOMX_FREQ_IXOR_R EQU 15t +RANDOMX_FREQ_IXOR_M EQU 5t +RANDOMX_FREQ_IROR_R EQU 8t +RANDOMX_FREQ_IROL_R EQU 2t +RANDOMX_FREQ_ISWAP_R EQU 4t +RANDOMX_FREQ_FSWAP_R EQU 4t +RANDOMX_FREQ_FADD_R EQU 16t +RANDOMX_FREQ_FADD_M EQU 5t +RANDOMX_FREQ_FSUB_R EQU 16t +RANDOMX_FREQ_FSUB_M EQU 5t +RANDOMX_FREQ_FSCAL_R EQU 6t +RANDOMX_FREQ_FMUL_R EQU 32t +RANDOMX_FREQ_FDIV_M EQU 4t +RANDOMX_FREQ_FSQRT_R EQU 6t +RANDOMX_FREQ_CBRANCH EQU 25t +RANDOMX_FREQ_CFROUND EQU 1t +RANDOMX_FREQ_ISTORE EQU 16t +RANDOMX_FREQ_NOP EQU 0t +; File end: ..\src\configuration.h diff --git a/randomx/asm/program_epilogue_linux.inc b/randomx/asm/program_epilogue_linux.inc new file mode 100644 index 0000000..eaacae5 --- /dev/null +++ b/randomx/asm/program_epilogue_linux.inc @@ -0,0 +1,10 @@ + ;# restore callee-saved registers - System V AMD64 ABI + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + + ;# program finished + ret 0 \ No newline at end of file diff --git a/randomx/asm/program_epilogue_store.inc b/randomx/asm/program_epilogue_store.inc new file mode 100644 index 0000000..b94fa4d --- /dev/null +++ b/randomx/asm/program_epilogue_store.inc @@ -0,0 +1,19 @@ + ;# save VM register values + pop rcx + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 + movdqa xmmword ptr [rcx+64], xmm0 + movdqa xmmword ptr [rcx+80], xmm1 + movdqa xmmword ptr [rcx+96], xmm2 + movdqa xmmword ptr [rcx+112], xmm3 + lea rcx, [rcx+64] + movdqa xmmword ptr [rcx+64], xmm4 + movdqa xmmword ptr [rcx+80], xmm5 + movdqa xmmword ptr [rcx+96], xmm6 + movdqa xmmword ptr [rcx+112], xmm7 \ No newline at end of file diff --git a/randomx/asm/program_epilogue_win64.inc b/randomx/asm/program_epilogue_win64.inc new file mode 100644 index 0000000..8d70a0a --- /dev/null +++ b/randomx/asm/program_epilogue_win64.inc @@ -0,0 +1,24 @@ + ;# restore callee-saved registers - Microsoft x64 calling convention + movdqu xmm15, xmmword ptr [rsp] + movdqu xmm14, xmmword ptr [rsp+16] + movdqu xmm13, xmmword ptr [rsp+32] + movdqu xmm12, xmmword ptr [rsp+48] + movdqu xmm11, xmmword ptr [rsp+64] + add rsp, 80 + movdqu xmm10, xmmword ptr [rsp] + movdqu xmm9, xmmword ptr [rsp+16] + movdqu xmm8, xmmword ptr [rsp+32] + movdqu xmm7, xmmword ptr [rsp+48] + movdqu xmm6, xmmword ptr [rsp+64] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbp + pop rbx + + ;# program finished + ret diff --git a/randomx/asm/program_loop_load.inc b/randomx/asm/program_loop_load.inc new file mode 100644 index 0000000..c293323 --- /dev/null +++ b/randomx/asm/program_loop_load.inc @@ -0,0 +1,28 @@ + lea rcx, [rsi+rax] + push rcx + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + lea rcx, [rsi+rdx] + push rcx + cvtdq2pd xmm0, qword ptr [rcx+0] + cvtdq2pd xmm1, qword ptr [rcx+8] + cvtdq2pd xmm2, qword ptr [rcx+16] + cvtdq2pd xmm3, qword ptr [rcx+24] + cvtdq2pd xmm4, qword ptr [rcx+32] + cvtdq2pd xmm5, qword ptr [rcx+40] + cvtdq2pd xmm6, qword ptr [rcx+48] + cvtdq2pd xmm7, qword ptr [rcx+56] + andps xmm4, xmm13 + andps xmm5, xmm13 + andps xmm6, xmm13 + andps xmm7, xmm13 + orps xmm4, xmm14 + orps xmm5, xmm14 + orps xmm6, xmm14 + orps xmm7, xmm14 diff --git a/randomx/asm/program_loop_store.inc b/randomx/asm/program_loop_store.inc new file mode 100644 index 0000000..1ba1635 --- /dev/null +++ b/randomx/asm/program_loop_store.inc @@ -0,0 +1,18 @@ + pop rcx + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 + pop rcx + xorpd xmm0, xmm4 + xorpd xmm1, xmm5 + xorpd xmm2, xmm6 + xorpd xmm3, xmm7 + movapd xmmword ptr [rcx+0], xmm0 + movapd xmmword ptr [rcx+16], xmm1 + movapd xmmword ptr [rcx+32], xmm2 + movapd xmmword ptr [rcx+48], xmm3 diff --git a/randomx/asm/program_prologue_linux.inc b/randomx/asm/program_prologue_linux.inc new file mode 100644 index 0000000..ffde152 --- /dev/null +++ b/randomx/asm/program_prologue_linux.inc @@ -0,0 +1,34 @@ + ;# callee-saved registers - System V AMD64 ABI + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + + ;# function arguments + mov rbx, rcx ;# loop counter + push rdi ;# RegisterFile& registerFile + mov rcx, rdi + mov rbp, qword ptr [rsi] ;# "mx", "ma" + mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset + mov rsi, rdx ;# uint8_t* scratchpad + + mov rax, rbp + + ;# zero integer registers + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + + ;# load constant registers + lea rcx, [rcx+120] + movapd xmm8, xmmword ptr [rcx+72] + movapd xmm9, xmmword ptr [rcx+88] + movapd xmm10, xmmword ptr [rcx+104] + movapd xmm11, xmmword ptr [rcx+120] diff --git a/randomx/asm/program_prologue_win64.inc b/randomx/asm/program_prologue_win64.inc new file mode 100644 index 0000000..590a98d --- /dev/null +++ b/randomx/asm/program_prologue_win64.inc @@ -0,0 +1,47 @@ + ;# callee-saved registers - Microsoft x64 calling convention + push rbx + push rbp + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + movdqu xmmword ptr [rsp+64], xmm6 + movdqu xmmword ptr [rsp+48], xmm7 + movdqu xmmword ptr [rsp+32], xmm8 + movdqu xmmword ptr [rsp+16], xmm9 + movdqu xmmword ptr [rsp+0], xmm10 + sub rsp, 80 + movdqu xmmword ptr [rsp+64], xmm11 + movdqu xmmword ptr [rsp+48], xmm12 + movdqu xmmword ptr [rsp+32], xmm13 + movdqu xmmword ptr [rsp+16], xmm14 + movdqu xmmword ptr [rsp+0], xmm15 + + ;# function arguments + push rcx ;# RegisterFile& registerFile + mov rbp, qword ptr [rdx] ;# "mx", "ma" + mov rdi, qword ptr [rdx+8] ;# uint8_t* dataset + mov rsi, r8 ;# uint8_t* scratchpad + mov rbx, r9 ;# loop counter + + mov rax, rbp + + ;# zero integer registers + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + + ;# load constant registers + lea rcx, [rcx+120] + movapd xmm8, xmmword ptr [rcx+72] + movapd xmm9, xmmword ptr [rcx+88] + movapd xmm10, xmmword ptr [rcx+104] + movapd xmm11, xmmword ptr [rcx+120] diff --git a/randomx/asm/program_read_dataset.inc b/randomx/asm/program_read_dataset.inc new file mode 100644 index 0000000..b81d0c3 --- /dev/null +++ b/randomx/asm/program_read_dataset.inc @@ -0,0 +1,17 @@ + xor rbp, rax ;# modify "mx" + mov edx, ebp ;# edx = mx + and edx, RANDOMX_DATASET_BASE_MASK + prefetchnta byte ptr [rdi+rdx] + ror rbp, 32 ;# swap "ma" and "mx" + mov edx, ebp ;# edx = ma + and edx, RANDOMX_DATASET_BASE_MASK + lea rcx, [rdi+rdx] ;# dataset cache line + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + \ No newline at end of file diff --git a/randomx/asm/program_read_dataset_sshash_fin.inc b/randomx/asm/program_read_dataset_sshash_fin.inc new file mode 100644 index 0000000..f5a067d --- /dev/null +++ b/randomx/asm/program_read_dataset_sshash_fin.inc @@ -0,0 +1,10 @@ + mov rbx, qword ptr [rsp+64] + xor r8, qword ptr [rsp+56] + xor r9, qword ptr [rsp+48] + xor r10, qword ptr [rsp+40] + xor r11, qword ptr [rsp+32] + xor r12, qword ptr [rsp+24] + xor r13, qword ptr [rsp+16] + xor r14, qword ptr [rsp+8] + xor r15, qword ptr [rsp+0] + add rsp, 72 \ No newline at end of file diff --git a/randomx/asm/program_read_dataset_sshash_init.inc b/randomx/asm/program_read_dataset_sshash_init.inc new file mode 100644 index 0000000..6fe9525 --- /dev/null +++ b/randomx/asm/program_read_dataset_sshash_init.inc @@ -0,0 +1,17 @@ + sub rsp, 72 + mov qword ptr [rsp+64], rbx + mov qword ptr [rsp+56], r8 + mov qword ptr [rsp+48], r9 + mov qword ptr [rsp+40], r10 + mov qword ptr [rsp+32], r11 + mov qword ptr [rsp+24], r12 + mov qword ptr [rsp+16], r13 + mov qword ptr [rsp+8], r14 + mov qword ptr [rsp+0], r15 + xor rbp, rax ;# modify "mx" + ror rbp, 32 ;# swap "ma" and "mx" + mov ebx, ebp ;# ecx = ma + and ebx, RANDOMX_DATASET_BASE_MASK + shr ebx, 6 ;# ebx = Dataset block number + ;# add ebx, datasetOffset / 64 + ;# call 32768 \ No newline at end of file diff --git a/randomx/asm/program_sshash_constants.inc b/randomx/asm/program_sshash_constants.inc new file mode 100644 index 0000000..53dc175 --- /dev/null +++ b/randomx/asm/program_sshash_constants.inc @@ -0,0 +1,24 @@ +r0_mul: + ;#/ 6364136223846793005 + db 45, 127, 149, 76, 45, 244, 81, 88 +r1_add: + ;#/ 9298411001130361340 + db 252, 161, 245, 89, 138, 151, 10, 129 +r2_add: + ;#/ 12065312585734608966 + db 70, 216, 194, 56, 223, 153, 112, 167 +r3_add: + ;#/ 9306329213124626780 + db 92, 73, 34, 191, 28, 185, 38, 129 +r4_add: + ;#/ 5281919268842080866 + db 98, 138, 159, 23, 151, 37, 77, 73 +r5_add: + ;#/ 10536153434571861004 + db 12, 236, 170, 206, 185, 239, 55, 146 +r6_add: + ;#/ 3398623926847679864 + db 120, 45, 230, 108, 116, 86, 42, 47 +r7_add: + ;#/ 9549104520008361294 + db 78, 229, 44, 182, 247, 59, 133, 132 \ No newline at end of file diff --git a/randomx/asm/program_sshash_load.inc b/randomx/asm/program_sshash_load.inc new file mode 100644 index 0000000..5351356 --- /dev/null +++ b/randomx/asm/program_sshash_load.inc @@ -0,0 +1,8 @@ + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] \ No newline at end of file diff --git a/randomx/asm/program_sshash_prefetch.inc b/randomx/asm/program_sshash_prefetch.inc new file mode 100644 index 0000000..26efb51 --- /dev/null +++ b/randomx/asm/program_sshash_prefetch.inc @@ -0,0 +1,4 @@ + and rbx, RANDOMX_CACHE_MASK + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] \ No newline at end of file diff --git a/randomx/asm/program_xmm_constants.inc b/randomx/asm/program_xmm_constants.inc new file mode 100644 index 0000000..296237a --- /dev/null +++ b/randomx/asm/program_xmm_constants.inc @@ -0,0 +1,6 @@ +mantissaMask: + db 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255, 255, 0 +exp240: + db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +scaleMask: + db 0, 0, 0, 0, 0, 0, 240, 128, 0, 0, 0, 0, 0, 0, 240, 128 \ No newline at end of file diff --git a/randomx/asm/randomx_reciprocal.inc b/randomx/asm/randomx_reciprocal.inc new file mode 100644 index 0000000..e1f22fd --- /dev/null +++ b/randomx/asm/randomx_reciprocal.inc @@ -0,0 +1,7 @@ + mov edx, 1 + mov r8, rcx + xor eax, eax + bsr rcx, rcx + shl rdx, cl + div r8 + ret \ No newline at end of file diff --git a/randomx/assembly_generator_x86.cpp b/randomx/assembly_generator_x86.cpp new file mode 100644 index 0000000..550b222 --- /dev/null +++ b/randomx/assembly_generator_x86.cpp @@ -0,0 +1,612 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "assembly_generator_x86.hpp" +#include "common.hpp" +#include "reciprocal.h" +#include "program.hpp" +#include "superscalar.hpp" + +namespace randomx { + + static const char* regR[] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" }; + static const char* regR32[] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" }; + static const char* regFE[] = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" }; + static const char* regF[] = { "xmm0", "xmm1", "xmm2", "xmm3" }; + static const char* regE[] = { "xmm4", "xmm5", "xmm6", "xmm7" }; + static const char* regA[] = { "xmm8", "xmm9", "xmm10", "xmm11" }; + + static const char* tempRegx = "xmm12"; + static const char* mantissaMaskReg = "xmm13"; + static const char* exponentMaskReg = "xmm14"; + static const char* scaleMaskReg = "xmm15"; + static const char* regIc = "rbx"; + static const char* regIc32 = "ebx"; + static const char* regIc8 = "bl"; + static const char* regScratchpadAddr = "rsi"; + + void AssemblyGeneratorX86::generateProgram(Program& prog) { + //printf("---\n"); + for (unsigned i = 0; i < RegistersCount; ++i) { + registerUsage[i] = -1; + } + asmCode.str(std::string()); //clear + for (unsigned i = 0; i < prog.getSize(); ++i) { + asmCode << "randomx_isn_" << i << ":" << std::endl; + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr, i); + } + } + + void AssemblyGeneratorX86::generateAsm(SuperscalarProgram& prog) { + asmCode.str(std::string()); //clear +#ifdef RANDOMX_ALIGN + asmCode << "ALIGN 16" << std::endl; +#endif + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + switch ((SuperscalarInstructionType)instr.opcode) + { + case SuperscalarInstructionType::ISUB_R: + asmCode << "sub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + break; + case SuperscalarInstructionType::IXOR_R: + asmCode << "xor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + break; + case SuperscalarInstructionType::IADD_RS: + asmCode << "lea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.getModShift())) << "]" << std::endl; + break; + case SuperscalarInstructionType::IMUL_R: + asmCode << "imul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + break; + case SuperscalarInstructionType::IROR_C: + asmCode << "ror " << regR[instr.dst] << ", " << instr.getImm32() << std::endl; + break; + case SuperscalarInstructionType::IADD_C7: + asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + break; + case SuperscalarInstructionType::IXOR_C7: + asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + break; + case SuperscalarInstructionType::IADD_C8: + asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; +#ifdef RANDOMX_ALIGN + asmCode << "nop" << std::endl; +#endif + break; + case SuperscalarInstructionType::IXOR_C8: + asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; +#ifdef RANDOMX_ALIGN + asmCode << "nop" << std::endl; +#endif + break; + case SuperscalarInstructionType::IADD_C9: + asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; +#ifdef RANDOMX_ALIGN + asmCode << "xchg ax, ax ;nop" << std::endl; +#endif + break; + case SuperscalarInstructionType::IXOR_C9: + asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; +#ifdef RANDOMX_ALIGN + asmCode << "xchg ax, ax ;nop" << std::endl; +#endif + break; + case SuperscalarInstructionType::IMULH_R: + asmCode << "mov rax, " << regR[instr.dst] << std::endl; + asmCode << "mul " << regR[instr.src] << std::endl; + asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl; + break; + case SuperscalarInstructionType::ISMULH_R: + asmCode << "mov rax, " << regR[instr.dst] << std::endl; + asmCode << "imul " << regR[instr.src] << std::endl; + asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl; + break; + case SuperscalarInstructionType::IMUL_RCP: + asmCode << "mov rax, " << (int64_t)randomx_reciprocal(instr.getImm32()) << std::endl; + asmCode << "imul " << regR[instr.dst] << ", rax" << std::endl; + break; + default: + UNREACHABLE; + } + } + } + + void AssemblyGeneratorX86::generateC(SuperscalarProgram& prog) { + asmCode.str(std::string()); //clear + asmCode << "#include " << std::endl; + asmCode << "#if defined(__SIZEOF_INT128__)" << std::endl; + asmCode << " static inline uint64_t mulh(uint64_t a, uint64_t b) {" << std::endl; + asmCode << " return ((unsigned __int128)a * b) >> 64;" << std::endl; + asmCode << " }" << std::endl; + asmCode << " static inline int64_t smulh(int64_t a, int64_t b) {" << std::endl; + asmCode << " return ((__int128)a * b) >> 64;" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_MULH" << std::endl; + asmCode << " #define HAVE_SMULH" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "#if defined(_MSC_VER)" << std::endl; + asmCode << " #define HAS_VALUE(X) X ## 0" << std::endl; + asmCode << " #define EVAL_DEFINE(X) HAS_VALUE(X)" << std::endl; + asmCode << " #include " << std::endl; + asmCode << " #include " << std::endl; + asmCode << " static __inline uint64_t rotr(uint64_t x , int c) {" << std::endl; + asmCode << " return _rotr64(x, c);" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_ROTR" << std::endl; + asmCode << " #if EVAL_DEFINE(__MACHINEARM64_X64(1))" << std::endl; + asmCode << " static __inline uint64_t mulh(uint64_t a, uint64_t b) {" << std::endl; + asmCode << " return __umulh(a, b);" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_MULH" << std::endl; + asmCode << " #endif" << std::endl; + asmCode << " #if EVAL_DEFINE(__MACHINEX64(1))" << std::endl; + asmCode << " static __inline int64_t smulh(int64_t a, int64_t b) {" << std::endl; + asmCode << " int64_t hi;" << std::endl; + asmCode << " _mul128(a, b, &hi);" << std::endl; + asmCode << " return hi;" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_SMULH" << std::endl; + asmCode << " #endif" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "#ifndef HAVE_ROTR" << std::endl; + asmCode << " static inline uint64_t rotr(uint64_t a, int b) {" << std::endl; + asmCode << " return (a >> b) | (a << (64 - b));" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_ROTR" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "#if !defined(HAVE_MULH) || !defined(HAVE_SMULH) || !defined(HAVE_ROTR)" << std::endl; + asmCode << " #error \"Required functions are not defined\"" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "void superScalar(uint64_t r[8]) {" << std::endl; + asmCode << "uint64_t r8 = r[0], r9 = r[1], r10 = r[2], r11 = r[3], r12 = r[4], r13 = r[5], r14 = r[6], r15 = r[7];" << std::endl; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + switch ((SuperscalarInstructionType)instr.opcode) + { + case SuperscalarInstructionType::ISUB_R: + asmCode << regR[instr.dst] << " -= " << regR[instr.src] << ";" << std::endl; + break; + case SuperscalarInstructionType::IXOR_R: + asmCode << regR[instr.dst] << " ^= " << regR[instr.src] << ";" << std::endl; + break; + case SuperscalarInstructionType::IADD_RS: + asmCode << regR[instr.dst] << " += " << regR[instr.src] << "*" << (1 << (instr.getModShift())) << ";" << std::endl; + break; + case SuperscalarInstructionType::IMUL_R: + asmCode << regR[instr.dst] << " *= " << regR[instr.src] << ";" << std::endl; + break; + case SuperscalarInstructionType::IROR_C: + asmCode << regR[instr.dst] << " = rotr(" << regR[instr.dst] << ", " << instr.getImm32() << ");" << std::endl; + break; + case SuperscalarInstructionType::IADD_C7: + case SuperscalarInstructionType::IADD_C8: + case SuperscalarInstructionType::IADD_C9: + asmCode << regR[instr.dst] << " += " << (int32_t)instr.getImm32() << ";" << std::endl; + break; + case SuperscalarInstructionType::IXOR_C7: + case SuperscalarInstructionType::IXOR_C8: + case SuperscalarInstructionType::IXOR_C9: + asmCode << regR[instr.dst] << " ^= " << (int32_t)instr.getImm32() << ";" << std::endl; + break; + case SuperscalarInstructionType::IMULH_R: + asmCode << regR[instr.dst] << " = mulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl; + break; + case SuperscalarInstructionType::ISMULH_R: + asmCode << regR[instr.dst] << " = smulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl; + break; + case SuperscalarInstructionType::IMUL_RCP: + asmCode << regR[instr.dst] << " *= " << (int64_t)randomx_reciprocal(instr.getImm32()) << ";" << std::endl; + break; + default: + UNREACHABLE; + } + } + asmCode << "r[0] = r8; r[1] = r9; r[2] = r10; r[3] = r11; r[4] = r12; r[5] = r13; r[6] = r14; r[7] = r15;" << std::endl; + asmCode << "}" << std::endl; + } + + void AssemblyGeneratorX86::traceint(Instruction& instr) { + if (trace) { + asmCode << "\tpush " << regR[instr.dst] << std::endl; + } + } + + void AssemblyGeneratorX86::traceflt(Instruction& instr) { + if (trace) { + asmCode << "\tpush 0" << std::endl; + } + } + + void AssemblyGeneratorX86::tracenop(Instruction& instr) { + if (trace) { + asmCode << "\tpush 0" << std::endl; + } + } + + void AssemblyGeneratorX86::generateCode(Instruction& instr, int i) { + asmCode << "\t; " << instr; + auto generator = engine[instr.opcode]; + (this->*generator)(instr, i); + } + + void AssemblyGeneratorX86::genAddressReg(Instruction& instr, const char* reg = "eax") { + asmCode << "\tlea " << reg << ", [" << regR32[instr.src] << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl; + asmCode << "\tand " << reg << ", " << ((instr.getModMem()) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl; + } + + void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) { + asmCode << "\tlea eax, [" << regR32[instr.dst] << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl; + int mask; + if (instr.getModCond() < StoreL3Condition) { + mask = instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask; + } + else { + mask = ScratchpadL3Mask; + } + asmCode << "\tand eax" << ", " << (mask & (-maskAlign)) << std::endl; + } + + int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) { + return (int32_t)instr.getImm32() & ScratchpadL3Mask; + } + + void AssemblyGeneratorX86::h_IADD_RS(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if(instr.dst == RegisterNeedsDisplacement) + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.getModShift())) << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl; + else + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.getModShift())) << "]" << std::endl; + traceint(instr); + } + + void AssemblyGeneratorX86::h_IADD_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + asmCode << "\tadd " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl; + } + else { + asmCode << "\tadd " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl; + } + traceint(instr); + } + + void AssemblyGeneratorX86::h_ISUB_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + asmCode << "\tsub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + } + else { + asmCode << "\tsub " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + } + traceint(instr); + } + + void AssemblyGeneratorX86::h_ISUB_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + asmCode << "\tsub " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl; + } + else { + asmCode << "\tsub " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl; + } + traceint(instr); + } + + void AssemblyGeneratorX86::h_IMUL_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + asmCode << "\timul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + } + else { + asmCode << "\timul " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + } + traceint(instr); + } + + void AssemblyGeneratorX86::h_IMUL_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + asmCode << "\timul " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl; + } + else { + asmCode << "\timul " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl; + } + traceint(instr); + } + + void AssemblyGeneratorX86::h_IMULH_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\tmul " << regR[instr.src] << std::endl; + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; + traceint(instr); + } + + void AssemblyGeneratorX86::h_IMULH_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr, "ecx"); + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\tmul qword ptr [" << regScratchpadAddr << "+rcx]" << std::endl; + } + else { + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\tmul qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl; + } + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; + traceint(instr); + } + + void AssemblyGeneratorX86::h_ISMULH_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\timul " << regR[instr.src] << std::endl; + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; + traceint(instr); + } + + void AssemblyGeneratorX86::h_ISMULH_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr, "ecx"); + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\timul qword ptr [" << regScratchpadAddr << "+rcx]" << std::endl; + } + else { + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\timul qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl; + } + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; + traceint(instr); + } + + void AssemblyGeneratorX86::h_INEG_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + asmCode << "\tneg " << regR[instr.dst] << std::endl; + traceint(instr); + } + + void AssemblyGeneratorX86::h_IXOR_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + asmCode << "\txor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + } + else { + asmCode << "\txor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + } + traceint(instr); + } + + void AssemblyGeneratorX86::h_IXOR_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + asmCode << "\txor " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl; + } + else { + asmCode << "\txor " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl; + } + traceint(instr); + } + + void AssemblyGeneratorX86::h_IROR_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + asmCode << "\tmov ecx, " << regR32[instr.src] << std::endl; + asmCode << "\tror " << regR[instr.dst] << ", cl" << std::endl; + } + else { + asmCode << "\tror " << regR[instr.dst] << ", " << (instr.getImm32() & 63) << std::endl; + } + traceint(instr); + } + + void AssemblyGeneratorX86::h_IROL_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + asmCode << "\tmov ecx, " << regR32[instr.src] << std::endl; + asmCode << "\trol " << regR[instr.dst] << ", cl" << std::endl; + } + else { + asmCode << "\trol " << regR[instr.dst] << ", " << (instr.getImm32() & 63) << std::endl; + } + traceint(instr); + } + + void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) { + uint64_t divisor = instr.getImm32(); + if (!isZeroOrPowerOf2(divisor)) { + registerUsage[instr.dst] = i; + asmCode << "\tmov rax, " << randomx_reciprocal(divisor) << std::endl; + asmCode << "\timul " << regR[instr.dst] << ", rax" << std::endl; + traceint(instr); + } + else { + tracenop(instr); + } + } + + void AssemblyGeneratorX86::h_ISWAP_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + registerUsage[instr.dst] = i; + registerUsage[instr.src] = i; + asmCode << "\txchg " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + traceint(instr); + } + else { + tracenop(instr); + } + } + + void AssemblyGeneratorX86::h_FSWAP_R(Instruction& instr, int i) { + asmCode << "\tshufpd " << regFE[instr.dst] << ", " << regFE[instr.dst] << ", 1" << std::endl; + traceflt(instr); + } + + void AssemblyGeneratorX86::h_FADD_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; + asmCode << "\taddpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; + traceflt(instr); + } + + void AssemblyGeneratorX86::h_FADD_M(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + genAddressReg(instr); + asmCode << "\tcvtdq2pd " << tempRegx << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl; + asmCode << "\taddpd " << regF[instr.dst] << ", " << tempRegx << std::endl; + traceflt(instr); + } + + void AssemblyGeneratorX86::h_FSUB_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; + asmCode << "\tsubpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; + traceflt(instr); + } + + void AssemblyGeneratorX86::h_FSUB_M(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + genAddressReg(instr); + asmCode << "\tcvtdq2pd " << tempRegx << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl; + asmCode << "\tsubpd " << regF[instr.dst] << ", " << tempRegx << std::endl; + traceflt(instr); + } + + void AssemblyGeneratorX86::h_FSCAL_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + asmCode << "\txorps " << regF[instr.dst] << ", " << scaleMaskReg << std::endl; + traceflt(instr); + } + + void AssemblyGeneratorX86::h_FMUL_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; + asmCode << "\tmulpd " << regE[instr.dst] << ", " << regA[instr.src] << std::endl; + traceflt(instr); + } + + void AssemblyGeneratorX86::h_FDIV_M(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + genAddressReg(instr); + asmCode << "\tcvtdq2pd " << tempRegx << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl; + asmCode << "\tandps " << tempRegx << ", " << mantissaMaskReg << std::endl; + asmCode << "\torps " << tempRegx << ", " << exponentMaskReg << std::endl; + asmCode << "\tdivpd " << regE[instr.dst] << ", " << tempRegx << std::endl; + traceflt(instr); + } + + void AssemblyGeneratorX86::h_FSQRT_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + asmCode << "\tsqrtpd " << regE[instr.dst] << ", " << regE[instr.dst] << std::endl; + traceflt(instr); + } + + void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) { + asmCode << "\tmov rax, " << regR[instr.src] << std::endl; + int rotate = (13 - (instr.getImm32() & 63)) & 63; + if (rotate != 0) + asmCode << "\trol rax, " << rotate << std::endl; + asmCode << "\tand eax, 24576" << std::endl; + asmCode << "\tor eax, 40896" << std::endl; + asmCode << "\tpush rax" << std::endl; + asmCode << "\tldmxcsr dword ptr [rsp]" << std::endl; + asmCode << "\tpop rax" << std::endl; + tracenop(instr); + } + + void AssemblyGeneratorX86::h_CBRANCH(Instruction& instr, int i) { + int reg = instr.dst; + int target = registerUsage[reg] + 1; + int shift = instr.getModCond() + ConditionOffset; + int32_t imm = instr.getImm32() | (1L << shift); + if (ConditionOffset > 0 || shift > 0) + imm &= ~(1L << (shift - 1)); + asmCode << "\tadd " << regR[reg] << ", " << imm << std::endl; + asmCode << "\ttest " << regR[reg] << ", " << (ConditionMask << shift) << std::endl; + asmCode << "\tjz randomx_isn_" << target << std::endl; + //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { + registerUsage[j] = i; + } + } + + void AssemblyGeneratorX86::h_ISTORE(Instruction& instr, int i) { + genAddressRegDst(instr); + asmCode << "\tmov qword ptr [" << regScratchpadAddr << "+rax], " << regR[instr.src] << std::endl; + tracenop(instr); + } + + void AssemblyGeneratorX86::h_NOP(Instruction& instr, int i) { + asmCode << "\tnop" << std::endl; + tracenop(instr); + } + +#include "instruction_weights.hpp" +#define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x)) + + InstructionGenerator AssemblyGeneratorX86::engine[256] = { + INST_HANDLE(IADD_RS) + INST_HANDLE(IADD_M) + INST_HANDLE(ISUB_R) + INST_HANDLE(ISUB_M) + INST_HANDLE(IMUL_R) + INST_HANDLE(IMUL_M) + INST_HANDLE(IMULH_R) + INST_HANDLE(IMULH_M) + INST_HANDLE(ISMULH_R) + INST_HANDLE(ISMULH_M) + INST_HANDLE(IMUL_RCP) + INST_HANDLE(INEG_R) + INST_HANDLE(IXOR_R) + INST_HANDLE(IXOR_M) + INST_HANDLE(IROR_R) + INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) + INST_HANDLE(FSWAP_R) + INST_HANDLE(FADD_R) + INST_HANDLE(FADD_M) + INST_HANDLE(FSUB_R) + INST_HANDLE(FSUB_M) + INST_HANDLE(FSCAL_R) + INST_HANDLE(FMUL_R) + INST_HANDLE(FDIV_M) + INST_HANDLE(FSQRT_R) + INST_HANDLE(CBRANCH) + INST_HANDLE(CFROUND) + INST_HANDLE(ISTORE) + INST_HANDLE(NOP) + }; +} \ No newline at end of file diff --git a/randomx/assembly_generator_x86.hpp b/randomx/assembly_generator_x86.hpp new file mode 100644 index 0000000..e962398 --- /dev/null +++ b/randomx/assembly_generator_x86.hpp @@ -0,0 +1,94 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include "common.hpp" +#include + +namespace randomx { + + class Program; + class SuperscalarProgram; + class AssemblyGeneratorX86; + class Instruction; + + typedef void(AssemblyGeneratorX86::*InstructionGenerator)(Instruction&, int); + + class AssemblyGeneratorX86 { + public: + void generateProgram(Program& prog); + void generateAsm(SuperscalarProgram& prog); + void generateC(SuperscalarProgram& prog); + void printCode(std::ostream& os) { + os << asmCode.rdbuf(); + } + private: + void genAddressReg(Instruction&, const char*); + void genAddressRegDst(Instruction&, int); + int32_t genAddressImm(Instruction&); + void generateCode(Instruction&, int); + void traceint(Instruction&); + void traceflt(Instruction&); + void tracenop(Instruction&); + void h_IADD_RS(Instruction&, int); + void h_IADD_M(Instruction&, int); + void h_ISUB_R(Instruction&, int); + void h_ISUB_M(Instruction&, int); + void h_IMUL_R(Instruction&, int); + void h_IMUL_M(Instruction&, int); + void h_IMULH_R(Instruction&, int); + void h_IMULH_M(Instruction&, int); + void h_ISMULH_R(Instruction&, int); + void h_ISMULH_M(Instruction&, int); + void h_IMUL_RCP(Instruction&, int); + void h_INEG_R(Instruction&, int); + void h_IXOR_R(Instruction&, int); + void h_IXOR_M(Instruction&, int); + void h_IROR_R(Instruction&, int); + void h_IROL_R(Instruction&, int); + void h_ISWAP_R(Instruction&, int); + void h_FSWAP_R(Instruction&, int); + void h_FADD_R(Instruction&, int); + void h_FADD_M(Instruction&, int); + void h_FSUB_R(Instruction&, int); + void h_FSUB_M(Instruction&, int); + void h_FSCAL_R(Instruction&, int); + void h_FMUL_R(Instruction&, int); + void h_FDIV_M(Instruction&, int); + void h_FSQRT_R(Instruction&, int); + void h_CBRANCH(Instruction&, int); + void h_CFROUND(Instruction&, int); + void h_ISTORE(Instruction&, int); + void h_NOP(Instruction&, int); + + static InstructionGenerator engine[256]; + std::stringstream asmCode; + int registerUsage[RegistersCount]; + }; +} \ No newline at end of file diff --git a/randomx/blake2/blake2-impl.h b/randomx/blake2/blake2-impl.h new file mode 100644 index 0000000..617f7c8 --- /dev/null +++ b/randomx/blake2/blake2-impl.h @@ -0,0 +1,76 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef PORTABLE_BLAKE2_IMPL_H +#define PORTABLE_BLAKE2_IMPL_H + +#include + +#include "endian.h" + +static FORCE_INLINE uint64_t load48(const void *src) { + const uint8_t *p = (const uint8_t *)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + return w; +} + +static FORCE_INLINE void store48(void *dst, uint64_t w) { + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +} + +static FORCE_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) { + return (w >> c) | (w << (32 - c)); +} + +static FORCE_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) { + return (w >> c) | (w << (64 - c)); +} + +#endif diff --git a/randomx/blake2/blake2.h b/randomx/blake2/blake2.h new file mode 100644 index 0000000..3d15be1 --- /dev/null +++ b/randomx/blake2/blake2.h @@ -0,0 +1,116 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef PORTABLE_BLAKE2_H +#define PORTABLE_BLAKE2_H + +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + + enum blake2b_constant { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 + }; + +#pragma pack(push, 1) + typedef struct __blake2b_param { + uint8_t digest_length; /* 1 */ + uint8_t key_length; /* 2 */ + uint8_t fanout; /* 3 */ + uint8_t depth; /* 4 */ + uint32_t leaf_length; /* 8 */ + uint64_t node_offset; /* 16 */ + uint8_t node_depth; /* 17 */ + uint8_t inner_length; /* 18 */ + uint8_t reserved[14]; /* 32 */ + uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */ + uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */ + } blake2b_param; +#pragma pack(pop) + + typedef struct __blake2b_state { + uint64_t h[8]; + uint64_t t[2]; + uint64_t f[2]; + uint8_t buf[BLAKE2B_BLOCKBYTES]; + unsigned buflen; + unsigned outlen; + uint8_t last_node; + } blake2b_state; + + /* Ensure param structs have not been wrongly padded */ + /* Poor man's static_assert */ + enum { + blake2_size_check_0 = 1 / !!(CHAR_BIT == 8), + blake2_size_check_2 = + 1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT) + }; + + //randomx namespace +#define blake2b_init randomx_blake2b_init +#define blake2b_init_key randomx_blake2b_init_key +#define blake2b_init_param randomx_blake2b_init_param +#define blake2b_update randomx_blake2b_update +#define blake2b_final randomx_blake2b_final +#define blake2b randomx_blake2b +#define blake2b_long randomx_blake2b_long + + /* Streaming API */ + int blake2b_init(blake2b_state *S, size_t outlen); + int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, + size_t keylen); + int blake2b_init_param(blake2b_state *S, const blake2b_param *P); + int blake2b_update(blake2b_state *S, const void *in, size_t inlen); + int blake2b_final(blake2b_state *S, void *out, size_t outlen); + + /* Simple API */ + int blake2b(void *out, size_t outlen, const void *in, size_t inlen, + const void *key, size_t keylen); + + /* Argon2 Team - Begin Code */ + int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen); + /* Argon2 Team - End Code */ + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/randomx/blake2/blake2b.c b/randomx/blake2/blake2b.c new file mode 100644 index 0000000..1bfd8d4 --- /dev/null +++ b/randomx/blake2/blake2b.c @@ -0,0 +1,409 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include + +#include "blake2.h" +#include "blake2-impl.h" + +static const uint64_t blake2b_IV[8] = { + UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), + UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), + UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), + UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) }; + +static const unsigned int blake2b_sigma[12][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, +}; + +static FORCE_INLINE void blake2b_set_lastnode(blake2b_state *S) { + S->f[1] = (uint64_t)-1; +} + +static FORCE_INLINE void blake2b_set_lastblock(blake2b_state *S) { + if (S->last_node) { + blake2b_set_lastnode(S); + } + S->f[0] = (uint64_t)-1; +} + +static FORCE_INLINE void blake2b_increment_counter(blake2b_state *S, + uint64_t inc) { + S->t[0] += inc; + S->t[1] += (S->t[0] < inc); +} + +static FORCE_INLINE void blake2b_invalidate_state(blake2b_state *S) { + //clear_internal_memory(S, sizeof(*S)); /* wipe */ + blake2b_set_lastblock(S); /* invalidate for further use */ +} + +static FORCE_INLINE void blake2b_init0(blake2b_state *S) { + memset(S, 0, sizeof(*S)); + memcpy(S->h, blake2b_IV, sizeof(S->h)); +} + +int blake2b_init_param(blake2b_state *S, const blake2b_param *P) { + const unsigned char *p = (const unsigned char *)P; + unsigned int i; + + if (NULL == P || NULL == S) { + return -1; + } + + blake2b_init0(S); + /* IV XOR Parameter Block */ + for (i = 0; i < 8; ++i) { + S->h[i] ^= load64(&p[i * sizeof(S->h[i])]); + } + S->outlen = P->digest_length; + return 0; +} + +/* Sequential blake2b initialization */ +int blake2b_init(blake2b_state *S, size_t outlen) { + blake2b_param P; + + if (S == NULL) { + return -1; + } + + if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) { + blake2b_invalidate_state(S); + return -1; + } + + /* Setup Parameter Block for unkeyed BLAKE2 */ + P.digest_length = (uint8_t)outlen; + P.key_length = 0; + P.fanout = 1; + P.depth = 1; + P.leaf_length = 0; + P.node_offset = 0; + P.node_depth = 0; + P.inner_length = 0; + memset(P.reserved, 0, sizeof(P.reserved)); + memset(P.salt, 0, sizeof(P.salt)); + memset(P.personal, 0, sizeof(P.personal)); + + return blake2b_init_param(S, &P); +} + +int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t keylen) { + blake2b_param P; + + if (S == NULL) { + return -1; + } + + if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) { + blake2b_invalidate_state(S); + return -1; + } + + if ((key == 0) || (keylen == 0) || (keylen > BLAKE2B_KEYBYTES)) { + blake2b_invalidate_state(S); + return -1; + } + + /* Setup Parameter Block for keyed BLAKE2 */ + P.digest_length = (uint8_t)outlen; + P.key_length = (uint8_t)keylen; + P.fanout = 1; + P.depth = 1; + P.leaf_length = 0; + P.node_offset = 0; + P.node_depth = 0; + P.inner_length = 0; + memset(P.reserved, 0, sizeof(P.reserved)); + memset(P.salt, 0, sizeof(P.salt)); + memset(P.personal, 0, sizeof(P.personal)); + + if (blake2b_init_param(S, &P) < 0) { + blake2b_invalidate_state(S); + return -1; + } + + { + uint8_t block[BLAKE2B_BLOCKBYTES]; + memset(block, 0, BLAKE2B_BLOCKBYTES); + memcpy(block, key, keylen); + blake2b_update(S, block, BLAKE2B_BLOCKBYTES); + /* Burn the key from stack */ + //clear_internal_memory(block, BLAKE2B_BLOCKBYTES); + } + return 0; +} + +static void blake2b_compress(blake2b_state *S, const uint8_t *block) { + uint64_t m[16]; + uint64_t v[16]; + unsigned int i, r; + + for (i = 0; i < 16; ++i) { + m[i] = load64(block + i * sizeof(m[i])); + } + + for (i = 0; i < 8; ++i) { + v[i] = S->h[i]; + } + + v[8] = blake2b_IV[0]; + v[9] = blake2b_IV[1]; + v[10] = blake2b_IV[2]; + v[11] = blake2b_IV[3]; + v[12] = blake2b_IV[4] ^ S->t[0]; + v[13] = blake2b_IV[5] ^ S->t[1]; + v[14] = blake2b_IV[6] ^ S->f[0]; + v[15] = blake2b_IV[7] ^ S->f[1]; + +#define G(r, i, a, b, c, d) \ + do { \ + a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while ((void)0, 0) + +#define ROUND(r) \ + do { \ + G(r, 0, v[0], v[4], v[8], v[12]); \ + G(r, 1, v[1], v[5], v[9], v[13]); \ + G(r, 2, v[2], v[6], v[10], v[14]); \ + G(r, 3, v[3], v[7], v[11], v[15]); \ + G(r, 4, v[0], v[5], v[10], v[15]); \ + G(r, 5, v[1], v[6], v[11], v[12]); \ + G(r, 6, v[2], v[7], v[8], v[13]); \ + G(r, 7, v[3], v[4], v[9], v[14]); \ + } while ((void)0, 0) + + for (r = 0; r < 12; ++r) { + ROUND(r); + } + + for (i = 0; i < 8; ++i) { + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + } + +#undef G +#undef ROUND +} + +int blake2b_update(blake2b_state *S, const void *in, size_t inlen) { + const uint8_t *pin = (const uint8_t *)in; + + if (inlen == 0) { + return 0; + } + + /* Sanity check */ + if (S == NULL || in == NULL) { + return -1; + } + + /* Is this a reused state? */ + if (S->f[0] != 0) { + return -1; + } + + if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) { //BLAKE2B_BLOCKBYTES =128 + /* Complete current block */ + size_t left = S->buflen; + size_t fill = BLAKE2B_BLOCKBYTES - left; + memcpy(&S->buf[left], pin, fill); + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, S->buf); + S->buflen = 0; + inlen -= fill; + pin += fill; + /* Avoid buffer copies when possible */ + while (inlen > BLAKE2B_BLOCKBYTES) { + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, pin); + inlen -= BLAKE2B_BLOCKBYTES; + pin += BLAKE2B_BLOCKBYTES; + } + } + memcpy(&S->buf[S->buflen], pin, inlen); + S->buflen += (unsigned int)inlen; + return 0; +} + +int blake2b_final(blake2b_state *S, void *out, size_t outlen) { + uint8_t buffer[BLAKE2B_OUTBYTES] = { 0 }; + unsigned int i; + + /* Sanity checks */ + if (S == NULL || out == NULL || outlen < S->outlen) { + return -1; + } + + /* Is this a reused state? */ + if (S->f[0] != 0) { + return -1; + } + + blake2b_increment_counter(S, S->buflen); + blake2b_set_lastblock(S); + memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ + blake2b_compress(S, S->buf); + + for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */ + store64(buffer + sizeof(S->h[i]) * i, S->h[i]); + } + + memcpy(out, buffer, S->outlen); + //clear_internal_memory(buffer, sizeof(buffer)); + //clear_internal_memory(S->buf, sizeof(S->buf)); + //clear_internal_memory(S->h, sizeof(S->h)); + return 0; +} + +int blake2b(void *out, size_t outlen, const void *in, size_t inlen, + const void *key, size_t keylen) { + blake2b_state S; + int ret = -1; + + /* Verify parameters */ + if (NULL == in && inlen > 0) { + goto fail; + } + + if (NULL == out || outlen == 0 || outlen > BLAKE2B_OUTBYTES) { + goto fail; + } + + if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) { + goto fail; + } + + if (keylen > 0) { + if (blake2b_init_key(&S, outlen, key, keylen) < 0) { + goto fail; + } + } + else { + if (blake2b_init(&S, outlen) < 0) { + goto fail; + } + } + + if (blake2b_update(&S, in, inlen) < 0) { + goto fail; + } + ret = blake2b_final(&S, out, outlen); + +fail: + //clear_internal_memory(&S, sizeof(S)); + return ret; +} + +/* Argon2 Team - Begin Code */ +int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) { + uint8_t *out = (uint8_t *)pout; + blake2b_state blake_state; + uint8_t outlen_bytes[sizeof(uint32_t)] = { 0 }; + int ret = -1; + + if (outlen > UINT32_MAX) { + goto fail; + } + + /* Ensure little-endian byte order! */ + store32(outlen_bytes, (uint32_t)outlen); + +#define TRY(statement) \ + do { \ + ret = statement; \ + if (ret < 0) { \ + goto fail; \ + } \ + } while ((void)0, 0) + + if (outlen <= BLAKE2B_OUTBYTES) { + TRY(blake2b_init(&blake_state, outlen)); + TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(blake2b_update(&blake_state, in, inlen)); + TRY(blake2b_final(&blake_state, out, outlen)); + } + else { + uint32_t toproduce; + uint8_t out_buffer[BLAKE2B_OUTBYTES]; + uint8_t in_buffer[BLAKE2B_OUTBYTES]; + TRY(blake2b_init(&blake_state, BLAKE2B_OUTBYTES)); + TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(blake2b_update(&blake_state, in, inlen)); + TRY(blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES)); + memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); + out += BLAKE2B_OUTBYTES / 2; + toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2; + + while (toproduce > BLAKE2B_OUTBYTES) { + memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); + TRY(blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, + BLAKE2B_OUTBYTES, NULL, 0)); + memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); + out += BLAKE2B_OUTBYTES / 2; + toproduce -= BLAKE2B_OUTBYTES / 2; + } + + memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); + TRY(blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL, + 0)); + memcpy(out, out_buffer, toproduce); + } +fail: + //clear_internal_memory(&blake_state, sizeof(blake_state)); + return ret; +#undef TRY +} +/* Argon2 Team - End Code */ + diff --git a/randomx/blake2/blamka-round-avx2.h b/randomx/blake2/blamka-round-avx2.h new file mode 100644 index 0000000..4838261 --- /dev/null +++ b/randomx/blake2/blamka-round-avx2.h @@ -0,0 +1,189 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef BLAKE_ROUND_MKA_OPT_H +#define BLAKE_ROUND_MKA_OPT_H + +#include "blake2-impl.h" + +#ifdef __GNUC__ +#include +#else +#include +#endif + +#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) +#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) +#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) +#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x))) + +#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i ml = _mm256_mul_epu32(A0, B0); \ + ml = _mm256_add_epi64(ml, ml); \ + A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ + D0 = _mm256_xor_si256(D0, A0); \ + D0 = rotr32(D0); \ + \ + ml = _mm256_mul_epu32(C0, D0); \ + ml = _mm256_add_epi64(ml, ml); \ + C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ + \ + B0 = _mm256_xor_si256(B0, C0); \ + B0 = rotr24(B0); \ + \ + ml = _mm256_mul_epu32(A1, B1); \ + ml = _mm256_add_epi64(ml, ml); \ + A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ + D1 = _mm256_xor_si256(D1, A1); \ + D1 = rotr32(D1); \ + \ + ml = _mm256_mul_epu32(C1, D1); \ + ml = _mm256_add_epi64(ml, ml); \ + C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ + \ + B1 = _mm256_xor_si256(B1, C1); \ + B1 = rotr24(B1); \ + } while((void)0, 0); + +#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i ml = _mm256_mul_epu32(A0, B0); \ + ml = _mm256_add_epi64(ml, ml); \ + A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ + D0 = _mm256_xor_si256(D0, A0); \ + D0 = rotr16(D0); \ + \ + ml = _mm256_mul_epu32(C0, D0); \ + ml = _mm256_add_epi64(ml, ml); \ + C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ + B0 = _mm256_xor_si256(B0, C0); \ + B0 = rotr63(B0); \ + \ + ml = _mm256_mul_epu32(A1, B1); \ + ml = _mm256_add_epi64(ml, ml); \ + A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ + D1 = _mm256_xor_si256(D1, A1); \ + D1 = rotr16(D1); \ + \ + ml = _mm256_mul_epu32(C1, D1); \ + ml = _mm256_add_epi64(ml, ml); \ + C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ + B1 = _mm256_xor_si256(B1, C1); \ + B1 = rotr63(B1); \ + } while((void)0, 0); + +#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ + C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ + D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ + \ + B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ + C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ + D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ + } while((void)0, 0); + +#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ + __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ + B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + \ + tmp1 = C0; \ + C0 = C1; \ + C1 = tmp1; \ + \ + tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ + tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ + D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + } while(0); + +#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ + C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ + D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ + \ + B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ + C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ + D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ + } while((void)0, 0); + +#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ + __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ + B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + \ + tmp1 = C0; \ + C0 = C1; \ + C1 = tmp1; \ + \ + tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ + tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ + D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + } while((void)0, 0); + +#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ + do{ \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + } while((void)0, 0); + +#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do{ \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + } while((void)0, 0); + +#endif /* BLAKE_ROUND_MKA_OPT_H */ diff --git a/randomx/blake2/blamka-round-ref.h b/randomx/blake2/blamka-round-ref.h new file mode 100644 index 0000000..f1fb50b --- /dev/null +++ b/randomx/blake2/blamka-round-ref.h @@ -0,0 +1,73 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef BLAKE_ROUND_MKA_H +#define BLAKE_ROUND_MKA_H + +#include "blake2.h" +#include "blake2-impl.h" + + /* designed by the Lyra PHC team */ +static FORCE_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) { + const uint64_t m = UINT64_C(0xFFFFFFFF); + const uint64_t xy = (x & m) * (y & m); + return x + y + 2 * xy; +} + +#define G(a, b, c, d) \ + do { \ + a = fBlaMka(a, b); \ + d = rotr64(d ^ a, 32); \ + c = fBlaMka(c, d); \ + b = rotr64(b ^ c, 24); \ + a = fBlaMka(a, b); \ + d = rotr64(d ^ a, 16); \ + c = fBlaMka(c, d); \ + b = rotr64(b ^ c, 63); \ + } while ((void)0, 0) + +#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \ + v12, v13, v14, v15) \ + do { \ + G(v0, v4, v8, v12); \ + G(v1, v5, v9, v13); \ + G(v2, v6, v10, v14); \ + G(v3, v7, v11, v15); \ + G(v0, v5, v10, v15); \ + G(v1, v6, v11, v12); \ + G(v2, v7, v8, v13); \ + G(v3, v4, v9, v14); \ + } while ((void)0, 0) + +#endif diff --git a/randomx/blake2/blamka-round-ssse3.h b/randomx/blake2/blamka-round-ssse3.h new file mode 100644 index 0000000..f2d3b5d --- /dev/null +++ b/randomx/blake2/blamka-round-ssse3.h @@ -0,0 +1,162 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef BLAKE_ROUND_MKA_OPT_H +#define BLAKE_ROUND_MKA_OPT_H + +#include "blake2-impl.h" + +#ifdef __GNUC__ +#include +#else +#include +#endif + +#ifdef _mm_roti_epi64 //clang defines it using the XOP instruction set +#undef _mm_roti_epi64 +#endif + +#define r16 \ + (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) +#define r24 \ + (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) \ + ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ + : (-(c) == 24) \ + ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) \ + ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) \ + ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_slli_epi64((x), 64 - (-(c)))) + +static FORCE_INLINE __m128i fBlaMka(__m128i x, __m128i y) { + const __m128i z = _mm_mul_epu32(x, y); + return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); +} + +#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = fBlaMka(A0, B0); \ + A1 = fBlaMka(A1, B1); \ + \ + D0 = _mm_xor_si128(D0, A0); \ + D1 = _mm_xor_si128(D1, A1); \ + \ + D0 = _mm_roti_epi64(D0, -32); \ + D1 = _mm_roti_epi64(D1, -32); \ + \ + C0 = fBlaMka(C0, D0); \ + C1 = fBlaMka(C1, D1); \ + \ + B0 = _mm_xor_si128(B0, C0); \ + B1 = _mm_xor_si128(B1, C1); \ + \ + B0 = _mm_roti_epi64(B0, -24); \ + B1 = _mm_roti_epi64(B1, -24); \ + } while ((void)0, 0) + +#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = fBlaMka(A0, B0); \ + A1 = fBlaMka(A1, B1); \ + \ + D0 = _mm_xor_si128(D0, A0); \ + D1 = _mm_xor_si128(D1, A1); \ + \ + D0 = _mm_roti_epi64(D0, -16); \ + D1 = _mm_roti_epi64(D1, -16); \ + \ + C0 = fBlaMka(C0, D0); \ + C1 = fBlaMka(C1, D1); \ + \ + B0 = _mm_xor_si128(B0, C0); \ + B1 = _mm_xor_si128(B1, C1); \ + \ + B0 = _mm_roti_epi64(B0, -63); \ + B1 = _mm_roti_epi64(B1, -63); \ + } while ((void)0, 0) + +#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ + __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ + B0 = t0; \ + B1 = t1; \ + \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + \ + t0 = _mm_alignr_epi8(D1, D0, 8); \ + t1 = _mm_alignr_epi8(D0, D1, 8); \ + D0 = t1; \ + D1 = t0; \ + } while ((void)0, 0) + +#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ + __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ + B0 = t0; \ + B1 = t1; \ + \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + \ + t0 = _mm_alignr_epi8(D0, D1, 8); \ + t1 = _mm_alignr_epi8(D1, D0, 8); \ + D0 = t1; \ + D1 = t0; \ + } while ((void)0, 0) + +#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + } while ((void)0, 0) + + +#endif /* BLAKE_ROUND_MKA_OPT_H */ diff --git a/randomx/blake2/endian.h b/randomx/blake2/endian.h new file mode 100644 index 0000000..c7afed2 --- /dev/null +++ b/randomx/blake2/endian.h @@ -0,0 +1,107 @@ +#pragma once +#include +#include + +#if defined(_MSC_VER) +#define FORCE_INLINE __inline +#elif defined(__GNUC__) || defined(__clang__) +#define FORCE_INLINE __inline__ +#else +#define FORCE_INLINE +#endif + + /* Argon2 Team - Begin Code */ + /* + Not an exhaustive list, but should cover the majority of modern platforms + Additionally, the code will always be correct---this is only a performance + tweak. + */ +#if (defined(__BYTE_ORDER__) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ + defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \ + defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \ + defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \ + defined(_M_ARM) +#define NATIVE_LITTLE_ENDIAN +#endif + /* Argon2 Team - End Code */ + +static FORCE_INLINE uint32_t load32(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + uint32_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = (const uint8_t *)src; + uint32_t w = *p++; + w |= (uint32_t)(*p++) << 8; + w |= (uint32_t)(*p++) << 16; + w |= (uint32_t)(*p++) << 24; + return w; +#endif +} + +static FORCE_INLINE uint64_t load64_native(const void *src) { + uint64_t w; + memcpy(&w, src, sizeof w); + return w; +} + +static FORCE_INLINE uint64_t load64(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + return load64_native(src); +#else + const uint8_t *p = (const uint8_t *)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + w |= (uint64_t)(*p++) << 48; + w |= (uint64_t)(*p++) << 56; + return w; +#endif +} + +static FORCE_INLINE void store32(void *dst, uint32_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} + +static FORCE_INLINE void store64_native(void *dst, uint64_t w) { + memcpy(dst, &w, sizeof w); +} + +static FORCE_INLINE void store64(void *dst, uint64_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + store64_native(dst, w); +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} diff --git a/randomx/blake2_generator.cpp b/randomx/blake2_generator.cpp new file mode 100644 index 0000000..3f2d028 --- /dev/null +++ b/randomx/blake2_generator.cpp @@ -0,0 +1,62 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "blake2/blake2.h" +#include "blake2/endian.h" +#include "blake2_generator.hpp" + +namespace randomx { + + constexpr int maxSeedSize = 60; + + Blake2Generator::Blake2Generator(const void* seed, size_t seedSize, int nonce) : dataIndex(sizeof(data)) { + memset(data, 0, sizeof(data)); + memcpy(data, seed, seedSize > maxSeedSize ? maxSeedSize : seedSize); + store32(&data[maxSeedSize], nonce); + } + + uint8_t Blake2Generator::getByte() { + checkData(1); + return data[dataIndex++]; + } + + uint32_t Blake2Generator::getUInt32() { + checkData(4); + auto ret = load32(&data[dataIndex]); + dataIndex += 4; + return ret; + } + + void Blake2Generator::checkData(const size_t bytesNeeded) { + if (dataIndex + bytesNeeded > sizeof(data)) { + blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); + dataIndex = 0; + } + } +} \ No newline at end of file diff --git a/randomx/blake2_generator.hpp b/randomx/blake2_generator.hpp new file mode 100644 index 0000000..5e7f61f --- /dev/null +++ b/randomx/blake2_generator.hpp @@ -0,0 +1,46 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +namespace randomx { + + class Blake2Generator { + public: + Blake2Generator(const void* seed, size_t seedSize, int nonce = 0); + uint8_t getByte(); + uint32_t getUInt32(); + private: + void checkData(const size_t); + + uint8_t data[64]; + size_t dataIndex; + }; +} \ No newline at end of file diff --git a/randomx/bytecode_machine.cpp b/randomx/bytecode_machine.cpp new file mode 100644 index 0000000..98e85af --- /dev/null +++ b/randomx/bytecode_machine.cpp @@ -0,0 +1,494 @@ +/* +Copyright (c) 2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "bytecode_machine.hpp" +#include "reciprocal.h" + +namespace randomx { + + const int_reg_t BytecodeMachine::zero = 0; + +#define INSTR_CASE(x) case InstructionType::x: \ + exe_ ## x(ibc, pc, scratchpad, config); \ + break; + + void BytecodeMachine::executeInstruction(RANDOMX_EXE_ARGS) { + switch (ibc.type) + { + INSTR_CASE(IADD_RS) + INSTR_CASE(IADD_M) + INSTR_CASE(ISUB_R) + INSTR_CASE(ISUB_M) + INSTR_CASE(IMUL_R) + INSTR_CASE(IMUL_M) + INSTR_CASE(IMULH_R) + INSTR_CASE(IMULH_M) + INSTR_CASE(ISMULH_R) + INSTR_CASE(ISMULH_M) + INSTR_CASE(INEG_R) + INSTR_CASE(IXOR_R) + INSTR_CASE(IXOR_M) + INSTR_CASE(IROR_R) + INSTR_CASE(IROL_R) + INSTR_CASE(ISWAP_R) + INSTR_CASE(FSWAP_R) + INSTR_CASE(FADD_R) + INSTR_CASE(FADD_M) + INSTR_CASE(FSUB_R) + INSTR_CASE(FSUB_M) + INSTR_CASE(FSCAL_R) + INSTR_CASE(FMUL_R) + INSTR_CASE(FDIV_M) + INSTR_CASE(FSQRT_R) + INSTR_CASE(CBRANCH) + INSTR_CASE(CFROUND) + INSTR_CASE(ISTORE) + + case InstructionType::NOP: + break; + + case InstructionType::IMUL_RCP: //executed as IMUL_R + default: + UNREACHABLE; + } + } + + void BytecodeMachine::compileInstruction(RANDOMX_GEN_ARGS) { + int opcode = instr.opcode; + +// printf("nreg.r[0]=%016llx\n",nreg->r[0]); +// printf("nreg.r[1]=%016llx\n",nreg->r[1]); +// printf("nreg.r[2]=%016llx\n",nreg->r[2]); +// printf("nreg.r[3]=%016llx\n",nreg->r[3]); +// printf("nreg.r[4]=%016llx\n",nreg->r[4]); +// printf("nreg.r[5]=%016llx\n",nreg->r[5]); +// printf("nreg.r[6]=%016llx\n",nreg->r[6]); +// printf("nreg.r[7]=%016llx\n",nreg->r[7]); + + //nreg.r 初始8个寄存器都是0; + + //printf("ceil_IADD_RS= %0d,ceil_IADD_M=%0d,ceil_ISUB_R=%0d\n",ceil_IADD_RS,ceil_IADD_M,ceil_ISUB_R); + if (opcode < ceil_IADD_RS) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IADD_RS; + ibc.idst = &nreg->r[dst]; + if (dst != RegisterNeedsDisplacement) { //RegisterNeedsDisplacement=5 + ibc.isrc = &nreg->r[src]; + ibc.shift = instr.getModShift(); + ibc.imm = 0; + } + else { + ibc.isrc = &nreg->r[src]; + ibc.shift = instr.getModShift(); + ibc.imm = signExtend2sCompl(instr.getImm32()); + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_IADD_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IADD_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_ISUB_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISUB_R; + ibc.idst = &nreg->r[dst]; + if (src != dst) { + ibc.isrc = &nreg->r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.getImm32()); + ibc.isrc = &ibc.imm; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_ISUB_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISUB_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_IMUL_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMUL_R; + ibc.idst = &nreg->r[dst]; + if (src != dst) { + ibc.isrc = &nreg->r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.getImm32()); + ibc.isrc = &ibc.imm; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_IMUL_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMUL_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_IMULH_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMULH_R; + ibc.idst = &nreg->r[dst]; + ibc.isrc = &nreg->r[src]; + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_IMULH_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMULH_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_ISMULH_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISMULH_R; + ibc.idst = &nreg->r[dst]; + ibc.isrc = &nreg->r[src]; + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_ISMULH_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISMULH_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_IMUL_RCP) { + uint64_t divisor = instr.getImm32(); + if (!isZeroOrPowerOf2(divisor)) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::IMUL_R; + ibc.idst = &nreg->r[dst]; + ibc.imm = randomx_reciprocal(divisor); //******8 + ibc.isrc = &ibc.imm; + registerUsage[dst] = i; + } + else { + ibc.type = InstructionType::NOP; + } + return; + } + + if (opcode < ceil_INEG_R) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::INEG_R; + ibc.idst = &nreg->r[dst]; + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_IXOR_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IXOR_R; + ibc.idst = &nreg->r[dst]; + if (src != dst) { + ibc.isrc = &nreg->r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.getImm32()); + ibc.isrc = &ibc.imm; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_IXOR_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IXOR_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_IROR_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IROR_R; + ibc.idst = &nreg->r[dst]; + if (src != dst) { + ibc.isrc = &nreg->r[src]; + } + else { + ibc.imm = instr.getImm32(); + ibc.isrc = &ibc.imm; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_IROL_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IROL_R; + ibc.idst = &nreg->r[dst]; + if (src != dst) { + ibc.isrc = &nreg->r[src]; + } + else { + ibc.imm = instr.getImm32(); + ibc.isrc = &ibc.imm; + } + registerUsage[dst] = i; + return; + } + + if (opcode < ceil_ISWAP_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + if (src != dst) { + ibc.idst = &nreg->r[dst]; + ibc.isrc = &nreg->r[src]; + ibc.type = InstructionType::ISWAP_R; + registerUsage[dst] = i; + registerUsage[src] = i; + } + else { + ibc.type = InstructionType::NOP; + } + return; + } + + if (opcode < ceil_FSWAP_R) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::FSWAP_R; + if (dst < RegisterCountFlt) + ibc.fdst = &nreg->f[dst]; + else + ibc.fdst = &nreg->e[dst - RegisterCountFlt]; + return; + } + + if (opcode < ceil_FADD_R) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegisterCountFlt; + ibc.type = InstructionType::FADD_R; + ibc.fdst = &nreg->f[dst]; + ibc.fsrc = &nreg->a[src]; + return; + } + + if (opcode < ceil_FADD_M) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::FADD_M; + ibc.fdst = &nreg->f[dst]; + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + ibc.imm = signExtend2sCompl(instr.getImm32()); + return; + } + + if (opcode < ceil_FSUB_R) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegisterCountFlt; + ibc.type = InstructionType::FSUB_R; + ibc.fdst = &nreg->f[dst]; + ibc.fsrc = &nreg->a[src]; + return; + } + + if (opcode < ceil_FSUB_M) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::FSUB_M; + ibc.fdst = &nreg->f[dst]; + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + ibc.imm = signExtend2sCompl(instr.getImm32()); + return; + } + + if (opcode < ceil_FSCAL_R) { + auto dst = instr.dst % RegisterCountFlt; + ibc.fdst = &nreg->f[dst]; + ibc.type = InstructionType::FSCAL_R; + return; + } + + if (opcode < ceil_FMUL_R) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegisterCountFlt; + ibc.type = InstructionType::FMUL_R; + ibc.fdst = &nreg->e[dst]; + ibc.fsrc = &nreg->a[src]; + return; + } + + if (opcode < ceil_FDIV_M) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::FDIV_M; + ibc.fdst = &nreg->e[dst]; + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + ibc.imm = signExtend2sCompl(instr.getImm32()); + return; + } + + if (opcode < ceil_FSQRT_R) { + auto dst = instr.dst % RegisterCountFlt; + ibc.type = InstructionType::FSQRT_R; + ibc.fdst = &nreg->e[dst]; + return; + } + + if (opcode < ceil_CBRANCH) { + ibc.type = InstructionType::CBRANCH; + //jump condition + int creg = instr.dst % RegistersCount; + ibc.idst = &nreg->r[creg]; + ibc.target = registerUsage[creg]; + int shift = instr.getModCond() + ConditionOffset; + ibc.imm = signExtend2sCompl(instr.getImm32()) | (1ULL << shift); + if (ConditionOffset > 0 || shift > 0) //clear the bit below the condition mask - this limits the number of successive jumps to 2 + ibc.imm &= ~(1ULL << (shift - 1)); + ibc.memMask = ConditionMask << shift; + //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { + registerUsage[j] = i; + } + return; + } + + if (opcode < ceil_CFROUND) { + auto src = instr.src % RegistersCount; + ibc.isrc = &nreg->r[src]; + ibc.type = InstructionType::CFROUND; + ibc.imm = instr.getImm32() & 63; + return; + } + + if (opcode < ceil_ISTORE) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISTORE; + ibc.idst = &nreg->r[dst]; + ibc.isrc = &nreg->r[src]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (instr.getModCond() < StoreL3Condition) //StoreL3Condition= 14 + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + else + ibc.memMask = ScratchpadL3Mask; + return; + } + + if (opcode < ceil_NOP) { + ibc.type = InstructionType::NOP; + return; + } + + UNREACHABLE; + } +} diff --git a/randomx/bytecode_machine.hpp b/randomx/bytecode_machine.hpp new file mode 100644 index 0000000..46697f6 --- /dev/null +++ b/randomx/bytecode_machine.hpp @@ -0,0 +1,322 @@ +/* +Copyright (c) 2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include "common.hpp" +#include "intrin_portable.h" +#include "instruction.hpp" +#include "program.hpp" + +namespace randomx { + + //register file in machine byte order + struct NativeRegisterFile { + int_reg_t r[RegistersCount] = { 0 }; + rx_vec_f128 f[RegisterCountFlt]; + rx_vec_f128 e[RegisterCountFlt]; + rx_vec_f128 a[RegisterCountFlt]; + }; + + struct InstructionByteCode { + union { + int_reg_t* idst; + rx_vec_f128* fdst; + }; + union { + const int_reg_t* isrc; + const rx_vec_f128* fsrc; + }; + union { + uint64_t imm; + int64_t simm; + }; + InstructionType type; + union { + int16_t target; + uint16_t shift; + }; + uint32_t memMask; + }; + +#define OPCODE_CEIL_DECLARE(curr, prev) constexpr int ceil_ ## curr = ceil_ ## prev + RANDOMX_FREQ_ ## curr; + constexpr int ceil_NULL = 0; + OPCODE_CEIL_DECLARE(IADD_RS, NULL); + OPCODE_CEIL_DECLARE(IADD_M, IADD_RS); + OPCODE_CEIL_DECLARE(ISUB_R, IADD_M); + OPCODE_CEIL_DECLARE(ISUB_M, ISUB_R); + OPCODE_CEIL_DECLARE(IMUL_R, ISUB_M); + OPCODE_CEIL_DECLARE(IMUL_M, IMUL_R); + OPCODE_CEIL_DECLARE(IMULH_R, IMUL_M); + OPCODE_CEIL_DECLARE(IMULH_M, IMULH_R); + OPCODE_CEIL_DECLARE(ISMULH_R, IMULH_M); + OPCODE_CEIL_DECLARE(ISMULH_M, ISMULH_R); + OPCODE_CEIL_DECLARE(IMUL_RCP, ISMULH_M); + OPCODE_CEIL_DECLARE(INEG_R, IMUL_RCP); + OPCODE_CEIL_DECLARE(IXOR_R, INEG_R); + OPCODE_CEIL_DECLARE(IXOR_M, IXOR_R); + OPCODE_CEIL_DECLARE(IROR_R, IXOR_M); + OPCODE_CEIL_DECLARE(IROL_R, IROR_R); + OPCODE_CEIL_DECLARE(ISWAP_R, IROL_R); + OPCODE_CEIL_DECLARE(FSWAP_R, ISWAP_R); + OPCODE_CEIL_DECLARE(FADD_R, FSWAP_R); + OPCODE_CEIL_DECLARE(FADD_M, FADD_R); + OPCODE_CEIL_DECLARE(FSUB_R, FADD_M); + OPCODE_CEIL_DECLARE(FSUB_M, FSUB_R); + OPCODE_CEIL_DECLARE(FSCAL_R, FSUB_M); + OPCODE_CEIL_DECLARE(FMUL_R, FSCAL_R); + OPCODE_CEIL_DECLARE(FDIV_M, FMUL_R); + OPCODE_CEIL_DECLARE(FSQRT_R, FDIV_M); + OPCODE_CEIL_DECLARE(CBRANCH, FSQRT_R); + OPCODE_CEIL_DECLARE(CFROUND, CBRANCH); + OPCODE_CEIL_DECLARE(ISTORE, CFROUND); + OPCODE_CEIL_DECLARE(NOP, ISTORE); +#undef OPCODE_CEIL_DECLARE + +#define RANDOMX_EXE_ARGS InstructionByteCode& ibc, int& pc, uint8_t* scratchpad, ProgramConfiguration& config +#define RANDOMX_GEN_ARGS Instruction& instr, int i, InstructionByteCode& ibc + + class BytecodeMachine; + + typedef void(BytecodeMachine::*InstructionGenBytecode)(RANDOMX_GEN_ARGS); + + class BytecodeMachine { + public: + void beginCompilation(NativeRegisterFile& regFile) { + for (unsigned i = 0; i < RegistersCount; ++i) { + registerUsage[i] = -1; + } + nreg = ®File; + } + + void compileProgram(Program& program, InstructionByteCode bytecode[RANDOMX_PROGRAM_SIZE], NativeRegisterFile& regFile) { + beginCompilation(regFile); + for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { //256 + auto& instr = program(i); + auto& ibc = bytecode[i]; + compileInstruction(instr, i, ibc); + } + } + + static void executeBytecode(InstructionByteCode bytecode[RANDOMX_PROGRAM_SIZE], uint8_t* scratchpad, ProgramConfiguration& config) { + for (int pc = 0; pc < RANDOMX_PROGRAM_SIZE; ++pc) { + auto& ibc = bytecode[pc]; + executeInstruction(ibc, pc, scratchpad, config); + } + } + + void compileInstruction(RANDOMX_GEN_ARGS) +#ifdef RANDOMX_GEN_TABLE + { + auto generator = genTable[instr.opcode]; + (this->*generator)(instr, i, ibc); + } +#else + ; +#endif + + static void executeInstruction(RANDOMX_EXE_ARGS); + + static void exe_IADD_RS(RANDOMX_EXE_ARGS) { + *ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm; + } + + static void exe_IADD_M(RANDOMX_EXE_ARGS) { + *ibc.idst += load64(getScratchpadAddress(ibc, scratchpad)); + } + + static void exe_ISUB_R(RANDOMX_EXE_ARGS) { + *ibc.idst -= *ibc.isrc; + } + + static void exe_ISUB_M(RANDOMX_EXE_ARGS) { + *ibc.idst -= load64(getScratchpadAddress(ibc, scratchpad)); + } + + static void exe_IMUL_R(RANDOMX_EXE_ARGS) { + *ibc.idst *= *ibc.isrc; + } + + static void exe_IMUL_M(RANDOMX_EXE_ARGS) { + *ibc.idst *= load64(getScratchpadAddress(ibc, scratchpad)); + } + + static void exe_IMULH_R(RANDOMX_EXE_ARGS) { + *ibc.idst = mulh(*ibc.idst, *ibc.isrc); + } + + static void exe_IMULH_M(RANDOMX_EXE_ARGS) { + *ibc.idst = mulh(*ibc.idst, load64(getScratchpadAddress(ibc, scratchpad))); + } + + static void exe_ISMULH_R(RANDOMX_EXE_ARGS) { + *ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(*ibc.isrc)); + } + + static void exe_ISMULH_M(RANDOMX_EXE_ARGS) { + *ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(load64(getScratchpadAddress(ibc, scratchpad)))); + } + + static void exe_INEG_R(RANDOMX_EXE_ARGS) { + *ibc.idst = ~(*ibc.idst) + 1; //two's complement negative + } + + static void exe_IXOR_R(RANDOMX_EXE_ARGS) { + *ibc.idst ^= *ibc.isrc; + } + + static void exe_IXOR_M(RANDOMX_EXE_ARGS) { + *ibc.idst ^= load64(getScratchpadAddress(ibc, scratchpad)); + } + + static void exe_IROR_R(RANDOMX_EXE_ARGS) { + *ibc.idst = rotr(*ibc.idst, *ibc.isrc & 63); + } + + static void exe_IROL_R(RANDOMX_EXE_ARGS) { + *ibc.idst = rotl(*ibc.idst, *ibc.isrc & 63); + } + + static void exe_ISWAP_R(RANDOMX_EXE_ARGS) { + int_reg_t temp = *ibc.isrc; + *(int_reg_t*)ibc.isrc = *ibc.idst; + *ibc.idst = temp; + } + + static void exe_FSWAP_R(RANDOMX_EXE_ARGS) { + *ibc.fdst = rx_swap_vec_f128(*ibc.fdst); + } + + static void exe_FADD_R(RANDOMX_EXE_ARGS) { + *ibc.fdst = rx_add_vec_f128(*ibc.fdst, *ibc.fsrc); + } + + static void exe_FADD_M(RANDOMX_EXE_ARGS) { + rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc, scratchpad)); + *ibc.fdst = rx_add_vec_f128(*ibc.fdst, fsrc); + } + + static void exe_FSUB_R(RANDOMX_EXE_ARGS) { + *ibc.fdst = rx_sub_vec_f128(*ibc.fdst, *ibc.fsrc); + } + + static void exe_FSUB_M(RANDOMX_EXE_ARGS) { + rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc, scratchpad)); + *ibc.fdst = rx_sub_vec_f128(*ibc.fdst, fsrc); + } + + static void exe_FSCAL_R(RANDOMX_EXE_ARGS) { + const rx_vec_f128 mask = rx_set1_vec_f128(0x80F0000000000000); + *ibc.fdst = rx_xor_vec_f128(*ibc.fdst, mask); + } + + static void exe_FMUL_R(RANDOMX_EXE_ARGS) { + *ibc.fdst = rx_mul_vec_f128(*ibc.fdst, *ibc.fsrc); + } + + static void exe_FDIV_M(RANDOMX_EXE_ARGS) { + rx_vec_f128 fsrc = maskRegisterExponentMantissa( + config, + rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc, scratchpad)) + ); + *ibc.fdst = rx_div_vec_f128(*ibc.fdst, fsrc); + } + + static void exe_FSQRT_R(RANDOMX_EXE_ARGS) { + *ibc.fdst = rx_sqrt_vec_f128(*ibc.fdst); + } + + static void exe_CBRANCH(RANDOMX_EXE_ARGS) { + *ibc.idst += ibc.imm; + if ((*ibc.idst & ibc.memMask) == 0) { + pc = ibc.target; + } + } + + static void exe_CFROUND(RANDOMX_EXE_ARGS) { + rx_set_rounding_mode(rotr(*ibc.isrc, ibc.imm) % 4); + } + + static void exe_ISTORE(RANDOMX_EXE_ARGS) { + store64(scratchpad + ((*ibc.idst + ibc.imm) & ibc.memMask), *ibc.isrc); + } + protected: + static rx_vec_f128 maskRegisterExponentMantissa(ProgramConfiguration& config, rx_vec_f128 x) { + const rx_vec_f128 xmantissaMask = rx_set_vec_f128(dynamicMantissaMask, dynamicMantissaMask); + const rx_vec_f128 xexponentMask = rx_load_vec_f128((const double*)&config.eMask); + x = rx_and_vec_f128(x, xmantissaMask); + x = rx_or_vec_f128(x, xexponentMask); + return x; + } + + private: + static const int_reg_t zero; + int registerUsage[RegistersCount]; + NativeRegisterFile* nreg; + + static void* getScratchpadAddress(InstructionByteCode& ibc, uint8_t* scratchpad) { + uint32_t addr = (*ibc.isrc + ibc.imm) & ibc.memMask; + return scratchpad + addr; + } + +#ifdef RANDOMX_GEN_TABLE + static InstructionGenBytecode genTable[256]; + + void gen_IADD_RS(RANDOMX_GEN_ARGS); + void gen_IADD_M(RANDOMX_GEN_ARGS); + void gen_ISUB_R(RANDOMX_GEN_ARGS); + void gen_ISUB_M(RANDOMX_GEN_ARGS); + void gen_IMUL_R(RANDOMX_GEN_ARGS); + void gen_IMUL_M(RANDOMX_GEN_ARGS); + void gen_IMULH_R(RANDOMX_GEN_ARGS); + void gen_IMULH_M(RANDOMX_GEN_ARGS); + void gen_ISMULH_R(RANDOMX_GEN_ARGS); + void gen_ISMULH_M(RANDOMX_GEN_ARGS); + void gen_IMUL_RCP(RANDOMX_GEN_ARGS); + void gen_INEG_R(RANDOMX_GEN_ARGS); + void gen_IXOR_R(RANDOMX_GEN_ARGS); + void gen_IXOR_M(RANDOMX_GEN_ARGS); + void gen_IROR_R(RANDOMX_GEN_ARGS); + void gen_IROL_R(RANDOMX_GEN_ARGS); + void gen_ISWAP_R(RANDOMX_GEN_ARGS); + void gen_FSWAP_R(RANDOMX_GEN_ARGS); + void gen_FADD_R(RANDOMX_GEN_ARGS); + void gen_FADD_M(RANDOMX_GEN_ARGS); + void gen_FSUB_R(RANDOMX_GEN_ARGS); + void gen_FSUB_M(RANDOMX_GEN_ARGS); + void gen_FSCAL_R(RANDOMX_GEN_ARGS); + void gen_FMUL_R(RANDOMX_GEN_ARGS); + void gen_FDIV_M(RANDOMX_GEN_ARGS); + void gen_FSQRT_R(RANDOMX_GEN_ARGS); + void gen_CBRANCH(RANDOMX_GEN_ARGS); + void gen_CFROUND(RANDOMX_GEN_ARGS); + void gen_ISTORE(RANDOMX_GEN_ARGS); + void gen_NOP(RANDOMX_GEN_ARGS); +#endif + }; +} diff --git a/randomx/common.hpp b/randomx/common.hpp new file mode 100644 index 0000000..a77feb3 --- /dev/null +++ b/randomx/common.hpp @@ -0,0 +1,187 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "blake2/endian.h" +#include "configuration.h" +#include "randomx.h" + +namespace randomx { + + static_assert(RANDOMX_ARGON_MEMORY >= 8, "RANDOMX_ARGON_MEMORY must be at least 8."); + static_assert(RANDOMX_ARGON_MEMORY <= 2097152, "RANDOMX_ARGON_MEMORY must not exceed 2097152."); + static_assert((RANDOMX_ARGON_MEMORY & (RANDOMX_ARGON_MEMORY - 1)) == 0, "RANDOMX_ARGON_MEMORY must be a power of 2."); + static_assert(RANDOMX_ARGON_ITERATIONS > 0 && RANDOMX_ARGON_ITERATIONS < UINT32_MAX, "RANDOMX_ARGON_ITERATIONS must be a positive 32-bit integer."); + static_assert(RANDOMX_ARGON_LANES > 0 && RANDOMX_ARGON_LANES <= 16777215, "RANDOMX_ARGON_LANES out of range"); + static_assert(RANDOMX_DATASET_BASE_SIZE >= 64, "RANDOMX_DATASET_BASE_SIZE must be at least 64."); + static_assert((RANDOMX_DATASET_BASE_SIZE & (RANDOMX_DATASET_BASE_SIZE - 1)) == 0, "RANDOMX_DATASET_BASE_SIZE must be a power of 2."); + static_assert(RANDOMX_DATASET_BASE_SIZE <= 4294967296ULL, "RANDOMX_DATASET_BASE_SIZE must not exceed 4294967296."); + static_assert(RANDOMX_DATASET_EXTRA_SIZE % 64 == 0, "RANDOMX_DATASET_EXTRA_SIZE must be divisible by 64."); + static_assert((uint64_t)RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE <= 17179869184, "Dataset size must not exceed 16 GiB."); + static_assert(RANDOMX_PROGRAM_SIZE > 0, "RANDOMX_PROGRAM_SIZE must be greater than 0"); + static_assert(RANDOMX_PROGRAM_SIZE <= 32768, "RANDOMX_PROGRAM_SIZE must not exceed 32768"); + static_assert(RANDOMX_PROGRAM_ITERATIONS > 0, "RANDOMX_PROGRAM_ITERATIONS must be greater than 0"); + static_assert(RANDOMX_PROGRAM_COUNT > 0, "RANDOMX_PROGRAM_COUNT must be greater than 0"); + static_assert((RANDOMX_SCRATCHPAD_L3 & (RANDOMX_SCRATCHPAD_L3 - 1)) == 0, "RANDOMX_SCRATCHPAD_L3 must be a power of 2."); + static_assert(RANDOMX_SCRATCHPAD_L3 >= RANDOMX_SCRATCHPAD_L2, "RANDOMX_SCRATCHPAD_L3 must be greater than or equal to RANDOMX_SCRATCHPAD_L2."); + static_assert((RANDOMX_SCRATCHPAD_L2 & (RANDOMX_SCRATCHPAD_L2 - 1)) == 0, "RANDOMX_SCRATCHPAD_L2 must be a power of 2."); + static_assert(RANDOMX_SCRATCHPAD_L2 >= RANDOMX_SCRATCHPAD_L1, "RANDOMX_SCRATCHPAD_L2 must be greater than or equal to RANDOMX_SCRATCHPAD_L1."); + static_assert(RANDOMX_SCRATCHPAD_L1 >= 64, "RANDOMX_SCRATCHPAD_L1 must be at least 64."); + static_assert((RANDOMX_SCRATCHPAD_L1 & (RANDOMX_SCRATCHPAD_L1 - 1)) == 0, "RANDOMX_SCRATCHPAD_L1 must be a power of 2."); + static_assert(RANDOMX_CACHE_ACCESSES > 1, "RANDOMX_CACHE_ACCESSES must be greater than 1"); + static_assert(RANDOMX_SUPERSCALAR_LATENCY > 0, "RANDOMX_SUPERSCALAR_LATENCY must be greater than 0"); + static_assert(RANDOMX_SUPERSCALAR_LATENCY <= 10000, "RANDOMX_SUPERSCALAR_LATENCY must not exceed 10000"); + static_assert(RANDOMX_JUMP_BITS > 0, "RANDOMX_JUMP_BITS must be greater than 0."); + static_assert(RANDOMX_JUMP_OFFSET >= 0, "RANDOMX_JUMP_OFFSET must be greater than or equal to 0."); + static_assert(RANDOMX_JUMP_BITS + RANDOMX_JUMP_OFFSET <= 16, "RANDOMX_JUMP_BITS + RANDOMX_JUMP_OFFSET must not exceed 16."); + + constexpr int wtSum = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + \ + RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + \ + RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M + RANDOMX_FREQ_IMUL_RCP + \ + RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + \ + RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + \ + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R + RANDOMX_FREQ_CBRANCH + \ + RANDOMX_FREQ_CFROUND + RANDOMX_FREQ_ISTORE + RANDOMX_FREQ_NOP; + + static_assert(wtSum == 256, "Sum of instruction frequencies must be 256."); + + + constexpr uint32_t ArgonBlockSize = 1024; + constexpr int ArgonSaltSize = sizeof("" RANDOMX_ARGON_SALT) - 1; + static_assert(ArgonSaltSize >= 8, "RANDOMX_ARGON_SALT must be at least 8 characters long"); + constexpr int SuperscalarMaxSize = 3 * RANDOMX_SUPERSCALAR_LATENCY + 2; + constexpr size_t CacheLineSize = RANDOMX_DATASET_ITEM_SIZE; + constexpr int ScratchpadSize = RANDOMX_SCRATCHPAD_L3; + constexpr uint32_t CacheLineAlignMask = (RANDOMX_DATASET_BASE_SIZE - 1) & ~(CacheLineSize - 1); + constexpr uint32_t CacheSize = RANDOMX_ARGON_MEMORY * ArgonBlockSize; + constexpr uint64_t DatasetSize = RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE; + constexpr uint32_t DatasetExtraItems = RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE; + constexpr uint32_t ConditionMask = ((1 << RANDOMX_JUMP_BITS) - 1); + constexpr int ConditionOffset = RANDOMX_JUMP_OFFSET; + constexpr int StoreL3Condition = 14; + + //Prevent some unsafe configurations. +#ifndef RANDOMX_UNSAFE + static_assert((uint64_t)ArgonBlockSize * RANDOMX_CACHE_ACCESSES * RANDOMX_ARGON_MEMORY + 33554432 >= (uint64_t)RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE, "Unsafe configuration: Memory-time tradeoffs"); + static_assert((128 + RANDOMX_PROGRAM_SIZE * RANDOMX_FREQ_ISTORE / 256) * (RANDOMX_PROGRAM_COUNT * RANDOMX_PROGRAM_ITERATIONS) >= RANDOMX_SCRATCHPAD_L3, "Unsafe configuration: Insufficient Scratchpad writes"); + static_assert(RANDOMX_PROGRAM_COUNT > 1, "Unsafe configuration: Program filtering strategies"); + static_assert(RANDOMX_PROGRAM_SIZE >= 64, "Unsafe configuration: Low program entropy"); + static_assert(RANDOMX_PROGRAM_ITERATIONS >= 400, "Unsafe configuration: High compilation overhead"); +#endif + +#ifdef TRACE + constexpr bool trace = true; +#else + constexpr bool trace = false; +#endif + +#ifndef UNREACHABLE +#ifdef __GNUC__ +#define UNREACHABLE __builtin_unreachable() +#elif _MSC_VER +#define UNREACHABLE __assume(false) +#else +#define UNREACHABLE +#endif +#endif + +#if defined(_M_X64) || defined(__x86_64__) + #define RANDOMX_HAVE_COMPILER 1 + class JitCompilerX86; + using JitCompiler = JitCompilerX86; +#elif defined(__aarch64__) + #define RANDOMX_HAVE_COMPILER 1 + class JitCompilerA64; + using JitCompiler = JitCompilerA64; +#else + #define RANDOMX_HAVE_COMPILER 0 + class JitCompilerFallback; + using JitCompiler = JitCompilerFallback; +#endif + + using addr_t = uint32_t; + + using int_reg_t = uint64_t; + + struct fpu_reg_t { + double lo; + double hi; + }; + + constexpr uint32_t ScratchpadL1 = RANDOMX_SCRATCHPAD_L1 / sizeof(int_reg_t); + constexpr uint32_t ScratchpadL2 = RANDOMX_SCRATCHPAD_L2 / sizeof(int_reg_t); + constexpr uint32_t ScratchpadL3 = RANDOMX_SCRATCHPAD_L3 / sizeof(int_reg_t); + constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8; + constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8; + constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16; + constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16; + constexpr int ScratchpadL3Mask = (ScratchpadL3 - 1) * 8; + constexpr int ScratchpadL3Mask64 = (ScratchpadL3 / 8 - 1) * 64; + constexpr int RegistersCount = 8; + constexpr int RegisterCountFlt = RegistersCount / 2; + constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register + constexpr int RegisterNeedsSib = 4; //x86 r12 register + + inline bool isZeroOrPowerOf2(uint64_t x) { + return (x & (x - 1)) == 0; + } + + constexpr int mantissaSize = 52; + constexpr int exponentSize = 11; + constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1; + constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1; + constexpr int exponentBias = 1023; + constexpr int dynamicExponentBits = 4; + constexpr int staticExponentBits = 4; + constexpr uint64_t constExponentBits = 0x300; + constexpr uint64_t dynamicMantissaMask = (1ULL << (mantissaSize + dynamicExponentBits)) - 1; + + struct MemoryRegisters { + addr_t mx, ma; + uint8_t* memory = nullptr; + }; + + //register file in little-endian byte order + struct RegisterFile { + int_reg_t r[RegistersCount]; + fpu_reg_t f[RegisterCountFlt]; + fpu_reg_t e[RegisterCountFlt]; + fpu_reg_t a[RegisterCountFlt]; + }; + + typedef void(ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t); + typedef void(DatasetInitFunc)(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); + + typedef void(DatasetDeallocFunc)(randomx_dataset*); + typedef void(CacheDeallocFunc)(randomx_cache*); + typedef void(CacheInitializeFunc)(randomx_cache*, const void*, size_t); +} diff --git a/randomx/configuration.h b/randomx/configuration.h new file mode 100644 index 0000000..84400dd --- /dev/null +++ b/randomx/configuration.h @@ -0,0 +1,125 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +//Cache size in KiB. Must be a power of 2. +#define RANDOMX_ARGON_MEMORY 262144 + +//Number of Argon2d iterations for Cache initialization. +#define RANDOMX_ARGON_ITERATIONS 3 + +//Number of parallel lanes for Cache initialization. +#define RANDOMX_ARGON_LANES 1 + +//Argon2d salt +#define RANDOMX_ARGON_SALT "RandomX\x03" + +//Number of random Cache accesses per Dataset item. Minimum is 2. +#define RANDOMX_CACHE_ACCESSES 8 + +//Target latency for SuperscalarHash (in cycles of the reference CPU). +#define RANDOMX_SUPERSCALAR_LATENCY 170 + +//Dataset base size in bytes. Must be a power of 2. +#define RANDOMX_DATASET_BASE_SIZE 2147483648 + +//Dataset extra size. Must be divisible by 64. +#define RANDOMX_DATASET_EXTRA_SIZE 33554368 + +//Number of instructions in a RandomX program. Must be divisible by 8. +#define RANDOMX_PROGRAM_SIZE 256 + +//Number of iterations during VM execution. +#define RANDOMX_PROGRAM_ITERATIONS 2048 + +//Number of chained VM executions per hash. +#define RANDOMX_PROGRAM_COUNT 8 + +//Scratchpad L3 size in bytes. Must be a power of 2. +#define RANDOMX_SCRATCHPAD_L3 2097152 + +//Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. +#define RANDOMX_SCRATCHPAD_L2 262144 + +//Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. +#define RANDOMX_SCRATCHPAD_L1 16384 + +//Jump condition mask size in bits. +#define RANDOMX_JUMP_BITS 8 + +//Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. +#define RANDOMX_JUMP_OFFSET 8 + +/* +Instruction frequencies (per 256 opcodes) +Total sum of frequencies must be 256 +*/ + +//Integer instructions +#define RANDOMX_FREQ_IADD_RS 16 +#define RANDOMX_FREQ_IADD_M 7 +#define RANDOMX_FREQ_ISUB_R 16 +#define RANDOMX_FREQ_ISUB_M 7 +#define RANDOMX_FREQ_IMUL_R 16 +#define RANDOMX_FREQ_IMUL_M 4 +#define RANDOMX_FREQ_IMULH_R 4 +#define RANDOMX_FREQ_IMULH_M 1 +#define RANDOMX_FREQ_ISMULH_R 4 +#define RANDOMX_FREQ_ISMULH_M 1 +#define RANDOMX_FREQ_IMUL_RCP 8 +#define RANDOMX_FREQ_INEG_R 2 +#define RANDOMX_FREQ_IXOR_R 15 +#define RANDOMX_FREQ_IXOR_M 5 +#define RANDOMX_FREQ_IROR_R 8 +#define RANDOMX_FREQ_IROL_R 2 +#define RANDOMX_FREQ_ISWAP_R 4 + +//Floating point instructions +#define RANDOMX_FREQ_FSWAP_R 4 +#define RANDOMX_FREQ_FADD_R 16 +#define RANDOMX_FREQ_FADD_M 5 +#define RANDOMX_FREQ_FSUB_R 16 +#define RANDOMX_FREQ_FSUB_M 5 +#define RANDOMX_FREQ_FSCAL_R 6 +#define RANDOMX_FREQ_FMUL_R 32 +#define RANDOMX_FREQ_FDIV_M 4 +#define RANDOMX_FREQ_FSQRT_R 6 + +//Control instructions +#define RANDOMX_FREQ_CBRANCH 25 +#define RANDOMX_FREQ_CFROUND 1 + +//Store instruction +#define RANDOMX_FREQ_ISTORE 16 + +//No-op instruction +#define RANDOMX_FREQ_NOP 0 +/* ------ + 256 +*/ diff --git a/randomx/cpu.cpp b/randomx/cpu.cpp new file mode 100644 index 0000000..be9f1b1 --- /dev/null +++ b/randomx/cpu.cpp @@ -0,0 +1,72 @@ +/* +Copyright (c) 2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "cpu.hpp" + +#if defined(_M_X64) || defined(__x86_64__) + #define HAVE_CPUID + #ifdef _WIN32 + #include + #define cpuid(info, x) __cpuidex(info, x, 0) + #else //GCC + #include + void cpuid(int info[4], int InfoType) { + __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); + } + #endif +#endif + +#if defined(HAVE_HWCAP) + #include + #include +#endif + +namespace randomx { + + Cpu::Cpu() : aes_(false), ssse3_(false), avx2_(false) { +#ifdef HAVE_CPUID + int info[4]; + cpuid(info, 0); + int nIds = info[0]; + if (nIds >= 0x00000001) { + cpuid(info, 0x00000001); + ssse3_ = (info[2] & (1 << 9)) != 0; + aes_ = (info[2] & (1 << 25)) != 0; + } + if (nIds >= 0x00000007) { + cpuid(info, 0x00000007); + avx2_ = (info[1] & (1 << 5)) != 0; + } +#elif defined(__aarch64__) && defined(HWCAP_AES) + long hwcaps = getauxval(AT_HWCAP); + aes_ = (hwcaps & HWCAP_AES) != 0; +#endif + //TODO POWER8 AES + } + +} diff --git a/randomx/cpu.hpp b/randomx/cpu.hpp new file mode 100644 index 0000000..516dd47 --- /dev/null +++ b/randomx/cpu.hpp @@ -0,0 +1,49 @@ +/* +Copyright (c) 2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +namespace randomx { + + class Cpu { + public: + Cpu(); + bool hasAes() const { + return aes_; + } + bool hasSsse3() const { + return ssse3_; + } + bool hasAvx2() const { + return avx2_; + } + private: + bool aes_, ssse3_, avx2_; + }; + +} diff --git a/randomx/dataset.cpp b/randomx/dataset.cpp new file mode 100644 index 0000000..a777a81 --- /dev/null +++ b/randomx/dataset.cpp @@ -0,0 +1,212 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" +#include "dataset.hpp" +#include "virtual_memory.hpp" +#include "superscalar.hpp" +#include "blake2_generator.hpp" +#include "reciprocal.h" +#include "blake2/endian.h" +#include "argon2.h" +#include "argon2_core.h" +#include "jit_compiler.hpp" +#include "intrin_portable.h" + +static_assert(RANDOMX_ARGON_MEMORY % (RANDOMX_ARGON_LANES * ARGON2_SYNC_POINTS) == 0, "RANDOMX_ARGON_MEMORY - invalid value"); +static_assert(ARGON2_BLOCK_SIZE == randomx::ArgonBlockSize, "Unpexpected value of ARGON2_BLOCK_SIZE"); + +namespace randomx { + + template + void deallocCache(randomx_cache* cache) { + if (cache->memory != nullptr) + Allocator::freeMemory(cache->memory, CacheSize); + if (cache->jit != nullptr) + delete cache->jit; + } + + template void deallocCache(randomx_cache* cache); + template void deallocCache(randomx_cache* cache); + + void initCache(randomx_cache* cache, const void* key, size_t keySize) { + uint32_t memory_blocks, segment_length; + argon2_instance_t instance; + argon2_context context; + + context.out = nullptr; + context.outlen = 0; + context.pwd = CONST_CAST(uint8_t *)key; + context.pwdlen = (uint32_t)keySize; + context.salt = CONST_CAST(uint8_t *)RANDOMX_ARGON_SALT; + context.saltlen = (uint32_t)randomx::ArgonSaltSize; + context.secret = NULL; + context.secretlen = 0; + context.ad = NULL; + context.adlen = 0; + context.t_cost = RANDOMX_ARGON_ITERATIONS; + context.m_cost = RANDOMX_ARGON_MEMORY; + context.lanes = RANDOMX_ARGON_LANES; //1 + context.threads = 1; + context.allocate_cbk = NULL; + context.free_cbk = NULL; + context.flags = ARGON2_DEFAULT_FLAGS; + context.version = ARGON2_VERSION_NUMBER; + + int inputsValid = randomx_argon2_validate_inputs(&context); + assert(inputsValid == ARGON2_OK); + + /* 2. Align memory size */ + /* Minimum memory_blocks = 8L blocks, where L is the number of lanes */ + memory_blocks = context.m_cost; //262144 + + segment_length = memory_blocks / (context.lanes * ARGON2_SYNC_POINTS); //ARGON2_SYNC_POINTS=4 + + instance.version = context.version; + instance.memory = NULL; + instance.passes = context.t_cost; + instance.memory_blocks = memory_blocks; + instance.segment_length = segment_length; + instance.lane_length = segment_length * ARGON2_SYNC_POINTS; + instance.lanes = context.lanes; + instance.threads = context.threads; + instance.type = Argon2_d; + instance.memory = (block*)cache->memory; + instance.impl = cache->argonImpl; + + if (instance.threads > instance.lanes) { + instance.threads = instance.lanes; + } + + /* 3. Initialization: Hashing inputs, allocating memory, filling first + * blocks + */ + randomx_argon2_initialize(&instance, &context); + + randomx_argon2_fill_memory_blocks(&instance); + + cache->reciprocalCache.clear(); + randomx::Blake2Generator gen(key, keySize); + for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { //RANDOMX_CACHE_ACCESSES =8 + randomx::generateSuperscalar(cache->programs[i], gen); + for (unsigned j = 0; j < cache->programs[i].getSize(); ++j) { + auto& instr = cache->programs[i](j); + if ((SuperscalarInstructionType)instr.opcode == SuperscalarInstructionType::IMUL_RCP) { + auto rcp = randomx_reciprocal(instr.getImm32()); + instr.setImm32(cache->reciprocalCache.size()); + cache->reciprocalCache.push_back(rcp); + } + } + } + printf("initial the cache finished\n"); + } + + void initCacheCompile(randomx_cache* cache, const void* key, size_t keySize) { + initCache(cache, key, keySize); + cache->jit->enableWriting(); + cache->jit->generateSuperscalarHash(cache->programs, cache->reciprocalCache); + cache->jit->generateDatasetInitCode(); + cache->jit->enableExecution(); + } + + constexpr uint64_t superscalarMul0 = 6364136223846793005ULL; + constexpr uint64_t superscalarAdd1 = 9298411001130361340ULL; + constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL; + constexpr uint64_t superscalarAdd3 = 9306329213124626780ULL; + constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL; + constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL; + constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL; + constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL; + + static inline uint8_t* getMixBlock(uint64_t registerValue, uint8_t *memory) { + constexpr uint32_t mask = CacheSize / CacheLineSize - 1; + return memory + (registerValue & mask) * CacheLineSize; + } + + void initDatasetItem(randomx_cache* cache, uint8_t* out, uint64_t itemNumber) { + //printf("xxxxxxxx\n"); + int_reg_t rl[8]; + uint8_t* mixBlock; + uint64_t registerValue = itemNumber; + rl[0] = (itemNumber + 1) * superscalarMul0; + rl[1] = rl[0] ^ superscalarAdd1; + rl[2] = rl[0] ^ superscalarAdd2; + rl[3] = rl[0] ^ superscalarAdd3; + rl[4] = rl[0] ^ superscalarAdd4; + rl[5] = rl[0] ^ superscalarAdd5; + rl[6] = rl[0] ^ superscalarAdd6; + rl[7] = rl[0] ^ superscalarAdd7; + for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { //RANDOMX_CACHE_ACCESSES=8 + mixBlock = getMixBlock(registerValue, cache->memory); + rx_prefetch_nta(mixBlock); + SuperscalarProgram& prog = cache->programs[i]; + + executeSuperscalar(rl, prog, &cache->reciprocalCache); + + for (unsigned q = 0; q < 8; ++q) + rl[q] ^= load64_native(mixBlock + 8 * q); + + registerValue = rl[prog.getAddressRegister()]; + } + + memcpy(out, &rl, CacheLineSize); + } + + void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startItem, uint32_t endItem) { + printf("initial the dataset\n"); + for (uint32_t itemNumber = startItem; itemNumber < endItem; ++itemNumber, dataset += CacheLineSize){ + initDatasetItem(cache, dataset, itemNumber); + + //if (itemNumber==(endItem-1)) + //{ + // printf("endItem= %0d\n",endItem); + // for (int i = 0; i < CacheLineSize; ++i) + // { + // printf("%02x ",dataset[i]); + // } + // printf("\n"); + //} + } + } +} + +//b3 1f 7e c5 cd 28 eb 4b b6 72 7e 15 7d b0 6a 63 0b d4 dc 32 fb 18 eb 25 b4 f2 09 9b 9b 5d 39 ab 0d 2d d0 e9 ed 5f b7 a5 ae 31 bc d1 8f 01 d4 04 91 aa 62 01 db 47 a7 0d 2b 42 b9 b3 43 cd 78 c9 \ No newline at end of file diff --git a/randomx/dataset.hpp b/randomx/dataset.hpp new file mode 100644 index 0000000..0f39af5 --- /dev/null +++ b/randomx/dataset.hpp @@ -0,0 +1,94 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "common.hpp" +#include "superscalar_program.hpp" +#include "allocator.hpp" +#include "argon2.h" + +/* Global scope for C binding */ +struct randomx_dataset { + uint8_t* memory = nullptr; + randomx::DatasetDeallocFunc* dealloc; +}; + +/* Global scope for C binding */ +struct randomx_cache { + uint8_t* memory = nullptr; + randomx::CacheDeallocFunc* dealloc; + randomx::JitCompiler* jit; + randomx::CacheInitializeFunc* initialize; + randomx::DatasetInitFunc* datasetInit; + randomx::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES]; //RANDOMX_CACHE_ACCESSES =8 + std::vector reciprocalCache; + std::string cacheKey; + randomx_argon2_impl* argonImpl; + + bool isInitialized() { + return programs[0].getSize() != 0; + } +}; + +//A pointer to a standard-layout struct object points to its initial member +static_assert(std::is_standard_layout(), "randomx_dataset must be a standard-layout struct"); +//the following assert fails when compiling Debug in Visual Studio (JIT mode will crash in Debug) +static_assert(std::is_standard_layout(), "randomx_cache must be a standard-layout struct"); + +namespace randomx { + + using DefaultAllocator = AlignedAllocator; + + template + void deallocDataset(randomx_dataset* dataset) { + if (dataset->memory != nullptr) + Allocator::freeMemory(dataset->memory, DatasetSize); + } + + template + void deallocCache(randomx_cache* cache); + + void initCache(randomx_cache*, const void*, size_t); + void initCacheCompile(randomx_cache*, const void*, size_t); + void initDatasetItem(randomx_cache* cache, uint8_t* out, uint64_t blockNumber); + void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); + + inline randomx_argon2_impl* selectArgonImpl(randomx_flags flags) { + if (flags & RANDOMX_FLAG_ARGON2_AVX2) { + return randomx_argon2_impl_avx2(); + } + if (flags & RANDOMX_FLAG_ARGON2_SSSE3) { + return randomx_argon2_impl_ssse3(); + } + return &randomx_argon2_fill_segment_ref; + } +} diff --git a/randomx/instruction.cpp b/randomx/instruction.cpp new file mode 100644 index 0000000..12e6f49 --- /dev/null +++ b/randomx/instruction.cpp @@ -0,0 +1,390 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "instruction.hpp" +#include "common.hpp" + +namespace randomx { + + void Instruction::print(std::ostream& os) const { + os << names[opcode] << " "; + auto handler = engine[opcode]; + (this->*handler)(os); + } + + void Instruction::genAddressReg(std::ostream& os, int srcIndex) const { + os << (getModMem() ? "L1" : "L2") << "[r" << srcIndex << std::showpos << (int32_t)getImm32() << std::noshowpos << "]"; + } + + void Instruction::genAddressRegDst(std::ostream& os, int dstIndex) const { + if (getModCond() < StoreL3Condition) + os << (getModMem() ? "L1" : "L2"); + else + os << "L3"; + os << "[r" << dstIndex << std::showpos << (int32_t)getImm32() << std::noshowpos << "]"; + } + + void Instruction::genAddressImm(std::ostream& os) const { + os << "L3" << "[" << (getImm32() & ScratchpadL3Mask) << "]"; + } + + void Instruction::h_IADD_RS(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + os << "r" << dstIndex << ", r" << srcIndex; + if(dstIndex == RegisterNeedsDisplacement) { + os << ", " << (int32_t)getImm32(); + } + os << ", SHFT " << getModShift() << std::endl; + } + + void Instruction::h_IADD_M(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", "; + genAddressReg(os, srcIndex); + os << std::endl; + } + else { + os << "r" << dstIndex << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_ISUB_R(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", r" << srcIndex << std::endl; + } + else { + os << "r" << dstIndex << ", " << (int32_t)getImm32() << std::endl; + } + } + + void Instruction::h_ISUB_M(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", "; + genAddressReg(os, srcIndex); + os << std::endl; + } + else { + os << "r" << dstIndex << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_IMUL_R(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", r" << srcIndex << std::endl; + } + else { + os << "r" << dstIndex << ", " << (int32_t)getImm32() << std::endl; + } + } + + void Instruction::h_IMUL_M(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", "; + genAddressReg(os, srcIndex); + os << std::endl; + } + else { + os << "r" << dstIndex << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_IMULH_R(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + os << "r" << dstIndex << ", r" << srcIndex << std::endl; + } + + void Instruction::h_IMULH_M(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", "; + genAddressReg(os, srcIndex); + os << std::endl; + } + else { + os << "r" << dstIndex << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_ISMULH_R(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + os << "r" << dstIndex << ", r" << srcIndex << std::endl; + } + + void Instruction::h_ISMULH_M(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", "; + genAddressReg(os, srcIndex); + os << std::endl; + } + else { + os << "r" << dstIndex << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_INEG_R(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + os << "r" << dstIndex << std::endl; + } + + void Instruction::h_IXOR_R(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", r" << srcIndex << std::endl; + } + else { + os << "r" << dstIndex << ", " << (int32_t)getImm32() << std::endl; + } + } + + void Instruction::h_IXOR_M(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", "; + genAddressReg(os, srcIndex); + os << std::endl; + } + else { + os << "r" << dstIndex << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_IROR_R(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", r" << srcIndex << std::endl; + } + else { + os << "r" << dstIndex << ", " << (getImm32() & 63) << std::endl; + } + } + + void Instruction::h_IROL_R(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + if (dstIndex != srcIndex) { + os << "r" << dstIndex << ", r" << srcIndex << std::endl; + } + else { + os << "r" << dstIndex << ", " << (getImm32() & 63) << std::endl; + } + } + + void Instruction::h_IMUL_RCP(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + os << "r" << dstIndex << ", " << getImm32() << std::endl; + } + + void Instruction::h_ISWAP_R(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + os << "r" << dstIndex << ", r" << srcIndex << std::endl; + } + + void Instruction::h_FSWAP_R(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + const char reg = (dstIndex >= RegisterCountFlt) ? 'e' : 'f'; + dstIndex %= RegisterCountFlt; + os << reg << dstIndex << std::endl; + } + + void Instruction::h_FADD_R(std::ostream& os) const { + auto dstIndex = dst % RegisterCountFlt; + auto srcIndex = src % RegisterCountFlt; + os << "f" << dstIndex << ", a" << srcIndex << std::endl; + } + + void Instruction::h_FADD_M(std::ostream& os) const { + auto dstIndex = dst % RegisterCountFlt; + auto srcIndex = src % RegistersCount; + os << "f" << dstIndex << ", "; + genAddressReg(os, srcIndex); + os << std::endl; + } + + void Instruction::h_FSUB_R(std::ostream& os) const { + auto dstIndex = dst % RegisterCountFlt; + auto srcIndex = src % RegisterCountFlt; + os << "f" << dstIndex << ", a" << srcIndex << std::endl; + } + + void Instruction::h_FSUB_M(std::ostream& os) const { + auto dstIndex = dst % RegisterCountFlt; + auto srcIndex = src % RegistersCount; + os << "f" << dstIndex << ", "; + genAddressReg(os, srcIndex); + os << std::endl; + } + + void Instruction::h_FSCAL_R(std::ostream& os) const { + auto dstIndex = dst % RegisterCountFlt; + os << "f" << dstIndex << std::endl; + } + + void Instruction::h_FMUL_R(std::ostream& os) const { + auto dstIndex = dst % RegisterCountFlt; + auto srcIndex = src % RegisterCountFlt; + os << "e" << dstIndex << ", a" << srcIndex << std::endl; + } + + void Instruction::h_FDIV_M(std::ostream& os) const { + auto dstIndex = dst % RegisterCountFlt; + auto srcIndex = src % RegistersCount; + os << "e" << dstIndex << ", "; + genAddressReg(os, srcIndex); + os << std::endl; + } + + void Instruction::h_FSQRT_R(std::ostream& os) const { + auto dstIndex = dst % RegisterCountFlt; + os << "e" << dstIndex << std::endl; + } + + void Instruction::h_CFROUND(std::ostream& os) const { + auto srcIndex = src % RegistersCount; + os << "r" << srcIndex << ", " << (getImm32() & 63) << std::endl; + } + + void Instruction::h_CBRANCH(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + os << "r" << dstIndex << ", " << (int32_t)getImm32() << ", COND " << (int)(getModCond()) << std::endl; + } + + void Instruction::h_ISTORE(std::ostream& os) const { + auto dstIndex = dst % RegistersCount; + auto srcIndex = src % RegistersCount; + genAddressRegDst(os, dstIndex); + os << ", r" << srcIndex << std::endl; + } + + void Instruction::h_NOP(std::ostream& os) const { + os << std::endl; + } + +#include "instruction_weights.hpp" +#define INST_NAME(x) REPN(#x, WT(x)) +#define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x)) + + const char* Instruction::names[256] = { + INST_NAME(IADD_RS) + INST_NAME(IADD_M) + INST_NAME(ISUB_R) + INST_NAME(ISUB_M) + INST_NAME(IMUL_R) + INST_NAME(IMUL_M) + INST_NAME(IMULH_R) + INST_NAME(IMULH_M) + INST_NAME(ISMULH_R) + INST_NAME(ISMULH_M) + INST_NAME(IMUL_RCP) + INST_NAME(INEG_R) + INST_NAME(IXOR_R) + INST_NAME(IXOR_M) + INST_NAME(IROR_R) + INST_NAME(IROL_R) + INST_NAME(ISWAP_R) + INST_NAME(FSWAP_R) + INST_NAME(FADD_R) + INST_NAME(FADD_M) + INST_NAME(FSUB_R) + INST_NAME(FSUB_M) + INST_NAME(FSCAL_R) + INST_NAME(FMUL_R) + INST_NAME(FDIV_M) + INST_NAME(FSQRT_R) + INST_NAME(CBRANCH) + INST_NAME(CFROUND) + INST_NAME(ISTORE) + INST_NAME(NOP) + }; + + InstructionFormatter Instruction::engine[256] = { + INST_HANDLE(IADD_RS) + INST_HANDLE(IADD_M) + INST_HANDLE(ISUB_R) + INST_HANDLE(ISUB_M) + INST_HANDLE(IMUL_R) + INST_HANDLE(IMUL_M) + INST_HANDLE(IMULH_R) + INST_HANDLE(IMULH_M) + INST_HANDLE(ISMULH_R) + INST_HANDLE(ISMULH_M) + INST_HANDLE(IMUL_RCP) + INST_HANDLE(INEG_R) + INST_HANDLE(IXOR_R) + INST_HANDLE(IXOR_M) + INST_HANDLE(IROR_R) + INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) + INST_HANDLE(FSWAP_R) + INST_HANDLE(FADD_R) + INST_HANDLE(FADD_M) + INST_HANDLE(FSUB_R) + INST_HANDLE(FSUB_M) + INST_HANDLE(FSCAL_R) + INST_HANDLE(FMUL_R) + INST_HANDLE(FDIV_M) + INST_HANDLE(FSQRT_R) + INST_HANDLE(CBRANCH) + INST_HANDLE(CFROUND) + INST_HANDLE(ISTORE) + INST_HANDLE(NOP) + }; + +} \ No newline at end of file diff --git a/randomx/instruction.hpp b/randomx/instruction.hpp new file mode 100644 index 0000000..b1863b5 --- /dev/null +++ b/randomx/instruction.hpp @@ -0,0 +1,149 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "blake2/endian.h" + +namespace randomx { + + class Instruction; + + typedef void(Instruction::*InstructionFormatter)(std::ostream&) const; + + enum class InstructionType : uint16_t { + IADD_RS = 0, + IADD_M = 1, + ISUB_R = 2, + ISUB_M = 3, + IMUL_R = 4, + IMUL_M = 5, + IMULH_R = 6, + IMULH_M = 7, + ISMULH_R = 8, + ISMULH_M = 9, + IMUL_RCP = 10, + INEG_R = 11, + IXOR_R = 12, + IXOR_M = 13, + IROR_R = 14, + IROL_R = 15, + ISWAP_R = 16, + FSWAP_R = 17, + FADD_R = 18, + FADD_M = 19, + FSUB_R = 20, + FSUB_M = 21, + FSCAL_R = 22, + FMUL_R = 23, + FDIV_M = 24, + FSQRT_R = 25, + CBRANCH = 26, + CFROUND = 27, + ISTORE = 28, + NOP = 29, + }; + + class Instruction { + public: + uint32_t getImm32() const { + return load32(&imm32); + } + void setImm32(uint32_t val) { + return store32(&imm32, val); + } + const char* getName() const { + return names[opcode]; + } + friend std::ostream& operator<<(std::ostream& os, const Instruction& i) { + i.print(os); + return os; + } + int getModMem() const { + return mod % 4; //bits 0-1 + } + int getModShift() const { + return (mod >> 2) % 4; //bits 2-3 + } + int getModCond() const { + return mod >> 4; //bits 4-7 + } + void setMod(uint8_t val) { + mod = val; + } + + uint8_t opcode; + uint8_t dst; + uint8_t src; + uint8_t mod; + uint32_t imm32; + private: + void print(std::ostream&) const; + static const char* names[256]; + static InstructionFormatter engine[256]; + void genAddressReg(std::ostream& os, int) const; + void genAddressImm(std::ostream& os) const; + void genAddressRegDst(std::ostream&, int) const; + void h_IADD_RS(std::ostream&) const; + void h_IADD_M(std::ostream&) const; + void h_ISUB_R(std::ostream&) const; + void h_ISUB_M(std::ostream&) const; + void h_IMUL_R(std::ostream&) const; + void h_IMUL_M(std::ostream&) const; + void h_IMULH_R(std::ostream&) const; + void h_IMULH_M(std::ostream&) const; + void h_ISMULH_R(std::ostream&) const; + void h_ISMULH_M(std::ostream&) const; + void h_IMUL_RCP(std::ostream&) const; + void h_INEG_R(std::ostream&) const; + void h_IXOR_R(std::ostream&) const; + void h_IXOR_M(std::ostream&) const; + void h_IROR_R(std::ostream&) const; + void h_IROL_R(std::ostream&) const; + void h_ISWAP_R(std::ostream&) const; + void h_FSWAP_R(std::ostream&) const; + void h_FADD_R(std::ostream&) const; + void h_FADD_M(std::ostream&) const; + void h_FSUB_R(std::ostream&) const; + void h_FSUB_M(std::ostream&) const; + void h_FSCAL_R(std::ostream&) const; + void h_FMUL_R(std::ostream&) const; + void h_FDIV_M(std::ostream&) const; + void h_FSQRT_R(std::ostream&) const; + void h_CBRANCH(std::ostream&) const; + void h_CFROUND(std::ostream&) const; + void h_ISTORE(std::ostream&) const; + void h_NOP(std::ostream&) const; + }; + + static_assert(sizeof(Instruction) == 8, "Invalid size of struct randomx::Instruction"); + static_assert(std::is_standard_layout(), "randomx::Instruction must be a standard-layout struct"); +} \ No newline at end of file diff --git a/randomx/instruction_weights.hpp b/randomx/instruction_weights.hpp new file mode 100644 index 0000000..f6c8873 --- /dev/null +++ b/randomx/instruction_weights.hpp @@ -0,0 +1,73 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#define REP0(x) +#define REP1(x) x, +#define REP2(x) REP1(x) x, +#define REP3(x) REP2(x) x, +#define REP4(x) REP3(x) x, +#define REP5(x) REP4(x) x, +#define REP6(x) REP5(x) x, +#define REP7(x) REP6(x) x, +#define REP8(x) REP7(x) x, +#define REP9(x) REP8(x) x, +#define REP10(x) REP9(x) x, +#define REP11(x) REP10(x) x, +#define REP12(x) REP11(x) x, +#define REP13(x) REP12(x) x, +#define REP14(x) REP13(x) x, +#define REP15(x) REP14(x) x, +#define REP16(x) REP15(x) x, +#define REP17(x) REP16(x) x, +#define REP18(x) REP17(x) x, +#define REP19(x) REP18(x) x, +#define REP20(x) REP19(x) x, +#define REP21(x) REP20(x) x, +#define REP22(x) REP21(x) x, +#define REP23(x) REP22(x) x, +#define REP24(x) REP23(x) x, +#define REP25(x) REP24(x) x, +#define REP26(x) REP25(x) x, +#define REP27(x) REP26(x) x, +#define REP28(x) REP27(x) x, +#define REP29(x) REP28(x) x, +#define REP30(x) REP29(x) x, +#define REP31(x) REP30(x) x, +#define REP32(x) REP31(x) x, +#define REP33(x) REP32(x) x, +#define REP40(x) REP32(x) REP8(x) +#define REP64(x) REP32(x) REP32(x) +#define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x) +#define REP232(x) REP128(x) REP40(x) REP40(x) REP24(x) +#define REP256(x) REP128(x) REP128(x) +#define REPNX(x,N) REP##N(x) +#define REPN(x,N) REPNX(x,N) +#define NUM(x) x +#define WT(x) NUM(RANDOMX_FREQ_##x) diff --git a/randomx/instructions_portable.cpp b/randomx/instructions_portable.cpp new file mode 100644 index 0000000..d1253d8 --- /dev/null +++ b/randomx/instructions_portable.cpp @@ -0,0 +1,193 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include "common.hpp" +#include "intrin_portable.h" +#include "blake2/endian.h" + +#if defined(__SIZEOF_INT128__) + typedef unsigned __int128 uint128_t; + typedef __int128 int128_t; + uint64_t mulh(uint64_t a, uint64_t b) { + return ((uint128_t)a * b) >> 64; + } + int64_t smulh(int64_t a, int64_t b) { + return ((int128_t)a * b) >> 64; + } + #define HAVE_MULH + #define HAVE_SMULH +#endif + +#if defined(_MSC_VER) + #define HAS_VALUE(X) X ## 0 + #define EVAL_DEFINE(X) HAS_VALUE(X) + #include + #include + + uint64_t rotl(uint64_t x, unsigned int c) { + return _rotl64(x, c); + } + uint64_t rotr(uint64_t x, unsigned int c) { + return _rotr64(x, c); + } + #define HAVE_ROTL + #define HAVE_ROTR + + #if EVAL_DEFINE(__MACHINEARM64_X64(1)) + uint64_t mulh(uint64_t a, uint64_t b) { + return __umulh(a, b); + } + #define HAVE_MULH + #endif + + #if EVAL_DEFINE(__MACHINEX64(1)) + int64_t smulh(int64_t a, int64_t b) { + int64_t hi; + _mul128(a, b, &hi); + return hi; + } + #define HAVE_SMULH + #endif + + static void setRoundMode_(uint32_t mode) { + _controlfp(mode, _MCW_RC); + } + #define HAVE_SETROUNDMODE_IMPL +#endif + +#ifndef HAVE_SETROUNDMODE_IMPL + static void setRoundMode_(uint32_t mode) { + fesetround(mode); + } +#endif + +#ifndef HAVE_ROTR + uint64_t rotr(uint64_t a, unsigned int b) { + return (a >> b) | (a << (-b & 63)); + } + #define HAVE_ROTR +#endif + +#ifndef HAVE_ROTL + uint64_t rotl(uint64_t a, unsigned int b) { + return (a << b) | (a >> (-b & 63)); + } + #define HAVE_ROTL +#endif + +#ifndef HAVE_MULH + #define LO(x) ((x)&0xffffffff) + #define HI(x) ((x)>>32) + uint64_t mulh(uint64_t a, uint64_t b) { + uint64_t ah = HI(a), al = LO(a); + uint64_t bh = HI(b), bl = LO(b); + uint64_t x00 = al * bl; + uint64_t x01 = al * bh; + uint64_t x10 = ah * bl; + uint64_t x11 = ah * bh; + uint64_t m1 = LO(x10) + LO(x01) + HI(x00); + uint64_t m2 = HI(x10) + HI(x01) + LO(x11) + HI(m1); + uint64_t m3 = HI(x11) + HI(m2); + + return (m3 << 32) + LO(m2); + } + #define HAVE_MULH +#endif + +#ifndef HAVE_SMULH + int64_t smulh(int64_t a, int64_t b) { + int64_t hi = mulh(a, b); + if (a < 0LL) hi -= b; + if (b < 0LL) hi -= a; + return hi; + } + #define HAVE_SMULH +#endif + +#ifdef RANDOMX_DEFAULT_FENV + +void rx_reset_float_state() { + setRoundMode_(FE_TONEAREST); + rx_set_double_precision(); //set precision to 53 bits if needed by the platform +} + +void rx_set_rounding_mode(uint32_t mode) { + switch (mode & 3) { + case RoundDown: + setRoundMode_(FE_DOWNWARD); + break; + case RoundUp: + setRoundMode_(FE_UPWARD); + break; + case RoundToZero: + setRoundMode_(FE_TOWARDZERO); + break; + case RoundToNearest: + setRoundMode_(FE_TONEAREST); + break; + default: + UNREACHABLE; + } +} + +#endif + +#ifdef RANDOMX_USE_X87 + +#if defined(_MSC_VER) && defined(_M_IX86) + +void rx_set_double_precision() { + _control87(_PC_53, _MCW_PC); +} + +#elif defined(__i386) + +void rx_set_double_precision() { + uint16_t volatile x87cw; + asm volatile("fstcw %0" : "=m" (x87cw)); + x87cw &= ~0x300; + x87cw |= 0x200; + asm volatile("fldcw %0" : : "m" (x87cw)); +} + +#endif + +#endif //RANDOMX_USE_X87 + +union double_ser_t { + double f; + uint64_t i; +}; + +double loadDoublePortable(const void* addr) { + double_ser_t ds; + ds.i = load64(addr); + return ds.f; +} diff --git a/randomx/intrin_portable.h b/randomx/intrin_portable.h new file mode 100644 index 0000000..b5ad91a --- /dev/null +++ b/randomx/intrin_portable.h @@ -0,0 +1,738 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "blake2/endian.h" + +constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); +} + +constexpr int64_t unsigned64ToSigned2sCompl(uint64_t x) { + return (-1 == ~0) ? (int64_t)x : (x > INT64_MAX ? (-(int64_t)(UINT64_MAX - x) - 1) : (int64_t)x); +} + +constexpr uint64_t signExtend2sCompl(uint32_t x) { + return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x); +} + +constexpr int RoundToNearest = 0; +constexpr int RoundDown = 1; +constexpr int RoundUp = 2; +constexpr int RoundToZero = 3; + +//MSVC doesn't define __SSE2__, so we have to define it manually if SSE2 is available +#if !defined(__SSE2__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)) +#define __SSE2__ 1 +#endif + +//MSVC doesn't define __AES__ +#if defined(_MSC_VER) && defined(__SSE2__) +#define __AES__ +#endif + +//the library "sqrt" function provided by MSVC for x86 targets doesn't give +//the correct results, so we have to use inline assembly to call x87 fsqrt directly +#if !defined(__SSE2__) +#if defined(_MSC_VER) && defined(_M_IX86) +inline double __cdecl rx_sqrt(double x) { + __asm { + fld x + fsqrt + } +} +#define rx_sqrt rx_sqrt + +void rx_set_double_precision(); +#define RANDOMX_USE_X87 + +#elif defined(__i386) + +void rx_set_double_precision(); +#define RANDOMX_USE_X87 + +#endif +#endif //__SSE2__ + +#if !defined(rx_sqrt) +#define rx_sqrt sqrt +#endif + +#if !defined(RANDOMX_USE_X87) +#define rx_set_double_precision(x) +#endif + +#ifdef __SSE2__ +#ifdef __GNUC__ +#include +#else +#include +#endif + +typedef __m128i rx_vec_i128; +typedef __m128d rx_vec_f128; + +#define rx_aligned_alloc(a, b) _mm_malloc(a,b) +#define rx_aligned_free(a) _mm_free(a) +#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) + +#define rx_load_vec_f128 _mm_load_pd +#define rx_store_vec_f128 _mm_store_pd +#define rx_add_vec_f128 _mm_add_pd +#define rx_sub_vec_f128 _mm_sub_pd +#define rx_mul_vec_f128 _mm_mul_pd +#define rx_div_vec_f128 _mm_div_pd +#define rx_sqrt_vec_f128 _mm_sqrt_pd + +FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) { + return _mm_shuffle_pd(a, a, 1); +} + +FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { + return _mm_castsi128_pd(_mm_set_epi64x(x1, x0)); +} + +FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { + return _mm_castsi128_pd(_mm_set1_epi64x(x)); +} + +#define rx_xor_vec_f128 _mm_xor_pd +#define rx_and_vec_f128 _mm_and_pd +#define rx_or_vec_f128 _mm_or_pd + +#ifdef __AES__ + +#define rx_aesenc_vec_i128 _mm_aesenc_si128 +#define rx_aesdec_vec_i128 _mm_aesdec_si128 + +#define HAVE_AES 1 + +#endif //__AES__ + +FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { + return _mm_cvtsi128_si32(a); +} + +FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { + return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); +} + +FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { + return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xaa)); +} + +FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { + return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xff)); +} + +#define rx_set_int_vec_i128 _mm_set_epi32 +#define rx_xor_vec_i128 _mm_xor_si128 +#define rx_load_vec_i128 _mm_load_si128 +#define rx_store_vec_i128 _mm_store_si128 + +FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { + __m128i ix = _mm_loadl_epi64((const __m128i*)addr); + return _mm_cvtepi32_pd(ix); +} + +constexpr uint32_t rx_mxcsr_default = 0x9FC0; //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled + +FORCE_INLINE void rx_reset_float_state() { + _mm_setcsr(rx_mxcsr_default); +} + +FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) { + _mm_setcsr(rx_mxcsr_default | (mode << 13)); +} + +#elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD +#include +#include +#include +#include +#undef vector +#undef pixel +#undef bool + +typedef __vector uint8_t __m128i; +typedef __vector uint32_t __m128l; +typedef __vector int __m128li; +typedef __vector uint64_t __m128ll; +typedef __vector double __m128d; + +typedef __m128i rx_vec_i128; +typedef __m128d rx_vec_f128; +typedef union{ + rx_vec_i128 i; + rx_vec_f128 d; + uint64_t u64[2]; + double d64[2]; + uint32_t u32[4]; + int i32[4]; +} vec_u; + +#define rx_aligned_alloc(a, b) malloc(a) +#define rx_aligned_free(a) free(a) +#define rx_prefetch_nta(x) + +/* Splat 64-bit long long to 2 64-bit long longs */ +FORCE_INLINE __m128i vec_splat2sd (int64_t scalar) +{ return (__m128i) vec_splats (scalar); } + +FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { +#if defined(NATIVE_LITTLE_ENDIAN) + return (rx_vec_f128)vec_vsx_ld(0,pd); +#else + vec_u t; + t.u64[0] = load64(pd + 0); + t.u64[1] = load64(pd + 1); + return (rx_vec_f128)t.d; +#endif +} + +FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) { +#if defined(NATIVE_LITTLE_ENDIAN) + vec_vsx_st(a,0,(rx_vec_f128*)mem_addr); +#else + vec_u _a; + _a.d = a; + store64(mem_addr + 0, _a.u64[0]); + store64(mem_addr + 1, _a.u64[1]); +#endif +} + +FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) { + return (rx_vec_f128)vec_perm((__m128i)a,(__m128i)a,(__m128i){8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7}); +} + +FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_add(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_sub(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_mul(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_div(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) { + return (rx_vec_f128)vec_sqrt(a); +} + +FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) { + return (rx_vec_i128)vec_splat2sd(a); +} + +FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) { + return (rx_vec_f128)a; +} + +FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { + return (rx_vec_f128)(__m128ll){x0,x1}; +} + +FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { + return (rx_vec_f128)vec_splat2sd(x); +} + +FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_xor(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_and(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_or(a,b); +} + +#if defined(__CRYPTO__) + +FORCE_INLINE __m128ll vrev(__m128i v){ +#if defined(NATIVE_LITTLE_ENDIAN) + return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}); +#else + return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}); +#endif +} + +FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { + __m128ll _v = vrev(v); + __m128ll _rkey = vrev(rkey); + __m128ll result = vrev((__m128i)__builtin_crypto_vcipher(_v,_rkey)); + return (rx_vec_i128)result; +} + +FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { + __m128ll _v = vrev(v); + __m128ll zero = (__m128ll){0}; + __m128ll out = vrev((__m128i)__builtin_crypto_vncipher(_v,zero)); + return (rx_vec_i128)vec_xor((__m128i)out,rkey); +} +#define HAVE_AES 1 + +#endif //__CRYPTO__ + +FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { + vec_u _a; + _a.i = a; + return _a.i32[0]; +} + +FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { + vec_u _a; + _a.i = a; + return _a.i32[1]; +} + +FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { + vec_u _a; + _a.i = a; + return _a.i32[2]; +} + +FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { + vec_u _a; + _a.i = a; + return _a.i32[3]; +} + +FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) { + return (rx_vec_i128)((__m128li){_I0,_I1,_I2,_I3}); +}; + +FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) { + return (rx_vec_i128)vec_xor(_A,_B); +} + +FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *_P) { +#if defined(NATIVE_LITTLE_ENDIAN) + return *_P; +#else + uint32_t* ptr = (uint32_t*)_P; + vec_u c; + c.u32[0] = load32(ptr + 0); + c.u32[1] = load32(ptr + 1); + c.u32[2] = load32(ptr + 2); + c.u32[3] = load32(ptr + 3); + return (rx_vec_i128)c.i; +#endif +} + +FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) { +#if defined(NATIVE_LITTLE_ENDIAN) + *_P = _B; +#else + uint32_t* ptr = (uint32_t*)_P; + vec_u B; + B.i = _B; + store32(ptr + 0, B.u32[0]); + store32(ptr + 1, B.u32[1]); + store32(ptr + 2, B.u32[2]); + store32(ptr + 3, B.u32[3]); +#endif +} + +FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { + vec_u x; + x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); + x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + return (rx_vec_f128)x.d; +} + +#define RANDOMX_DEFAULT_FENV + +#elif defined(__aarch64__) + +#include +#include +#include + +typedef uint8x16_t rx_vec_i128; +typedef float64x2_t rx_vec_f128; + +inline void* rx_aligned_alloc(size_t size, size_t align) { + void* p; + if (posix_memalign(&p, align, size) == 0) + return p; + + return 0; +}; + +#define rx_aligned_free(a) free(a) + +inline void rx_prefetch_nta(void* ptr) { + asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr)); +} + +FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { + return vld1q_f64((const float64_t*)pd); +} + +FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 val) { + vst1q_f64((float64_t*)mem_addr, val); +} + +FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) { + float64x2_t temp; + temp = vcopyq_laneq_f64(temp, 1, a, 1); + a = vcopyq_laneq_f64(a, 1, a, 0); + return vcopyq_laneq_f64(a, 0, temp, 1); +} + +FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { + uint64x2_t temp0 = vdupq_n_u64(x0); + uint64x2_t temp1 = vdupq_n_u64(x1); + return vreinterpretq_f64_u64(vcopyq_laneq_u64(temp0, 1, temp1, 0)); +} + +FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { + return vreinterpretq_f64_u64(vdupq_n_u64(x)); +} + +#define rx_add_vec_f128 vaddq_f64 +#define rx_sub_vec_f128 vsubq_f64 +#define rx_mul_vec_f128 vmulq_f64 +#define rx_div_vec_f128 vdivq_f64 +#define rx_sqrt_vec_f128 vsqrtq_f64 + +FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b))); +} + +FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b))); +} + +FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b))); +} + +#ifdef __ARM_FEATURE_CRYPTO + + +FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 a, rx_vec_i128 key) { + const uint8x16_t zero = { 0 }; + return vaesmcq_u8(vaeseq_u8(a, zero)) ^ key; +} + +FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 a, rx_vec_i128 key) { + const uint8x16_t zero = { 0 }; + return vaesimcq_u8(vaesdq_u8(a, zero)) ^ key; +} + +#define HAVE_AES 1 + +#endif + +#define rx_xor_vec_i128 veorq_u8 + +FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { + return vgetq_lane_s32(vreinterpretq_s32_u8(a), 0); +} + +FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { + return vgetq_lane_s32(vreinterpretq_s32_u8(a), 1); +} + +FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { + return vgetq_lane_s32(vreinterpretq_s32_u8(a), 2); +} + +FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { + return vgetq_lane_s32(vreinterpretq_s32_u8(a), 3); +} + +FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) { + int32_t data[4]; + data[0] = _I0; + data[1] = _I1; + data[2] = _I2; + data[3] = _I3; + return vreinterpretq_u8_s32(vld1q_s32(data)); +}; + +#define rx_xor_vec_i128 veorq_u8 + +FORCE_INLINE rx_vec_i128 rx_load_vec_i128(const rx_vec_i128* mem_addr) { + return vld1q_u8((const uint8_t*)mem_addr); +} + +FORCE_INLINE void rx_store_vec_i128(rx_vec_i128* mem_addr, rx_vec_i128 val) { + vst1q_u8((uint8_t*)mem_addr, val); +} + +FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { + double lo = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); + double hi = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + rx_vec_f128 x; + x = vsetq_lane_f64(lo, x, 0); + x = vsetq_lane_f64(hi, x, 1); + return x; +} + +#define RANDOMX_DEFAULT_FENV + +#else //portable fallback + +#include +#include +#include +#include + +typedef union { + uint64_t u64[2]; + uint32_t u32[4]; + uint16_t u16[8]; + uint8_t u8[16]; +} rx_vec_i128; + +typedef union { + struct { + double lo; + double hi; + }; + rx_vec_i128 i; +} rx_vec_f128; + +#define rx_aligned_alloc(a, b) malloc(a) +#define rx_aligned_free(a) free(a) +#define rx_prefetch_nta(x) + +FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { + rx_vec_f128 x; + x.i.u64[0] = load64(pd + 0); + x.i.u64[1] = load64(pd + 1); + return x; +} + +FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) { + store64(mem_addr + 0, a.i.u64[0]); + store64(mem_addr + 1, a.i.u64[1]); +} + +FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) { + double temp = a.hi; + a.hi = a.lo; + a.lo = temp; + return a; +} + +FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.lo = a.lo + b.lo; + x.hi = a.hi + b.hi; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.lo = a.lo - b.lo; + x.hi = a.hi - b.hi; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.lo = a.lo * b.lo; + x.hi = a.hi * b.hi; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.lo = a.lo / b.lo; + x.hi = a.hi / b.hi; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) { + rx_vec_f128 x; + x.lo = rx_sqrt(a.lo); + x.hi = rx_sqrt(a.hi); + return x; +} + +FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) { + rx_vec_i128 x; + x.u64[0] = a; + x.u64[1] = a; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) { + rx_vec_f128 x; + x.i = a; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { + rx_vec_f128 v; + v.i.u64[0] = x0; + v.i.u64[1] = x1; + return v; +} + +FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { + rx_vec_f128 v; + v.i.u64[0] = x; + v.i.u64[1] = x; + return v; +} + +FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0]; + x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1]; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.i.u64[0] = a.i.u64[0] & b.i.u64[0]; + x.i.u64[1] = a.i.u64[1] & b.i.u64[1]; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.i.u64[0] = a.i.u64[0] | b.i.u64[0]; + x.i.u64[1] = a.i.u64[1] | b.i.u64[1]; + return x; +} + +FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { + return a.u32[0]; +} + +FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { + return a.u32[1]; +} + +FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { + return a.u32[2]; +} + +FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { + return a.u32[3]; +} + +FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) { + rx_vec_i128 v; + v.u32[0] = _I0; + v.u32[1] = _I1; + v.u32[2] = _I2; + v.u32[3] = _I3; + return v; +}; + +FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) { + rx_vec_i128 c; + c.u32[0] = _A.u32[0] ^ _B.u32[0]; + c.u32[1] = _A.u32[1] ^ _B.u32[1]; + c.u32[2] = _A.u32[2] ^ _B.u32[2]; + c.u32[3] = _A.u32[3] ^ _B.u32[3]; + return c; +} + +FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const*_P) { +#if defined(NATIVE_LITTLE_ENDIAN) + return *_P; +#else + uint32_t* ptr = (uint32_t*)_P; + rx_vec_i128 c; + c.u32[0] = load32(ptr + 0); + c.u32[1] = load32(ptr + 1); + c.u32[2] = load32(ptr + 2); + c.u32[3] = load32(ptr + 3); + return c; +#endif +} + +FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) { +#if defined(NATIVE_LITTLE_ENDIAN) + *_P = _B; +#else + uint32_t* ptr = (uint32_t*)_P; + store32(ptr + 0, _B.u32[0]); + store32(ptr + 1, _B.u32[1]); + store32(ptr + 2, _B.u32[2]); + store32(ptr + 3, _B.u32[3]); +#endif +} + +FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { + rx_vec_f128 x; + x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); + x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + return x; +} + +#define RANDOMX_DEFAULT_FENV + +#endif + +#ifndef HAVE_AES +static const char* platformError = "Platform doesn't support hardware AES"; + +#include + +FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { + throw std::runtime_error(platformError); +} + +FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { + throw std::runtime_error(platformError); +} + +#define HAVE_AES 0 + +#endif + +#ifdef RANDOMX_DEFAULT_FENV + +void rx_reset_float_state(); + +void rx_set_rounding_mode(uint32_t mode); + +#endif + +double loadDoublePortable(const void* addr); +uint64_t mulh(uint64_t, uint64_t); +int64_t smulh(int64_t, int64_t); +uint64_t rotl(uint64_t, unsigned int); +uint64_t rotr(uint64_t, unsigned int); diff --git a/randomx/jit_compiler.hpp b/randomx/jit_compiler.hpp new file mode 100644 index 0000000..bd9c2b0 --- /dev/null +++ b/randomx/jit_compiler.hpp @@ -0,0 +1,37 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#if defined(_M_X64) || defined(__x86_64__) +#include "jit_compiler_x86.hpp" +#elif defined(__aarch64__) +#include "jit_compiler_a64.hpp" +#else +#include "jit_compiler_fallback.hpp" +#endif diff --git a/randomx/jit_compiler_fallback.hpp b/randomx/jit_compiler_fallback.hpp new file mode 100644 index 0000000..56ccb8c --- /dev/null +++ b/randomx/jit_compiler_fallback.hpp @@ -0,0 +1,76 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "common.hpp" + +namespace randomx { + + class Program; + class ProgramConfiguration; + class SuperscalarProgram; + + class JitCompilerFallback { + public: + JitCompilerFallback() { + throw std::runtime_error("JIT compilation is not supported on this platform"); + } + void generateProgram(Program&, ProgramConfiguration&) { + + } + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) { + + } + template + void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &) { + + } + void generateDatasetInitCode() { + + } + ProgramFunc* getProgramFunc() { + return nullptr; + } + DatasetInitFunc* getDatasetInitFunc() { + return nullptr; + } + uint8_t* getCode() { + return nullptr; + } + size_t getCodeSize() { + return 0; + } + void enableWriting() {} + void enableExecution() {} + void enableAll() {} + }; +} \ No newline at end of file diff --git a/randomx/jit_compiler_x86.cpp b/randomx/jit_compiler_x86.cpp new file mode 100644 index 0000000..63be868 --- /dev/null +++ b/randomx/jit_compiler_x86.cpp @@ -0,0 +1,845 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include "jit_compiler_x86.hpp" +#include "jit_compiler_x86_static.hpp" +#include "superscalar.hpp" +#include "program.hpp" +#include "reciprocal.h" +#include "virtual_memory.hpp" + +namespace randomx { + /* + + REGISTER ALLOCATION: + + ; rax -> temporary + ; rbx -> iteration counter "ic" + ; rcx -> temporary + ; rdx -> temporary + ; rsi -> scratchpad pointer + ; rdi -> dataset pointer + ; rbp -> memory registers "ma" (high 32 bits), "mx" (low 32 bits) + ; rsp -> stack pointer + ; r8 -> "r0" + ; r9 -> "r1" + ; r10 -> "r2" + ; r11 -> "r3" + ; r12 -> "r4" + ; r13 -> "r5" + ; r14 -> "r6" + ; r15 -> "r7" + ; xmm0 -> "f0" + ; xmm1 -> "f1" + ; xmm2 -> "f2" + ; xmm3 -> "f3" + ; xmm4 -> "e0" + ; xmm5 -> "e1" + ; xmm6 -> "e2" + ; xmm7 -> "e3" + ; xmm8 -> "a0" + ; xmm9 -> "a1" + ; xmm10 -> "a2" + ; xmm11 -> "a3" + ; xmm12 -> temporary + ; xmm13 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff + ; xmm14 -> E 'or' mask = 0x3*00000000******3*00000000****** + ; xmm15 -> scale mask = 0x81f000000000000081f0000000000000 + + */ + + //Calculate the required code buffer size that is sufficient for the largest possible program: + + constexpr size_t MaxRandomXInstrCodeSize = 32; //FDIV_M requires up to 32 bytes of x86 code + constexpr size_t MaxSuperscalarInstrSize = 14; //IMUL_RCP requires 14 bytes of x86 code + constexpr size_t SuperscalarProgramHeader = 128; //overhead per superscalar program + constexpr size_t CodeAlign = 4096; //align code size to a multiple of 4 KiB + constexpr size_t ReserveCodeSize = CodeAlign; //function prologue/epilogue + reserve + + constexpr size_t RandomXCodeSize = alignSize(ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_SIZE, CodeAlign); + constexpr size_t SuperscalarSize = alignSize(ReserveCodeSize + (SuperscalarProgramHeader + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_ACCESSES, CodeAlign); + + static_assert(RandomXCodeSize < INT32_MAX / 2, "RandomXCodeSize is too large"); + static_assert(SuperscalarSize < INT32_MAX / 2, "SuperscalarSize is too large"); + + constexpr uint32_t CodeSize = RandomXCodeSize + SuperscalarSize; + + constexpr int32_t superScalarHashOffset = RandomXCodeSize; + + const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; + const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; + const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; + const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; + const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; + const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init; + const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin; + const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init; + const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store; + const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; + const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; + const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; + const uint8_t* codeShhLoad = (uint8_t*)&randomx_sshash_load; + const uint8_t* codeShhPrefetch = (uint8_t*)&randomx_sshash_prefetch; + const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end; + const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init; + + const int32_t prologueSize = codeLoopBegin - codePrologue; + const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; + const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset; + const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit; + const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin; + const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; + const int32_t datasetInitSize = codeEpilogue - codeDatasetInit; + const int32_t epilogueSize = codeShhLoad - codeEpilogue; + const int32_t codeSshLoadSize = codeShhPrefetch - codeShhLoad; + const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch; + const int32_t codeSshInitSize = codeProgramEnd - codeShhInit; + + const int32_t epilogueOffset = CodeSize - epilogueSize; + + static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 }; + static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 }; + static const uint8_t REX_SUB_RR[] = { 0x4d, 0x2b }; + static const uint8_t REX_SUB_RM[] = { 0x4c, 0x2b }; + static const uint8_t REX_MOV_RR[] = { 0x41, 0x8b }; + static const uint8_t REX_MOV_RR64[] = { 0x49, 0x8b }; + static const uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b }; + static const uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf }; + static const uint8_t REX_IMUL_RRI[] = { 0x4d, 0x69 }; + static const uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf }; + static const uint8_t REX_MUL_R[] = { 0x49, 0xf7 }; + static const uint8_t REX_MUL_M[] = { 0x48, 0xf7 }; + static const uint8_t REX_81[] = { 0x49, 0x81 }; + static const uint8_t AND_EAX_I = 0x25; + static const uint8_t MOV_EAX_I = 0xb8; + static const uint8_t MOV_RAX_I[] = { 0x48, 0xb8 }; + static const uint8_t MOV_RCX_I[] = { 0x48, 0xb9 }; + static const uint8_t REX_LEA[] = { 0x4f, 0x8d }; + static const uint8_t REX_MUL_MEM[] = { 0x48, 0xf7, 0x24, 0x0e }; + static const uint8_t REX_IMUL_MEM[] = { 0x48, 0xf7, 0x2c, 0x0e }; + static const uint8_t REX_SHR_RAX[] = { 0x48, 0xc1, 0xe8 }; + static const uint8_t RAX_ADD_SBB_1[] = { 0x48, 0x83, 0xC0, 0x01, 0x48, 0x83, 0xD8, 0x00 }; + static const uint8_t MUL_RCX[] = { 0x48, 0xf7, 0xe1 }; + static const uint8_t REX_SHR_RDX[] = { 0x48, 0xc1, 0xea }; + static const uint8_t REX_SH[] = { 0x49, 0xc1 }; + static const uint8_t MOV_RCX_RAX_SAR_RCX_63[] = { 0x48, 0x89, 0xc1, 0x48, 0xc1, 0xf9, 0x3f }; + static const uint8_t AND_ECX_I[] = { 0x81, 0xe1 }; + static const uint8_t ADD_RAX_RCX[] = { 0x48, 0x01, 0xC8 }; + static const uint8_t SAR_RAX_I8[] = { 0x48, 0xC1, 0xF8 }; + static const uint8_t NEG_RAX[] = { 0x48, 0xF7, 0xD8 }; + static const uint8_t ADD_R_RAX[] = { 0x4C, 0x03 }; + static const uint8_t XOR_EAX_EAX[] = { 0x33, 0xC0 }; + static const uint8_t ADD_RDX_R[] = { 0x4c, 0x01 }; + static const uint8_t SUB_RDX_R[] = { 0x4c, 0x29 }; + static const uint8_t SAR_RDX_I8[] = { 0x48, 0xC1, 0xFA }; + static const uint8_t TEST_RDX_RDX[] = { 0x48, 0x85, 0xD2 }; + static const uint8_t SETS_AL_ADD_RDX_RAX[] = { 0x0F, 0x98, 0xC0, 0x48, 0x03, 0xD0 }; + static const uint8_t REX_NEG[] = { 0x49, 0xF7 }; + static const uint8_t REX_XOR_RR[] = { 0x4D, 0x33 }; + static const uint8_t REX_XOR_RI[] = { 0x49, 0x81 }; + static const uint8_t REX_XOR_RM[] = { 0x4c, 0x33 }; + static const uint8_t REX_ROT_CL[] = { 0x49, 0xd3 }; + static const uint8_t REX_ROT_I8[] = { 0x49, 0xc1 }; + static const uint8_t SHUFPD[] = { 0x66, 0x0f, 0xc6 }; + static const uint8_t REX_ADDPD[] = { 0x66, 0x41, 0x0f, 0x58 }; + static const uint8_t REX_CVTDQ2PD_XMM12[] = { 0xf3, 0x44, 0x0f, 0xe6, 0x24, 0x06 }; + static const uint8_t REX_SUBPD[] = { 0x66, 0x41, 0x0f, 0x5c }; + static const uint8_t REX_XORPS[] = { 0x41, 0x0f, 0x57 }; + static const uint8_t REX_MULPD[] = { 0x66, 0x41, 0x0f, 0x59 }; + static const uint8_t REX_MAXPD[] = { 0x66, 0x41, 0x0f, 0x5f }; + static const uint8_t REX_DIVPD[] = { 0x66, 0x41, 0x0f, 0x5e }; + static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 }; + static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x50, 0x0F, 0xAE, 0x14, 0x24, 0x58 }; + static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 }; + static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 }; + static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 }; + static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 }; + static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 }; + static const uint8_t REX_MOV_MR[] = { 0x4c, 0x89 }; + static const uint8_t REX_XOR_EAX[] = { 0x41, 0x33 }; + static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 }; + static const uint8_t JNZ[] = { 0x0f, 0x85 }; + static const uint8_t JMP = 0xe9; + static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; + static const uint8_t REX_XCHG[] = { 0x4d, 0x87 }; + static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 }; + static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f }; + static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 }; + static const uint8_t CALL = 0xe8; + static const uint8_t REX_ADD_I[] = { 0x49, 0x81 }; + static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; + static const uint8_t JZ[] = { 0x0f, 0x84 }; + static const uint8_t RET = 0xc3; + static const uint8_t LEA_32[] = { 0x41, 0x8d }; + static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 }; + static const uint8_t ADD_EBX_I[] = { 0x81, 0xc3 }; + + static const uint8_t NOP1[] = { 0x90 }; + static const uint8_t NOP2[] = { 0x66, 0x90 }; + static const uint8_t NOP3[] = { 0x66, 0x66, 0x90 }; + static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 }; + static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }; + static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; + + static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; + + size_t JitCompilerX86::getCodeSize() { + return CodeSize; + } + + JitCompilerX86::JitCompilerX86() { + code = (uint8_t*)allocMemoryPages(CodeSize); + memcpy(code, codePrologue, prologueSize); + memcpy(code + epilogueOffset, codeEpilogue, epilogueSize); + } + + JitCompilerX86::~JitCompilerX86() { + freePagedMemory(code, CodeSize); + } + + void JitCompilerX86::enableAll() { + setPagesRWX(code, CodeSize); + } + + void JitCompilerX86::enableWriting() { + setPagesRW(code, CodeSize); + } + + void JitCompilerX86::enableExecution() { + setPagesRX(code, CodeSize); + } + + void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { + //printf("---\n"); + generateProgramPrologue(prog, pcfg); + memcpy(code + codePos, codeReadDataset, readDatasetSize); + codePos += readDatasetSize; + generateProgramEpilogue(prog, pcfg); + } + + void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { + generateProgramPrologue(prog, pcfg); + emit(codeReadDatasetLightSshInit, readDatasetLightInitSize); + emit(ADD_EBX_I); + emit32(datasetOffset / CacheLineSize); + emitByte(CALL); + emit32(superScalarHashOffset - (codePos + 4)); + emit(codeReadDatasetLightSshFin, readDatasetLightFinSize); + generateProgramEpilogue(prog, pcfg); + } + + template + void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &reciprocalCache) { + printf("xcccc\n"); + memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); + codePos = superScalarHashOffset + codeSshInitSize; + for (unsigned j = 0; j < N; ++j) { + SuperscalarProgram& prog = programs[j]; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + generateSuperscalarCode(instr, reciprocalCache); + } + emit(codeShhLoad, codeSshLoadSize); + if (j < N - 1) { + emit(REX_MOV_RR64); + emitByte(0xd8 + prog.getAddressRegister()); + emit(codeShhPrefetch, codeSshPrefetchSize); +#ifdef RANDOMX_ALIGN + int align = (codePos % 16); + while (align != 0) { + int nopSize = 16 - align; + if (nopSize > 8) nopSize = 8; + emit(NOPX[nopSize - 1], nopSize); + align = (codePos % 16); + } +#endif + } + } + emitByte(RET); + } + + template + void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES], std::vector &reciprocalCache); + + void JitCompilerX86::generateDatasetInitCode() { + memcpy(code, codeDatasetInit, datasetInitSize); + } + + void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { + instructionOffsets.clear(); + for (unsigned i = 0; i < 8; ++i) { + registerUsage[i] = -1; + } + + codePos = ((uint8_t*)randomx_program_prologue_first_load) - ((uint8_t*)randomx_program_prologue); + code[codePos + sizeof(REX_XOR_RAX_R64)] = 0xc0 + pcfg.readReg0; + code[codePos + sizeof(REX_XOR_RAX_R64) * 2 + 1] = 0xc0 + pcfg.readReg1; + + codePos = prologueSize; + memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask)); + memcpy(code + codePos, codeLoopLoad, loopLoadSize); + codePos += loopLoadSize; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr, i); + } + emit(REX_MOV_RR); + emitByte(0xc0 + pcfg.readReg2); + emit(REX_XOR_EAX); + emitByte(0xc0 + pcfg.readReg3); + } + + void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) { + emit(REX_MOV_RR64); + emitByte(0xc0 + pcfg.readReg0); + emit(REX_XOR_RAX_R64); + emitByte(0xc0 + pcfg.readReg1); + emit((const uint8_t*)&randomx_prefetch_scratchpad, ((uint8_t*)&randomx_prefetch_scratchpad_end) - ((uint8_t*)&randomx_prefetch_scratchpad)); + memcpy(code + codePos, codeLoopStore, loopStoreSize); + codePos += loopStoreSize; + emit(SUB_EBX); + emit(JNZ); + emit32(prologueSize - codePos - 4); + emitByte(JMP); + emit32(epilogueOffset - codePos - 4); + } + + void JitCompilerX86::generateCode(Instruction& instr, int i) { + instructionOffsets.push_back(codePos); + auto generator = engine[instr.opcode]; + (this->*generator)(instr, i); + } + + void JitCompilerX86::generateSuperscalarCode(Instruction& instr, std::vector &reciprocalCache) { + switch ((SuperscalarInstructionType)instr.opcode) + { + case randomx::SuperscalarInstructionType::ISUB_R: + emit(REX_SUB_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case randomx::SuperscalarInstructionType::IXOR_R: + emit(REX_XOR_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case randomx::SuperscalarInstructionType::IADD_RS: + emit(REX_LEA); + emitByte(0x04 + 8 * instr.dst); + genSIB(instr.getModShift(), instr.src, instr.dst); + break; + case randomx::SuperscalarInstructionType::IMUL_R: + emit(REX_IMUL_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case randomx::SuperscalarInstructionType::IROR_C: + emit(REX_ROT_I8); + emitByte(0xc8 + instr.dst); + emitByte(instr.getImm32() & 63); + break; + case randomx::SuperscalarInstructionType::IADD_C7: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); + break; + case randomx::SuperscalarInstructionType::IXOR_C7: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); + break; + case randomx::SuperscalarInstructionType::IADD_C8: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); +#ifdef RANDOMX_ALIGN + emit(NOP1); +#endif + break; + case randomx::SuperscalarInstructionType::IXOR_C8: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); +#ifdef RANDOMX_ALIGN + emit(NOP1); +#endif + break; + case randomx::SuperscalarInstructionType::IADD_C9: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); +#ifdef RANDOMX_ALIGN + emit(NOP2); +#endif + break; + case randomx::SuperscalarInstructionType::IXOR_C9: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); +#ifdef RANDOMX_ALIGN + emit(NOP2); +#endif + break; + case randomx::SuperscalarInstructionType::IMULH_R: + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe0 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + break; + case randomx::SuperscalarInstructionType::ISMULH_R: + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe8 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + break; + case randomx::SuperscalarInstructionType::IMUL_RCP: + emit(MOV_RAX_I); + emit64(reciprocalCache[instr.getImm32()]); + emit(REX_IMUL_RM); + emitByte(0xc0 + 8 * instr.dst); + break; + default: + UNREACHABLE; + } + } + + void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) { + emit(LEA_32); + emitByte(0x80 + instr.src + (rax ? 0 : 8)); + if (instr.src == RegisterNeedsSib) { + emitByte(0x24); + } + emit32(instr.getImm32()); + if (rax) + emitByte(AND_EAX_I); + else + emit(AND_ECX_I); + emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + + void JitCompilerX86::genAddressRegDst(Instruction& instr) { + emit(LEA_32); + emitByte(0x80 + instr.dst); + if (instr.dst == RegisterNeedsSib) { + emitByte(0x24); + } + emit32(instr.getImm32()); + emitByte(AND_EAX_I); + if (instr.getModCond() < StoreL3Condition) { + emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + emit32(ScratchpadL3Mask); + } + } + + void JitCompilerX86::genAddressImm(Instruction& instr) { + emit32(instr.getImm32() & ScratchpadL3Mask); + } + + void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + emit(REX_LEA); + if (instr.dst == RegisterNeedsDisplacement) + emitByte(0xac); + else + emitByte(0x04 + 8 * instr.dst); + genSIB(instr.getModShift(), instr.src, instr.dst); + if (instr.dst == RegisterNeedsDisplacement) + emit32(instr.getImm32()); + } + + void JitCompilerX86::h_IADD_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_ADD_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_ADD_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::genSIB(int scale, int index, int base) { + emitByte((scale << 6) | (index << 3) | base); + } + + void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + emit(REX_SUB_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + } + else { + emit(REX_81); + emitByte(0xe8 + instr.dst); + emit32(instr.getImm32()); + } + } + + void JitCompilerX86::h_ISUB_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_SUB_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_SUB_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + emit(REX_IMUL_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + } + else { + emit(REX_IMUL_RRI); + emitByte(0xc0 + 9 * instr.dst); + emit32(instr.getImm32()); + } + } + + void JitCompilerX86::h_IMUL_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_IMUL_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_IMUL_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe0 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_IMULH_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr, false); + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_MEM); + } + else { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_M); + emitByte(0xa6); + genAddressImm(instr); + } + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe8 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_ISMULH_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr, false); + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_IMUL_MEM); + } + else { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_M); + emitByte(0xae); + genAddressImm(instr); + } + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { + uint64_t divisor = instr.getImm32(); + if (!isZeroOrPowerOf2(divisor)) { + registerUsage[instr.dst] = i; + emit(MOV_RAX_I); + emit64(randomx_reciprocal_fast(divisor)); + emit(REX_IMUL_RM); + emitByte(0xc0 + 8 * instr.dst); + } + } + + void JitCompilerX86::h_INEG_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + emit(REX_NEG); + emitByte(0xd8 + instr.dst); + } + + void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + emit(REX_XOR_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + } + else { + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); + } + } + + void JitCompilerX86::h_IXOR_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_XOR_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_XOR_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::h_IROR_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + emit(REX_MOV_RR); + emitByte(0xc8 + instr.src); + emit(REX_ROT_CL); + emitByte(0xc8 + instr.dst); + } + else { + emit(REX_ROT_I8); + emitByte(0xc8 + instr.dst); + emitByte(instr.getImm32() & 63); + } + } + + void JitCompilerX86::h_IROL_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + emit(REX_MOV_RR); + emitByte(0xc8 + instr.src); + emit(REX_ROT_CL); + emitByte(0xc0 + instr.dst); + } + else { + emit(REX_ROT_I8); + emitByte(0xc0 + instr.dst); + emitByte(instr.getImm32() & 63); + } + } + + void JitCompilerX86::h_ISWAP_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + registerUsage[instr.dst] = i; + registerUsage[instr.src] = i; + emit(REX_XCHG); + emitByte(0xc0 + instr.src + 8 * instr.dst); + } + } + + void JitCompilerX86::h_FSWAP_R(Instruction& instr, int i) { + emit(SHUFPD); + emitByte(0xc0 + 9 * instr.dst); + emitByte(1); + } + + void JitCompilerX86::h_FADD_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; + emit(REX_ADDPD); + emitByte(0xc0 + instr.src + 8 * instr.dst); + } + + void JitCompilerX86::h_FADD_M(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_ADDPD); + emitByte(0xc4 + 8 * instr.dst); + } + + void JitCompilerX86::h_FSUB_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; + emit(REX_SUBPD); + emitByte(0xc0 + instr.src + 8 * instr.dst); + } + + void JitCompilerX86::h_FSUB_M(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_SUBPD); + emitByte(0xc4 + 8 * instr.dst); + } + + void JitCompilerX86::h_FSCAL_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + emit(REX_XORPS); + emitByte(0xc7 + 8 * instr.dst); + } + + void JitCompilerX86::h_FMUL_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; + emit(REX_MULPD); + emitByte(0xe0 + instr.src + 8 * instr.dst); + } + + void JitCompilerX86::h_FDIV_M(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_ANDPS_XMM12); + emit(REX_DIVPD); + emitByte(0xe4 + 8 * instr.dst); + } + + void JitCompilerX86::h_FSQRT_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + emit(SQRTPD); + emitByte(0xe4 + 9 * instr.dst); + } + + void JitCompilerX86::h_CFROUND(Instruction& instr, int i) { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.src); + int rotate = (13 - (instr.getImm32() & 63)) & 63; + if (rotate != 0) { + emit(ROL_RAX); + emitByte(rotate); + } + emit(AND_OR_MOV_LDMXCSR); + } + + void JitCompilerX86::h_CBRANCH(Instruction& instr, int i) { + int reg = instr.dst; + int target = registerUsage[reg] + 1; + emit(REX_ADD_I); + emitByte(0xc0 + reg); + int shift = instr.getModCond() + ConditionOffset; + uint32_t imm = instr.getImm32() | (1UL << shift); + if (ConditionOffset > 0 || shift > 0) + imm &= ~(1UL << (shift - 1)); + emit32(imm); + emit(REX_TEST); + emitByte(0xc0 + reg); + emit32(ConditionMask << shift); + emit(JZ); + emit32(instructionOffsets[target] - (codePos + 4)); + //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { + registerUsage[j] = i; + } + } + + void JitCompilerX86::h_ISTORE(Instruction& instr, int i) { + genAddressRegDst(instr); + emit(REX_MOV_MR); + emitByte(0x04 + 8 * instr.src); + emitByte(0x06); + } + + void JitCompilerX86::h_NOP(Instruction& instr, int i) { + emit(NOP1); + } + +#include "instruction_weights.hpp" +#define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) + + InstructionGeneratorX86 JitCompilerX86::engine[256] = { + INST_HANDLE(IADD_RS) + INST_HANDLE(IADD_M) + INST_HANDLE(ISUB_R) + INST_HANDLE(ISUB_M) + INST_HANDLE(IMUL_R) + INST_HANDLE(IMUL_M) + INST_HANDLE(IMULH_R) + INST_HANDLE(IMULH_M) + INST_HANDLE(ISMULH_R) + INST_HANDLE(ISMULH_M) + INST_HANDLE(IMUL_RCP) + INST_HANDLE(INEG_R) + INST_HANDLE(IXOR_R) + INST_HANDLE(IXOR_M) + INST_HANDLE(IROR_R) + INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) + INST_HANDLE(FSWAP_R) + INST_HANDLE(FADD_R) + INST_HANDLE(FADD_M) + INST_HANDLE(FSUB_R) + INST_HANDLE(FSUB_M) + INST_HANDLE(FSCAL_R) + INST_HANDLE(FMUL_R) + INST_HANDLE(FDIV_M) + INST_HANDLE(FSQRT_R) + INST_HANDLE(CBRANCH) + INST_HANDLE(CFROUND) + INST_HANDLE(ISTORE) + INST_HANDLE(NOP) + }; + +} diff --git a/randomx/jit_compiler_x86.hpp b/randomx/jit_compiler_x86.hpp new file mode 100644 index 0000000..7829fca --- /dev/null +++ b/randomx/jit_compiler_x86.hpp @@ -0,0 +1,142 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "common.hpp" + +namespace randomx { + + class Program; + class ProgramConfiguration; + class SuperscalarProgram; + class JitCompilerX86; + class Instruction; + + typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); + + class JitCompilerX86 { + public: + JitCompilerX86(); + ~JitCompilerX86(); + void generateProgram(Program&, ProgramConfiguration&); + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); + template + void generateSuperscalarHash(SuperscalarProgram (&programs)[N], std::vector &); + void generateDatasetInitCode(); + ProgramFunc* getProgramFunc() { + return (ProgramFunc*)code; + } + DatasetInitFunc* getDatasetInitFunc() { + return (DatasetInitFunc*)code; + } + uint8_t* getCode() { + return code; + } + size_t getCodeSize(); + void enableWriting(); + void enableExecution(); + void enableAll(); + private: + static InstructionGeneratorX86 engine[256]; + std::vector instructionOffsets; + int registerUsage[RegistersCount]; + uint8_t* code; + int32_t codePos; + + void generateProgramPrologue(Program&, ProgramConfiguration&); + void generateProgramEpilogue(Program&, ProgramConfiguration&); + void genAddressReg(Instruction&, bool); + void genAddressRegDst(Instruction&); + void genAddressImm(Instruction&); + void genSIB(int scale, int index, int base); + + void generateCode(Instruction&, int); + void generateSuperscalarCode(Instruction &, std::vector &); + + void emitByte(uint8_t val) { + code[codePos] = val; + codePos++; + } + + void emit32(uint32_t val) { + memcpy(code + codePos, &val, sizeof val); + codePos += sizeof val; + } + + void emit64(uint64_t val) { + memcpy(code + codePos, &val, sizeof val); + codePos += sizeof val; + } + + template + void emit(const uint8_t (&src)[N]) { + emit(src, N); + } + + void emit(const uint8_t* src, size_t count) { + memcpy(code + codePos, src, count); + codePos += count; + } + + void h_IADD_RS(Instruction&, int); + void h_IADD_M(Instruction&, int); + void h_ISUB_R(Instruction&, int); + void h_ISUB_M(Instruction&, int); + void h_IMUL_R(Instruction&, int); + void h_IMUL_M(Instruction&, int); + void h_IMULH_R(Instruction&, int); + void h_IMULH_M(Instruction&, int); + void h_ISMULH_R(Instruction&, int); + void h_ISMULH_M(Instruction&, int); + void h_IMUL_RCP(Instruction&, int); + void h_INEG_R(Instruction&, int); + void h_IXOR_R(Instruction&, int); + void h_IXOR_M(Instruction&, int); + void h_IROR_R(Instruction&, int); + void h_IROL_R(Instruction&, int); + void h_ISWAP_R(Instruction&, int); + void h_FSWAP_R(Instruction&, int); + void h_FADD_R(Instruction&, int); + void h_FADD_M(Instruction&, int); + void h_FSUB_R(Instruction&, int); + void h_FSUB_M(Instruction&, int); + void h_FSCAL_R(Instruction&, int); + void h_FMUL_R(Instruction&, int); + void h_FDIV_M(Instruction&, int); + void h_FSQRT_R(Instruction&, int); + void h_CBRANCH(Instruction&, int); + void h_CFROUND(Instruction&, int); + void h_ISTORE(Instruction&, int); + void h_NOP(Instruction&, int); + }; + +} \ No newline at end of file diff --git a/randomx/jit_compiler_x86_static.S b/randomx/jit_compiler_x86_static.S new file mode 100644 index 0000000..0b02278 --- /dev/null +++ b/randomx/jit_compiler_x86_static.S @@ -0,0 +1,232 @@ +# Copyright (c) 2018-2019, tevador +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the copyright holder nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +.intel_syntax noprefix +#if defined(__APPLE__) +.text +#define DECL(x) _##x +#else +.section .text +#define DECL(x) x +#endif + +#if defined(__WIN32__) || defined(__CYGWIN__) +#define WINABI +#endif + +.global DECL(randomx_prefetch_scratchpad) +.global DECL(randomx_prefetch_scratchpad_end) +.global DECL(randomx_program_prologue) +.global DECL(randomx_program_prologue_first_load) +.global DECL(randomx_program_loop_begin) +.global DECL(randomx_program_loop_load) +.global DECL(randomx_program_start) +.global DECL(randomx_program_read_dataset) +.global DECL(randomx_program_read_dataset_sshash_init) +.global DECL(randomx_program_read_dataset_sshash_fin) +.global DECL(randomx_program_loop_store) +.global DECL(randomx_program_loop_end) +.global DECL(randomx_dataset_init) +.global DECL(randomx_program_epilogue) +.global DECL(randomx_sshash_load) +.global DECL(randomx_sshash_prefetch) +.global DECL(randomx_sshash_end) +.global DECL(randomx_sshash_init) +.global DECL(randomx_program_end) +.global DECL(randomx_reciprocal_fast) + +#include "configuration.h" + +#define RANDOMX_SCRATCHPAD_MASK (RANDOMX_SCRATCHPAD_L3-64) +#define RANDOMX_DATASET_BASE_MASK (RANDOMX_DATASET_BASE_SIZE-64) +#define RANDOMX_CACHE_MASK (RANDOMX_ARGON_MEMORY*16-1) +#define RANDOMX_ALIGN 4096 +#define SUPERSCALAR_OFFSET ((((RANDOMX_ALIGN + 32 * RANDOMX_PROGRAM_SIZE) - 1) / (RANDOMX_ALIGN) + 1) * (RANDOMX_ALIGN)) + +#define db .byte + +DECL(randomx_prefetch_scratchpad): + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rax] + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rdx] + +DECL(randomx_prefetch_scratchpad_end): + +.balign 64 +DECL(randomx_program_prologue): +#if defined(WINABI) + #include "asm/program_prologue_win64.inc" +#else + #include "asm/program_prologue_linux.inc" +#endif + movapd xmm13, xmmword ptr [mantissaMask+rip] + movapd xmm14, xmmword ptr [exp240+rip] + movapd xmm15, xmmword ptr [scaleMask+rip] + +DECL(randomx_program_prologue_first_load): + xor rax, r8 + xor rax, r8 + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + jmp DECL(randomx_program_loop_begin) + +.balign 64 + #include "asm/program_xmm_constants.inc" + +.balign 64 +DECL(randomx_program_loop_begin): + nop + +DECL(randomx_program_loop_load): + #include "asm/program_loop_load.inc" + +DECL(randomx_program_start): + nop + +DECL(randomx_program_read_dataset): + #include "asm/program_read_dataset.inc" + +DECL(randomx_program_read_dataset_sshash_init): + #include "asm/program_read_dataset_sshash_init.inc" + +DECL(randomx_program_read_dataset_sshash_fin): + #include "asm/program_read_dataset_sshash_fin.inc" + +DECL(randomx_program_loop_store): + #include "asm/program_loop_store.inc" + +DECL(randomx_program_loop_end): + nop + +.balign 64 +DECL(randomx_dataset_init): + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +#if defined(WINABI) + push rdi + push rsi + mov rdi, qword ptr [rcx] ;# cache->memory + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index +#else + mov rdi, qword ptr [rdi] ;# cache->memory + ;# dataset in rsi + mov rbp, rdx ;# block index + push rcx ;# max. block index +#endif +init_block_loop: + prefetchw byte ptr [rsi] + mov rbx, rbp + .byte 232 ;# 0xE8 = call + .int SUPERSCALAR_OFFSET - (call_offset - DECL(randomx_dataset_init)) +call_offset: + mov qword ptr [rsi+0], r8 + mov qword ptr [rsi+8], r9 + mov qword ptr [rsi+16], r10 + mov qword ptr [rsi+24], r11 + mov qword ptr [rsi+32], r12 + mov qword ptr [rsi+40], r13 + mov qword ptr [rsi+48], r14 + mov qword ptr [rsi+56], r15 + add rbp, 1 + add rsi, 64 + cmp rbp, qword ptr [rsp] + jb init_block_loop + pop rax +#if defined(WINABI) + pop rsi + pop rdi +#endif + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret + +.balign 64 +DECL(randomx_program_epilogue): + #include "asm/program_epilogue_store.inc" +#if defined(WINABI) + #include "asm/program_epilogue_win64.inc" +#else + #include "asm/program_epilogue_linux.inc" +#endif + +.balign 64 +DECL(randomx_sshash_load): + #include "asm/program_sshash_load.inc" + +DECL(randomx_sshash_prefetch): + #include "asm/program_sshash_prefetch.inc" + +DECL(randomx_sshash_end): + nop + +.balign 64 +DECL(randomx_sshash_init): + lea r8, [rbx+1] + #include "asm/program_sshash_prefetch.inc" + imul r8, qword ptr [r0_mul+rip] + mov r9, qword ptr [r1_add+rip] + xor r9, r8 + mov r10, qword ptr [r2_add+rip] + xor r10, r8 + mov r11, qword ptr [r3_add+rip] + xor r11, r8 + mov r12, qword ptr [r4_add+rip] + xor r12, r8 + mov r13, qword ptr [r5_add+rip] + xor r13, r8 + mov r14, qword ptr [r6_add+rip] + xor r14, r8 + mov r15, qword ptr [r7_add+rip] + xor r15, r8 + jmp DECL(randomx_program_end) + +.balign 64 + #include "asm/program_sshash_constants.inc" + +.balign 64 +DECL(randomx_program_end): + nop + +DECL(randomx_reciprocal_fast): +#if !defined(WINABI) + mov rcx, rdi +#endif + #include "asm/randomx_reciprocal.inc" diff --git a/randomx/jit_compiler_x86_static.asm b/randomx/jit_compiler_x86_static.asm new file mode 100644 index 0000000..0f97183 --- /dev/null +++ b/randomx/jit_compiler_x86_static.asm @@ -0,0 +1,227 @@ +; Copyright (c) 2018-2019, tevador +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; * Neither the name of the copyright holder nor the +; names of its contributors may be used to endorse or promote products +; derived from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +IFDEF RAX + +_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE + +PUBLIC randomx_prefetch_scratchpad +PUBLIC randomx_prefetch_scratchpad_end +PUBLIC randomx_program_prologue +PUBLIC randomx_program_prologue_first_load +PUBLIC randomx_program_loop_begin +PUBLIC randomx_program_loop_load +PUBLIC randomx_program_start +PUBLIC randomx_program_read_dataset +PUBLIC randomx_program_read_dataset_sshash_init +PUBLIC randomx_program_read_dataset_sshash_fin +PUBLIC randomx_dataset_init +PUBLIC randomx_program_loop_store +PUBLIC randomx_program_loop_end +PUBLIC randomx_program_epilogue +PUBLIC randomx_sshash_load +PUBLIC randomx_sshash_prefetch +PUBLIC randomx_sshash_end +PUBLIC randomx_sshash_init +PUBLIC randomx_program_end +PUBLIC randomx_reciprocal_fast + +include asm/configuration.asm + +RANDOMX_SCRATCHPAD_MASK EQU (RANDOMX_SCRATCHPAD_L3-64) +RANDOMX_DATASET_BASE_MASK EQU (RANDOMX_DATASET_BASE_SIZE-64) +RANDOMX_CACHE_MASK EQU (RANDOMX_ARGON_MEMORY*16-1) +RANDOMX_ALIGN EQU 4096 +SUPERSCALAR_OFFSET EQU ((((RANDOMX_ALIGN + 32 * RANDOMX_PROGRAM_SIZE) - 1) / (RANDOMX_ALIGN) + 1) * (RANDOMX_ALIGN)) + +randomx_prefetch_scratchpad PROC + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rax] + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rdx] +randomx_prefetch_scratchpad ENDP + +randomx_prefetch_scratchpad_end PROC +randomx_prefetch_scratchpad_end ENDP + +ALIGN 64 +randomx_program_prologue PROC + include asm/program_prologue_win64.inc + movapd xmm13, xmmword ptr [mantissaMask] + movapd xmm14, xmmword ptr [exp240] + movapd xmm15, xmmword ptr [scaleMask] +randomx_program_prologue ENDP + +randomx_program_prologue_first_load PROC + xor rax, r8 + xor rax, r8 + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + jmp randomx_program_loop_begin +randomx_program_prologue_first_load ENDP + +ALIGN 64 + include asm/program_xmm_constants.inc + +ALIGN 64 +randomx_program_loop_begin PROC + nop +randomx_program_loop_begin ENDP + +randomx_program_loop_load PROC + include asm/program_loop_load.inc +randomx_program_loop_load ENDP + +randomx_program_start PROC + nop +randomx_program_start ENDP + +randomx_program_read_dataset PROC + include asm/program_read_dataset.inc +randomx_program_read_dataset ENDP + +randomx_program_read_dataset_sshash_init PROC + include asm/program_read_dataset_sshash_init.inc +randomx_program_read_dataset_sshash_init ENDP + +randomx_program_read_dataset_sshash_fin PROC + include asm/program_read_dataset_sshash_fin.inc +randomx_program_read_dataset_sshash_fin ENDP + +randomx_program_loop_store PROC + include asm/program_loop_store.inc +randomx_program_loop_store ENDP + +randomx_program_loop_end PROC + nop +randomx_program_loop_end ENDP + +ALIGN 64 +randomx_dataset_init PROC + push rbx + push rbp + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + mov rdi, qword ptr [rcx] ;# cache->memory + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index +init_block_loop: + prefetchw byte ptr [rsi] + mov rbx, rbp + db 232 ;# 0xE8 = call + dd SUPERSCALAR_OFFSET - distance + distance equ $ - offset randomx_dataset_init + mov qword ptr [rsi+0], r8 + mov qword ptr [rsi+8], r9 + mov qword ptr [rsi+16], r10 + mov qword ptr [rsi+24], r11 + mov qword ptr [rsi+32], r12 + mov qword ptr [rsi+40], r13 + mov qword ptr [rsi+48], r14 + mov qword ptr [rsi+56], r15 + add rbp, 1 + add rsi, 64 + cmp rbp, qword ptr [rsp] + jb init_block_loop + pop r9 + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbp + pop rbx + ret +randomx_dataset_init ENDP + +ALIGN 64 +randomx_program_epilogue PROC + include asm/program_epilogue_store.inc + include asm/program_epilogue_win64.inc +randomx_program_epilogue ENDP + +ALIGN 64 +randomx_sshash_load PROC + include asm/program_sshash_load.inc +randomx_sshash_load ENDP + +randomx_sshash_prefetch PROC + include asm/program_sshash_prefetch.inc +randomx_sshash_prefetch ENDP + +randomx_sshash_end PROC + nop +randomx_sshash_end ENDP + +ALIGN 64 +randomx_sshash_init PROC + lea r8, [rbx+1] + include asm/program_sshash_prefetch.inc + imul r8, qword ptr [r0_mul] + mov r9, qword ptr [r1_add] + xor r9, r8 + mov r10, qword ptr [r2_add] + xor r10, r8 + mov r11, qword ptr [r3_add] + xor r11, r8 + mov r12, qword ptr [r4_add] + xor r12, r8 + mov r13, qword ptr [r5_add] + xor r13, r8 + mov r14, qword ptr [r6_add] + xor r14, r8 + mov r15, qword ptr [r7_add] + xor r15, r8 + jmp randomx_program_end +randomx_sshash_init ENDP + +ALIGN 64 + include asm/program_sshash_constants.inc + +ALIGN 64 +randomx_program_end PROC + nop +randomx_program_end ENDP + +randomx_reciprocal_fast PROC + include asm/randomx_reciprocal.inc +randomx_reciprocal_fast ENDP + +_RANDOMX_JITX86_STATIC ENDS + +ENDIF + +END \ No newline at end of file diff --git a/randomx/jit_compiler_x86_static.hpp b/randomx/jit_compiler_x86_static.hpp new file mode 100644 index 0000000..0a62c98 --- /dev/null +++ b/randomx/jit_compiler_x86_static.hpp @@ -0,0 +1,51 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +extern "C" { + void randomx_prefetch_scratchpad(); + void randomx_prefetch_scratchpad_end(); + void randomx_program_prologue(); + void randomx_program_prologue_first_load(); + void randomx_program_loop_begin(); + void randomx_program_loop_load(); + void randomx_program_start(); + void randomx_program_read_dataset(); + void randomx_program_read_dataset_sshash_init(); + void randomx_program_read_dataset_sshash_fin(); + void randomx_program_loop_store(); + void randomx_program_loop_end(); + void randomx_dataset_init(); + void randomx_program_epilogue(); + void randomx_sshash_load(); + void randomx_sshash_prefetch(); + void randomx_sshash_end(); + void randomx_sshash_init(); + void randomx_program_end(); +} diff --git a/randomx/mingw-std-threads-master/CMakeLists.txt b/randomx/mingw-std-threads-master/CMakeLists.txt new file mode 100644 index 0000000..ea2f559 --- /dev/null +++ b/randomx/mingw-std-threads-master/CMakeLists.txt @@ -0,0 +1,35 @@ +project(mingw_stdthreads) +cmake_minimum_required(VERSION 3.0) + +option(MINGW_STDTHREADS_BUILD_TEST "Build tests") +option(MINGW_STDTHREADS_GENERATE_STDHEADERS "Generate std-like headers") + +string(CONCAT mingw_stdthreads_dir_docstring + "Optional. When generating std-like headers , this variable can be set" + "to manually specify the path to mingw-stdthreads directory containing" + "original library headers.") +set(MINGW_STDTHREADS_DIR "${PROJECT_SOURCE_DIR}" + CACHE PATH ${mingw_stdthreads_dir_docstring}) + +# mingw-stdthreads is a header-only library, so make it a INTERFACE target +add_library(${PROJECT_NAME} INTERFACE) +target_include_directories(${PROJECT_NAME} INTERFACE "${PROJECT_SOURCE_DIR}") + +if(MINGW_STDTHREADS_GENERATE_STDHEADERS) + # Check if we are using gcc or clang + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") + # Add as dependency and generate std headers + add_subdirectory(cmake_stdheaders_generator) + target_link_libraries(${PROJECT_NAME} INTERFACE + cmake_stdheaders_generator) + else() + message(WARNING "Cannot generate std headers with this compiler: " + ${CMAKE_CXX_COMPILER_ID} ". " + "Please fall back to #include ") + endif() +endif() + +# Build tests.exe +if(MINGW_STDTHREADS_BUILD_TEST) + add_subdirectory(tests) +endif() \ No newline at end of file diff --git a/randomx/mingw-std-threads-master/LICENSE b/randomx/mingw-std-threads-master/LICENSE new file mode 100644 index 0000000..ac525cf --- /dev/null +++ b/randomx/mingw-std-threads-master/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2016, Mega Limited +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/randomx/mingw-std-threads-master/README.md b/randomx/mingw-std-threads-master/README.md new file mode 100644 index 0000000..f0bd9fe --- /dev/null +++ b/randomx/mingw-std-threads-master/README.md @@ -0,0 +1,58 @@ +mingw-std-threads +================= + +Implementation of standard C++11 threading classes, which are currently still missing on MinGW GCC. + +Target Windows version +---------------------- +This implementation should work with Windows XP (regardless of service pack), or newer. +The library automatically detects the version of Windows that is being targeted (at compile time), and selects an implementation that takes advantage of available Windows features. +In MinGW GCC, the target Windows version may optionally be selected by the command-line option `-D _WIN32_WINNT=...`. +Use `0x0600` for Windows Vista, or `0x0601` for Windows 7. +See "[Modifying `WINVER` and `_WIN32_WINNT`](https://docs.microsoft.com/en-us/cpp/porting/modifying-winver-and-win32-winnt)" for more details. + +Usage +----- + +This is a header-only library. To use, just include the corresponding `mingw.xxx.h file`, where `xxx` would be the name of the standard header that you would normally include. + +For example, `#include "mingw.thread.h"` replaces `#include `. + +A `CMakeLists.txt` has also been provided. You can add it to your project by using `add_subdirectory()`, and then this library can be added as your targets' dependency by using `target_link_libraries(YOUR_TARGET PRIVATE mingw_stdthreads)`. By default it just adds a include path, allowing you to include headers using angle brackets (for example `#include `). But you can also provide options to let it generate "std-like" headers (see next paragraph). + +Using "std-like" headers +------------------------ + +Probably you don't really want to replace all your includes from `#include
` to `#include "mingw.header.h"`. So if you are using GCC or clang, here are some ways to make you happy :) + +With CMake, you just need to turn on the option `MINGW_STDTHREADS_GENERATE_STDHEADERS` before adding mingw-stdthreads, something like this: +```CMake +option(MINGW_STDTHREADS_GENERATE_STDHEADERS "" ON) +add_subdirectory(mingw_stdthreads) +target_link_libraries(${TARGET} PRIVATE mingw_stdthreads) +``` +When CMake generates project files, headers named in the "standard header" way will be generated and added to your include path. Then you can avoid stuffs like `mingw.thread.h`, and keep using `#include ` like always. In addition, `MINGW_STDTHREADS_GENERATED_STDHEADERS` will be defined, you can use this macro to check if those generated headers are actually available. + +If you aren't using CMake, you can use one of the three scripts inside [utility_scripts](utility_scripts) directory to manually generate those "std-like" headers. Note that this requires Microsoft Power Shell, so if you are cross-compiling, you would need to install Power Shell. + +Compatibility +------------- + +This code has been tested to work with MinGW-w64 5.3.0, but should work with any other MinGW version that has the `std` threading classes missing, has C++11 support for lambda functions, variadic templates, and has working mutex helper classes in ``. + +Switching from the win32-pthread based implementation +----------------------------------------------------- +It seems that recent versions of MinGW-w64 include a Win32 port of pthreads, and have the `std::thread`, `std::mutex`, etc. classes implemented and working based on that compatibility +layer. +That is a somewhat heavier implementation, as it relies on an abstraction layer, so you may still want to use this implementation for efficiency purposes. +Unfortunately you can't use this library standalone and independent of the system `` headers, as it relies on those headers for `std::unique_lock` and other non-trivial utility classes. +In that case you will need to edit the `c++-config.h` file of your MinGW setup and comment out the definition of _GLIBCXX_HAS_GTHREADS. +This will cause the system headers not to define the actual `thread`, `mutex`, etc. classes, but still define the necessary utility classes. + +Why MinGW has no threading classes +---------------------------------- +It seems that for cross-platform threading implementation, the GCC standard library relies on the gthreads/pthreads library. +If this library is not available, as is the case with MinGW, the classes `std::thread`, `std::mutex`, `std::condition_variable` are not defined. +However, various usable helper classes are still defined in the system headers. +Hence, this implementation does not re-define them, and instead includes those headers. + diff --git a/randomx/mingw-std-threads-master/cmake_stdheaders_generator/CMakeLists.txt b/randomx/mingw-std-threads-master/cmake_stdheaders_generator/CMakeLists.txt new file mode 100644 index 0000000..631ec10 --- /dev/null +++ b/randomx/mingw-std-threads-master/cmake_stdheaders_generator/CMakeLists.txt @@ -0,0 +1,78 @@ +cmake_minimum_required(VERSION 3.0) +project(cmake_stdheaders_generator) + +set(output_include_path "${PROJECT_BINARY_DIR}/${PROJECT_NAME}") +message("${PROJECT_NAME}: output_include_path set to ${output_include_path}") + +function(generate_mingw_stdthreads_header header_file_name + mingw_stdthreads_folder) + set(template_file_path "${PROJECT_SOURCE_DIR}/template.cpp") + set(destination_file_path "${output_include_path}/${header_file_name}") + + # Check if compiler is gcc or clang + if (NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") + # Actually this should never happen because it should have already + # been checked in the parent CMakeLists.txt + message(FATAL_ERROR "Unsupported compiler") + endif() + + # Call g++ to retrieve header path + # The -H option will let g++ outputs header dependencies to stderr + set(compiler_arguments + ${template_file_path} + -H + "-DMINGW_STDTHREADS_DETECTING_SYSTEM_HEADER=<${header_file_name}>") + # And content of stderr is saved to variable compiler_output + execute_process(COMMAND "${CMAKE_CXX_COMPILER}" ${compiler_arguments} + ERROR_VARIABLE compiler_output + OUTPUT_QUIET) + + # Get full path to system header + string(REGEX MATCH "[.] ([^\r\n]*)" _ "${compiler_output}") + set(mingw_stdthreads_headers_generator_system_header "${CMAKE_MATCH_1}") + message("Matched: <${mingw_stdthreads_headers_generator_system_header}>") + + # Ensure file exists + if(NOT EXISTS "${mingw_stdthreads_headers_generator_system_header}") + message(FATAL_ERROR "<${header_file_name}>'s path not found, " + "compiler output was:\n${compiler_output}") + endif() + + # Get full path to mingw-stdthreads header + set(mingw_stdthreads_headers_generator_library_header + "${mingw_stdthreads_folder}/mingw.${header_file_name}.h") + + # Normalize paths + file(TO_CMAKE_PATH "${mingw_stdthreads_headers_generator_system_header}" + mingw_stdthreads_headers_generator_system_header) + file(TO_CMAKE_PATH "${mingw_stdthreads_headers_generator_library_header}" + mingw_stdthreads_headers_generator_library_header) + + configure_file("${template_file_path}" "${destination_file_path}") +endfunction() + +if(EXISTS "${MINGW_STDTHREADS_DIR}") + message("${PROJECT_NAME}: MINGW_STDTHREADS_DIR: " + "${MINGW_STDTHREADS_DIR}") +else() + message(FATAL_ERROR "${PROECT_NAME}: MINGW_STDTHREADS_DIR does not " + "exist: ${MINGW_STDTHREADS_DIR}") +endif() + +# +generate_mingw_stdthreads_header(condition_variable "${MINGW_STDTHREADS_DIR}") +# +generate_mingw_stdthreads_header(future "${MINGW_STDTHREADS_DIR}") +# +generate_mingw_stdthreads_header(mutex "${MINGW_STDTHREADS_DIR}") +# +generate_mingw_stdthreads_header(shared_mutex "${MINGW_STDTHREADS_DIR}") +# +generate_mingw_stdthreads_header(thread "${MINGW_STDTHREADS_DIR}") + +# the generated headers are to be considered as a header only library +# so we create an interface target +add_library(${PROJECT_NAME} INTERFACE) +target_compile_definitions(${PROJECT_NAME} INTERFACE + MINGW_STDTHREADS_GENERATED_STDHEADERS) +target_include_directories(${PROJECT_NAME} INTERFACE "${output_include_path}") \ No newline at end of file diff --git a/randomx/mingw-std-threads-master/cmake_stdheaders_generator/template.cpp b/randomx/mingw-std-threads-master/cmake_stdheaders_generator/template.cpp new file mode 100644 index 0000000..e7c712c --- /dev/null +++ b/randomx/mingw-std-threads-master/cmake_stdheaders_generator/template.cpp @@ -0,0 +1,11 @@ +#ifdef MINGW_STDTHREADS_DETECTING_SYSTEM_HEADER + #include MINGW_STDTHREADS_DETECTING_SYSTEM_HEADER + static_assert(false, "Prevent compilation") +#else + #pragma once + // both system header and mignw-stdthreads header should already have include + // guards. But we still add a #pragma once just to be safe. + + #include "${mingw_stdthreads_headers_generator_system_header}" + #include "${mingw_stdthreads_headers_generator_library_header}" +#endif \ No newline at end of file diff --git a/randomx/mingw-std-threads-master/mingw.condition_variable.h b/randomx/mingw-std-threads-master/mingw.condition_variable.h new file mode 100644 index 0000000..50c5ebd --- /dev/null +++ b/randomx/mingw-std-threads-master/mingw.condition_variable.h @@ -0,0 +1,564 @@ +/** +* @file condition_variable.h +* @brief std::condition_variable implementation for MinGW +* +* (c) 2013-2016 by Mega Limited, Auckland, New Zealand +* @author Alexander Vassilev +* +* @copyright Simplified (2-clause) BSD License. +* You should have received a copy of the license along with this +* program. +* +* This code is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +* @note +* This file may become part of the mingw-w64 runtime package. If/when this happens, +* the appropriate license will be added, i.e. this code will become dual-licensed, +* and the current BSD 2-clause license will stay. +*/ + +#ifndef MINGW_CONDITIONAL_VARIABLE_H +#define MINGW_CONDITIONAL_VARIABLE_H + +#if !defined(__cplusplus) || (__cplusplus < 201103L) +#error A C++11 compiler is required! +#endif +// Use the standard classes for std::, if available. +#include + +#include +#include +#include + +#include // Detect Windows version. +#if (WINVER < _WIN32_WINNT_VISTA) +#include +#endif +#if (defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)) +#pragma message "The Windows API that MinGW-w32 provides is not fully compatible\ + with Microsoft's API. We'll try to work around this, but we can make no\ + guarantees. This problem does not exist in MinGW-w64." +#include // No further granularity can be expected. +#else +#if (WINVER < _WIN32_WINNT_VISTA) +#include +#include // For CreateSemaphore +#include +#endif +#include +#endif + +#include "mingw.mutex.h" +#include "mingw.shared_mutex.h" + +#if !defined(_WIN32_WINNT) || (_WIN32_WINNT < 0x0501) +#error To use the MinGW-std-threads library, you will need to define the macro _WIN32_WINNT to be 0x0501 (Windows XP) or higher. +#endif + +namespace mingw_stdthread +{ +#if defined(__MINGW32__ ) && !defined(_GLIBCXX_HAS_GTHREADS) +enum class cv_status { no_timeout, timeout }; +#else +using std::cv_status; +#endif +namespace xp +{ +// Include the XP-compatible condition_variable classes only if actually +// compiling for XP. The XP-compatible classes are slower than the newer +// versions, and depend on features not compatible with Windows Phone 8. +#if (WINVER < _WIN32_WINNT_VISTA) +class condition_variable_any +{ + recursive_mutex mMutex {}; + std::atomic mNumWaiters {0}; + HANDLE mSemaphore; + HANDLE mWakeEvent {}; +public: + using native_handle_type = HANDLE; + native_handle_type native_handle() + { + return mSemaphore; + } + condition_variable_any(const condition_variable_any&) = delete; + condition_variable_any& operator=(const condition_variable_any&) = delete; + condition_variable_any() + : mSemaphore(CreateSemaphoreA(NULL, 0, 0xFFFF, NULL)) + { + if (mSemaphore == NULL) + throw std::system_error(GetLastError(), std::generic_category()); + mWakeEvent = CreateEvent(NULL, FALSE, FALSE, NULL); + if (mWakeEvent == NULL) + { + CloseHandle(mSemaphore); + throw std::system_error(GetLastError(), std::generic_category()); + } + } + ~condition_variable_any() + { + CloseHandle(mWakeEvent); + CloseHandle(mSemaphore); + } +private: + template + bool wait_impl(M& lock, DWORD timeout) + { + { + lock_guard guard(mMutex); + mNumWaiters++; + } + lock.unlock(); + DWORD ret = WaitForSingleObject(mSemaphore, timeout); + + mNumWaiters--; + SetEvent(mWakeEvent); + lock.lock(); + if (ret == WAIT_OBJECT_0) + return true; + else if (ret == WAIT_TIMEOUT) + return false; +//2 possible cases: +//1)The point in notify_all() where we determine the count to +//increment the semaphore with has not been reached yet: +//we just need to decrement mNumWaiters, but setting the event does not hurt +// +//2)Semaphore has just been released with mNumWaiters just before +//we decremented it. This means that the semaphore count +//after all waiters finish won't be 0 - because not all waiters +//woke up by acquiring the semaphore - we woke up by a timeout. +//The notify_all() must handle this gracefully +// + else + { + using namespace std; + throw system_error(make_error_code(errc::protocol_error)); + } + } +public: + template + void wait(M& lock) + { + wait_impl(lock, INFINITE); + } + template + void wait(M& lock, Predicate pred) + { + while(!pred()) + { + wait(lock); + }; + } + + void notify_all() noexcept + { + lock_guard lock(mMutex); //block any further wait requests until all current waiters are unblocked + if (mNumWaiters.load() <= 0) + return; + + ReleaseSemaphore(mSemaphore, mNumWaiters, NULL); + while(mNumWaiters > 0) + { + auto ret = WaitForSingleObject(mWakeEvent, 1000); + if (ret == WAIT_FAILED || ret == WAIT_ABANDONED) + std::terminate(); + } + assert(mNumWaiters == 0); +//in case some of the waiters timed out just after we released the +//semaphore by mNumWaiters, it won't be zero now, because not all waiters +//woke up by acquiring the semaphore. So we must zero the semaphore before +//we accept waiters for the next event +//See _wait_impl for details + while(WaitForSingleObject(mSemaphore, 0) == WAIT_OBJECT_0); + } + void notify_one() noexcept + { + lock_guard lock(mMutex); + int targetWaiters = mNumWaiters.load() - 1; + if (targetWaiters <= -1) + return; + ReleaseSemaphore(mSemaphore, 1, NULL); + while(mNumWaiters > targetWaiters) + { + auto ret = WaitForSingleObject(mWakeEvent, 1000); + if (ret == WAIT_FAILED || ret == WAIT_ABANDONED) + std::terminate(); + } + assert(mNumWaiters == targetWaiters); + } + template + cv_status wait_for(M& lock, + const std::chrono::duration& rel_time) + { + using namespace std::chrono; + auto timeout = duration_cast(rel_time).count(); + DWORD waittime = (timeout < INFINITE) ? ((timeout < 0) ? 0 : static_cast(timeout)) : (INFINITE - 1); + bool ret = wait_impl(lock, waittime) || (timeout >= INFINITE); + return ret?cv_status::no_timeout:cv_status::timeout; + } + + template + bool wait_for(M& lock, + const std::chrono::duration& rel_time, Predicate pred) + { + return wait_until(lock, std::chrono::steady_clock::now()+rel_time, pred); + } + template + cv_status wait_until (M& lock, + const std::chrono::time_point& abs_time) + { + return wait_for(lock, abs_time - Clock::now()); + } + template + bool wait_until (M& lock, + const std::chrono::time_point& abs_time, + Predicate pred) + { + while (!pred()) + { + if (wait_until(lock, abs_time) == cv_status::timeout) + { + return pred(); + } + } + return true; + } +}; +class condition_variable: condition_variable_any +{ + using base = condition_variable_any; +public: + using base::native_handle_type; + using base::native_handle; + using base::base; + using base::notify_all; + using base::notify_one; + void wait(unique_lock &lock) + { + base::wait(lock); + } + template + void wait(unique_lock& lock, Predicate pred) + { + base::wait(lock, pred); + } + template + cv_status wait_for(unique_lock& lock, const std::chrono::duration& rel_time) + { + return base::wait_for(lock, rel_time); + } + template + bool wait_for(unique_lock& lock, const std::chrono::duration& rel_time, Predicate pred) + { + return base::wait_for(lock, rel_time, pred); + } + template + cv_status wait_until (unique_lock& lock, const std::chrono::time_point& abs_time) + { + return base::wait_until(lock, abs_time); + } + template + bool wait_until (unique_lock& lock, const std::chrono::time_point& abs_time, Predicate pred) + { + return base::wait_until(lock, abs_time, pred); + } +}; +#endif // Compiling for XP +} // Namespace mingw_stdthread::xp + +#if (WINVER >= _WIN32_WINNT_VISTA) +namespace vista +{ +// If compiling for Vista or higher, use the native condition variable. +class condition_variable +{ + static constexpr DWORD kInfinite = 0xffffffffl; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" + CONDITION_VARIABLE cvariable_ = CONDITION_VARIABLE_INIT; +#pragma GCC diagnostic pop + + friend class condition_variable_any; + +#if STDMUTEX_RECURSION_CHECKS + template + inline static void before_wait (MTX * pmutex) + { + pmutex->mOwnerThread.checkSetOwnerBeforeUnlock(); + } + template + inline static void after_wait (MTX * pmutex) + { + pmutex->mOwnerThread.setOwnerAfterLock(GetCurrentThreadId()); + } +#else + inline static void before_wait (void *) { } + inline static void after_wait (void *) { } +#endif + + bool wait_impl (unique_lock & lock, DWORD time) + { + using mutex_handle_type = typename xp::mutex::native_handle_type; + static_assert(std::is_same::value, + "Native Win32 condition variable requires std::mutex to \ +use native Win32 critical section objects."); + xp::mutex * pmutex = lock.release(); + before_wait(pmutex); + BOOL success = SleepConditionVariableCS(&cvariable_, + pmutex->native_handle(), + time); + after_wait(pmutex); + lock = unique_lock(*pmutex, adopt_lock); + return success; + } + + bool wait_unique (windows7::mutex * pmutex, DWORD time) + { + before_wait(pmutex); + BOOL success = SleepConditionVariableSRW( native_handle(), + pmutex->native_handle(), + time, +// CONDITION_VARIABLE_LOCKMODE_SHARED has a value not specified by +// Microsoft's Dev Center, but is known to be (convertible to) a ULONG. To +// ensure that the value passed to this function is not equal to Microsoft's +// constant, we can either use a static_assert, or simply generate an +// appropriate value. + !CONDITION_VARIABLE_LOCKMODE_SHARED); + after_wait(pmutex); + return success; + } + bool wait_impl (unique_lock & lock, DWORD time) + { + windows7::mutex * pmutex = lock.release(); + bool success = wait_unique(pmutex, time); + lock = unique_lock(*pmutex, adopt_lock); + return success; + } +public: + using native_handle_type = PCONDITION_VARIABLE; + native_handle_type native_handle (void) + { + return &cvariable_; + } + + condition_variable (void) = default; + ~condition_variable (void) = default; + + condition_variable (const condition_variable &) = delete; + condition_variable & operator= (const condition_variable &) = delete; + + void notify_one (void) noexcept + { + WakeConditionVariable(&cvariable_); + } + + void notify_all (void) noexcept + { + WakeAllConditionVariable(&cvariable_); + } + + void wait (unique_lock & lock) + { + wait_impl(lock, kInfinite); + } + + template + void wait (unique_lock & lock, Predicate pred) + { + while (!pred()) + wait(lock); + } + + template + cv_status wait_for(unique_lock& lock, + const std::chrono::duration& rel_time) + { + using namespace std::chrono; + auto timeout = duration_cast(rel_time).count(); + DWORD waittime = (timeout < kInfinite) ? ((timeout < 0) ? 0 : static_cast(timeout)) : (kInfinite - 1); + bool result = wait_impl(lock, waittime) || (timeout >= kInfinite); + return result ? cv_status::no_timeout : cv_status::timeout; + } + + template + bool wait_for(unique_lock& lock, + const std::chrono::duration& rel_time, + Predicate pred) + { + return wait_until(lock, + std::chrono::steady_clock::now() + rel_time, + std::move(pred)); + } + template + cv_status wait_until (unique_lock& lock, + const std::chrono::time_point& abs_time) + { + return wait_for(lock, abs_time - Clock::now()); + } + template + bool wait_until (unique_lock& lock, + const std::chrono::time_point& abs_time, + Predicate pred) + { + while (!pred()) + { + if (wait_until(lock, abs_time) == cv_status::timeout) + { + return pred(); + } + } + return true; + } +}; + +class condition_variable_any +{ + static constexpr DWORD kInfinite = 0xffffffffl; + using native_shared_mutex = windows7::shared_mutex; + + condition_variable internal_cv_ {}; +// When available, the SRW-based mutexes should be faster than the +// CriticalSection-based mutexes. Only try_lock will be unavailable in Vista, +// and try_lock is not used by condition_variable_any. + windows7::mutex internal_mutex_ {}; + + template + bool wait_impl (L & lock, DWORD time) + { + unique_lock internal_lock(internal_mutex_); + lock.unlock(); + bool success = internal_cv_.wait_impl(internal_lock, time); + lock.lock(); + return success; + } +// If the lock happens to be called on a native Windows mutex, skip any extra +// contention. + inline bool wait_impl (unique_lock & lock, DWORD time) + { + return internal_cv_.wait_impl(lock, time); + } +// Some shared_mutex functionality is available even in Vista, but it's not +// until Windows 7 that a full implementation is natively possible. The class +// itself is defined, with missing features, at the Vista feature level. + bool wait_impl (unique_lock & lock, DWORD time) + { + native_shared_mutex * pmutex = lock.release(); + bool success = internal_cv_.wait_unique(pmutex, time); + lock = unique_lock(*pmutex, adopt_lock); + return success; + } + bool wait_impl (shared_lock & lock, DWORD time) + { + native_shared_mutex * pmutex = lock.release(); + BOOL success = SleepConditionVariableSRW(native_handle(), + pmutex->native_handle(), time, + CONDITION_VARIABLE_LOCKMODE_SHARED); + lock = shared_lock(*pmutex, adopt_lock); + return success; + } +public: + using native_handle_type = typename condition_variable::native_handle_type; + + native_handle_type native_handle (void) + { + return internal_cv_.native_handle(); + } + + void notify_one (void) noexcept + { + internal_cv_.notify_one(); + } + + void notify_all (void) noexcept + { + internal_cv_.notify_all(); + } + + condition_variable_any (void) = default; + ~condition_variable_any (void) = default; + + template + void wait (L & lock) + { + wait_impl(lock, kInfinite); + } + + template + void wait (L & lock, Predicate pred) + { + while (!pred()) + wait(lock); + } + + template + cv_status wait_for(L& lock, const std::chrono::duration& period) + { + using namespace std::chrono; + auto timeout = duration_cast(period).count(); + DWORD waittime = (timeout < kInfinite) ? ((timeout < 0) ? 0 : static_cast(timeout)) : (kInfinite - 1); + bool result = wait_impl(lock, waittime) || (timeout >= kInfinite); + return result ? cv_status::no_timeout : cv_status::timeout; + } + + template + bool wait_for(L& lock, const std::chrono::duration& period, + Predicate pred) + { + return wait_until(lock, std::chrono::steady_clock::now() + period, + std::move(pred)); + } + template + cv_status wait_until (L& lock, + const std::chrono::time_point& abs_time) + { + return wait_for(lock, abs_time - Clock::now()); + } + template + bool wait_until (L& lock, + const std::chrono::time_point& abs_time, + Predicate pred) + { + while (!pred()) + { + if (wait_until(lock, abs_time) == cv_status::timeout) + { + return pred(); + } + } + return true; + } +}; +} // Namespace vista +#endif +#if WINVER < 0x0600 +using xp::condition_variable; +using xp::condition_variable_any; +#else +using vista::condition_variable; +using vista::condition_variable_any; +#endif +} // Namespace mingw_stdthread + +// Push objects into std, but only if they are not already there. +namespace std +{ +// Because of quirks of the compiler, the common "using namespace std;" +// directive would flatten the namespaces and introduce ambiguity where there +// was none. Direct specification (std::), however, would be unaffected. +// Take the safe option, and include only in the presence of MinGW's win32 +// implementation. +#if defined(__MINGW32__ ) && !defined(_GLIBCXX_HAS_GTHREADS) +using mingw_stdthread::cv_status; +using mingw_stdthread::condition_variable; +using mingw_stdthread::condition_variable_any; +#elif !defined(MINGW_STDTHREAD_REDUNDANCY_WARNING) // Skip repetition +#define MINGW_STDTHREAD_REDUNDANCY_WARNING +#pragma message "This version of MinGW seems to include a win32 port of\ + pthreads, and probably already has C++11 std threading classes implemented,\ + based on pthreads. These classes, found in namespace std, are not overridden\ + by the mingw-std-thread library. If you would still like to use this\ + implementation (as it is more lightweight), use the classes provided in\ + namespace mingw_stdthread." +#endif +} +#endif // MINGW_CONDITIONAL_VARIABLE_H diff --git a/randomx/mingw-std-threads-master/mingw.future.h b/randomx/mingw-std-threads-master/mingw.future.h new file mode 100644 index 0000000..0aa4536 --- /dev/null +++ b/randomx/mingw-std-threads-master/mingw.future.h @@ -0,0 +1,1118 @@ +/// \file mingw.future.h +/// \brief Standard-compliant C++ futures for MinGW +/// +/// (c) 2018 by Nathaniel J. McClatchey, San Jose, California +/// \author Nathaniel J. McClatchey, PhD +/// +/// \copyright Simplified (2-clause) BSD License. +/// +/// \note This file may become part of the mingw-w64 runtime package. If/when +/// this happens, the appropriate license will be added, i.e. this code will +/// become dual-licensed, and the current BSD 2-clause license will stay. +/// \note Target Windows version is determined by WINVER, which is determined in +/// from _WIN32_WINNT, which can itself be set by the user. + +#ifndef MINGW_FUTURE_H_ +#define MINGW_FUTURE_H_ + +#if !defined(__cplusplus) || (__cplusplus < 201103L) +#error The MinGW STD Threads library requires a compiler supporting C++11. +#endif + +#include + +#include +#include +#include // For std::pair +#include +#include +#include // For std::hash + +#include "mingw.thread.h" // Start new threads, and use invoke. + +// Mutexes and condition variables are used explicitly. +#include "mingw.mutex.h" +#include "mingw.condition_variable.h" + +#if (defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)) +#pragma message "The Windows API that MinGW-w32 provides is not fully compatible\ + with Microsoft's API. We'll try to work around this, but we can make no\ + guarantees. This problem does not exist in MinGW-w64." +#include // No further granularity can be expected. +#else +#include +#include +#include +#endif + +// Note: +// std::shared_ptr is the natural choice for this. However, a custom +// implementation removes the need to keep a control block separate from the +// class itself (no need to support weak pointers). + +namespace mingw_stdthread +{ +using std::future_errc; +using std::future_error; +using std::future_status; +using std::launch; +using std::future_category; + +namespace detail +{ +struct Empty { }; + +// Use a class template to allow instantiation of statics in a header-only +// library. Note: Template will only be instantiated once to avoid bloat. +template +struct FutureStatic +{ + enum Type : uint_fast8_t + { + kUndecided = 0x00, + kDeferred = 0x05, + kValue = 0x02, + kException = 0x03, + kSetFlag = 0x02, + kTypeMask = 0x03, + kReadyFlag = 0x04 + }; + + static std::vector > sync_pool; + + static mutex & get_mutex (void const * ptr) + { + std::hash hash_func; + return sync_pool[hash_func(ptr) % sync_pool.size()].first; + } + static condition_variable & get_condition_variable (void const * ptr) + { + std::hash hash_func; + return sync_pool[hash_func(ptr) % sync_pool.size()].second; + } +}; +template +std::vector > FutureStatic::sync_pool (thread::hardware_concurrency() * 2 + 1); + +struct FutureStateBase +{ + inline mutex & get_mutex (void) const + { + return FutureStatic::get_mutex(this); + } + inline condition_variable & get_condition_variable (void) const + { + return FutureStatic::get_condition_variable(this); + } + typedef typename FutureStatic::Type Type; +// Destroys this object. Used for allocator-awareness. + virtual void deallocate_this (void) noexcept = 0; + virtual ~FutureStateBase (void) = default; + + FutureStateBase (FutureStateBase const &) = delete; + FutureStateBase & operator= (FutureStateBase const &) = delete; + + FutureStateBase(Type t) noexcept + : mReferences(0), mType(t) + { + } + + void increment_references (void) noexcept + { + mReferences.fetch_add(1, std::memory_order_relaxed); + } + + void decrement_references (void) noexcept + { + if (mReferences.fetch_sub(1, std::memory_order_acquire) == 0) + deallocate_this(); + } + + std::atomic mReferences; + std::atomic mType; +}; + +// Reduce compilation time and improve code re-use. +struct FutureBase : public FutureStatic +{ + typedef FutureStatic Base; + FutureStateBase * mState; + + mutex & get_mutex (void) const + { + return FutureStatic::get_mutex(mState); + } + condition_variable & get_condition_variable (void) const + { + return FutureStatic::get_condition_variable(mState); + } + + FutureBase (FutureStateBase * ptr) noexcept + : mState(ptr) + { + } + + FutureBase (FutureBase && source) noexcept + : mState(source.mState) + { + source.mState = nullptr; + } + + ~FutureBase (void) + { + release(); + } + + FutureBase (FutureBase const &) = delete; + FutureBase & operator= (FutureBase const &) = delete; + + bool valid (void) const noexcept + { + return mState != nullptr; + } + +// Releases this object's hold on its state. Requires a specification of +// which state is being used. + inline void release (void) noexcept + { + if (valid()) + mState->decrement_references(); + mState = nullptr; + } + + void wait (std::unique_lock & lock) const + { +#if !defined(NDEBUG) + if (!valid()) + throw future_error(future_errc::no_state); +#endif +// If there's already a value or exception, don't do any extraneous +// synchronization. The `get()` method will do that for us. + if (mState->mType.load(std::memory_order_relaxed) & kReadyFlag) + return; + get_condition_variable().wait(lock, [this](void)->bool { + return mState->mType.load(std::memory_order_relaxed) & kReadyFlag; + }); + } + + template + future_status wait_for (std::chrono::duration const & dur) const + { +#if !defined(NDEBUG) + if (!valid()) + throw future_error(future_errc::no_state); +#endif + auto current_state = mState->mType.load(std::memory_order_relaxed); + if (current_state & kReadyFlag) + return (current_state == kDeferred) ? future_status::deferred : future_status::ready; + std::unique_lock lock { get_mutex() }; + if (get_condition_variable().wait_for(lock, dur, + [this](void)->bool { + return mState->mType.load(std::memory_order_relaxed) & kReadyFlag; + })) + return future_status::ready; + else + return future_status::timeout; + } + + template + future_status wait_until(const std::chrono::time_point& time) const + { + return wait_for(time - Clock::now()); + } +}; + +template +struct FutureState : public FutureStateBase +{ +// The state never needs more than one of these at any one time, so don't +// waste space or allocation time. + union { + struct {} mUndecided; // Included to make the active member unambiguous. + T mObject; + std::exception_ptr mException; + std::function mFunction; + }; + + FutureState (void) noexcept + : FutureStateBase(Type::kUndecided), mUndecided() + { + } + + FutureState (std::function && deferred_function) + : FutureStateBase(Type::kDeferred), mFunction(std::move(deferred_function)) + { + } + + void deallocate_this (void) noexcept override + { + delete this; + } + + template + void set_value (Arg && arg) + { + assert(!(mType.load(std::memory_order_relaxed) & Type::kSetFlag)); + new(&mObject) T (std::forward(arg)); + mType.store(Type::kValue | Type::kReadyFlag, std::memory_order_release); + } + template + void set_exception (Arg && arg) + { + assert(!(mType.load(std::memory_order_relaxed) & Type::kSetFlag)); + new(&mException) std::exception_ptr (std::forward(arg)); + mType.store(Type::kException | Type::kReadyFlag, std::memory_order_release); + } +// These overloads set value/exception, but don't make it ready. + template + void set_value (Arg && arg, bool) + { + assert(!(mType.load(std::memory_order_relaxed) & Type::kSetFlag)); + new(&mObject) T (std::forward(arg)); + mType.store(Type::kValue, std::memory_order_release); + } + template + void set_exception (Arg && arg, bool) + { + assert(!(mType.load(std::memory_order_relaxed) & Type::kSetFlag)); + new(&mException) std::exception_ptr (std::forward(arg)); + mType.store(Type::kException, std::memory_order_release); + } + //private: + ~FutureState (void) + { + switch (mType.load(std::memory_order_acquire) & Type::kTypeMask) + { + case Type::kDeferred & Type::kTypeMask: + mFunction.~function(); + break; + case Type::kValue: + mObject.~T(); + break; + case Type::kException: + mException.~exception_ptr(); + break; + default:; + } + } +}; + +template +struct FutureStateAllocated : public FutureState +{ + typedef typename std::allocator_traits::void_pointer void_pointer; + void_pointer mThis; + Alloc mAllocator; + + FutureStateAllocated (Alloc const & alloc, void_pointer const & vptr) noexcept + : FutureState(), mThis(vptr), mAllocator(alloc) + { + } + + FutureStateAllocated (FutureStateAllocated const &) = delete; + FutureStateAllocated & operator= (FutureStateAllocated const &) = delete; + + void deallocate_this (void) noexcept override + { + typedef typename std::allocator_traits::template rebind_traits > allocator_traits; + typename allocator_traits::allocator_type alloc(std::move(mAllocator)); + typedef typename allocator_traits::pointer pointer; + pointer ptr(static_cast(mThis)); + allocator_traits::destroy(alloc, this); + allocator_traits::deallocate(alloc, ptr, 1); + } +}; +} // Namespace "detail" + +#if (defined(__MINGW32__ ) && !defined(_GLIBCXX_HAS_GTHREADS)) +} +namespace std { +#else +template +class future; +template +class shared_future; +template +class promise; +#endif + +template +class future : mingw_stdthread::detail::FutureBase +{ + typedef mingw_stdthread::detail::FutureState state_type; + future (state_type * ptr) noexcept + : FutureBase(ptr) + { + } + + friend class shared_future; + friend class promise; + + template + friend class future; + + template + friend future<__async_result_of<_Fn, _Args...>> async (std::launch, _Fn &&, _Args&&...); + public: + using FutureBase::valid; + using FutureBase::wait_for; + using FutureBase::wait_until; + + future (void) noexcept + : FutureBase(nullptr) + { + } + future & operator= (future && source) noexcept + { +// Check for this atypical behavior rather than creating a nonsensical state. + if (this != &source) + { + release(); + mState = source.mState; + source.mState = nullptr; + } + return *this; + } + future (future && source) noexcept + : FutureBase(std::move(source)) + { + } + + ~future (void) = default; + + future (future const &) = delete; + future & operator= (future const &) = delete; + + T const & get (void) const + { + wait(); + if (mState->mType.load(std::memory_order_acquire) == (kValue | kReadyFlag)) + return static_cast(mState)->mObject; + else + { + assert(mState->mType.load(std::memory_order_relaxed) == (kException | kReadyFlag)); + std::rethrow_exception(static_cast(mState)->mException); + } + } + + shared_future share (void) noexcept; + + void wait (void) const + { + std::unique_lock lock { get_mutex() }; + FutureBase::wait(lock); + if (mState->mType.load(std::memory_order_acquire) == kDeferred) + { + state_type * ptr = static_cast(mState); + decltype(ptr->mFunction) func = std::move(ptr->mFunction); + ptr->mFunction.~function(); + func(); + ptr->get_condition_variable().notify_all(); + } + } +}; + +template +class shared_future : future +{ + typedef typename future::state_type state_type; + public: + using future::get; + using future::wait; + using future::wait_for; + using future::wait_until; + using future::valid; + + shared_future (void) noexcept : future() + { + } + + shared_future (shared_future && source) noexcept + : future(std::move(source)) + { + } + + shared_future & operator= (shared_future && source) noexcept + { + return future::operator=(std::move(source)); + } + + shared_future (shared_future const & source) noexcept(__cplusplus >= 201703L) + : future(static_cast(source.mState)) + { + future::mState->increment_references(); + } + + shared_future & operator= (shared_future const & source) noexcept(__cplusplus >= 201703L) + { + if (future::mState == source.mState) + return *this; + future::release(); + future::mState = source.mState; + future::mState->increment_references(); + return *this; + } + + shared_future (future && source) noexcept + : future(std::move(source)) + { + } + + shared_future & operator= (future && source) noexcept + { + future::operator=(std::move(source)); + return *this; + } + + ~shared_future (void) = default; +}; + +template +class promise : mingw_stdthread::detail::FutureBase +{ + bool mRetrieved; + typedef mingw_stdthread::detail::FutureState state_type; + void check_before_set (void) const + { + if (!valid()) + throw future_error(future_errc::no_state); + if (mState->mType.load(std::memory_order_relaxed) & kSetFlag) + throw future_error(future_errc::promise_already_satisfied); + } + + void check_abandon (void) + { + if (valid() && !(mState->mType.load(std::memory_order_relaxed) & kSetFlag)) + { + set_exception(std::make_exception_ptr(future_error(future_errc::broken_promise))); + } + } +/// \bug Might throw more exceptions than specified by the standard... +// Need OS support for this... + void make_ready_at_thread_exit (void) + { + static constexpr DWORD kInfinite = 0xffffffffl; +// Need to turn the pseudohandle from GetCurrentThread() into a true handle... + HANDLE thread_handle; + BOOL success = DuplicateHandle(GetCurrentProcess(), + GetCurrentThread(), + GetCurrentProcess(), + &thread_handle, + 0, // Access doesn't matter. Will be duplicated. + FALSE, // No need for this to be inherited. + DUPLICATE_SAME_ACCESS | DUPLICATE_CLOSE_SOURCE); + if (!success) + throw std::runtime_error("MinGW STD Threads library failed to make a promise ready after thread exit."); + + mState->increment_references(); + bool handle_handled = false; + try { + state_type * ptr = static_cast(mState); + mingw_stdthread::thread watcher_thread ([ptr, thread_handle, &handle_handled](void) + { + { + std::lock_guard guard (ptr->get_mutex()); + handle_handled = true; + } + ptr->get_condition_variable().notify_all(); +// Wait for the original thread to die. + WaitForSingleObject(thread_handle, kInfinite); + CloseHandle(thread_handle); + + { + std::lock_guard guard (ptr->get_mutex()); + ptr->mType.fetch_or(kReadyFlag, std::memory_order_relaxed); + } + ptr->get_condition_variable().notify_all(); + + ptr->decrement_references(); + }); + { + std::unique_lock guard (ptr->get_mutex()); + ptr->get_condition_variable().wait(guard, [&handle_handled](void)->bool + { + return handle_handled; + }); + } + watcher_thread.detach(); + } + catch (...) + { +// Because the original promise is still alive, this can't be the decrement +// destroys it. + mState->decrement_references(); + if (!handle_handled) + CloseHandle(thread_handle); + } + } + + template + future make_future (void) + { + if (!valid()) + throw future_error(future_errc::no_state); + if (mRetrieved) + throw future_error(future_errc::future_already_retrieved); + mState->increment_references(); + mRetrieved = true; + return future(static_cast(mState)); + } + + template + friend class promise; + public: +// Create a promise with an empty state, with the reference counter set to +// indicate that the state is only held by this promise (i.e. not by any +// futures). + promise (void) + : FutureBase(new state_type ()), mRetrieved(false) + { + } + + template + promise (std::allocator_arg_t, Alloc const & alloc) + : FutureBase(nullptr), mRetrieved(false) + { + typedef mingw_stdthread::detail::FutureStateAllocated State; + typedef typename std::allocator_traits::template rebind_traits Traits; + typename Traits::allocator_type rebound_alloc(alloc); + typename Traits::pointer ptr = Traits::allocate(rebound_alloc, 1); + typename Traits::void_pointer vptr = ptr; + State * sptr = std::addressof(*ptr); + Traits::construct(rebound_alloc, sptr, std::move(rebound_alloc), vptr); + mState = static_cast(sptr); + } + + promise (promise && source) noexcept + : FutureBase(std::move(source)), mRetrieved(source.mRetrieved) + { + } + + ~promise (void) + { + check_abandon(); + } + + promise & operator= (promise && source) noexcept + { + if (this == &source) + return *this; + check_abandon(); + release(); + mState = source.mState; + mRetrieved = source.mRetrieved; + source.mState = nullptr; + return *this; + } + + void swap (promise & other) noexcept + { + std::swap(mState, other.mState); + std::swap(mRetrieved, other.mRetrieved); + } + + promise (promise const &) = delete; + promise & operator= (promise const &) = delete; + + future get_future (void) + { + return make_future(); + } + + void set_value (T const & value) + { + { + std::lock_guard lock { get_mutex() }; + check_before_set(); + static_cast(mState)->set_value(value); + } + get_condition_variable().notify_all(); + } + + void set_value (T && value) + { + { + std::lock_guard lock { get_mutex() }; + check_before_set(); + static_cast(mState)->set_value(std::move(value)); + } + get_condition_variable().notify_all(); + } + + void set_value_at_thread_exit (T const & value) + { + { + std::lock_guard lock { get_mutex() }; + check_before_set(); + static_cast(mState)->set_value(value, false); + } + make_ready_at_thread_exit(); + } + + void set_value_at_thread_exit (T && value) + { + { + std::lock_guard lock { get_mutex() }; + check_before_set(); + static_cast(mState)->set_value(std::move(value), false); + } + make_ready_at_thread_exit(); + } + + void set_exception (std::exception_ptr eptr) + { + { + std::lock_guard lock { get_mutex() }; + check_before_set(); + static_cast(mState)->set_exception(eptr); + } + get_condition_variable().notify_all(); + } + + void set_exception_at_thread_exit (std::exception_ptr eptr) + { + { + std::lock_guard lock { get_mutex() }; + check_before_set(); + static_cast(mState)->set_exception(eptr, false); + } + make_ready_at_thread_exit(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Reference Specialization // +//////////////////////////////////////////////////////////////////////////////// + +template +class future : future +{ + typedef future Base; + template + friend class shared_future; + template + friend class promise; + + future (typename Base::state_type * state) + : Base(state) + { + } + + template + friend future<__async_result_of<_Fn, _Args...>> async (std::launch, _Fn &&, _Args&&...); + public: + using Base::valid; + using Base::wait_for; + using Base::wait_until; + using Base::wait; + + future (void) noexcept = default; + + inline T& get (void) const + { + return *static_cast(Base::get()); + } + + shared_future share (void) noexcept; +}; + +template +class shared_future : shared_future +{ + typedef shared_future Base; + public: + using Base::wait; + using Base::wait_for; + using Base::wait_until; + using Base::valid; + + inline T& get (void) const + { + return *static_cast(Base::get()); + } + + shared_future (future && source) noexcept + : Base(std::move(source)) + { + } + + shared_future & operator= (future && source) noexcept + { + Base::operator=(std::move(source)); + return *this; + } + + ~shared_future (void) = default; +}; + +template +class promise : private promise +{ + typedef promise Base; + public: + using Base::set_exception; + using Base::set_exception_at_thread_exit; + + promise (void) = default; + template + promise (std::allocator_arg_t arg, Alloc const & alloc) + : Base(arg, alloc) + { + } + + inline void set_value (T & value) + { + typedef typename std::remove_cv::type T_non_cv; + Base::set_value(const_cast(std::addressof(value))); + } + + inline void set_value_at_thread_exit (T & value) + { + typedef typename std::remove_cv::type T_non_cv; + Base::set_value_at_thread_exit(const_cast(std::addressof(value))); + } + + inline future get_future (void) + { + return Base::template make_future(); + } + + void swap (promise & other) noexcept + { + Base::swap(other); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Void Specialization // +//////////////////////////////////////////////////////////////////////////////// + +template<> +class future : future +{ + typedef mingw_stdthread::detail::Empty Empty; + template + friend class shared_future; + template + friend class promise; + + future(future::state_type * state) + : future(state) + { + } + + template + friend future<__async_result_of<_Fn, _Args...>> async (std::launch, _Fn &&, _Args&&...); + + public: + using future::valid; + using future::wait_for; + using future::wait_until; + using future::wait; + + future (void) noexcept = default; + + void get (void) const + { + future::get(); + } + + shared_future share (void) noexcept; +}; + +template<> +class shared_future : shared_future +{ + typedef mingw_stdthread::detail::Empty Empty; + public: + using shared_future::wait; + using shared_future::wait_for; + using shared_future::wait_until; + using shared_future::valid; + + void get (void) const + { + shared_future::get(); + } + + shared_future (void) noexcept = default; + + shared_future (shared_future && source) noexcept = default; + + shared_future & operator= (shared_future && source) noexcept = default; + + shared_future (shared_future const & source) noexcept(__cplusplus >= 201703L) = default; + + shared_future & operator= (shared_future const & source) noexcept(__cplusplus >= 201703L) = default; + + shared_future (future && source) noexcept + : shared_future(std::move(source)) + { + } + + shared_future & operator= (future && source) noexcept + { + shared_future::operator=(std::move(source)); + return *this; + } + + ~shared_future (void) = default; +}; + +inline shared_future future::share (void) noexcept +{ + return shared_future(std::move(*this)); +} + +template +shared_future future::share (void) noexcept +{ + return shared_future(std::move(*this)); +} + +template +shared_future future::share (void) noexcept +{ + return shared_future(std::move(*this)); +} + +template<> +class promise : private promise +{ + typedef mingw_stdthread::detail::Empty Empty; + public: + using promise::set_exception; + using promise::set_exception_at_thread_exit; + + promise (void) = default; + template + promise (std::allocator_arg_t arg, Alloc const & alloc) + : promise(arg, alloc) + { + } + + inline void set_value (void) + { + promise::set_value(Empty()); + } + + inline void set_value_at_thread_exit (void) + { + promise::set_value_at_thread_exit(Empty()); + } + + inline future get_future (void) + { + return promise::template make_future(); + } + + void swap (promise & other) noexcept + { + promise::swap(other); + } +}; + + + +template +void swap(promise & lhs, promise & rhs) noexcept +{ + lhs.swap(rhs); +} + +template +struct uses_allocator, Alloc> : std::true_type +{ +}; + +} // Namespace "std" +namespace mingw_stdthread +{ +namespace detail +{ +template +struct StorageHelper +{ + template + static void store_deferred (FutureState * state_ptr, Func && func, Args&&... args) + { + try { + state_ptr->set_value(invoke(std::forward(func), std::forward(args)...)); + } catch (...) { + state_ptr->set_exception(std::current_exception()); + } + } + template + static void store (FutureState * state_ptr, Func && func, Args&&... args) + { + { + std::lock_guard lock { state_ptr->get_mutex() }; + store_deferred(state_ptr, std::forward(func), std::forward(args)...); + } + state_ptr->get_condition_variable().notify_all(); + } +}; + +template +struct StorageHelper +{ + template + static void store_deferred (FutureState * state_ptr, Func && func, Args&&... args) + { + try { + typedef typename std::remove_cv::type Ref_non_cv; + Ref & rf = invoke(std::forward(func), std::forward(args)...); + state_ptr->set_value(const_cast(std::addressof(rf))); + } catch (...) { + state_ptr->set_exception(std::current_exception()); + } + } + template + static void store (FutureState * state_ptr, Func && func, Args&&... args) + { + { + std::lock_guard lock { state_ptr->get_mutex() }; + store_deferred(state_ptr, std::forward(func), std::forward(args)...); + } + state_ptr->get_condition_variable().notify_all(); + } +}; + +template<> +struct StorageHelper +{ + template + static void store_deferred (FutureState * state_ptr, Func && func, Args&&... args) + { + try { + invoke(std::forward(func), std::forward(args)...); + state_ptr->set_value(Empty{}); + } catch (...) { + state_ptr->set_exception(std::current_exception()); + } + } + template + static void store (FutureState * state_ptr, Func && func, Args&&... args) + { + { + std::lock_guard lock { state_ptr->get_mutex() }; + store_deferred(state_ptr, std::forward(func), std::forward(args)...); + } + state_ptr->get_condition_variable().notify_all(); + } +}; +} // Namespace "detail" +} // Namespace "mingw_stdthread" +namespace std +{ + + +// Unfortunately, MinGW's locks us into a particular (non-standard) +// signature for async. +template< class Function, class... Args> +/*#if (__cplusplus < 201703L) +std::future::type(std::decay::type...)>::type> +#else +#if (__cplusplus > 201703L) +[[nodiscard]] +#endif +std::future, std::decay_t...>> +#endif*/ +#if (__cplusplus > 201703L) +[[nodiscard]] +#endif +std::future<__async_result_of > + async(Function&& f, Args&&... args) +{ + return async(launch::async | launch::deferred, std::forward(f), std::forward(args)...); +} +template< class Function, class... Args > +/*#if (__cplusplus < 201703L) +std::future::type(std::decay::type...)>::type> +#else +#if (__cplusplus > 201703L) +[[nodiscard]] +#endif +std::future, std::decay_t...> > +#endif*/ +#if (__cplusplus > 201703L) +[[nodiscard]] +#endif +std::future<__async_result_of > + async(std::launch policy, Function&& f, Args&&... args) +{ + typedef __async_result_of result_type; +/*#if (__cplusplus < 201703L) + typedef std::result_of::type(std::decay::type...)>::type result_type; +#else + typedef std::invoke_result_t, std::decay_t...> result_type; +#endif*/ + typedef future future_type; + typedef typename future_type::state_type state_type; + + //auto setter = [] + + state_type * state_ptr = nullptr; + /*if ((policy & std::launch::async) == std::launch::async) + state_ptr = new state_type (); + else + state_ptr = new state_type (std::function(std::bind(std::forward(f), std::forward(args)...)));*/ + + + if ((policy & std::launch::async) == std::launch::async) + { + auto deleter = [](state_type * ptr) { ptr->decrement_references(); }; + state_ptr = new state_type (); + state_ptr->increment_references(); + std::unique_ptr ooptr { state_ptr, deleter }; + mingw_stdthread::thread t ([](decltype(ooptr) ptr, typename std::decay::type f2, typename std::decay::type... args2) + { + typedef mingw_stdthread::detail::StorageHelper s_helper; + s_helper::store(ptr.get(), f2, args2...); + }, std::move(ooptr), std::forward(f), std::forward(args)...); + t.detach(); + } else { + typedef std::function func_type; + struct Packed + { + func_type func; + state_type * ptr; + }; + std::shared_ptr bound { new Packed { std::bind(std::forward(f), std::forward(args)...), nullptr } }; + state_ptr = new state_type (std::function([bound](void) + { + typedef mingw_stdthread::detail::StorageHelper s_helper; + s_helper::store_deferred(bound->ptr, std::move(bound->func)); + })); + bound->ptr = state_ptr; + } + assert(state_ptr != nullptr); + return future { state_ptr }; +} + +#if (defined(__MINGW32__ ) && !defined(_GLIBCXX_HAS_GTHREADS)) +} // Namespace std +namespace mingw_stdthread +{ +using std::future; +using std::shared_future; +using std::promise; +using std::async; +#else +} // Namespace mingw_stdthread +namespace std +{ +template +void swap(mingw_stdthread::promise & lhs, mingw_stdthread::promise & rhs) noexcept +{ + lhs.swap(rhs); +} + +template +struct uses_allocator, Alloc> : std::true_type +{ +}; +#endif +} // Namespace + +#endif // MINGW_FUTURE_H_ diff --git a/randomx/mingw-std-threads-master/mingw.invoke.h b/randomx/mingw-std-threads-master/mingw.invoke.h new file mode 100644 index 0000000..d5c9dd3 --- /dev/null +++ b/randomx/mingw-std-threads-master/mingw.invoke.h @@ -0,0 +1,109 @@ +/// \file mingw.invoke.h +/// \brief Lightweight `invoke` implementation, for C++11 and C++14. +/// +/// (c) 2018-2019 by Nathaniel J. McClatchey, San Jose, CA, United States +/// \author Nathaniel J. McClatchey, PhD +/// +/// \copyright Simplified (2-clause) BSD License. +/// +/// \note This file may become part of the mingw-w64 runtime package. If/when +/// this happens, the appropriate license will be added, i.e. this code will +/// become dual-licensed, and the current BSD 2-clause license will stay. + +#ifndef MINGW_INVOKE_H_ +#define MINGW_INVOKE_H_ + +#include // For std::result_of, etc. +#include // For std::forward +#include // For std::reference_wrapper + +namespace mingw_stdthread +{ +namespace detail +{ +// For compatibility, implement std::invoke for C++11 and C++14 +#if __cplusplus < 201703L + template + struct Invoker + { + template + inline static typename std::result_of::type invoke (F&& f, Args&&... args) + { + return std::forward(f)(std::forward(args)...); + } + }; + template + struct InvokerHelper; + + template<> + struct InvokerHelper + { + template + inline static auto get (T1&& t1) -> decltype(*std::forward(t1)) + { + return *std::forward(t1); + } + + template + inline static auto get (const std::reference_wrapper& t1) -> decltype(t1.get()) + { + return t1.get(); + } + }; + + template<> + struct InvokerHelper + { + template + inline static auto get (T1&& t1) -> decltype(std::forward(t1)) + { + return std::forward(t1); + } + }; + + template<> + struct Invoker + { + template + inline static auto invoke (F T::* f, T1&& t1, Args&&... args) ->\ + decltype((InvokerHelper::type>::value>::get(std::forward(t1)).*f)(std::forward(args)...)) + { + return (InvokerHelper::type>::value>::get(std::forward(t1)).*f)(std::forward(args)...); + } + }; + + template<> + struct Invoker + { + template + inline static auto invoke (F T::* f, T1&& t1, Args&&... args) ->\ + decltype(InvokerHelper::type>::value>::get(t1).*f) + { + return InvokerHelper::type>::value>::get(t1).*f; + } + }; + + template + struct InvokeResult + { + typedef Invoker::type>::value, + std::is_member_object_pointer::type>::value && + (sizeof...(Args) == 1)> invoker; + inline static auto invoke (F&& f, Args&&... args) -> decltype(invoker::invoke(std::forward(f), std::forward(args)...)) + { + return invoker::invoke(std::forward(f), std::forward(args)...); + } + }; + + template + auto invoke (F&& f, Args&&... args) -> decltype(InvokeResult::invoke(std::forward(f), std::forward(args)...)) + { + return InvokeResult::invoke(std::forward(f), std::forward(args)...); + } +#else + using std::invoke; +#endif +} // Namespace "detail" +} // Namespace "mingw_stdthread" + +#endif diff --git a/randomx/mingw-std-threads-master/mingw.mutex.h b/randomx/mingw-std-threads-master/mingw.mutex.h new file mode 100644 index 0000000..54d6146 --- /dev/null +++ b/randomx/mingw-std-threads-master/mingw.mutex.h @@ -0,0 +1,491 @@ +/** +* @file mingw.mutex.h +* @brief std::mutex et al implementation for MinGW +** (c) 2013-2016 by Mega Limited, Auckland, New Zealand +* @author Alexander Vassilev +* +* @copyright Simplified (2-clause) BSD License. +* You should have received a copy of the license along with this +* program. +* +* This code is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +* @note +* This file may become part of the mingw-w64 runtime package. If/when this happens, +* the appropriate license will be added, i.e. this code will become dual-licensed, +* and the current BSD 2-clause license will stay. +*/ + +#ifndef WIN32STDMUTEX_H +#define WIN32STDMUTEX_H + +#if !defined(__cplusplus) || (__cplusplus < 201103L) +#error A C++11 compiler is required! +#endif +// Recursion checks on non-recursive locks have some performance penalty, and +// the C++ standard does not mandate them. The user might want to explicitly +// enable or disable such checks. If the user has no preference, enable such +// checks in debug builds, but not in release builds. +#ifdef STDMUTEX_RECURSION_CHECKS +#elif defined(NDEBUG) +#define STDMUTEX_RECURSION_CHECKS 0 +#else +#define STDMUTEX_RECURSION_CHECKS 1 +#endif + +#include +#include +#include +#include //need for call_once() + +#if STDMUTEX_RECURSION_CHECKS || !defined(NDEBUG) +#include +#endif + +#include // Detect Windows version. + +#if (defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)) +#pragma message "The Windows API that MinGW-w32 provides is not fully compatible\ + with Microsoft's API. We'll try to work around this, but we can make no\ + guarantees. This problem does not exist in MinGW-w64." +#include // No further granularity can be expected. +#else +#if STDMUTEX_RECURSION_CHECKS +#include // For GetCurrentThreadId +#endif +#include // For InitializeCriticalSection, etc. +#include // For GetLastError +#include +#endif + +// Need for the implementation of invoke +#include "mingw.invoke.h" + +#if !defined(_WIN32_WINNT) || (_WIN32_WINNT < 0x0501) +#error To use the MinGW-std-threads library, you will need to define the macro _WIN32_WINNT to be 0x0501 (Windows XP) or higher. +#endif + +namespace mingw_stdthread +{ +// The _NonRecursive class has mechanisms that do not play nice with direct +// manipulation of the native handle. This forward declaration is part of +// a friend class declaration. +#if STDMUTEX_RECURSION_CHECKS +namespace vista +{ +class condition_variable; +} +#endif +// To make this namespace equivalent to the thread-related subset of std, +// pull in the classes and class templates supplied by std but not by this +// implementation. +using std::lock_guard; +using std::unique_lock; +using std::adopt_lock_t; +using std::defer_lock_t; +using std::try_to_lock_t; +using std::adopt_lock; +using std::defer_lock; +using std::try_to_lock; + +class recursive_mutex +{ + CRITICAL_SECTION mHandle; +public: + typedef LPCRITICAL_SECTION native_handle_type; + native_handle_type native_handle() {return &mHandle;} + recursive_mutex() noexcept : mHandle() + { + InitializeCriticalSection(&mHandle); + } + recursive_mutex (const recursive_mutex&) = delete; + recursive_mutex& operator=(const recursive_mutex&) = delete; + ~recursive_mutex() noexcept + { + DeleteCriticalSection(&mHandle); + } + void lock() + { + EnterCriticalSection(&mHandle); + } + void unlock() + { + LeaveCriticalSection(&mHandle); + } + bool try_lock() + { + return (TryEnterCriticalSection(&mHandle)!=0); + } +}; + +#if STDMUTEX_RECURSION_CHECKS +struct _OwnerThread +{ +// If this is to be read before locking, then the owner-thread variable must +// be atomic to prevent a torn read from spuriously causing errors. + std::atomic mOwnerThread; + constexpr _OwnerThread () noexcept : mOwnerThread(0) {} + static void on_deadlock (void) + { + using namespace std; + fprintf(stderr, "FATAL: Recursive locking of non-recursive mutex\ + detected. Throwing system exception\n"); + fflush(stderr); + throw system_error(make_error_code(errc::resource_deadlock_would_occur)); + } + DWORD checkOwnerBeforeLock() const + { + DWORD self = GetCurrentThreadId(); + if (mOwnerThread.load(std::memory_order_relaxed) == self) + on_deadlock(); + return self; + } + void setOwnerAfterLock(DWORD id) + { + mOwnerThread.store(id, std::memory_order_relaxed); + } + void checkSetOwnerBeforeUnlock() + { + DWORD self = GetCurrentThreadId(); + if (mOwnerThread.load(std::memory_order_relaxed) != self) + on_deadlock(); + mOwnerThread.store(0, std::memory_order_relaxed); + } +}; +#endif + +// Though the Slim Reader-Writer (SRW) locks used here are not complete until +// Windows 7, implementing partial functionality in Vista will simplify the +// interaction with condition variables. +#if defined(_WIN32) && (WINVER >= _WIN32_WINNT_VISTA) +namespace windows7 +{ +class mutex +{ + SRWLOCK mHandle; +// Track locking thread for error checking. +#if STDMUTEX_RECURSION_CHECKS + friend class vista::condition_variable; + _OwnerThread mOwnerThread {}; +#endif +public: + typedef PSRWLOCK native_handle_type; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" + constexpr mutex () noexcept : mHandle(SRWLOCK_INIT) { } +#pragma GCC diagnostic pop + mutex (const mutex&) = delete; + mutex & operator= (const mutex&) = delete; + void lock (void) + { +// Note: Undefined behavior if called recursively. +#if STDMUTEX_RECURSION_CHECKS + DWORD self = mOwnerThread.checkOwnerBeforeLock(); +#endif + AcquireSRWLockExclusive(&mHandle); +#if STDMUTEX_RECURSION_CHECKS + mOwnerThread.setOwnerAfterLock(self); +#endif + } + void unlock (void) + { +#if STDMUTEX_RECURSION_CHECKS + mOwnerThread.checkSetOwnerBeforeUnlock(); +#endif + ReleaseSRWLockExclusive(&mHandle); + } +// TryAcquireSRW functions are a Windows 7 feature. +#if (WINVER >= _WIN32_WINNT_WIN7) + bool try_lock (void) + { +#if STDMUTEX_RECURSION_CHECKS + DWORD self = mOwnerThread.checkOwnerBeforeLock(); +#endif + BOOL ret = TryAcquireSRWLockExclusive(&mHandle); +#if STDMUTEX_RECURSION_CHECKS + if (ret) + mOwnerThread.setOwnerAfterLock(self); +#endif + return ret; + } +#endif + native_handle_type native_handle (void) + { + return &mHandle; + } +}; +} // Namespace windows7 +#endif // Compiling for Vista +namespace xp +{ +class mutex +{ + CRITICAL_SECTION mHandle; + std::atomic_uchar mState; +// Track locking thread for error checking. +#if STDMUTEX_RECURSION_CHECKS + friend class vista::condition_variable; + _OwnerThread mOwnerThread {}; +#endif +public: + typedef PCRITICAL_SECTION native_handle_type; + constexpr mutex () noexcept : mHandle(), mState(2) { } + mutex (const mutex&) = delete; + mutex & operator= (const mutex&) = delete; + ~mutex() noexcept + { +// Undefined behavior if the mutex is held (locked) by any thread. +// Undefined behavior if a thread terminates while holding ownership of the +// mutex. + DeleteCriticalSection(&mHandle); + } + void lock (void) + { + unsigned char state = mState.load(std::memory_order_acquire); + while (state) { + if ((state == 2) && mState.compare_exchange_weak(state, 1, std::memory_order_acquire)) + { + InitializeCriticalSection(&mHandle); + mState.store(0, std::memory_order_release); + break; + } + if (state == 1) + { + Sleep(0); + state = mState.load(std::memory_order_acquire); + } + } +#if STDMUTEX_RECURSION_CHECKS + DWORD self = mOwnerThread.checkOwnerBeforeLock(); +#endif + EnterCriticalSection(&mHandle); +#if STDMUTEX_RECURSION_CHECKS + mOwnerThread.setOwnerAfterLock(self); +#endif + } + void unlock (void) + { +#if STDMUTEX_RECURSION_CHECKS + mOwnerThread.checkSetOwnerBeforeUnlock(); +#endif + LeaveCriticalSection(&mHandle); + } + bool try_lock (void) + { + unsigned char state = mState.load(std::memory_order_acquire); + if ((state == 2) && mState.compare_exchange_strong(state, 1, std::memory_order_acquire)) + { + InitializeCriticalSection(&mHandle); + mState.store(0, std::memory_order_release); + } + if (state == 1) + return false; +#if STDMUTEX_RECURSION_CHECKS + DWORD self = mOwnerThread.checkOwnerBeforeLock(); +#endif + BOOL ret = TryEnterCriticalSection(&mHandle); +#if STDMUTEX_RECURSION_CHECKS + if (ret) + mOwnerThread.setOwnerAfterLock(self); +#endif + return ret; + } + native_handle_type native_handle (void) + { + return &mHandle; + } +}; +} // Namespace "xp" +#if (WINVER >= _WIN32_WINNT_WIN7) +using windows7::mutex; +#else +using xp::mutex; +#endif + +class recursive_timed_mutex +{ + static constexpr DWORD kWaitAbandoned = 0x00000080l; + static constexpr DWORD kWaitObject0 = 0x00000000l; + static constexpr DWORD kInfinite = 0xffffffffl; + inline bool try_lock_internal (DWORD ms) noexcept + { + DWORD ret = WaitForSingleObject(mHandle, ms); +#ifndef NDEBUG + if (ret == kWaitAbandoned) + { + using namespace std; + fprintf(stderr, "FATAL: Thread terminated while holding a mutex."); + terminate(); + } +#endif + return (ret == kWaitObject0) || (ret == kWaitAbandoned); + } +protected: + HANDLE mHandle; +// Track locking thread for error checking of non-recursive timed_mutex. For +// standard compliance, this must be defined in same class and at the same +// access-control level as every other variable in the timed_mutex. +#if STDMUTEX_RECURSION_CHECKS + friend class vista::condition_variable; + _OwnerThread mOwnerThread {}; +#endif +public: + typedef HANDLE native_handle_type; + native_handle_type native_handle() const {return mHandle;} + recursive_timed_mutex(const recursive_timed_mutex&) = delete; + recursive_timed_mutex& operator=(const recursive_timed_mutex&) = delete; + recursive_timed_mutex(): mHandle(CreateMutex(NULL, FALSE, NULL)) {} + ~recursive_timed_mutex() + { + CloseHandle(mHandle); + } + void lock() + { + DWORD ret = WaitForSingleObject(mHandle, kInfinite); +// If (ret == WAIT_ABANDONED), then the thread that held ownership was +// terminated. Behavior is undefined, but Windows will pass ownership to this +// thread. +#ifndef NDEBUG + if (ret == kWaitAbandoned) + { + using namespace std; + fprintf(stderr, "FATAL: Thread terminated while holding a mutex."); + terminate(); + } +#endif + if ((ret != kWaitObject0) && (ret != kWaitAbandoned)) + { + throw std::system_error(GetLastError(), std::system_category()); + } + } + void unlock() + { + if (!ReleaseMutex(mHandle)) + throw std::system_error(GetLastError(), std::system_category()); + } + bool try_lock() + { + return try_lock_internal(0); + } + template + bool try_lock_for(const std::chrono::duration& dur) + { + using namespace std::chrono; + auto timeout = duration_cast(dur).count(); + while (timeout > 0) + { + constexpr auto kMaxStep = static_cast(kInfinite-1); + auto step = (timeout < kMaxStep) ? timeout : kMaxStep; + if (try_lock_internal(static_cast(step))) + return true; + timeout -= step; + } + return false; + } + template + bool try_lock_until(const std::chrono::time_point& timeout_time) + { + return try_lock_for(timeout_time - Clock::now()); + } +}; + +// Override if, and only if, it is necessary for error-checking. +#if STDMUTEX_RECURSION_CHECKS +class timed_mutex: recursive_timed_mutex +{ +public: + timed_mutex(const timed_mutex&) = delete; + timed_mutex& operator=(const timed_mutex&) = delete; + void lock() + { + DWORD self = mOwnerThread.checkOwnerBeforeLock(); + recursive_timed_mutex::lock(); + mOwnerThread.setOwnerAfterLock(self); + } + void unlock() + { + mOwnerThread.checkSetOwnerBeforeUnlock(); + recursive_timed_mutex::unlock(); + } + template + bool try_lock_for(const std::chrono::duration& dur) + { + DWORD self = mOwnerThread.checkOwnerBeforeLock(); + bool ret = recursive_timed_mutex::try_lock_for(dur); + if (ret) + mOwnerThread.setOwnerAfterLock(self); + return ret; + } + template + bool try_lock_until(const std::chrono::time_point& timeout_time) + { + return try_lock_for(timeout_time - Clock::now()); + } + bool try_lock () + { + return try_lock_for(std::chrono::milliseconds(0)); + } +}; +#else +typedef recursive_timed_mutex timed_mutex; +#endif + +class once_flag +{ +// When available, the SRW-based mutexes should be faster than the +// CriticalSection-based mutexes. Only try_lock will be unavailable in Vista, +// and try_lock is not used by once_flag. +#if (_WIN32_WINNT == _WIN32_WINNT_VISTA) + windows7::mutex mMutex; +#else + mutex mMutex; +#endif + std::atomic_bool mHasRun; + once_flag(const once_flag&) = delete; + once_flag& operator=(const once_flag&) = delete; + template + friend void call_once(once_flag& once, Callable&& f, Args&&... args); +public: + constexpr once_flag() noexcept: mMutex(), mHasRun(false) {} +}; + +template +void call_once(once_flag& flag, Callable&& func, Args&&... args) +{ + if (flag.mHasRun.load(std::memory_order_acquire)) + return; + lock_guard lock(flag.mMutex); + if (flag.mHasRun.load(std::memory_order_acquire)) + return; + detail::invoke(std::forward(func),std::forward(args)...); + flag.mHasRun.store(true, std::memory_order_release); +} +} // Namespace mingw_stdthread + +// Push objects into std, but only if they are not already there. +namespace std +{ +// Because of quirks of the compiler, the common "using namespace std;" +// directive would flatten the namespaces and introduce ambiguity where there +// was none. Direct specification (std::), however, would be unaffected. +// Take the safe option, and include only in the presence of MinGW's win32 +// implementation. +#if defined(__MINGW32__ ) && !defined(_GLIBCXX_HAS_GTHREADS) +using mingw_stdthread::recursive_mutex; +using mingw_stdthread::mutex; +using mingw_stdthread::recursive_timed_mutex; +using mingw_stdthread::timed_mutex; +using mingw_stdthread::once_flag; +using mingw_stdthread::call_once; +#elif !defined(MINGW_STDTHREAD_REDUNDANCY_WARNING) // Skip repetition +#define MINGW_STDTHREAD_REDUNDANCY_WARNING +#pragma message "This version of MinGW seems to include a win32 port of\ + pthreads, and probably already has C++11 std threading classes implemented,\ + based on pthreads. These classes, found in namespace std, are not overridden\ + by the mingw-std-thread library. If you would still like to use this\ + implementation (as it is more lightweight), use the classes provided in\ + namespace mingw_stdthread." +#endif +} +#endif // WIN32STDMUTEX_H diff --git a/randomx/mingw-std-threads-master/mingw.shared_mutex.h b/randomx/mingw-std-threads-master/mingw.shared_mutex.h new file mode 100644 index 0000000..ff1ac65 --- /dev/null +++ b/randomx/mingw-std-threads-master/mingw.shared_mutex.h @@ -0,0 +1,503 @@ +/// \file mingw.shared_mutex.h +/// \brief Standard-compliant shared_mutex for MinGW +/// +/// (c) 2017 by Nathaniel J. McClatchey, Athens OH, United States +/// \author Nathaniel J. McClatchey +/// +/// \copyright Simplified (2-clause) BSD License. +/// +/// \note This file may become part of the mingw-w64 runtime package. If/when +/// this happens, the appropriate license will be added, i.e. this code will +/// become dual-licensed, and the current BSD 2-clause license will stay. +/// \note Target Windows version is determined by WINVER, which is determined in +/// from _WIN32_WINNT, which can itself be set by the user. + +// Notes on the namespaces: +// - The implementation can be accessed directly in the namespace +// mingw_stdthread. +// - Objects will be brought into namespace std by a using directive. This +// will cause objects declared in std (such as MinGW's implementation) to +// hide this implementation's definitions. +// - To avoid poluting the namespace with implementation details, all objects +// to be pushed into std will be placed in mingw_stdthread::visible. +// The end result is that if MinGW supplies an object, it is automatically +// used. If MinGW does not supply an object, this implementation's version will +// instead be used. + +#ifndef MINGW_SHARED_MUTEX_H_ +#define MINGW_SHARED_MUTEX_H_ + +#if !defined(__cplusplus) || (__cplusplus < 201103L) +#error A C++11 compiler is required! +#endif + +#include +// For descriptive errors. +#include +// Implementing a shared_mutex without OS support will require atomic read- +// modify-write capacity. +#include +// For timing in shared_lock and shared_timed_mutex. +#include +#include + +// Use MinGW's shared_lock class template, if it's available. Requires C++14. +// If unavailable (eg. because this library is being used in C++11), then an +// implementation of shared_lock is provided by this header. +#if (__cplusplus >= 201402L) +#include +#endif + +// For defer_lock_t, adopt_lock_t, and try_to_lock_t +#include "mingw.mutex.h" +// For this_thread::yield. +//#include "mingw.thread.h" + +// Might be able to use native Slim Reader-Writer (SRW) locks. +#ifdef _WIN32 +#include // Detect Windows version. +#if (defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)) +#pragma message "The Windows API that MinGW-w32 provides is not fully compatible\ + with Microsoft's API. We'll try to work around this, but we can make no\ + guarantees. This problem does not exist in MinGW-w64." +#include // No further granularity can be expected. +#else +#include +#endif +#endif + +namespace mingw_stdthread +{ +// Define a portable atomics-based shared_mutex +namespace portable +{ +class shared_mutex +{ + typedef uint_fast16_t counter_type; + std::atomic mCounter {0}; + static constexpr counter_type kWriteBit = 1 << (std::numeric_limits::digits - 1); + +#if STDMUTEX_RECURSION_CHECKS +// Runtime checker for verifying owner threads. Note: Exclusive mode only. + _OwnerThread mOwnerThread {}; +#endif +public: + typedef shared_mutex * native_handle_type; + + shared_mutex () = default; + +// No form of copying or moving should be allowed. + shared_mutex (const shared_mutex&) = delete; + shared_mutex & operator= (const shared_mutex&) = delete; + + ~shared_mutex () + { +// Terminate if someone tries to destroy an owned mutex. + assert(mCounter.load(std::memory_order_relaxed) == 0); + } + + void lock_shared (void) + { + counter_type expected = mCounter.load(std::memory_order_relaxed); + do + { +// Delay if writing or if too many readers are attempting to read. + if (expected >= kWriteBit - 1) + { + using namespace std; + expected = mCounter.load(std::memory_order_relaxed); + continue; + } + if (mCounter.compare_exchange_weak(expected, + static_cast(expected + 1), + std::memory_order_acquire, + std::memory_order_relaxed)) + break; + } + while (true); + } + + bool try_lock_shared (void) + { + counter_type expected = mCounter.load(std::memory_order_relaxed) & static_cast(~kWriteBit); + if (expected + 1 == kWriteBit) + return false; + else + return mCounter.compare_exchange_strong( expected, + static_cast(expected + 1), + std::memory_order_acquire, + std::memory_order_relaxed); + } + + void unlock_shared (void) + { + using namespace std; +#ifndef NDEBUG + if (!(mCounter.fetch_sub(1, memory_order_release) & static_cast(~kWriteBit))) + throw system_error(make_error_code(errc::operation_not_permitted)); +#else + mCounter.fetch_sub(1, memory_order_release); +#endif + } + +// Behavior is undefined if a lock was previously acquired. + void lock (void) + { +#if STDMUTEX_RECURSION_CHECKS + DWORD self = mOwnerThread.checkOwnerBeforeLock(); +#endif + using namespace std; +// Might be able to use relaxed memory order... +// Wait for the write-lock to be unlocked, then claim the write slot. + counter_type current; + while ((current = mCounter.fetch_or(kWriteBit, std::memory_order_acquire)) & kWriteBit); + //this_thread::yield(); +// Wait for readers to finish up. + while (current != kWriteBit) + { + //this_thread::yield(); + current = mCounter.load(std::memory_order_acquire); + } +#if STDMUTEX_RECURSION_CHECKS + mOwnerThread.setOwnerAfterLock(self); +#endif + } + + bool try_lock (void) + { +#if STDMUTEX_RECURSION_CHECKS + DWORD self = mOwnerThread.checkOwnerBeforeLock(); +#endif + counter_type expected = 0; + bool ret = mCounter.compare_exchange_strong(expected, kWriteBit, + std::memory_order_acquire, + std::memory_order_relaxed); +#if STDMUTEX_RECURSION_CHECKS + if (ret) + mOwnerThread.setOwnerAfterLock(self); +#endif + return ret; + } + + void unlock (void) + { +#if STDMUTEX_RECURSION_CHECKS + mOwnerThread.checkSetOwnerBeforeUnlock(); +#endif + using namespace std; +#ifndef NDEBUG + if (mCounter.load(memory_order_relaxed) != kWriteBit) + throw system_error(make_error_code(errc::operation_not_permitted)); +#endif + mCounter.store(0, memory_order_release); + } + + native_handle_type native_handle (void) + { + return this; + } +}; + +} // Namespace portable + +// The native shared_mutex implementation primarily uses features of Windows +// Vista, but the features used for try_lock and try_lock_shared were not +// introduced until Windows 7. To allow limited use while compiling for Vista, +// I define the class without try_* functions in that case. +// Only fully-featured implementations will be placed into namespace std. +#if defined(_WIN32) && (WINVER >= _WIN32_WINNT_VISTA) +namespace vista +{ +class condition_variable_any; +} + +namespace windows7 +{ +// We already #include "mingw.mutex.h". May as well reduce redundancy. +class shared_mutex : windows7::mutex +{ +// Allow condition_variable_any (and only condition_variable_any) to treat a +// shared_mutex as its base class. + friend class vista::condition_variable_any; +public: + using windows7::mutex::native_handle_type; + using windows7::mutex::lock; + using windows7::mutex::unlock; + using windows7::mutex::native_handle; + + void lock_shared (void) + { + AcquireSRWLockShared(native_handle()); + } + + void unlock_shared (void) + { + ReleaseSRWLockShared(native_handle()); + } + +// TryAcquireSRW functions are a Windows 7 feature. +#if (WINVER >= _WIN32_WINNT_WIN7) + bool try_lock_shared (void) + { + return TryAcquireSRWLockShared(native_handle()) != 0; + } + + using windows7::mutex::try_lock; +#endif +}; + +} // Namespace windows7 +#endif // Compiling for Vista +#if (defined(_WIN32) && (WINVER >= _WIN32_WINNT_WIN7)) +using windows7::shared_mutex; +#else +using portable::shared_mutex; +#endif + +class shared_timed_mutex : shared_mutex +{ + typedef shared_mutex Base; +public: + using Base::lock; + using Base::try_lock; + using Base::unlock; + using Base::lock_shared; + using Base::try_lock_shared; + using Base::unlock_shared; + + template< class Clock, class Duration > + bool try_lock_until ( const std::chrono::time_point& cutoff ) + { + do + { + if (try_lock()) + return true; + } + while (std::chrono::steady_clock::now() < cutoff); + return false; + } + + template< class Rep, class Period > + bool try_lock_for (const std::chrono::duration& rel_time) + { + return try_lock_until(std::chrono::steady_clock::now() + rel_time); + } + + template< class Clock, class Duration > + bool try_lock_shared_until ( const std::chrono::time_point& cutoff ) + { + do + { + if (try_lock_shared()) + return true; + } + while (std::chrono::steady_clock::now() < cutoff); + return false; + } + + template< class Rep, class Period > + bool try_lock_shared_for (const std::chrono::duration& rel_time) + { + return try_lock_shared_until(std::chrono::steady_clock::now() + rel_time); + } +}; + +#if __cplusplus >= 201402L +using std::shared_lock; +#else +// If not supplied by shared_mutex (eg. because C++14 is not supported), I +// supply the various helper classes that the header should have defined. +template +class shared_lock +{ + Mutex * mMutex; + bool mOwns; +// Reduce code redundancy + void verify_lockable (void) + { + using namespace std; + if (mMutex == nullptr) + throw system_error(make_error_code(errc::operation_not_permitted)); + if (mOwns) + throw system_error(make_error_code(errc::resource_deadlock_would_occur)); + } +public: + typedef Mutex mutex_type; + + shared_lock (void) noexcept + : mMutex(nullptr), mOwns(false) + { + } + + shared_lock (shared_lock && other) noexcept + : mMutex(other.mutex_), mOwns(other.owns_) + { + other.mMutex = nullptr; + other.mOwns = false; + } + + explicit shared_lock (mutex_type & m) + : mMutex(&m), mOwns(true) + { + mMutex->lock_shared(); + } + + shared_lock (mutex_type & m, defer_lock_t) noexcept + : mMutex(&m), mOwns(false) + { + } + + shared_lock (mutex_type & m, adopt_lock_t) + : mMutex(&m), mOwns(true) + { + } + + shared_lock (mutex_type & m, try_to_lock_t) + : mMutex(&m), mOwns(m.try_lock_shared()) + { + } + + template< class Rep, class Period > + shared_lock( mutex_type& m, const std::chrono::duration& timeout_duration ) + : mMutex(&m), mOwns(m.try_lock_shared_for(timeout_duration)) + { + } + + template< class Clock, class Duration > + shared_lock( mutex_type& m, const std::chrono::time_point& timeout_time ) + : mMutex(&m), mOwns(m.try_lock_shared_until(timeout_time)) + { + } + + shared_lock& operator= (shared_lock && other) noexcept + { + if (&other != this) + { + if (mOwns) + mMutex->unlock_shared(); + mMutex = other.mMutex; + mOwns = other.mOwns; + other.mMutex = nullptr; + other.mOwns = false; + } + return *this; + } + + + ~shared_lock (void) + { + if (mOwns) + mMutex->unlock_shared(); + } + + shared_lock (const shared_lock &) = delete; + shared_lock& operator= (const shared_lock &) = delete; + +// Shared locking + void lock (void) + { + verify_lockable(); + mMutex->lock_shared(); + mOwns = true; + } + + bool try_lock (void) + { + verify_lockable(); + mOwns = mMutex->try_lock_shared(); + return mOwns; + } + + template< class Clock, class Duration > + bool try_lock_until( const std::chrono::time_point& cutoff ) + { + verify_lockable(); + do + { + mOwns = mMutex->try_lock_shared(); + if (mOwns) + return mOwns; + } + while (std::chrono::steady_clock::now() < cutoff); + return false; + } + + template< class Rep, class Period > + bool try_lock_for (const std::chrono::duration& rel_time) + { + return try_lock_until(std::chrono::steady_clock::now() + rel_time); + } + + void unlock (void) + { + using namespace std; + if (!mOwns) + throw system_error(make_error_code(errc::operation_not_permitted)); + mMutex->unlock_shared(); + mOwns = false; + } + +// Modifiers + void swap (shared_lock & other) noexcept + { + using namespace std; + swap(mMutex, other.mMutex); + swap(mOwns, other.mOwns); + } + + mutex_type * release (void) noexcept + { + mutex_type * ptr = mMutex; + mMutex = nullptr; + mOwns = false; + return ptr; + } +// Observers + mutex_type * mutex (void) const noexcept + { + return mMutex; + } + + bool owns_lock (void) const noexcept + { + return mOwns; + } + + explicit operator bool () const noexcept + { + return owns_lock(); + } +}; + +template< class Mutex > +void swap( shared_lock& lhs, shared_lock& rhs ) noexcept +{ + lhs.swap(rhs); +} +#endif // C++11 +} // Namespace mingw_stdthread + +namespace std +{ +// Because of quirks of the compiler, the common "using namespace std;" +// directive would flatten the namespaces and introduce ambiguity where there +// was none. Direct specification (std::), however, would be unaffected. +// Take the safe option, and include only in the presence of MinGW's win32 +// implementation. +#if (__cplusplus < 201703L) || (defined(__MINGW32__ ) && !defined(_GLIBCXX_HAS_GTHREADS)) +using mingw_stdthread::shared_mutex; +#endif +#if (__cplusplus < 201402L) || (defined(__MINGW32__ ) && !defined(_GLIBCXX_HAS_GTHREADS)) +using mingw_stdthread::shared_timed_mutex; +using mingw_stdthread::shared_lock; +#elif !defined(MINGW_STDTHREAD_REDUNDANCY_WARNING) // Skip repetition +#define MINGW_STDTHREAD_REDUNDANCY_WARNING +#pragma message "This version of MinGW seems to include a win32 port of\ + pthreads, and probably already has C++ std threading classes implemented,\ + based on pthreads. These classes, found in namespace std, are not overridden\ + by the mingw-std-thread library. If you would still like to use this\ + implementation (as it is more lightweight), use the classes provided in\ + namespace mingw_stdthread." +#endif +} // Namespace std +#endif // MINGW_SHARED_MUTEX_H_ diff --git a/randomx/mingw-std-threads-master/mingw.thread.h b/randomx/mingw-std-threads-master/mingw.thread.h new file mode 100644 index 0000000..bcdd1a3 --- /dev/null +++ b/randomx/mingw-std-threads-master/mingw.thread.h @@ -0,0 +1,360 @@ +/** +* @file mingw.thread.h +* @brief std::thread implementation for MinGW +* (c) 2013-2016 by Mega Limited, Auckland, New Zealand +* @author Alexander Vassilev +* +* @copyright Simplified (2-clause) BSD License. +* You should have received a copy of the license along with this +* program. +* +* This code is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +* @note +* This file may become part of the mingw-w64 runtime package. If/when this happens, +* the appropriate license will be added, i.e. this code will become dual-licensed, +* and the current BSD 2-clause license will stay. +*/ + +#ifndef WIN32STDTHREAD_H +#define WIN32STDTHREAD_H + +#if !defined(__cplusplus) || (__cplusplus < 201103L) +#error A C++11 compiler is required! +#endif + +// Use the standard classes for std::, if available. +#include + +#include // For std::size_t +#include // Detect error type. +#include // For std::terminate +#include // For std::system_error +#include // For std::hash +#include // For std::tuple +#include // For sleep timing. +#include // For std::unique_ptr +#include // Stream output for thread ids. +#include // For std::swap, std::forward + +#include "mingw.invoke.h" + +#if (defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)) +#pragma message "The Windows API that MinGW-w32 provides is not fully compatible\ + with Microsoft's API. We'll try to work around this, but we can make no\ + guarantees. This problem does not exist in MinGW-w64." +#include // No further granularity can be expected. +#else +#include // For WaitForSingleObject +#include // For CloseHandle, etc. +#include // For GetNativeSystemInfo +#include // For GetCurrentThreadId +#endif +#include // For _beginthreadex + +#ifndef NDEBUG +#include +#endif + +#if !defined(_WIN32_WINNT) || (_WIN32_WINNT < 0x0501) +#error To use the MinGW-std-threads library, you will need to define the macro _WIN32_WINNT to be 0x0501 (Windows XP) or higher. +#endif + +// Instead of INVALID_HANDLE_VALUE, _beginthreadex returns 0. +namespace mingw_stdthread +{ +namespace detail +{ + template + struct IntSeq {}; + + template + struct GenIntSeq : GenIntSeq { }; + + template + struct GenIntSeq<0, S...> { typedef IntSeq type; }; + +// Use a template specialization to avoid relying on compiler optimization +// when determining the parameter integer sequence. + template + class ThreadFuncCall; +// We can't define the Call struct in the function - the standard forbids template methods in that case + template + class ThreadFuncCall, Args...> + { + static_assert(sizeof...(S) == sizeof...(Args), "Args must match."); + using Tuple = std::tuple::type...>; + typename std::decay::type mFunc; + Tuple mArgs; + + public: + ThreadFuncCall(Func&& aFunc, Args&&... aArgs) + : mFunc(std::forward(aFunc)), + mArgs(std::forward(aArgs)...) + { + } + + void callFunc() + { + detail::invoke(std::move(mFunc), std::move(std::get(mArgs)) ...); + } + }; + +// Allow construction of threads without exposing implementation. + class ThreadIdTool; +} // Namespace "detail" + +class thread +{ +public: + class id + { + DWORD mId = 0; + friend class thread; + friend class std::hash; + friend class detail::ThreadIdTool; + explicit id(DWORD aId) noexcept : mId(aId){} + public: + id (void) noexcept = default; + friend bool operator==(id x, id y) noexcept {return x.mId == y.mId; } + friend bool operator!=(id x, id y) noexcept {return x.mId != y.mId; } + friend bool operator< (id x, id y) noexcept {return x.mId < y.mId; } + friend bool operator<=(id x, id y) noexcept {return x.mId <= y.mId; } + friend bool operator> (id x, id y) noexcept {return x.mId > y.mId; } + friend bool operator>=(id x, id y) noexcept {return x.mId >= y.mId; } + + template + friend std::basic_ostream<_CharT, _Traits>& + operator<<(std::basic_ostream<_CharT, _Traits>& __out, id __id) + { + if (__id.mId == 0) + { + return __out << "(invalid std::thread::id)"; + } + else + { + return __out << __id.mId; + } + } + }; +private: + static constexpr HANDLE kInvalidHandle = nullptr; + static constexpr DWORD kInfinite = 0xffffffffl; + HANDLE mHandle; + id mThreadId; + + template + static unsigned __stdcall threadfunc(void* arg) + { + std::unique_ptr call(static_cast(arg)); + call->callFunc(); + return 0; + } + + static unsigned int _hardware_concurrency_helper() noexcept + { + SYSTEM_INFO sysinfo; +// This is one of the few functions used by the library which has a nearly- +// equivalent function defined in earlier versions of Windows. Include the +// workaround, just as a reminder that it does exist. +#if defined(_WIN32_WINNT) && (_WIN32_WINNT >= 0x0501) + ::GetNativeSystemInfo(&sysinfo); +#else + ::GetSystemInfo(&sysinfo); +#endif + return sysinfo.dwNumberOfProcessors; + } +public: + typedef HANDLE native_handle_type; + id get_id() const noexcept {return mThreadId;} + native_handle_type native_handle() const {return mHandle;} + thread(): mHandle(kInvalidHandle), mThreadId(){} + + thread(thread&& other) + :mHandle(other.mHandle), mThreadId(other.mThreadId) + { + other.mHandle = kInvalidHandle; + other.mThreadId = id{}; + } + + thread(const thread &other)=delete; + + template + explicit thread(Func&& func, Args&&... args) : mHandle(), mThreadId() + { + using ArgSequence = typename detail::GenIntSeq::type; + using Call = detail::ThreadFuncCall; + auto call = new Call( + std::forward(func), std::forward(args)...); + unsigned id_receiver; + auto int_handle = _beginthreadex(NULL, 0, threadfunc, + static_cast(call), 0, &id_receiver); + if (int_handle == 0) + { + mHandle = kInvalidHandle; + int errnum = errno; + delete call; +// Note: Should only throw EINVAL, EAGAIN, EACCES + throw std::system_error(errnum, std::generic_category()); + } else { + mThreadId.mId = id_receiver; + mHandle = reinterpret_cast(int_handle); + } + } + + bool joinable() const {return mHandle != kInvalidHandle;} + +// Note: Due to lack of synchronization, this function has a race condition +// if called concurrently, which leads to undefined behavior. The same applies +// to all other member functions of this class, but this one is mentioned +// explicitly. + void join() + { + using namespace std; + if (get_id() == id(GetCurrentThreadId())) + throw system_error(make_error_code(errc::resource_deadlock_would_occur)); + if (mHandle == kInvalidHandle) + throw system_error(make_error_code(errc::no_such_process)); + if (!joinable()) + throw system_error(make_error_code(errc::invalid_argument)); + WaitForSingleObject(mHandle, kInfinite); + CloseHandle(mHandle); + mHandle = kInvalidHandle; + mThreadId = id{}; + } + + ~thread() + { + if (joinable()) + { +#ifndef NDEBUG + std::printf("Error: Must join() or detach() a thread before \ +destroying it.\n"); +#endif + std::terminate(); + } + } + thread& operator=(const thread&) = delete; + thread& operator=(thread&& other) noexcept + { + if (joinable()) + { +#ifndef NDEBUG + std::printf("Error: Must join() or detach() a thread before \ +moving another thread to it.\n"); +#endif + std::terminate(); + } + swap(std::forward(other)); + return *this; + } + void swap(thread&& other) noexcept + { + std::swap(mHandle, other.mHandle); + std::swap(mThreadId.mId, other.mThreadId.mId); + } + + static unsigned int hardware_concurrency() noexcept + { + static unsigned int cached = _hardware_concurrency_helper(); + return cached; + } + + void detach() + { + if (!joinable()) + { + using namespace std; + throw system_error(make_error_code(errc::invalid_argument)); + } + if (mHandle != kInvalidHandle) + { + CloseHandle(mHandle); + mHandle = kInvalidHandle; + } + mThreadId = id{}; + } +}; + +namespace detail +{ + class ThreadIdTool + { + public: + static thread::id make_id (DWORD base_id) noexcept + { + return thread::id(base_id); + } + }; +} // Namespace "detail" + +namespace this_thread +{ + inline thread::id get_id() noexcept + { + return detail::ThreadIdTool::make_id(GetCurrentThreadId()); + } + inline void yield() noexcept {Sleep(0);} + template< class Rep, class Period > + void sleep_for( const std::chrono::duration& sleep_duration) + { + static constexpr DWORD kInfinite = 0xffffffffl; + using namespace std::chrono; + using rep = milliseconds::rep; + rep ms = duration_cast(sleep_duration).count(); + while (ms > 0) + { + constexpr rep kMaxRep = static_cast(kInfinite - 1); + auto sleepTime = (ms < kMaxRep) ? ms : kMaxRep; + Sleep(static_cast(sleepTime)); + ms -= sleepTime; + } + } + template + void sleep_until(const std::chrono::time_point& sleep_time) + { + sleep_for(sleep_time-Clock::now()); + } +} +} // Namespace mingw_stdthread + +namespace std +{ +// Because of quirks of the compiler, the common "using namespace std;" +// directive would flatten the namespaces and introduce ambiguity where there +// was none. Direct specification (std::), however, would be unaffected. +// Take the safe option, and include only in the presence of MinGW's win32 +// implementation. +#if defined(__MINGW32__ ) && !defined(_GLIBCXX_HAS_GTHREADS) +using mingw_stdthread::thread; +// Remove ambiguity immediately, to avoid problems arising from the above. +//using std::thread; +namespace this_thread +{ +using namespace mingw_stdthread::this_thread; +} +#elif !defined(MINGW_STDTHREAD_REDUNDANCY_WARNING) // Skip repetition +#define MINGW_STDTHREAD_REDUNDANCY_WARNING +#pragma message "This version of MinGW seems to include a win32 port of\ + pthreads, and probably already has C++11 std threading classes implemented,\ + based on pthreads. These classes, found in namespace std, are not overridden\ + by the mingw-std-thread library. If you would still like to use this\ + implementation (as it is more lightweight), use the classes provided in\ + namespace mingw_stdthread." +#endif + +// Specialize hash for this implementation's thread::id, even if the +// std::thread::id already has a hash. +template<> +struct hash +{ + typedef mingw_stdthread::thread::id argument_type; + typedef size_t result_type; + size_t operator() (const argument_type & i) const noexcept + { + return i.mId; + } +}; +} +#endif // WIN32STDTHREAD_H diff --git a/randomx/mingw-std-threads-master/tests/CMakeLists.txt b/randomx/mingw-std-threads-master/tests/CMakeLists.txt new file mode 100644 index 0000000..0561145 --- /dev/null +++ b/randomx/mingw-std-threads-master/tests/CMakeLists.txt @@ -0,0 +1,18 @@ +project(stdthreadtest) +cmake_minimum_required(VERSION 3.0) + +string(CONCAT mingw_stdthreads_tests_compile_options_docstring + "Compiler flags used to compile mingw-stdthreads's tests. By default " + "it's -std=c++11 -Wall -Wextra") +set(MINGW_STDTHREADS_TESTS_COMPILE_OPTIONS "-std=c++11;-Wall;-Wextra" + CACHE STRING ${mingw_stdthreads_tests_compile_options_docstring}) + +set(MINGW_STDTHREADS_TESTS_ADDITIONAL_LINKER_FLAGS "" CACHE STRING + "Optional linker flags to be passed when linking mingw-stdthreads's tests") + +add_executable(${PROJECT_NAME} tests.cpp) +target_compile_options(${PROJECT_NAME} PRIVATE + ${MINGW_STDTHREADS_TESTS_COMPILE_OPTIONS}) +target_link_libraries(${PROJECT_NAME} PRIVATE mingw_stdthreads) +target_link_libraries(${PROJECT_NAME} PRIVATE + ${MINGW_STDTHREADS_TESTS_ADDITIONAL_LINKER_FLAGS}) \ No newline at end of file diff --git a/randomx/mingw-std-threads-master/tests/tests.cpp b/randomx/mingw-std-threads-master/tests/tests.cpp new file mode 100644 index 0000000..6ef71d1 --- /dev/null +++ b/randomx/mingw-std-threads-master/tests/tests.cpp @@ -0,0 +1,450 @@ +#ifndef MINGW_STDTHREADS_GENERATED_STDHEADERS + #include + #include + #include + #include + #include +#else + #include + #include + #include + #include + #include +#endif +#include +#include +#include +#include +#include + +using namespace std; + +int test_int = 42; + +// Pre-declaration to suppress some warnings. +void test_call_once(int, char const *); + +int cond = 0; +std::mutex m; +std::shared_mutex sm; +std::condition_variable cv; +std::condition_variable_any cv_any; + +template +void log (char const * fmtString, Args ...args) { + printf(fmtString, args...); + printf("\n"); + fflush(stdout); +} + +void test_call_once(int a, const char* str) +{ + log("test_call_once called with a=%d, str=%s", a, str); + this_thread::sleep_for(std::chrono::milliseconds(500)); +} + +struct TestMove +{ + std::string mStr; + TestMove(const std::string& aStr): mStr(aStr){} + TestMove(TestMove&& other): mStr(other.mStr+" moved") + { printf("%s: Object moved\n", mStr.c_str()); } + TestMove(const TestMove&) : mStr() + { + assert(false && "TestMove: Object COPIED instead of moved"); + } +}; + +template +void test_future_set_value (promise & promise) +{ + promise.set_value(T(test_int)); +} + +template<> +void test_future_set_value (promise & promise) +{ + promise.set_value(); +} + +template +bool test_future_get_value (future & future) +{ + return (future.get() == T(test_int)); +} + +template<> +bool test_future_get_value (future & future) +{ + future.get(); + return true; +} + +template +struct CustomAllocator +{ + CustomAllocator (void) noexcept + { + } + + template + CustomAllocator (CustomAllocator const &) noexcept + { + } + + template + CustomAllocator & operator= (CustomAllocator const &) noexcept + { + return *this; + } + + typedef T value_type; + T * allocate (size_t n) + { + log("Used custom allocator to allocate %zu object(s).", n); + return static_cast(std::malloc(n * sizeof(T))); + } + void deallocate (T * ptr, size_t n) + { + log("Used custom allocator to deallocate %zu object(s).", n); + std::free(ptr); + } +}; + +template +void test_future () +{ + static_assert(is_move_constructible >::value, + "std::promise must be move-constructible."); + static_assert(is_move_assignable >::value, + "std::promise must be move-assignable."); + static_assert(!is_copy_constructible >::value, + "std::promise must not be copy-constructible."); + static_assert(!is_copy_assignable >::value, + "std::promise must not be copy-assignable."); + + static_assert(is_move_constructible >::value, + "std::future must be move-constructible."); + static_assert(is_move_assignable >::value, + "std::future must be move-assignable."); + static_assert(!is_copy_constructible >::value, + "std::future must not be copy-constructible."); + static_assert(!is_copy_assignable >::value, + "std::future must not be copy-assignable."); + + static_assert(is_move_constructible >::value, + "std::shared_future must be move-constructible."); + static_assert(is_move_assignable >::value, + "std::shared_future must be move-assignable."); + static_assert(is_copy_constructible >::value, + "std::shared_future must be copy-constructible."); + static_assert(is_copy_assignable >::value, + "std::shared_future must be copy-assignable."); + + log("\tMaking a few promises, and getting their futures..."); + promise promise_value, promise_exception, promise_broken, promise_late; + + future future_value = promise_value.get_future(); + future future_exception = promise_exception.get_future(); + future future_broken = promise_broken.get_future(); + future future_late = promise_late.get_future(); + + try { + future impossible_future = promise_value.get_future(); + log("WARNING: Promise failed to detect that its future was already retrieved."); + } catch(...) { + log("\tPromise successfully prevented redundant future retrieval."); + } + + log("\tPassing promises to a new thread..."); + thread t ([](promise p_value, promise p_exception, promise, promise p_late) + { + this_thread::sleep_for(std::chrono::seconds(1)); + try { + throw std::runtime_error("Thrown during the thread."); + } catch (...) { + p_late.set_exception_at_thread_exit(std::current_exception()); + } + test_future_set_value(p_value); + try { + throw std::runtime_error("Things happened as expected."); + } catch (...) { + p_exception.set_exception(std::current_exception()); + } + this_thread::sleep_for(std::chrono::seconds(2)); + }, + std::move(promise_value), + std::move(promise_exception), + std::move(promise_broken), + std::move(promise_late)); + t.detach(); + + try { + bool was_expected = test_future_get_value(future_value); + log("\tReceived %sexpected value.", (was_expected ? "" : "un")); + } catch (...) { + log("WARNING: Exception where there should be none!"); + throw; + } + try { + test_future_get_value(future_exception); + log("WARNING: Got a value where there should be an exception!"); + } catch (std::exception & e) { + log("\tReceived an exception (\"%s\") as expected.", e.what()); + } + + log("\tWaiting for the thread to exit..."); + try { + test_future_get_value(future_late); + log("WARNING: Got a value where there should be an exception!"); + } catch (std::exception & e) { + log("\tReceived an exception (\"%s\") as expected.", e.what()); + } + + try { + test_future_get_value(future_broken); + log("WARNING: Got a value where there should be an exception!"); + } catch (std::future_error & e) { + log("\tReceived a future_error (\"%s\") as expected.", e.what()); + } + + log("\tDeferring a function..."); + auto async_deferred = async(launch::deferred, [] (void) -> T + { + std::hash hasher; + log("\t\tDeferred function called on thread %zu", hasher(std::this_thread::get_id())); + if (!is_void::value) + return T(test_int); + }); + log("\tCalling a function asynchronously..."); + auto async_async = async(launch::async, [] (void) -> T + { + std::hash hasher; + log("\t\tAsynchronous function called on thread %zu", hasher(std::this_thread::get_id())); + if (!is_void::value) + return T(test_int); + }); + log("\tLetting the implementation decide..."); + auto async_either = async([] (thread::id other_id) -> T + { + std::hash hasher; + log("\t\tFunction called on thread %zu. Implementation chose %s execution.", hasher(this_thread::get_id()), (this_thread::get_id() == other_id) ? "deferred" : "asynchronous"); + if (!is_void::value) + return T(test_int); + }, this_thread::get_id()); + + log("\tFetching asynchronous result."); + test_future_get_value(async_async); + log("\tFetching deferred result."); + test_future_get_value(async_deferred); + log("\tFetching implementation-defined result."); + test_future_get_value(async_either); + + log("\tTesting async on pointer-to-member-function."); + struct Helper + { + thread::id other_id; + T call (void) const + { + std::hash hasher; + log("\t\tFunction called on thread %zu. Implementation chose %s execution.", hasher(this_thread::get_id()), (this_thread::get_id() == other_id) ? "deferred" : "asynchronous"); + if (!is_void::value) + return T(test_int); + } + } test_class { this_thread::get_id() }; + auto async_member = async(Helper::call, test_class); + log("\tFetching result."); + test_future_get_value(async_member); +} + +#define TEST_SL_MV_CPY(ClassName) \ + static_assert(std::is_standard_layout::value, \ + "ClassName does not satisfy concept StandardLayoutType."); \ + static_assert(!std::is_move_constructible::value, \ + "ClassName must not be move-constructible."); \ + static_assert(!std::is_move_assignable::value, \ + "ClassName must not be move-assignable."); \ + static_assert(!std::is_copy_constructible::value, \ + "ClassName must not be copy-constructible."); \ + static_assert(!std::is_copy_assignable::value, \ + "ClassName must not be copy-assignable."); + +int main() +{ +#ifdef MINGW_STDTHREADS_GENERATED_STDHEADERS + std::cout << "Using cmake-generated stdheaders, "; +#endif + static_assert(std::is_trivially_copyable::value, + "thread::id must be trivially copyable."); + + TEST_SL_MV_CPY(mutex) + TEST_SL_MV_CPY(recursive_mutex) + TEST_SL_MV_CPY(timed_mutex) + TEST_SL_MV_CPY(recursive_timed_mutex) + TEST_SL_MV_CPY(shared_mutex) + TEST_SL_MV_CPY(shared_timed_mutex) + TEST_SL_MV_CPY(condition_variable) + TEST_SL_MV_CPY(condition_variable_any) + static_assert(!std::is_move_constructible::value, + "once_flag must not be move-constructible."); + static_assert(!std::is_move_assignable::value, + "once_flag must not be move-assignable."); + static_assert(!std::is_copy_constructible::value, + "once_flag must not be copy-constructible."); + static_assert(!std::is_copy_assignable::value, + "once_flag must not be copy-assignable."); + +// With C++ feature level and target Windows version potentially affecting +// behavior, make this information visible. + { + switch (__cplusplus) + { + case 201103L: std::cout << "Compiled in C++11"; break; + case 201402L: std::cout << "Compiled in C++14"; break; + case 201703L: std::cout << "Compiled in C++17"; break; + default: std::cout << "Compiled in a non-conforming C++ compiler"; + } + std::cout << ", targeting Windows "; + static_assert(WINVER > 0x0500, "Windows NT and earlier are not supported."); + switch (WINVER) + { + case 0x0501: std::cout << "XP"; break; + case 0x0502: std::cout << "Server 2003"; break; + case 0x0600: std::cout << "Vista"; break; + case 0x0601: std::cout << "7"; break; + case 0x0602: std::cout << "8"; break; + case 0x0603: std::cout << "8.1"; break; + case 0x0A00: std::cout << "10"; break; + default: std::cout << "10+"; + } + std::cout << "\n"; + } + + { + log("Testing serialization and hashing for thread::id..."); + std::cout << "Serialization:\t" << this_thread::get_id() << "\n"; + std::hash hasher; + std::cout << "Hash:\t" << hasher(this_thread::get_id()) << "\n"; + } + +// Regression test: Thread must copy any argument that is passed by value. + { + std::vector loop_threads; + std::atomic i_vals_touched [4];// { 0, 0, 0, 0 }; + for (int i = 0; i < 4; ++i) + i_vals_touched[i].store(0, std::memory_order_relaxed); + for (int i = 0; i < 4; ++i) + { + loop_threads.push_back(std::thread([&](int c) + { + log("For-loop test thread got value: %i", c); + i_vals_touched[c].fetch_add(1, std::memory_order_relaxed); + }, i)); + } + for (std::thread & thr : loop_threads) + thr.join(); + for (int i = 0; i < 4; ++i) + { + if (i_vals_touched[i] != 1) + { + log("FATAL: Threads are not copying arguments!"); + return 1; + } + } + } + + std::thread t([](TestMove&& a, const char* b, int c) mutable + { + try + { + log("Worker thread started, sleeping for a while..."); +// Thread might move the string more than once. + assert(a.mStr.substr(0, 15) == "move test moved"); + assert(!strcmp(b, "test message")); + assert(c == -20); + auto move2nd = std::move(a); //test move to final destination + this_thread::sleep_for(std::chrono::milliseconds(1000)); + { + lock_guard lock(m); + cond = 1; + log("Notifying condvar"); + cv.notify_all(); + } + + this_thread::sleep_for(std::chrono::milliseconds(500)); + { + lock_guard lock(sm); + cond = 2; + log("Notifying condvar"); + cv_any.notify_all(); + } + + this_thread::sleep_for(std::chrono::milliseconds(500)); + { + lock_guard lock(sm); + cond = 3; + log("Notifying condvar"); + cv_any.notify_all(); + } + + log("Worker thread finishing"); + } + catch(std::exception& e) + { + printf("EXCEPTION in worker thread: %s\n", e.what()); + } + }, + TestMove("move test"), "test message", -20); + try + { + log("Main thread: Locking mutex, waiting on condvar..."); + { + std::unique_lock lk(m); + cv.wait(lk, []{ return cond >= 1;} ); + log("condvar notified, cond = %d", cond); + assert(lk.owns_lock()); + } + log("Main thread: Locking shared_mutex, waiting on condvar..."); + { + std::unique_lock lk(sm); + cv_any.wait(lk, []{ return cond >= 2;} ); + log("condvar notified, cond = %d", cond); + assert(lk.owns_lock()); + } + log("Main thread: Locking shared_mutex in shared mode, waiting on condvar..."); + { + std::shared_lock lk(sm); + cv_any.wait(lk, []{ return cond >= 3;} ); + log("condvar notified, cond = %d", cond); + assert(lk.owns_lock()); + } + log("Main thread: Waiting on worker join..."); + + t.join(); + log("Main thread: Worker thread joined"); + fflush(stdout); + } + catch(std::exception& e) + { + log("EXCEPTION in main thread: %s", e.what()); + } + once_flag of; + call_once(of, test_call_once, 1, "test"); + call_once(of, test_call_once, 1, "ERROR! Should not be called second time"); + log("Test complete"); + + { + log("Testing implementation of ..."); + test_future(); + test_future(); + test_future(); + test_future(); + test_future(); + test_future(); + log("Testing 's use of allocators. Should allocate, then deallocate."); + promise allocated_promise (std::allocator_arg, CustomAllocator()); + allocated_promise.set_value(7); + } + + return 0; +} diff --git a/randomx/mingw-std-threads-master/utility_scripts/Generate-StdLikeHeaders.ps1 b/randomx/mingw-std-threads-master/utility_scripts/Generate-StdLikeHeaders.ps1 new file mode 100644 index 0000000..5208e27 --- /dev/null +++ b/randomx/mingw-std-threads-master/utility_scripts/Generate-StdLikeHeaders.ps1 @@ -0,0 +1,226 @@ + +<# +.SYNOPSIS + Generate std-like headers which you can use just like standard c++'s ones. + For example include . +.PARAMETER GccPath + Path to GCC. Will try to use the default one from $env:Path if not + specified. +.PARAMETER MinGWStdThreadsPath + Path to mingw-std-threads folder. Will try to use $PSScriptRoot/.. if not + specified. +.PARAMETER DestinationFolder + Destination folder where generated headers will be saved to +.PARAMETER GenerateCompilerWrapperWithFileName + If specified, will be generated a wrapper batch script for g++ which automatically + adds $DestinationFolder as an include path +.PARAMETER Interactive + Use this switch if you want to pass parameters interactively +#> +[CmdletBinding(PositionalBinding = $false)] +param ( + # Path of GCC + [Parameter(Mandatory = $false, + ValueFromPipelineByPropertyName = $true, + ParameterSetName = "NonInteractive", + HelpMessage = "Pathtof GCC. Will try to use the default one from `$env:Path if not specified.")] + [string] + $GccPath, + + # Path of mingw-std-threads + [Parameter(Mandatory = $false, + ValueFromPipelineByPropertyName = $true, + ParameterSetName = "NonInteractive", + HelpMessage = "Path to mingw-std-threads folder. Will try to use `$PSScriptRoot/.. if not specified.")] + [string] + $MinGWStdThreadsPath, + + # Destination folder path + [Parameter(Mandatory = $true, + ValueFromPipelineByPropertyName = $true, + ParameterSetName = "NonInteractive", + HelpMessage = "Destination folder where generated headers will be saved to")] + [ValidateNotNullOrEmpty()] + [string] + $DestinationFolder, + + # Compiler wrapper path + [Parameter(Mandatory = $false, + ValueFromPipelineByPropertyName = $true, + ParameterSetName = "NonInteractive", + HelpMessage = "If specified, will generate a wrapper batch script for g++ which automatically adds `$DestinationFolder as an include path")] + [string] + $GenerateCompilerWrapperWithFileName, + + # Interactive Switch + [Parameter(ParameterSetName = "Interactive")] + [switch] + $Interactive = $false +) + +# Stop execution when encountering any error (includeing Write-Error command) +$ErrorActionPreference = "Stop"; + +# headers to be generated +$headers = @("condition_variable", "future", "mutex", "shared_mutex", "thread") + +# ask for user input in interactive mode +if ($Interactive) { + Write-Host "Generate std-like headers which you can use just like standard c++'s ones." + Write-Host "Something like `"include `"." + + $DestinationFolder = Read-Host -Prompt "Destination folder into which headers will be generated" + $GccPath = Read-Host -Prompt "Path to GCC, optional. Press Enter to let it be retrieved from PATH" + $MinGWStdThreadsPath = Read-Host -Prompt "Path to mingw-std-threads folder, optional. Press Enter to use default value" + $GenerateCompilerWrapperWithFileName = Read-Host "Optional path to which a wrapper batch script for g++ will be created. It will automatically use $DestinationFolder as an include path. Press Enter to skip" +} + +if (-not $GccPath) { + $GccPath = "gcc" +} + +# set default value of $MinGWStdThreadsPath +if (-not $MinGWStdThreadsPath) { + $scriptFilePath = $null + if ($MyInvocation.MyCommand.CommandType -eq "ExternalScript") { + $scriptFilePath = $MyInvocation.MyCommand.Definition + } + else { + $scriptFilePath = [Environment]::GetCommandLineArgs()[0] + } + $MinGWStdThreadsPath = (Get-Item -LiteralPath $scriptFilePath).Directory.Parent.FullName +} + +# Normalize paths +$GccPath = (Get-Command -Name $GccPath).Source +$MinGWStdThreadsPath = Resolve-Path -LiteralPath $MinGWStdThreadsPath +$DestinationFolder = New-Item -Path $DestinationFolder -ItemType "Directory" -Force + +Write-Output "GccPath: $GccPath" +Write-Output "MinGWStdThreadsPath: $MinGWStdThreadsPath" +Write-Output "DestinationFolder: $DestinationFolder" +if ($GenerateCompilerWrapperWithFileName) { + Write-Output "GenerateCompilerWrapperWithFileName: $GenerateCompilerWrapperWithFileName" +} + +# Find path of real headers +Write-Output "Retrieving system header search paths..." + +$readingIncludePath = $false +# Empty array which will later store include paths +$includePaths = @() + +# Launch GCC +$processStartInfo = New-Object -TypeName "System.Diagnostics.ProcessStartInfo" +$processStartInfo.FileName = $GccPath +$processStartInfo.Arguments = "-xc++ -E -v -" +$processStartInfo.RedirectStandardInput = $true +$processStartInfo.RedirectStandardOutput = $true +$processStartInfo.RedirectStandardError = $true +$processStartInfo.UseShellExecute = $false + +$outputLines = @() +$gcc = New-Object -TypeName "System.Diagnostics.Process" +try { + $gcc.StartInfo = $processStartInfo + $gcc.Start() | Out-Null + $gcc.StandardInput.Close() + $gcc.WaitForExit() + $output = $gcc.StandardError.ReadToEnd() + $outputLines = $output -split "[\r\n]" | + ForEach-Object { return $_.Trim() } | + Where-Object { return $_.Length -gt 0 } +} +finally { + $gcc.StandardInput.Dispose() + $gcc.StandardOutput.Dispose() + $gcc.StandardError.Dispose() + $gcc.Dispose() +} + +# Parse Output +foreach ($line in $outputLines) { + if (-not $readingIncludePath) { + if ($line -match "#include <...> search starts here:") { + $readingIncludePath = $true + } + continue + } + + if ($line -match "End of search list.") { + break + } + + Write-Output "Retrieved search path: $line" + $includePaths += $line +} + +if ($includePaths.Count -eq 0) { + Write-Error "Error: didn't find any #inlcude <...> search paths" +} + +# look for std header paths +Write-Output "Searching for standard headers..." +$stdHeaders = @() +# set a label called "nextHeader" to allow continue with outer loop +:nextHeader foreach ($header in $headers) { + # check if mingw-std-threads headers exist + $myHeader = "mingw.$header.h" + $myHeader = Join-Path -Path $MinGWStdThreadsPath -ChildPath $myHeader + if (-not (Test-Path -LiteralPath $myHeader -PathType "Leaf")) { + Write-Error "Error: mingw-std-threads header not found: $myHeader" + } + + foreach ($inludePath in $includePaths) { + $fullPath = Join-Path -Path $inludePath -ChildPath $header + if (Test-Path -LiteralPath $fullPath -PathType "Leaf") { + $fullPath = (Get-Item -LiteralPath $fullPath).FullName + $stdHeaders += $fullPath + Write-Output "Found std header: $fullPath" + # if found matching header, continue with outer loop + continue nextHeader + } + } + + Write-Error "Error: didn't find $header in any search paths" +} + +# generate headers +Write-Output "Generating headers..." +foreach ($stdHeader in $stdHeaders) { + $headerFileName = (Get-Item -LiteralPath $stdHeader).Name + $myHeader = "mingw.$headerFileName.h" + $myHeader = Join-Path -Path $MinGWStdThreadsPath -ChildPath $myHeader + Write-Output "Generating <$headerFileName> from $myHeader and $stdHeader..." + + # both two headers should already have include guards + # but we still add a #pragma once just to be safe + $content = "#pragma once`r`n" + $content += "#include `"$stdHeader`"`r`n" + $content += "#include `"$myHeader`"`r`n"; + + $outputFileName = Join-Path -Path $DestinationFolder -ChildPath $headerFileName + Write-Output "Writing file: $outputFileName" + + # use .NET's method to output lines to avoid UTF-8 BOM + $noBomEncoding = New-Object -TypeName "System.Text.UTF8Encoding" -ArgumentList $false + [IO.File]::WriteAllText($outputFileName, $content, $noBomEncoding) +} + +$message = "Successfully generated std-like headers. Use them by adding " +$message += "`"-I$DestinationFolder`" to your compiler command line parameters" +Write-Output $message + +if ($GenerateCompilerWrapperWithFileName) { + $compilerFolder = Split-Path -LiteralPath $GccPath + $compiler = Join-Path -Path $compilerFolder -ChildPath "g++" + $command = "@echo off`r`n" + $command += "$compiler %* `"-I$DestinationFolder`"" + $wrapper = New-Item -Path $GenerateCompilerWrapperWithFileName -ItemType "File" -Force + + # use .NET's method to output lines to avoid UTF-8 BOM + $noBomEncoding = New-Object -TypeName "System.Text.UTF8Encoding" -ArgumentList $false + [IO.File]::WriteAllText($wrapper, $command, $noBomEncoding) + + Write-Output "Wrapper batch script successfully generated to $wrapper" +} \ No newline at end of file diff --git a/randomx/mingw-std-threads-master/utility_scripts/generate_std_like_headers.bat b/randomx/mingw-std-threads-master/utility_scripts/generate_std_like_headers.bat new file mode 100644 index 0000000..9e2440c --- /dev/null +++ b/randomx/mingw-std-threads-master/utility_scripts/generate_std_like_headers.bat @@ -0,0 +1 @@ +powershell -NonInteractive -ExecutionPolicy ByPass -File %~dp0Generate-StdLikeHeaders.ps1 %* \ No newline at end of file diff --git a/randomx/mingw-std-threads-master/utility_scripts/generate_std_like_headers_interactive.bat b/randomx/mingw-std-threads-master/utility_scripts/generate_std_like_headers_interactive.bat new file mode 100644 index 0000000..aad146d --- /dev/null +++ b/randomx/mingw-std-threads-master/utility_scripts/generate_std_like_headers_interactive.bat @@ -0,0 +1 @@ +powershell -ExecutionPolicy ByPass -File %~dp0Generate-StdLikeHeaders.ps1 -Interactive \ No newline at end of file diff --git a/randomx/program.hpp b/randomx/program.hpp new file mode 100644 index 0000000..3c66a8f --- /dev/null +++ b/randomx/program.hpp @@ -0,0 +1,71 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include "common.hpp" +#include "instruction.hpp" +#include "blake2/endian.h" + +namespace randomx { + + struct ProgramConfiguration { + uint64_t eMask[2]; + uint32_t readReg0, readReg1, readReg2, readReg3; + }; + + class Program { + public: + Instruction& operator()(int pc) { + return programBuffer[pc]; + } + friend std::ostream& operator<<(std::ostream& os, const Program& p) { + p.print(os); + return os; + } + uint64_t getEntropy(int i) { + return load64(&entropyBuffer[i]); + } + uint32_t getSize() { + return RANDOMX_PROGRAM_SIZE; + } + private: + void print(std::ostream& os) const { + for (int i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { + auto instr = programBuffer[i]; + os << instr; + } + } + uint64_t entropyBuffer[16]; + Instruction programBuffer[RANDOMX_PROGRAM_SIZE]; //256 每个指令64bit + }; + + static_assert(sizeof(Program) % 64 == 0, "Invalid size of class randomx::Program"); +} diff --git a/randomx/randomx.cpp b/randomx/randomx.cpp new file mode 100644 index 0000000..d3ecc35 --- /dev/null +++ b/randomx/randomx.cpp @@ -0,0 +1,492 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include "mingw-std-threads-master/mingw.thread.h" //这个时因为使用#include 会报错,mingw中thread不支持std,如果时其他编译器如vs等,就不需要; +#include "randomx.h" +#include "dataset.hpp" +#include "vm_interpreted.hpp" +#include "vm_interpreted_light.hpp" +#include "vm_compiled.hpp" +#include "vm_compiled_light.hpp" +#include "blake2/blake2.h" +#include "cpu.hpp" +#include +#include + + + randomx_flags randomx_get_flags() { + randomx_flags flags = RANDOMX_HAVE_COMPILER ? RANDOMX_FLAG_JIT : RANDOMX_FLAG_DEFAULT; + randomx::Cpu cpu; +#ifdef __OpenBSD__ + if (flags == RANDOMX_FLAG_JIT) { + flags |= RANDOMX_FLAG_SECURE; + } +#endif + if (HAVE_AES && cpu.hasAes()) { + flags |= RANDOMX_FLAG_HARD_AES; + } + if (randomx_argon2_impl_avx2() != nullptr && cpu.hasAvx2()) { + flags |= RANDOMX_FLAG_ARGON2_AVX2; + } + if (randomx_argon2_impl_ssse3() != nullptr && cpu.hasSsse3()) { + flags |= RANDOMX_FLAG_ARGON2_SSSE3; + } + return flags; + } + + randomx_cache *randomx_alloc_cache(randomx_flags flags) { + randomx_cache *cache = nullptr; + auto impl = randomx::selectArgonImpl(flags); + if (impl == nullptr) { + return cache; + } + + try { + cache = new randomx_cache(); + cache->argonImpl = impl; + switch ((int)(flags & (RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES))) { + case RANDOMX_FLAG_DEFAULT: + cache->dealloc = &randomx::deallocCache; + cache->jit = nullptr; + cache->initialize = &randomx::initCache; + cache->datasetInit = &randomx::initDataset; + cache->memory = (uint8_t*)randomx::DefaultAllocator::allocMemory(randomx::CacheSize); //randomx::CacheSize =256MB + break; + + case RANDOMX_FLAG_JIT: + cache->dealloc = &randomx::deallocCache; + cache->jit = new randomx::JitCompiler(); + cache->initialize = &randomx::initCacheCompile; + cache->datasetInit = cache->jit->getDatasetInitFunc(); + cache->memory = (uint8_t*)randomx::DefaultAllocator::allocMemory(randomx::CacheSize); + break; + + case RANDOMX_FLAG_LARGE_PAGES: + cache->dealloc = &randomx::deallocCache; + cache->jit = nullptr; + cache->initialize = &randomx::initCache; + cache->datasetInit = &randomx::initDataset; + cache->memory = (uint8_t*)randomx::LargePageAllocator::allocMemory(randomx::CacheSize); + break; + + case RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES: + cache->dealloc = &randomx::deallocCache; + cache->jit = new randomx::JitCompiler(); + cache->initialize = &randomx::initCacheCompile; + cache->datasetInit = cache->jit->getDatasetInitFunc(); + cache->memory = (uint8_t*)randomx::LargePageAllocator::allocMemory(randomx::CacheSize); + break; + + default: + UNREACHABLE; + } + } + catch (std::exception &ex) { + if (cache != nullptr) { + randomx_release_cache(cache); + cache = nullptr; + } + } + + return cache; + } + + void randomx_init_cache(randomx_cache *cache, const void *key, size_t keySize) { + assert(cache != nullptr); + assert(keySize == 0 || key != nullptr); + + cache->initialize(cache, key, keySize); + + //std::string cacheKey; + //cacheKey.assign((const char *)key, keySize); //将字符串key中keysize个长度的字符赋值给cacheKey; + //if (cache->cacheKey != cacheKey || !cache->isInitialized()) { + // cache->initialize(cache, key, keySize); + // cache->cacheKey = cacheKey; + //} + } + + void randomx_release_cache(randomx_cache* cache) { + assert(cache != nullptr); + if (cache->memory != nullptr) { + cache->dealloc(cache); + } + delete cache; + } + + randomx_dataset *randomx_alloc_dataset(randomx_flags flags) { + + //fail on 32-bit systems if DatasetSize is >= 4 GiB + if (randomx::DatasetSize > std::numeric_limits::max()) { + return nullptr; + } + //printf("xxxxx\n"); + randomx_dataset *dataset = nullptr; + + //try { + dataset = new randomx_dataset(); + if (flags & RANDOMX_FLAG_LARGE_PAGES) { + dataset->dealloc = &randomx::deallocDataset; + dataset->memory = (uint8_t*)randomx::LargePageAllocator::allocMemory(randomx::DatasetSize); + } + else { + dataset->dealloc = &randomx::deallocDataset; + dataset->memory = (uint8_t*)randomx::DefaultAllocator::allocMemory(randomx::DatasetSize); + } + //} + //catch (std::exception &ex) { + // if (dataset != nullptr) { + // randomx_release_dataset(dataset); + // dataset = nullptr; + // } + //} + + return dataset; + } + + constexpr unsigned long DatasetItemCount = randomx::DatasetSize / RANDOMX_DATASET_ITEM_SIZE; + + unsigned long randomx_dataset_item_count() { + return DatasetItemCount; + } + + void randomx_init_dataset(randomx_dataset *dataset, randomx_cache *cache, unsigned long startItem, unsigned long itemCount) { + assert(dataset != nullptr); + assert(cache != nullptr); + assert(startItem < DatasetItemCount && itemCount <= DatasetItemCount); + assert(startItem + itemCount <= DatasetItemCount); + cache->datasetInit(cache, dataset->memory + startItem * randomx::CacheLineSize, startItem, startItem + itemCount); + } + + void *randomx_get_dataset_memory(randomx_dataset *dataset) { + assert(dataset != nullptr); + return dataset->memory; + } + + void randomx_release_dataset(randomx_dataset *dataset) { + assert(dataset != nullptr); + dataset->dealloc(dataset); + delete dataset; + } + + randomx_vm *randomx_create_vm(randomx_flags flags, randomx_cache *cache, randomx_dataset *dataset) { + assert(cache != nullptr || (flags & RANDOMX_FLAG_FULL_MEM)); + assert(cache == nullptr || cache->isInitialized()); + assert(dataset != nullptr || !(flags & RANDOMX_FLAG_FULL_MEM)); + + randomx_vm *vm = nullptr; + + try { + switch ((int)(flags & (RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES))) { + case RANDOMX_FLAG_DEFAULT: + vm = new randomx::InterpretedLightVmDefault(); + break; + + case RANDOMX_FLAG_FULL_MEM: + vm = new randomx::InterpretedVmDefault(); + break; + + case RANDOMX_FLAG_JIT: + if (flags & RANDOMX_FLAG_SECURE) { + vm = new randomx::CompiledLightVmDefaultSecure(); + } + else { + vm = new randomx::CompiledLightVmDefault(); + } + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT: + if (flags & RANDOMX_FLAG_SECURE) { + vm = new randomx::CompiledVmDefaultSecure(); + } + else { + vm = new randomx::CompiledVmDefault(); + } + break; + + case RANDOMX_FLAG_HARD_AES: + vm = new randomx::InterpretedLightVmHardAes(); + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_HARD_AES: + vm = new randomx::InterpretedVmHardAes(); + break; + + case RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES: + if (flags & RANDOMX_FLAG_SECURE) { + vm = new randomx::CompiledLightVmHardAesSecure(); + } + else { + vm = new randomx::CompiledLightVmHardAes(); + } + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES: + if (flags & RANDOMX_FLAG_SECURE) { + vm = new randomx::CompiledVmHardAesSecure(); + } + else { + vm = new randomx::CompiledVmHardAes(); + } + break; + + case RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::InterpretedLightVmLargePage(); + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::InterpretedVmLargePage(); + break; + + case RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES: + if (flags & RANDOMX_FLAG_SECURE) { + vm = new randomx::CompiledLightVmLargePageSecure(); + } + else { + vm = new randomx::CompiledLightVmLargePage(); + } + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES: + if (flags & RANDOMX_FLAG_SECURE) { + vm = new randomx::CompiledVmLargePageSecure(); + } + else { + vm = new randomx::CompiledVmLargePage(); + } + break; + + case RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::InterpretedLightVmLargePageHardAes(); + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::InterpretedVmLargePageHardAes(); + break; + + case RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: + if (flags & RANDOMX_FLAG_SECURE) { + vm = new randomx::CompiledLightVmLargePageHardAesSecure(); + } + else { + vm = new randomx::CompiledLightVmLargePageHardAes(); + } + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: + if (flags & RANDOMX_FLAG_SECURE) { + vm = new randomx::CompiledVmLargePageHardAesSecure(); + } + else { + vm = new randomx::CompiledVmLargePageHardAes(); + } + break; + + default: + UNREACHABLE; + } + + if(cache != nullptr) { + // printf("cachedddddddddddd\n"); yes + vm->setCache(cache); + vm->cacheKey = cache->cacheKey; + } + + if(dataset != nullptr){ + // printf("datasetdddddddddddd\n"); no + vm->setDataset(dataset); + } + + vm->allocate(); //allocate the scratchpad + } + catch (std::exception &ex) { + delete vm; + vm = nullptr; + } + + return vm; + } + + void randomx_vm_set_cache(randomx_vm *machine, randomx_cache* cache) { + assert(machine != nullptr); + assert(cache != nullptr && cache->isInitialized()); + if (machine->cacheKey != cache->cacheKey) { + machine->setCache(cache); + machine->cacheKey = cache->cacheKey; + } + } + + void randomx_vm_set_dataset(randomx_vm *machine, randomx_dataset *dataset) { + assert(machine != nullptr); + assert(dataset != nullptr); + machine->setDataset(dataset); + } + + void randomx_destroy_vm(randomx_vm *machine) { + assert(machine != nullptr); + delete machine; + } + + void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output) { + assert(machine != nullptr); + assert(inputSize == 0 || input != nullptr); + assert(output != nullptr); + alignas(16) uint64_t tempHash[8]; + int blakeResult = blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); + assert(blakeResult == 0); + machine->initScratchpad(&tempHash); + machine->resetRoundingMode(); + for (int chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) { //RANDOMX_PROGRAM_COUNT =8 + machine->run(&tempHash); + blakeResult = blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + assert(blakeResult == 0); + } + machine->run(&tempHash); + machine->getFinalResult(output, RANDOMX_HASH_SIZE); + } + + +/* +int main(int argc, char** argv) { + + const uint8_t myKey[] ={ 0x67,0x0f,0x0b,0x99,0x1d,0xc3,0xfe,0x80,0x56,0x04,0xea,0xc3,0x79,0x35,0x1d,0x9a,0xb5,0x21,0xef,0xac,0x60,0x95,0xf2,0x6b,0xca,0xa3,0xa8,0x56,0x83,0x89,0x77,0x99}; + + const uint8_t myInput[] = { 0x0e,0x0e,0xe4,0xf3,0xf4,0xff,0x05,0x81,0xd4,0x1a,0x87,0xb3,0xa3,0xd7,0xb6,0x40,0x24,0x8d,0x9f,0x34,0x86,0x07,0xe9,0x67,0x55,0x5c,0xce,0x2f,0x40,0xe6,0x5e,0x5e,0x40,0x45,0x46,0x56,0x9e,0xb3,0xe6, + //0xee,0x85,0x00,0x00, + 0xe6,0x23,0x00,0x00, + 0xcb,0x84,0x81,0x0b,0xc0,0x28,0xa5,0x76,0xec,0xd2,0x0b,0xf9,0xee,0xee,0x43,0x78,0x9c,0x3d,0x55,0xe5,0x54,0xe3,0x05,0xb3,0x46,0x02,0x09,0x64,0x43,0x21,0xd2,0x9e,0x28}; + + uint8_t hash[RANDOMX_HASH_SIZE]; + int initThreadCount =16; + randomx_cache* cache = randomx_alloc_cache(RANDOMX_FLAG_DEFAULT); + + randomx_init_cache(cache, myKey, sizeof myKey); + + uint32_t datasetItemCount = randomx_dataset_item_count(); + + printf("datasetItemCount=%d\n", datasetItemCount); + + randomx_dataset* dataset = randomx_alloc_dataset(RANDOMX_FLAG_DEFAULT); + + std::vector threads; + auto perThread = datasetItemCount / initThreadCount; + auto remainder = datasetItemCount % initThreadCount; + uint32_t startItem = 0; + for (int i = 0; i < initThreadCount; ++i) { + auto count = perThread + (i == initThreadCount - 1 ? remainder : 0); + threads.push_back(std::thread(&randomx_init_dataset, dataset, cache, startItem, count)); + startItem += count; + } + for (unsigned i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + + randomx_release_cache(cache); + + randomx_vm* vm = randomx_create_vm(RANDOMX_FLAG_FULL_MEM,nullptr, dataset); + + randomx_calculate_hash(vm, &myInput, sizeof myInput, hash); + + randomx_destroy_vm(vm); + + randomx_release_dataset(dataset); + + for (unsigned i = 0; i < RANDOMX_HASH_SIZE; ++i) + printf("%02x", hash[i] & 0xff); + + return 0; +} +//8a48e5f9db45ab79d98574c4d81954fe6ac63842214aff73c244b26330b7c9 +*/ + +/* +int main() { + + const uint8_t myKey[] ={ 0x67,0x0f,0x0b,0x99,0x1d,0xc3,0xfe,0x80,0x56,0x04,0xea,0xc3,0x79,0x35,0x1d,0x9a,0xb5,0x21,0xef,0xac,0x60,0x95,0xf2,0x6b,0xca,0xa3,0xa8,0x56,0x83,0x89,0x77,0x99}; + + const uint8_t myInput[] = { 0x0e,0x0e,0xe4,0xf3,0xf4,0xff,0x05,0x81,0xd4,0x1a,0x87,0xb3,0xa3,0xd7,0xb6,0x40,0x24,0x8d,0x9f,0x34,0x86,0x07,0xe9,0x67,0x55,0x5c,0xce,0x2f,0x40,0xe6,0x5e,0x5e,0x40,0x45,0x46,0x56,0x9e,0xb3,0xe6, + //0xee,0x85,0x00,0x00, + 0xe6,0x23,0x00,0x00, + 0xcb,0x84,0x81,0x0b,0xc0,0x28,0xa5,0x76,0xec,0xd2,0x0b,0xf9,0xee,0xee,0x43,0x78,0x9c,0x3d,0x55,0xe5,0x54,0xe3,0x05,0xb3,0x46,0x02,0x09,0x64,0x43,0x21,0xd2,0x9e,0x28}; + + char hash[RANDOMX_HASH_SIZE]; + + //randomx_flags flags = randomx_get_flags(); + randomx_flags flags = RANDOMX_FLAG_DEFAULT; + randomx_cache *myCache = randomx_alloc_cache(flags); + + randomx_init_cache(myCache, &myKey, sizeof myKey); + + randomx_vm *myMachine = randomx_create_vm(flags, myCache, NULL); + + randomx_calculate_hash(myMachine, &myInput, sizeof myInput, hash); + + randomx_destroy_vm(myMachine); + randomx_release_cache(myCache); + + for (unsigned i = 0; i < RANDOMX_HASH_SIZE; ++i) + printf("%02x", hash[i] & 0xff); + + printf("\n"); + + return 0; +} +*/ + + +randomx_cache* cache; +randomx_vm* vm = nullptr; + +int main(){ + + const uint8_t myKey[] ={ 0x67,0x0f,0x0b,0x99,0x1d,0xc3,0xfe,0x80,0x56,0x04,0xea,0xc3,0x79,0x35,0x1d,0x9a,0xb5,0x21,0xef,0xac,0x60,0x95,0xf2,0x6b,0xca,0xa3,0xa8,0x56,0x83,0x89,0x77,0x99}; + + const uint8_t myInput[] = { 0x0e,0x0e,0xe4,0xf3,0xf4,0xff,0x05,0x81,0xd4,0x1a,0x87,0xb3,0xa3,0xd7,0xb6,0x40,0x24,0x8d,0x9f,0x34,0x86,0x07,0xe9,0x67,0x55,0x5c,0xce,0x2f,0x40,0xe6,0x5e,0x5e,0x40,0x45,0x46,0x56,0x9e,0xb3,0xe6, + 0xe6,0x23,0x00,0x00, + 0xcb,0x84,0x81,0x0b,0xc0,0x28,0xa5,0x76,0xec,0xd2,0x0b,0xf9,0xee,0xee,0x43,0x78,0x9c,0x3d,0x55,0xe5,0x54,0xe3,0x05,0xb3,0x46,0x02,0x09,0x64,0x43,0x21,0xd2,0x9e,0x28}; + + + // const uint8_t myKey[] ={146, 6, 71, 248, 241, 11, 139, 72, 70, 73, 173, 248, 53, 153, 197, 184, 107, 186, 19, 126, 126, 178, 46, 149, 221, 135, 57, 217, 133, 40, 246, 119}; + // const uint8_t myInput[] = {0, 0, 0, 14, 246, 237, 44, 156, 4, 131, 10, 137, 157, 56, 143, 188, 94, 194, 80, 172, 219, 123, 75, 112, 250, 36, 34, 195, 214, 232, 2, 195, 72, 210, 201, 0, 0, 0, 0, 0, 128, 7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + uint8_t hash[RANDOMX_HASH_SIZE]; + cache = randomx_alloc_cache(RANDOMX_FLAG_DEFAULT); + randomx_init_cache(cache, myKey, sizeof(myKey)); + + vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT, cache, nullptr); + // randomx_vm_set_cache(vm, cache); + + randomx_calculate_hash(vm, myInput, sizeof(myInput), hash); + for (unsigned i = 0; i < RANDOMX_HASH_SIZE; ++i) + // printf("%02d ", hash[i] & 0xff); + printf("%02x", hash[i] & 0xff); + + + //assert(equalsHex(hash, "1a7151b1367507ded1e9af0b97da8ae23ec84e9f352eb731eab8f0f060710300")); +} + + + //g++ aes_hash.cpp allocator.cpp argon2_avx2.c argon2_core.c argon2_ref.c argon2_ssse3.c assembly_generator_x86.cpp blake2_generator.cpp bytecode_machine.cpp cpu.cpp dataset.cpp instruction.cpp instructions_portable.cpp randomx.cpp reciprocal.c soft_aes.cpp superscalar.cpp virtual_machine.cpp virtual_memory.cpp vm_compiled.cpp vm_compiled_light.cpp vm_interpreted.cpp vm_interpreted_light.cpp ./blake2/blake2b.c jit_compiler_x86.cpp jit_compiler_x86_static.S diff --git a/randomx/randomx.h b/randomx/randomx.h new file mode 100644 index 0000000..c06002b --- /dev/null +++ b/randomx/randomx.h @@ -0,0 +1,245 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef RANDOMX_H +#define RANDOMX_H + +#include + +#define RANDOMX_HASH_SIZE 32 +#define RANDOMX_DATASET_ITEM_SIZE 64 + +#ifndef RANDOMX_EXPORT +#define RANDOMX_EXPORT +#endif + +typedef enum { + RANDOMX_FLAG_DEFAULT = 0, + RANDOMX_FLAG_LARGE_PAGES = 1, + RANDOMX_FLAG_HARD_AES = 2, + RANDOMX_FLAG_FULL_MEM = 4, + RANDOMX_FLAG_JIT = 8, + RANDOMX_FLAG_SECURE = 16, + RANDOMX_FLAG_ARGON2_SSSE3 = 32, + RANDOMX_FLAG_ARGON2_AVX2 = 64, + RANDOMX_FLAG_ARGON2 = 96 +} randomx_flags; + +typedef struct randomx_dataset randomx_dataset; +typedef struct randomx_cache randomx_cache; +typedef struct randomx_vm randomx_vm; + + +#if defined(__cplusplus) + +#ifdef __cpp_constexpr +#define CONSTEXPR constexpr +#else +#define CONSTEXPR +#endif + +inline CONSTEXPR randomx_flags operator |(randomx_flags a, randomx_flags b) { + return static_cast(static_cast(a) | static_cast(b)); +} +inline CONSTEXPR randomx_flags operator &(randomx_flags a, randomx_flags b) { + return static_cast(static_cast(a) & static_cast(b)); +} +inline randomx_flags& operator |=(randomx_flags& a, randomx_flags b) { + return a = a | b; +} + +extern "C" { +#endif + +/** + * @return The recommended flags to be used on the current machine. + * Does not include: + * RANDOMX_FLAG_LARGE_PAGES + * RANDOMX_FLAG_FULL_MEM + * RANDOMX_FLAG_SECURE + * These flags must be added manually if desired. + * On OpenBSD RANDOMX_FLAG_SECURE is enabled by default in JIT mode as W^X is enforced by the OS. + */ +RANDOMX_EXPORT randomx_flags randomx_get_flags(void); + +/** + * Creates a randomx_cache structure and allocates memory for RandomX Cache. + * + * @param flags is any combination of these 2 flags (each flag can be set or not set): + * RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages + * RANDOMX_FLAG_JIT - create cache structure with JIT compilation support; this makes + * subsequent Dataset initialization faster + * Optionally, one of these two flags may be selected: + * RANDOMX_FLAG_ARGON2_SSSE3 - optimized Argon2 for CPUs with the SSSE3 instruction set + * makes subsequent cache initialization faster + * RANDOMX_FLAG_ARGON2_AVX2 - optimized Argon2 for CPUs with the AVX2 instruction set + * makes subsequent cache initialization faster + * + * @return Pointer to an allocated randomx_cache structure. + * Returns NULL if: + * (1) memory allocation fails + * (2) the RANDOMX_FLAG_JIT is set and JIT compilation is not supported on the current platform + * (3) an invalid or unsupported RANDOMX_FLAG_ARGON2 value is set + */ +RANDOMX_EXPORT randomx_cache *randomx_alloc_cache(randomx_flags flags); + +/** + * Initializes the cache memory and SuperscalarHash using the provided key value. + * Does nothing if called again with the same key value. + * + * @param cache is a pointer to a previously allocated randomx_cache structure. Must not be NULL. + * @param key is a pointer to memory which contains the key value. Must not be NULL. + * @param keySize is the number of bytes of the key. +*/ +RANDOMX_EXPORT void randomx_init_cache(randomx_cache *cache, const void *key, size_t keySize); + +/** + * Releases all memory occupied by the randomx_cache structure. + * + * @param cache is a pointer to a previously allocated randomx_cache structure. +*/ +RANDOMX_EXPORT void randomx_release_cache(randomx_cache* cache); + +/** + * Creates a randomx_dataset structure and allocates memory for RandomX Dataset. + * + * @param flags is the initialization flags. Only one flag is supported (can be set or not set): + * RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages + * + * @return Pointer to an allocated randomx_dataset structure. + * NULL is returned if memory allocation fails. + */ +RANDOMX_EXPORT randomx_dataset *randomx_alloc_dataset(randomx_flags flags); + +/** + * Gets the number of items contained in the dataset. + * + * @return the number of items contained in the dataset. +*/ +RANDOMX_EXPORT unsigned long randomx_dataset_item_count(void); + +/** + * Initializes dataset items. + * + * Note: In order to use the Dataset, all items from 0 to (randomx_dataset_item_count() - 1) must be initialized. + * This may be done by several calls to this function using non-overlapping item sequences. + * + * @param dataset is a pointer to a previously allocated randomx_dataset structure. Must not be NULL. + * @param cache is a pointer to a previously allocated and initialized randomx_cache structure. Must not be NULL. + * @param startItem is the item number where intialization should start. + * @param itemCount is the number of items that should be initialized. +*/ +RANDOMX_EXPORT void randomx_init_dataset(randomx_dataset *dataset, randomx_cache *cache, unsigned long startItem, unsigned long itemCount); + +/** + * Returns a pointer to the internal memory buffer of the dataset structure. The size + * of the internal memory buffer is randomx_dataset_item_count() * RANDOMX_DATASET_ITEM_SIZE. + * + * @param dataset is a pointer to a previously allocated randomx_dataset structure. Must not be NULL. + * + * @return Pointer to the internal memory buffer of the dataset structure. +*/ +RANDOMX_EXPORT void *randomx_get_dataset_memory(randomx_dataset *dataset); + +/** + * Releases all memory occupied by the randomx_dataset structure. + * + * @param dataset is a pointer to a previously allocated randomx_dataset structure. +*/ +RANDOMX_EXPORT void randomx_release_dataset(randomx_dataset *dataset); + +/** + * Creates and initializes a RandomX virtual machine. + * + * @param flags is any combination of these 5 flags (each flag can be set or not set): + * RANDOMX_FLAG_LARGE_PAGES - allocate scratchpad memory in large pages + * RANDOMX_FLAG_HARD_AES - virtual machine will use hardware accelerated AES + * RANDOMX_FLAG_FULL_MEM - virtual machine will use the full dataset + * RANDOMX_FLAG_JIT - virtual machine will use a JIT compiler + * RANDOMX_FLAG_SECURE - when combined with RANDOMX_FLAG_JIT, the JIT pages are never + * writable and executable at the same time (W^X policy) + * The numeric values of the first 4 flags are ordered so that a higher value will provide + * faster hash calculation and a lower numeric value will provide higher portability. + * Using RANDOMX_FLAG_DEFAULT (all flags not set) works on all platforms, but is the slowest. + * @param cache is a pointer to an initialized randomx_cache structure. Can be + * NULL if RANDOMX_FLAG_FULL_MEM is set. + * @param dataset is a pointer to a randomx_dataset structure. Can be NULL + * if RANDOMX_FLAG_FULL_MEM is not set. + * + * @return Pointer to an initialized randomx_vm structure. + * Returns NULL if: + * (1) Scratchpad memory allocation fails. + * (2) The requested initialization flags are not supported on the current platform. + * (3) cache parameter is NULL and RANDOMX_FLAG_FULL_MEM is not set + * (4) dataset parameter is NULL and RANDOMX_FLAG_FULL_MEM is set +*/ +RANDOMX_EXPORT randomx_vm *randomx_create_vm(randomx_flags flags, randomx_cache *cache, randomx_dataset *dataset); + +/** + * Reinitializes a virtual machine with a new Cache. This function should be called anytime + * the Cache is reinitialized with a new key. Does nothing if called with a Cache containing + * the same key value as already set. + * + * @param machine is a pointer to a randomx_vm structure that was initialized + * without RANDOMX_FLAG_FULL_MEM. Must not be NULL. + * @param cache is a pointer to an initialized randomx_cache structure. Must not be NULL. +*/ +RANDOMX_EXPORT void randomx_vm_set_cache(randomx_vm *machine, randomx_cache* cache); + +/** + * Reinitializes a virtual machine with a new Dataset. + * + * @param machine is a pointer to a randomx_vm structure that was initialized + * with RANDOMX_FLAG_FULL_MEM. Must not be NULL. + * @param dataset is a pointer to an initialized randomx_dataset structure. Must not be NULL. +*/ +RANDOMX_EXPORT void randomx_vm_set_dataset(randomx_vm *machine, randomx_dataset *dataset); + +/** + * Releases all memory occupied by the randomx_vm structure. + * + * @param machine is a pointer to a previously created randomx_vm structure. +*/ +RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine); + +/** + * Calculates a RandomX hash value. + * + * @param machine is a pointer to a randomx_vm structure. Must not be NULL. + * @param input is a pointer to memory to be hashed. Must not be NULL. + * @param inputSize is the number of bytes to be hashed. + * @param output is a pointer to memory where the hash will be stored. Must not + * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. +*/ +RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/randomx/reciprocal.c b/randomx/reciprocal.c new file mode 100644 index 0000000..22620f5 --- /dev/null +++ b/randomx/reciprocal.c @@ -0,0 +1,80 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "reciprocal.h" + +/* + Calculates rcp = 2**x / divisor for highest integer x such that rcp < 2**64. + divisor must not be 0 or a power of 2 + + Equivalent x86 assembly (divisor in rcx): + + mov edx, 1 + mov r8, rcx + xor eax, eax + bsr rcx, rcx + shl rdx, cl + div r8 + ret + +*/ +uint64_t randomx_reciprocal(uint64_t divisor) { + + assert(divisor != 0); + + const uint64_t p2exp63 = 1ULL << 63; + + uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor; + + unsigned bsr = 0; //highest set bit in divisor + + for (uint64_t bit = divisor; bit > 0; bit >>= 1) + bsr++; + + for (unsigned shift = 0; shift < bsr; shift++) { + if (remainder >= divisor - remainder) { + quotient = quotient * 2 + 1; + remainder = remainder * 2 - divisor; + } + else { + quotient = quotient * 2; + remainder = remainder * 2; + } + } + + return quotient; +} + +#if !RANDOMX_HAVE_FAST_RECIPROCAL + +uint64_t randomx_reciprocal_fast(uint64_t divisor) { + return randomx_reciprocal(divisor); +} + +#endif diff --git a/randomx/reciprocal.h b/randomx/reciprocal.h new file mode 100644 index 0000000..8858df2 --- /dev/null +++ b/randomx/reciprocal.h @@ -0,0 +1,48 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +#if defined(_M_X64) || defined(__x86_64__) +#define RANDOMX_HAVE_FAST_RECIPROCAL 1 +#else +#define RANDOMX_HAVE_FAST_RECIPROCAL 0 +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +uint64_t randomx_reciprocal(uint64_t); +uint64_t randomx_reciprocal_fast(uint64_t); + +#if defined(__cplusplus) +} +#endif diff --git a/randomx/soft_aes.cpp b/randomx/soft_aes.cpp new file mode 100644 index 0000000..3e82fa2 --- /dev/null +++ b/randomx/soft_aes.cpp @@ -0,0 +1,364 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "soft_aes.h" + +alignas(16) const uint8_t sbox[256] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, +}; + +alignas(16) const uint32_t lutEnc0[256] = { + 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591, + 0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56, 0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec, + 0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa, 0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb, + 0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45, 0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b, + 0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c, 0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83, + 0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9, 0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a, + 0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d, 0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f, + 0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df, 0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea, + 0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34, 0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b, + 0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d, 0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413, + 0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1, 0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6, + 0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972, 0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85, + 0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed, 0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511, + 0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe, 0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b, + 0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05, 0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1, + 0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142, 0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf, + 0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3, 0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e, + 0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a, 0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6, + 0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3, 0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b, + 0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428, 0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad, + 0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14, 0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8, + 0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4, 0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2, + 0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda, 0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949, + 0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf, 0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810, + 0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c, 0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697, + 0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e, 0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f, + 0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc, 0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c, + 0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969, 0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27, + 0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122, 0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433, + 0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9, 0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5, + 0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a, 0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0, + 0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e, 0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c, +}; + +alignas(16) const uint32_t lutEnc1[256] = { + 0x6363c6a5, 0x7c7cf884, 0x7777ee99, 0x7b7bf68d, 0xf2f2ff0d, 0x6b6bd6bd, 0x6f6fdeb1, 0xc5c59154, + 0x30306050, 0x01010203, 0x6767cea9, 0x2b2b567d, 0xfefee719, 0xd7d7b562, 0xabab4de6, 0x7676ec9a, + 0xcaca8f45, 0x82821f9d, 0xc9c98940, 0x7d7dfa87, 0xfafaef15, 0x5959b2eb, 0x47478ec9, 0xf0f0fb0b, + 0xadad41ec, 0xd4d4b367, 0xa2a25ffd, 0xafaf45ea, 0x9c9c23bf, 0xa4a453f7, 0x7272e496, 0xc0c09b5b, + 0xb7b775c2, 0xfdfde11c, 0x93933dae, 0x26264c6a, 0x36366c5a, 0x3f3f7e41, 0xf7f7f502, 0xcccc834f, + 0x3434685c, 0xa5a551f4, 0xe5e5d134, 0xf1f1f908, 0x7171e293, 0xd8d8ab73, 0x31316253, 0x15152a3f, + 0x0404080c, 0xc7c79552, 0x23234665, 0xc3c39d5e, 0x18183028, 0x969637a1, 0x05050a0f, 0x9a9a2fb5, + 0x07070e09, 0x12122436, 0x80801b9b, 0xe2e2df3d, 0xebebcd26, 0x27274e69, 0xb2b27fcd, 0x7575ea9f, + 0x0909121b, 0x83831d9e, 0x2c2c5874, 0x1a1a342e, 0x1b1b362d, 0x6e6edcb2, 0x5a5ab4ee, 0xa0a05bfb, + 0x5252a4f6, 0x3b3b764d, 0xd6d6b761, 0xb3b37dce, 0x2929527b, 0xe3e3dd3e, 0x2f2f5e71, 0x84841397, + 0x5353a6f5, 0xd1d1b968, 0x00000000, 0xededc12c, 0x20204060, 0xfcfce31f, 0xb1b179c8, 0x5b5bb6ed, + 0x6a6ad4be, 0xcbcb8d46, 0xbebe67d9, 0x3939724b, 0x4a4a94de, 0x4c4c98d4, 0x5858b0e8, 0xcfcf854a, + 0xd0d0bb6b, 0xefefc52a, 0xaaaa4fe5, 0xfbfbed16, 0x434386c5, 0x4d4d9ad7, 0x33336655, 0x85851194, + 0x45458acf, 0xf9f9e910, 0x02020406, 0x7f7ffe81, 0x5050a0f0, 0x3c3c7844, 0x9f9f25ba, 0xa8a84be3, + 0x5151a2f3, 0xa3a35dfe, 0x404080c0, 0x8f8f058a, 0x92923fad, 0x9d9d21bc, 0x38387048, 0xf5f5f104, + 0xbcbc63df, 0xb6b677c1, 0xdadaaf75, 0x21214263, 0x10102030, 0xffffe51a, 0xf3f3fd0e, 0xd2d2bf6d, + 0xcdcd814c, 0x0c0c1814, 0x13132635, 0xececc32f, 0x5f5fbee1, 0x979735a2, 0x444488cc, 0x17172e39, + 0xc4c49357, 0xa7a755f2, 0x7e7efc82, 0x3d3d7a47, 0x6464c8ac, 0x5d5dbae7, 0x1919322b, 0x7373e695, + 0x6060c0a0, 0x81811998, 0x4f4f9ed1, 0xdcdca37f, 0x22224466, 0x2a2a547e, 0x90903bab, 0x88880b83, + 0x46468cca, 0xeeeec729, 0xb8b86bd3, 0x1414283c, 0xdedea779, 0x5e5ebce2, 0x0b0b161d, 0xdbdbad76, + 0xe0e0db3b, 0x32326456, 0x3a3a744e, 0x0a0a141e, 0x494992db, 0x06060c0a, 0x2424486c, 0x5c5cb8e4, + 0xc2c29f5d, 0xd3d3bd6e, 0xacac43ef, 0x6262c4a6, 0x919139a8, 0x959531a4, 0xe4e4d337, 0x7979f28b, + 0xe7e7d532, 0xc8c88b43, 0x37376e59, 0x6d6ddab7, 0x8d8d018c, 0xd5d5b164, 0x4e4e9cd2, 0xa9a949e0, + 0x6c6cd8b4, 0x5656acfa, 0xf4f4f307, 0xeaeacf25, 0x6565caaf, 0x7a7af48e, 0xaeae47e9, 0x08081018, + 0xbaba6fd5, 0x7878f088, 0x25254a6f, 0x2e2e5c72, 0x1c1c3824, 0xa6a657f1, 0xb4b473c7, 0xc6c69751, + 0xe8e8cb23, 0xdddda17c, 0x7474e89c, 0x1f1f3e21, 0x4b4b96dd, 0xbdbd61dc, 0x8b8b0d86, 0x8a8a0f85, + 0x7070e090, 0x3e3e7c42, 0xb5b571c4, 0x6666ccaa, 0x484890d8, 0x03030605, 0xf6f6f701, 0x0e0e1c12, + 0x6161c2a3, 0x35356a5f, 0x5757aef9, 0xb9b969d0, 0x86861791, 0xc1c19958, 0x1d1d3a27, 0x9e9e27b9, + 0xe1e1d938, 0xf8f8eb13, 0x98982bb3, 0x11112233, 0x6969d2bb, 0xd9d9a970, 0x8e8e0789, 0x949433a7, + 0x9b9b2db6, 0x1e1e3c22, 0x87871592, 0xe9e9c920, 0xcece8749, 0x5555aaff, 0x28285078, 0xdfdfa57a, + 0x8c8c038f, 0xa1a159f8, 0x89890980, 0x0d0d1a17, 0xbfbf65da, 0xe6e6d731, 0x424284c6, 0x6868d0b8, + 0x414182c3, 0x999929b0, 0x2d2d5a77, 0x0f0f1e11, 0xb0b07bcb, 0x5454a8fc, 0xbbbb6dd6, 0x16162c3a, +}; + +alignas(16) const uint32_t lutEnc2[256] = { + 0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b, 0xf2ff0df2, 0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5, + 0x30605030, 0x01020301, 0x67cea967, 0x2b567d2b, 0xfee719fe, 0xd7b562d7, 0xab4de6ab, 0x76ec9a76, + 0xca8f45ca, 0x821f9d82, 0xc98940c9, 0x7dfa877d, 0xfaef15fa, 0x59b2eb59, 0x478ec947, 0xf0fb0bf0, + 0xad41ecad, 0xd4b367d4, 0xa25ffda2, 0xaf45eaaf, 0x9c23bf9c, 0xa453f7a4, 0x72e49672, 0xc09b5bc0, + 0xb775c2b7, 0xfde11cfd, 0x933dae93, 0x264c6a26, 0x366c5a36, 0x3f7e413f, 0xf7f502f7, 0xcc834fcc, + 0x34685c34, 0xa551f4a5, 0xe5d134e5, 0xf1f908f1, 0x71e29371, 0xd8ab73d8, 0x31625331, 0x152a3f15, + 0x04080c04, 0xc79552c7, 0x23466523, 0xc39d5ec3, 0x18302818, 0x9637a196, 0x050a0f05, 0x9a2fb59a, + 0x070e0907, 0x12243612, 0x801b9b80, 0xe2df3de2, 0xebcd26eb, 0x274e6927, 0xb27fcdb2, 0x75ea9f75, + 0x09121b09, 0x831d9e83, 0x2c58742c, 0x1a342e1a, 0x1b362d1b, 0x6edcb26e, 0x5ab4ee5a, 0xa05bfba0, + 0x52a4f652, 0x3b764d3b, 0xd6b761d6, 0xb37dceb3, 0x29527b29, 0xe3dd3ee3, 0x2f5e712f, 0x84139784, + 0x53a6f553, 0xd1b968d1, 0x00000000, 0xedc12ced, 0x20406020, 0xfce31ffc, 0xb179c8b1, 0x5bb6ed5b, + 0x6ad4be6a, 0xcb8d46cb, 0xbe67d9be, 0x39724b39, 0x4a94de4a, 0x4c98d44c, 0x58b0e858, 0xcf854acf, + 0xd0bb6bd0, 0xefc52aef, 0xaa4fe5aa, 0xfbed16fb, 0x4386c543, 0x4d9ad74d, 0x33665533, 0x85119485, + 0x458acf45, 0xf9e910f9, 0x02040602, 0x7ffe817f, 0x50a0f050, 0x3c78443c, 0x9f25ba9f, 0xa84be3a8, + 0x51a2f351, 0xa35dfea3, 0x4080c040, 0x8f058a8f, 0x923fad92, 0x9d21bc9d, 0x38704838, 0xf5f104f5, + 0xbc63dfbc, 0xb677c1b6, 0xdaaf75da, 0x21426321, 0x10203010, 0xffe51aff, 0xf3fd0ef3, 0xd2bf6dd2, + 0xcd814ccd, 0x0c18140c, 0x13263513, 0xecc32fec, 0x5fbee15f, 0x9735a297, 0x4488cc44, 0x172e3917, + 0xc49357c4, 0xa755f2a7, 0x7efc827e, 0x3d7a473d, 0x64c8ac64, 0x5dbae75d, 0x19322b19, 0x73e69573, + 0x60c0a060, 0x81199881, 0x4f9ed14f, 0xdca37fdc, 0x22446622, 0x2a547e2a, 0x903bab90, 0x880b8388, + 0x468cca46, 0xeec729ee, 0xb86bd3b8, 0x14283c14, 0xdea779de, 0x5ebce25e, 0x0b161d0b, 0xdbad76db, + 0xe0db3be0, 0x32645632, 0x3a744e3a, 0x0a141e0a, 0x4992db49, 0x060c0a06, 0x24486c24, 0x5cb8e45c, + 0xc29f5dc2, 0xd3bd6ed3, 0xac43efac, 0x62c4a662, 0x9139a891, 0x9531a495, 0xe4d337e4, 0x79f28b79, + 0xe7d532e7, 0xc88b43c8, 0x376e5937, 0x6ddab76d, 0x8d018c8d, 0xd5b164d5, 0x4e9cd24e, 0xa949e0a9, + 0x6cd8b46c, 0x56acfa56, 0xf4f307f4, 0xeacf25ea, 0x65caaf65, 0x7af48e7a, 0xae47e9ae, 0x08101808, + 0xba6fd5ba, 0x78f08878, 0x254a6f25, 0x2e5c722e, 0x1c38241c, 0xa657f1a6, 0xb473c7b4, 0xc69751c6, + 0xe8cb23e8, 0xdda17cdd, 0x74e89c74, 0x1f3e211f, 0x4b96dd4b, 0xbd61dcbd, 0x8b0d868b, 0x8a0f858a, + 0x70e09070, 0x3e7c423e, 0xb571c4b5, 0x66ccaa66, 0x4890d848, 0x03060503, 0xf6f701f6, 0x0e1c120e, + 0x61c2a361, 0x356a5f35, 0x57aef957, 0xb969d0b9, 0x86179186, 0xc19958c1, 0x1d3a271d, 0x9e27b99e, + 0xe1d938e1, 0xf8eb13f8, 0x982bb398, 0x11223311, 0x69d2bb69, 0xd9a970d9, 0x8e07898e, 0x9433a794, + 0x9b2db69b, 0x1e3c221e, 0x87159287, 0xe9c920e9, 0xce8749ce, 0x55aaff55, 0x28507828, 0xdfa57adf, + 0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d, 0xbf65dabf, 0xe6d731e6, 0x4284c642, 0x68d0b868, + 0x4182c341, 0x9929b099, 0x2d5a772d, 0x0f1e110f, 0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb, 0x162c3a16, +}; + +alignas(16) const uint32_t lutEnc3[256] = { + 0xc6a56363, 0xf8847c7c, 0xee997777, 0xf68d7b7b, 0xff0df2f2, 0xd6bd6b6b, 0xdeb16f6f, 0x9154c5c5, + 0x60503030, 0x02030101, 0xcea96767, 0x567d2b2b, 0xe719fefe, 0xb562d7d7, 0x4de6abab, 0xec9a7676, + 0x8f45caca, 0x1f9d8282, 0x8940c9c9, 0xfa877d7d, 0xef15fafa, 0xb2eb5959, 0x8ec94747, 0xfb0bf0f0, + 0x41ecadad, 0xb367d4d4, 0x5ffda2a2, 0x45eaafaf, 0x23bf9c9c, 0x53f7a4a4, 0xe4967272, 0x9b5bc0c0, + 0x75c2b7b7, 0xe11cfdfd, 0x3dae9393, 0x4c6a2626, 0x6c5a3636, 0x7e413f3f, 0xf502f7f7, 0x834fcccc, + 0x685c3434, 0x51f4a5a5, 0xd134e5e5, 0xf908f1f1, 0xe2937171, 0xab73d8d8, 0x62533131, 0x2a3f1515, + 0x080c0404, 0x9552c7c7, 0x46652323, 0x9d5ec3c3, 0x30281818, 0x37a19696, 0x0a0f0505, 0x2fb59a9a, + 0x0e090707, 0x24361212, 0x1b9b8080, 0xdf3de2e2, 0xcd26ebeb, 0x4e692727, 0x7fcdb2b2, 0xea9f7575, + 0x121b0909, 0x1d9e8383, 0x58742c2c, 0x342e1a1a, 0x362d1b1b, 0xdcb26e6e, 0xb4ee5a5a, 0x5bfba0a0, + 0xa4f65252, 0x764d3b3b, 0xb761d6d6, 0x7dceb3b3, 0x527b2929, 0xdd3ee3e3, 0x5e712f2f, 0x13978484, + 0xa6f55353, 0xb968d1d1, 0x00000000, 0xc12ceded, 0x40602020, 0xe31ffcfc, 0x79c8b1b1, 0xb6ed5b5b, + 0xd4be6a6a, 0x8d46cbcb, 0x67d9bebe, 0x724b3939, 0x94de4a4a, 0x98d44c4c, 0xb0e85858, 0x854acfcf, + 0xbb6bd0d0, 0xc52aefef, 0x4fe5aaaa, 0xed16fbfb, 0x86c54343, 0x9ad74d4d, 0x66553333, 0x11948585, + 0x8acf4545, 0xe910f9f9, 0x04060202, 0xfe817f7f, 0xa0f05050, 0x78443c3c, 0x25ba9f9f, 0x4be3a8a8, + 0xa2f35151, 0x5dfea3a3, 0x80c04040, 0x058a8f8f, 0x3fad9292, 0x21bc9d9d, 0x70483838, 0xf104f5f5, + 0x63dfbcbc, 0x77c1b6b6, 0xaf75dada, 0x42632121, 0x20301010, 0xe51affff, 0xfd0ef3f3, 0xbf6dd2d2, + 0x814ccdcd, 0x18140c0c, 0x26351313, 0xc32fecec, 0xbee15f5f, 0x35a29797, 0x88cc4444, 0x2e391717, + 0x9357c4c4, 0x55f2a7a7, 0xfc827e7e, 0x7a473d3d, 0xc8ac6464, 0xbae75d5d, 0x322b1919, 0xe6957373, + 0xc0a06060, 0x19988181, 0x9ed14f4f, 0xa37fdcdc, 0x44662222, 0x547e2a2a, 0x3bab9090, 0x0b838888, + 0x8cca4646, 0xc729eeee, 0x6bd3b8b8, 0x283c1414, 0xa779dede, 0xbce25e5e, 0x161d0b0b, 0xad76dbdb, + 0xdb3be0e0, 0x64563232, 0x744e3a3a, 0x141e0a0a, 0x92db4949, 0x0c0a0606, 0x486c2424, 0xb8e45c5c, + 0x9f5dc2c2, 0xbd6ed3d3, 0x43efacac, 0xc4a66262, 0x39a89191, 0x31a49595, 0xd337e4e4, 0xf28b7979, + 0xd532e7e7, 0x8b43c8c8, 0x6e593737, 0xdab76d6d, 0x018c8d8d, 0xb164d5d5, 0x9cd24e4e, 0x49e0a9a9, + 0xd8b46c6c, 0xacfa5656, 0xf307f4f4, 0xcf25eaea, 0xcaaf6565, 0xf48e7a7a, 0x47e9aeae, 0x10180808, + 0x6fd5baba, 0xf0887878, 0x4a6f2525, 0x5c722e2e, 0x38241c1c, 0x57f1a6a6, 0x73c7b4b4, 0x9751c6c6, + 0xcb23e8e8, 0xa17cdddd, 0xe89c7474, 0x3e211f1f, 0x96dd4b4b, 0x61dcbdbd, 0x0d868b8b, 0x0f858a8a, + 0xe0907070, 0x7c423e3e, 0x71c4b5b5, 0xccaa6666, 0x90d84848, 0x06050303, 0xf701f6f6, 0x1c120e0e, + 0xc2a36161, 0x6a5f3535, 0xaef95757, 0x69d0b9b9, 0x17918686, 0x9958c1c1, 0x3a271d1d, 0x27b99e9e, + 0xd938e1e1, 0xeb13f8f8, 0x2bb39898, 0x22331111, 0xd2bb6969, 0xa970d9d9, 0x07898e8e, 0x33a79494, + 0x2db69b9b, 0x3c221e1e, 0x15928787, 0xc920e9e9, 0x8749cece, 0xaaff5555, 0x50782828, 0xa57adfdf, + 0x038f8c8c, 0x59f8a1a1, 0x09808989, 0x1a170d0d, 0x65dabfbf, 0xd731e6e6, 0x84c64242, 0xd0b86868, + 0x82c34141, 0x29b09999, 0x5a772d2d, 0x1e110f0f, 0x7bcbb0b0, 0xa8fc5454, 0x6dd6bbbb, 0x2c3a1616, +}; + +alignas(16) const uint32_t lutDec0[256] = { + 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b, + 0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5, 0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5, + 0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d, 0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b, + 0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295, 0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e, + 0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927, 0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d, + 0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362, 0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9, + 0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52, 0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566, + 0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3, 0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed, + 0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e, 0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4, + 0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4, 0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd, + 0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d, 0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060, + 0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967, 0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879, + 0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000, 0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c, + 0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36, 0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624, + 0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b, 0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c, + 0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12, 0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14, + 0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3, 0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b, + 0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8, 0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684, + 0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7, 0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177, + 0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947, 0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322, + 0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498, 0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f, + 0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54, 0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382, + 0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf, 0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb, + 0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83, 0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef, + 0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029, 0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235, + 0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733, 0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117, + 0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4, 0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546, + 0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb, 0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d, + 0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb, 0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a, + 0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773, 0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478, + 0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2, 0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff, + 0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664, 0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0, +}; + +alignas(16) const uint32_t lutDec1[256] = { + 0xa7f45150, 0x65417e53, 0xa4171ac3, 0x5e273a96, 0x6bab3bcb, 0x459d1ff1, 0x58faacab, 0x03e34b93, + 0xfa302055, 0x6d76adf6, 0x76cc8891, 0x4c02f525, 0xd7e54ffc, 0xcb2ac5d7, 0x44352680, 0xa362b58f, + 0x5ab1de49, 0x1bba2567, 0x0eea4598, 0xc0fe5de1, 0x752fc302, 0xf04c8112, 0x97468da3, 0xf9d36bc6, + 0x5f8f03e7, 0x9c921595, 0x7a6dbfeb, 0x595295da, 0x83bed42d, 0x217458d3, 0x69e04929, 0xc8c98e44, + 0x89c2756a, 0x798ef478, 0x3e58996b, 0x71b927dd, 0x4fe1beb6, 0xad88f017, 0xac20c966, 0x3ace7db4, + 0x4adf6318, 0x311ae582, 0x33519760, 0x7f536245, 0x7764b1e0, 0xae6bbb84, 0xa081fe1c, 0x2b08f994, + 0x68487058, 0xfd458f19, 0x6cde9487, 0xf87b52b7, 0xd373ab23, 0x024b72e2, 0x8f1fe357, 0xab55662a, + 0x28ebb207, 0xc2b52f03, 0x7bc5869a, 0x0837d3a5, 0x872830f2, 0xa5bf23b2, 0x6a0302ba, 0x8216ed5c, + 0x1ccf8a2b, 0xb479a792, 0xf207f3f0, 0xe2694ea1, 0xf4da65cd, 0xbe0506d5, 0x6234d11f, 0xfea6c48a, + 0x532e349d, 0x55f3a2a0, 0xe18a0532, 0xebf6a475, 0xec830b39, 0xef6040aa, 0x9f715e06, 0x106ebd51, + 0x8a213ef9, 0x06dd963d, 0x053eddae, 0xbde64d46, 0x8d5491b5, 0x5dc47105, 0xd406046f, 0x155060ff, + 0xfb981924, 0xe9bdd697, 0x434089cc, 0x9ed96777, 0x42e8b0bd, 0x8b890788, 0x5b19e738, 0xeec879db, + 0x0a7ca147, 0x0f427ce9, 0x1e84f8c9, 0x00000000, 0x86800983, 0xed2b3248, 0x70111eac, 0x725a6c4e, + 0xff0efdfb, 0x38850f56, 0xd5ae3d1e, 0x392d3627, 0xd90f0a64, 0xa65c6821, 0x545b9bd1, 0x2e36243a, + 0x670a0cb1, 0xe757930f, 0x96eeb4d2, 0x919b1b9e, 0xc5c0804f, 0x20dc61a2, 0x4b775a69, 0x1a121c16, + 0xba93e20a, 0x2aa0c0e5, 0xe0223c43, 0x171b121d, 0x0d090e0b, 0xc78bf2ad, 0xa8b62db9, 0xa91e14c8, + 0x19f15785, 0x0775af4c, 0xdd99eebb, 0x607fa3fd, 0x2601f79f, 0xf5725cbc, 0x3b6644c5, 0x7efb5b34, + 0x29438b76, 0xc623cbdc, 0xfcedb668, 0xf1e4b863, 0xdc31d7ca, 0x85634210, 0x22971340, 0x11c68420, + 0x244a857d, 0x3dbbd2f8, 0x32f9ae11, 0xa129c76d, 0x2f9e1d4b, 0x30b2dcf3, 0x52860dec, 0xe3c177d0, + 0x16b32b6c, 0xb970a999, 0x489411fa, 0x64e94722, 0x8cfca8c4, 0x3ff0a01a, 0x2c7d56d8, 0x903322ef, + 0x4e4987c7, 0xd138d9c1, 0xa2ca8cfe, 0x0bd49836, 0x81f5a6cf, 0xde7aa528, 0x8eb7da26, 0xbfad3fa4, + 0x9d3a2ce4, 0x9278500d, 0xcc5f6a9b, 0x467e5462, 0x138df6c2, 0xb8d890e8, 0xf7392e5e, 0xafc382f5, + 0x805d9fbe, 0x93d0697c, 0x2dd56fa9, 0x1225cfb3, 0x99acc83b, 0x7d1810a7, 0x639ce86e, 0xbb3bdb7b, + 0x7826cd09, 0x18596ef4, 0xb79aec01, 0x9a4f83a8, 0x6e95e665, 0xe6ffaa7e, 0xcfbc2108, 0xe815efe6, + 0x9be7bad9, 0x366f4ace, 0x099fead4, 0x7cb029d6, 0xb2a431af, 0x233f2a31, 0x94a5c630, 0x66a235c0, + 0xbc4e7437, 0xca82fca6, 0xd090e0b0, 0xd8a73315, 0x9804f14a, 0xdaec41f7, 0x50cd7f0e, 0xf691172f, + 0xd64d768d, 0xb0ef434d, 0x4daacc54, 0x0496e4df, 0xb5d19ee3, 0x886a4c1b, 0x1f2cc1b8, 0x5165467f, + 0xea5e9d04, 0x358c015d, 0x7487fa73, 0x410bfb2e, 0x1d67b35a, 0xd2db9252, 0x5610e933, 0x47d66d13, + 0x61d79a8c, 0x0ca1377a, 0x14f8598e, 0x3c13eb89, 0x27a9ceee, 0xc961b735, 0xe51ce1ed, 0xb1477a3c, + 0xdfd29c59, 0x73f2553f, 0xce141879, 0x37c773bf, 0xcdf753ea, 0xaafd5f5b, 0x6f3ddf14, 0xdb447886, + 0xf3afca81, 0xc468b93e, 0x3424382c, 0x40a3c25f, 0xc31d1672, 0x25e2bc0c, 0x493c288b, 0x950dff41, + 0x01a83971, 0xb30c08de, 0xe4b4d89c, 0xc1566490, 0x84cb7b61, 0xb632d570, 0x5c6c4874, 0x57b8d042, +}; + +alignas(16) const uint32_t lutDec2[256] = { + 0xf45150a7, 0x417e5365, 0x171ac3a4, 0x273a965e, 0xab3bcb6b, 0x9d1ff145, 0xfaacab58, 0xe34b9303, + 0x302055fa, 0x76adf66d, 0xcc889176, 0x02f5254c, 0xe54ffcd7, 0x2ac5d7cb, 0x35268044, 0x62b58fa3, + 0xb1de495a, 0xba25671b, 0xea45980e, 0xfe5de1c0, 0x2fc30275, 0x4c8112f0, 0x468da397, 0xd36bc6f9, + 0x8f03e75f, 0x9215959c, 0x6dbfeb7a, 0x5295da59, 0xbed42d83, 0x7458d321, 0xe0492969, 0xc98e44c8, + 0xc2756a89, 0x8ef47879, 0x58996b3e, 0xb927dd71, 0xe1beb64f, 0x88f017ad, 0x20c966ac, 0xce7db43a, + 0xdf63184a, 0x1ae58231, 0x51976033, 0x5362457f, 0x64b1e077, 0x6bbb84ae, 0x81fe1ca0, 0x08f9942b, + 0x48705868, 0x458f19fd, 0xde94876c, 0x7b52b7f8, 0x73ab23d3, 0x4b72e202, 0x1fe3578f, 0x55662aab, + 0xebb20728, 0xb52f03c2, 0xc5869a7b, 0x37d3a508, 0x2830f287, 0xbf23b2a5, 0x0302ba6a, 0x16ed5c82, + 0xcf8a2b1c, 0x79a792b4, 0x07f3f0f2, 0x694ea1e2, 0xda65cdf4, 0x0506d5be, 0x34d11f62, 0xa6c48afe, + 0x2e349d53, 0xf3a2a055, 0x8a0532e1, 0xf6a475eb, 0x830b39ec, 0x6040aaef, 0x715e069f, 0x6ebd5110, + 0x213ef98a, 0xdd963d06, 0x3eddae05, 0xe64d46bd, 0x5491b58d, 0xc471055d, 0x06046fd4, 0x5060ff15, + 0x981924fb, 0xbdd697e9, 0x4089cc43, 0xd967779e, 0xe8b0bd42, 0x8907888b, 0x19e7385b, 0xc879dbee, + 0x7ca1470a, 0x427ce90f, 0x84f8c91e, 0x00000000, 0x80098386, 0x2b3248ed, 0x111eac70, 0x5a6c4e72, + 0x0efdfbff, 0x850f5638, 0xae3d1ed5, 0x2d362739, 0x0f0a64d9, 0x5c6821a6, 0x5b9bd154, 0x36243a2e, + 0x0a0cb167, 0x57930fe7, 0xeeb4d296, 0x9b1b9e91, 0xc0804fc5, 0xdc61a220, 0x775a694b, 0x121c161a, + 0x93e20aba, 0xa0c0e52a, 0x223c43e0, 0x1b121d17, 0x090e0b0d, 0x8bf2adc7, 0xb62db9a8, 0x1e14c8a9, + 0xf1578519, 0x75af4c07, 0x99eebbdd, 0x7fa3fd60, 0x01f79f26, 0x725cbcf5, 0x6644c53b, 0xfb5b347e, + 0x438b7629, 0x23cbdcc6, 0xedb668fc, 0xe4b863f1, 0x31d7cadc, 0x63421085, 0x97134022, 0xc6842011, + 0x4a857d24, 0xbbd2f83d, 0xf9ae1132, 0x29c76da1, 0x9e1d4b2f, 0xb2dcf330, 0x860dec52, 0xc177d0e3, + 0xb32b6c16, 0x70a999b9, 0x9411fa48, 0xe9472264, 0xfca8c48c, 0xf0a01a3f, 0x7d56d82c, 0x3322ef90, + 0x4987c74e, 0x38d9c1d1, 0xca8cfea2, 0xd498360b, 0xf5a6cf81, 0x7aa528de, 0xb7da268e, 0xad3fa4bf, + 0x3a2ce49d, 0x78500d92, 0x5f6a9bcc, 0x7e546246, 0x8df6c213, 0xd890e8b8, 0x392e5ef7, 0xc382f5af, + 0x5d9fbe80, 0xd0697c93, 0xd56fa92d, 0x25cfb312, 0xacc83b99, 0x1810a77d, 0x9ce86e63, 0x3bdb7bbb, + 0x26cd0978, 0x596ef418, 0x9aec01b7, 0x4f83a89a, 0x95e6656e, 0xffaa7ee6, 0xbc2108cf, 0x15efe6e8, + 0xe7bad99b, 0x6f4ace36, 0x9fead409, 0xb029d67c, 0xa431afb2, 0x3f2a3123, 0xa5c63094, 0xa235c066, + 0x4e7437bc, 0x82fca6ca, 0x90e0b0d0, 0xa73315d8, 0x04f14a98, 0xec41f7da, 0xcd7f0e50, 0x91172ff6, + 0x4d768dd6, 0xef434db0, 0xaacc544d, 0x96e4df04, 0xd19ee3b5, 0x6a4c1b88, 0x2cc1b81f, 0x65467f51, + 0x5e9d04ea, 0x8c015d35, 0x87fa7374, 0x0bfb2e41, 0x67b35a1d, 0xdb9252d2, 0x10e93356, 0xd66d1347, + 0xd79a8c61, 0xa1377a0c, 0xf8598e14, 0x13eb893c, 0xa9ceee27, 0x61b735c9, 0x1ce1ede5, 0x477a3cb1, + 0xd29c59df, 0xf2553f73, 0x141879ce, 0xc773bf37, 0xf753eacd, 0xfd5f5baa, 0x3ddf146f, 0x447886db, + 0xafca81f3, 0x68b93ec4, 0x24382c34, 0xa3c25f40, 0x1d1672c3, 0xe2bc0c25, 0x3c288b49, 0x0dff4195, + 0xa8397101, 0x0c08deb3, 0xb4d89ce4, 0x566490c1, 0xcb7b6184, 0x32d570b6, 0x6c48745c, 0xb8d04257, +}; + +alignas(16) const uint32_t lutDec3[256] = { + 0x5150a7f4, 0x7e536541, 0x1ac3a417, 0x3a965e27, 0x3bcb6bab, 0x1ff1459d, 0xacab58fa, 0x4b9303e3, + 0x2055fa30, 0xadf66d76, 0x889176cc, 0xf5254c02, 0x4ffcd7e5, 0xc5d7cb2a, 0x26804435, 0xb58fa362, + 0xde495ab1, 0x25671bba, 0x45980eea, 0x5de1c0fe, 0xc302752f, 0x8112f04c, 0x8da39746, 0x6bc6f9d3, + 0x03e75f8f, 0x15959c92, 0xbfeb7a6d, 0x95da5952, 0xd42d83be, 0x58d32174, 0x492969e0, 0x8e44c8c9, + 0x756a89c2, 0xf478798e, 0x996b3e58, 0x27dd71b9, 0xbeb64fe1, 0xf017ad88, 0xc966ac20, 0x7db43ace, + 0x63184adf, 0xe582311a, 0x97603351, 0x62457f53, 0xb1e07764, 0xbb84ae6b, 0xfe1ca081, 0xf9942b08, + 0x70586848, 0x8f19fd45, 0x94876cde, 0x52b7f87b, 0xab23d373, 0x72e2024b, 0xe3578f1f, 0x662aab55, + 0xb20728eb, 0x2f03c2b5, 0x869a7bc5, 0xd3a50837, 0x30f28728, 0x23b2a5bf, 0x02ba6a03, 0xed5c8216, + 0x8a2b1ccf, 0xa792b479, 0xf3f0f207, 0x4ea1e269, 0x65cdf4da, 0x06d5be05, 0xd11f6234, 0xc48afea6, + 0x349d532e, 0xa2a055f3, 0x0532e18a, 0xa475ebf6, 0x0b39ec83, 0x40aaef60, 0x5e069f71, 0xbd51106e, + 0x3ef98a21, 0x963d06dd, 0xddae053e, 0x4d46bde6, 0x91b58d54, 0x71055dc4, 0x046fd406, 0x60ff1550, + 0x1924fb98, 0xd697e9bd, 0x89cc4340, 0x67779ed9, 0xb0bd42e8, 0x07888b89, 0xe7385b19, 0x79dbeec8, + 0xa1470a7c, 0x7ce90f42, 0xf8c91e84, 0x00000000, 0x09838680, 0x3248ed2b, 0x1eac7011, 0x6c4e725a, + 0xfdfbff0e, 0x0f563885, 0x3d1ed5ae, 0x3627392d, 0x0a64d90f, 0x6821a65c, 0x9bd1545b, 0x243a2e36, + 0x0cb1670a, 0x930fe757, 0xb4d296ee, 0x1b9e919b, 0x804fc5c0, 0x61a220dc, 0x5a694b77, 0x1c161a12, + 0xe20aba93, 0xc0e52aa0, 0x3c43e022, 0x121d171b, 0x0e0b0d09, 0xf2adc78b, 0x2db9a8b6, 0x14c8a91e, + 0x578519f1, 0xaf4c0775, 0xeebbdd99, 0xa3fd607f, 0xf79f2601, 0x5cbcf572, 0x44c53b66, 0x5b347efb, + 0x8b762943, 0xcbdcc623, 0xb668fced, 0xb863f1e4, 0xd7cadc31, 0x42108563, 0x13402297, 0x842011c6, + 0x857d244a, 0xd2f83dbb, 0xae1132f9, 0xc76da129, 0x1d4b2f9e, 0xdcf330b2, 0x0dec5286, 0x77d0e3c1, + 0x2b6c16b3, 0xa999b970, 0x11fa4894, 0x472264e9, 0xa8c48cfc, 0xa01a3ff0, 0x56d82c7d, 0x22ef9033, + 0x87c74e49, 0xd9c1d138, 0x8cfea2ca, 0x98360bd4, 0xa6cf81f5, 0xa528de7a, 0xda268eb7, 0x3fa4bfad, + 0x2ce49d3a, 0x500d9278, 0x6a9bcc5f, 0x5462467e, 0xf6c2138d, 0x90e8b8d8, 0x2e5ef739, 0x82f5afc3, + 0x9fbe805d, 0x697c93d0, 0x6fa92dd5, 0xcfb31225, 0xc83b99ac, 0x10a77d18, 0xe86e639c, 0xdb7bbb3b, + 0xcd097826, 0x6ef41859, 0xec01b79a, 0x83a89a4f, 0xe6656e95, 0xaa7ee6ff, 0x2108cfbc, 0xefe6e815, + 0xbad99be7, 0x4ace366f, 0xead4099f, 0x29d67cb0, 0x31afb2a4, 0x2a31233f, 0xc63094a5, 0x35c066a2, + 0x7437bc4e, 0xfca6ca82, 0xe0b0d090, 0x3315d8a7, 0xf14a9804, 0x41f7daec, 0x7f0e50cd, 0x172ff691, + 0x768dd64d, 0x434db0ef, 0xcc544daa, 0xe4df0496, 0x9ee3b5d1, 0x4c1b886a, 0xc1b81f2c, 0x467f5165, + 0x9d04ea5e, 0x015d358c, 0xfa737487, 0xfb2e410b, 0xb35a1d67, 0x9252d2db, 0xe9335610, 0x6d1347d6, + 0x9a8c61d7, 0x377a0ca1, 0x598e14f8, 0xeb893c13, 0xceee27a9, 0xb735c961, 0xe1ede51c, 0x7a3cb147, + 0x9c59dfd2, 0x553f73f2, 0x1879ce14, 0x73bf37c7, 0x53eacdf7, 0x5f5baafd, 0xdf146f3d, 0x7886db44, + 0xca81f3af, 0xb93ec468, 0x382c3424, 0xc25f40a3, 0x1672c31d, 0xbc0c25e2, 0x288b493c, 0xff41950d, + 0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8, +}; + +rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key) { + uint32_t s0, s1, s2, s3; + + s0 = rx_vec_i128_w(in); + s1 = rx_vec_i128_z(in); + s2 = rx_vec_i128_y(in); + s3 = rx_vec_i128_x(in); + + rx_vec_i128 out = rx_set_int_vec_i128( + (lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]), + (lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]), + (lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]), + (lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24]) + ); + + return rx_xor_vec_i128(out, key); +} + +rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key) { + uint32_t s0, s1, s2, s3; + + s0 = rx_vec_i128_w(in); + s1 = rx_vec_i128_z(in); + s2 = rx_vec_i128_y(in); + s3 = rx_vec_i128_x(in); + + rx_vec_i128 out = rx_set_int_vec_i128( + (lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]), + (lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]), + (lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]), + (lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24]) + ); + + return rx_xor_vec_i128(out, key); +} diff --git a/randomx/soft_aes.h b/randomx/soft_aes.h new file mode 100644 index 0000000..254f8d6 --- /dev/null +++ b/randomx/soft_aes.h @@ -0,0 +1,46 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "intrin_portable.h" + +rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key); + +rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key); + +template +inline rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) { + return soft ? soft_aesenc(in, key) : rx_aesenc_vec_i128(in, key); +} + +template +inline rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) { + return soft ? soft_aesdec(in, key) : rx_aesdec_vec_i128(in, key); +} \ No newline at end of file diff --git a/randomx/superscalar.cpp b/randomx/superscalar.cpp new file mode 100644 index 0000000..4e9fd78 --- /dev/null +++ b/randomx/superscalar.cpp @@ -0,0 +1,903 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "configuration.h" +#include "program.hpp" +#include "blake2/endian.h" +#include +#include +#include +#include +#include +#include "superscalar.hpp" +#include "intrin_portable.h" +#include "reciprocal.h" +#include "common.hpp" + +namespace randomx { + + static bool isMultiplication(SuperscalarInstructionType type) { + return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP; + } + + //uOPs (micro-ops) are represented only by the execution port they can go to + namespace ExecutionPort { + using type = int; + constexpr type Null = 0; + constexpr type P0 = 1; + constexpr type P1 = 2; + constexpr type P5 = 4; + constexpr type P01 = P0 | P1; + constexpr type P05 = P0 | P5; + constexpr type P015 = P0 | P1 | P5; + } + + //Macro-operation as output of the x86 decoder + //Usually one macro-op = one x86 instruction, but 2 instructions are sometimes fused into 1 macro-op + //Macro-op can consist of 1 or 2 uOPs. + class MacroOp { + public: + MacroOp(const char* name, int size) + : name_(name), size_(size), latency_(0), uop1_(ExecutionPort::Null), uop2_(ExecutionPort::Null) {} + MacroOp(const char* name, int size, int latency, ExecutionPort::type uop) + : name_(name), size_(size), latency_(latency), uop1_(uop), uop2_(ExecutionPort::Null) {} + MacroOp(const char* name, int size, int latency, ExecutionPort::type uop1, ExecutionPort::type uop2) + : name_(name), size_(size), latency_(latency), uop1_(uop1), uop2_(uop2) {} + MacroOp(const MacroOp& parent, bool dependent) + : name_(parent.name_), size_(parent.size_), latency_(parent.latency_), uop1_(parent.uop1_), uop2_(parent.uop2_), dependent_(dependent) {} + const char* getName() const { + return name_; + } + int getSize() const { + return size_; + } + int getLatency() const { + return latency_; + } + ExecutionPort::type getUop1() const { + return uop1_; + } + ExecutionPort::type getUop2() const { + return uop2_; + } + bool isSimple() const { + return uop2_ == ExecutionPort::Null; + } + bool isEliminated() const { + return uop1_ == ExecutionPort::Null; + } + bool isDependent() const { + return dependent_; + } + static const MacroOp Add_rr; + static const MacroOp Add_ri; + static const MacroOp Lea_sib; + static const MacroOp Sub_rr; + static const MacroOp Imul_rr; + static const MacroOp Imul_r; + static const MacroOp Mul_r; + static const MacroOp Mov_rr; + static const MacroOp Mov_ri64; + static const MacroOp Xor_rr; + static const MacroOp Xor_ri; + static const MacroOp Ror_rcl; + static const MacroOp Ror_ri; + static const MacroOp TestJz_fused; + static const MacroOp Xor_self; + static const MacroOp Cmp_ri; + static const MacroOp Setcc_r; + private: + const char* name_; + int size_; + int latency_; + ExecutionPort::type uop1_; + ExecutionPort::type uop2_; + bool dependent_ = false; + }; + + //Size: 3 bytes + const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3); + + //Size: 4 bytes + const MacroOp MacroOp::Lea_sib = MacroOp("lea r,r+r*s", 4, 1, ExecutionPort::P01); + const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); + const MacroOp MacroOp::Ror_ri = MacroOp("ror r,i", 4, 1, ExecutionPort::P05); + + //Size: 7 bytes (can be optionally padded with nop to 8 or 9 bytes) + const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015); + + //Size: 10 bytes + const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015); + + //Unused: + const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5); + const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); + const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); + const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); + + const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr }; + const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; + const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) }; + + class SuperscalarInstructionInfo { + public: + const char* getName() const { + return name_; + } + int getSize() const { + return ops_.size(); + } + bool isSimple() const { + return getSize() == 1; + } + int getLatency() const { + return latency_; + } + const MacroOp& getOp(int index) const { + return ops_[index]; + } + SuperscalarInstructionType getType() const { + return type_; + } + int getResultOp() const { + return resultOp_; + } + int getDstOp() const { + return dstOp_; + } + int getSrcOp() const { + return srcOp_; + } + static const SuperscalarInstructionInfo ISUB_R; + static const SuperscalarInstructionInfo IXOR_R; + static const SuperscalarInstructionInfo IADD_RS; + static const SuperscalarInstructionInfo IMUL_R; + static const SuperscalarInstructionInfo IROR_C; + static const SuperscalarInstructionInfo IADD_C7; + static const SuperscalarInstructionInfo IXOR_C7; + static const SuperscalarInstructionInfo IADD_C8; + static const SuperscalarInstructionInfo IXOR_C8; + static const SuperscalarInstructionInfo IADD_C9; + static const SuperscalarInstructionInfo IXOR_C9; + static const SuperscalarInstructionInfo IMULH_R; + static const SuperscalarInstructionInfo ISMULH_R; + static const SuperscalarInstructionInfo IMUL_RCP; + static const SuperscalarInstructionInfo NOP; + private: + const char* name_; + SuperscalarInstructionType type_; + std::vector ops_; + int latency_; + int resultOp_ = 0; + int dstOp_ = 0; + int srcOp_; + + SuperscalarInstructionInfo(const char* name) + : name_(name), type_(SuperscalarInstructionType::INVALID), latency_(0) {} + SuperscalarInstructionInfo(const char* name, SuperscalarInstructionType type, const MacroOp& op, int srcOp) + : name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) { + ops_.push_back(MacroOp(op)); + } + template + SuperscalarInstructionInfo(const char* name, SuperscalarInstructionType type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp) + : name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) { + for (unsigned i = 0; i < N; ++i) { + ops_.push_back(MacroOp(arr[i])); + latency_ += ops_.back().getLatency(); + } + static_assert(N > 1, "Invalid array size"); + } + }; + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISUB_R = SuperscalarInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_R = SuperscalarInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_RS = SuperscalarInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_R = SuperscalarInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IROR_C = SuperscalarInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1); + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C7 = SuperscalarInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C7 = SuperscalarInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C8 = SuperscalarInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C8 = SuperscalarInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C9 = SuperscalarInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C9 = SuperscalarInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1); + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMULH_R = SuperscalarInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISMULH_R = SuperscalarInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_RCP = SuperscalarInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::NOP = SuperscalarInstructionInfo("NOP"); + + //these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions. + //RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate). + //Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction. + const int buffer0[] = { 4, 8, 4 }; + const int buffer1[] = { 7, 3, 3, 3 }; + const int buffer2[] = { 3, 7, 3, 3 }; + const int buffer3[] = { 4, 9, 3 }; + const int buffer4[] = { 4, 4, 4, 4 }; + const int buffer5[] = { 3, 3, 10 }; + + class DecoderBuffer { + public: + static const DecoderBuffer Default; + template + DecoderBuffer(const char* name, int index, const int(&arr)[N]) + : name_(name), index_(index), counts_(arr), opsCount_(N) {} + const int* getCounts() const { + return counts_; + } + int getSize() const { + return opsCount_; + } + int getIndex() const { + return index_; + } + const char* getName() const { + return name_; + } + const DecoderBuffer* fetchNext(SuperscalarInstructionType instrType, int cycle, int mulCount, Blake2Generator& gen) const { + //If the current RandomX instruction is "IMULH", the next fetch configuration must be 3-3-10 + //because the full 128-bit multiplication instruction is 3 bytes long and decodes to 2 uOPs on Intel CPUs. + //Intel CPUs can decode at most 4 uOPs per cycle, so this requires a 2-1-1 configuration for a total of 3 macro ops. + if (instrType == SuperscalarInstructionType::IMULH_R || instrType == SuperscalarInstructionType::ISMULH_R) + return &decodeBuffer3310; + + //To make sure that the multiplication port is saturated, a 4-4-4-4 configuration is generated if the number of multiplications + //is lower than the number of cycles. + if (mulCount < cycle + 1) + return &decodeBuffer4444; + + //If the current RandomX instruction is "IMUL_RCP", the next buffer must begin with a 4-byte slot for multiplication. + if(instrType == SuperscalarInstructionType::IMUL_RCP) + return (gen.getByte() & 1) ? &decodeBuffer484 : &decodeBuffer493; + + //Default: select a random fetch configuration. + return fetchNextDefault(gen); + } + private: + const char* name_; + int index_; + const int* counts_; + int opsCount_; + DecoderBuffer() : index_(-1) {} + static const DecoderBuffer decodeBuffer484; + static const DecoderBuffer decodeBuffer7333; + static const DecoderBuffer decodeBuffer3733; + static const DecoderBuffer decodeBuffer493; + static const DecoderBuffer decodeBuffer4444; + static const DecoderBuffer decodeBuffer3310; + static const DecoderBuffer* decodeBuffers[4]; + const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const { + return decodeBuffers[gen.getByte() & 3]; + } + }; + + const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0); + const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1); + const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2); + const DecoderBuffer DecoderBuffer::decodeBuffer493 = DecoderBuffer("4,9,3", 3, buffer3); + const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4); + const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 5, buffer5); + + const DecoderBuffer* DecoderBuffer::decodeBuffers[4] = { + &DecoderBuffer::decodeBuffer484, + &DecoderBuffer::decodeBuffer7333, + &DecoderBuffer::decodeBuffer3733, + &DecoderBuffer::decodeBuffer493, + }; + + const DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); + + const SuperscalarInstructionInfo* slot_3[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R }; + const SuperscalarInstructionInfo* slot_3L[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R, &SuperscalarInstructionInfo::IMULH_R, &SuperscalarInstructionInfo::ISMULH_R }; + const SuperscalarInstructionInfo* slot_4[] = { &SuperscalarInstructionInfo::IROR_C, &SuperscalarInstructionInfo::IADD_RS }; + const SuperscalarInstructionInfo* slot_7[] = { &SuperscalarInstructionInfo::IXOR_C7, &SuperscalarInstructionInfo::IADD_C7 }; + const SuperscalarInstructionInfo* slot_8[] = { &SuperscalarInstructionInfo::IXOR_C8, &SuperscalarInstructionInfo::IADD_C8 }; + const SuperscalarInstructionInfo* slot_9[] = { &SuperscalarInstructionInfo::IXOR_C9, &SuperscalarInstructionInfo::IADD_C9 }; + const SuperscalarInstructionInfo* slot_10 = &SuperscalarInstructionInfo::IMUL_RCP; + + static bool selectRegister(std::vector& availableRegisters, Blake2Generator& gen, int& reg) { + int index; + if (availableRegisters.size() == 0) + return false; + + if (availableRegisters.size() > 1) { + index = gen.getUInt32() % availableRegisters.size(); + } + else { + index = 0; + } + reg = availableRegisters[index]; + return true; + } + + class RegisterInfo { + public: + RegisterInfo() : latency(0), lastOpGroup(SuperscalarInstructionType::INVALID), lastOpPar(-1), value(0) {} + int latency; + SuperscalarInstructionType lastOpGroup; + int lastOpPar; + int value; + }; + + //"SuperscalarInstruction" consists of one or more macro-ops + class SuperscalarInstruction { + public: + void toInstr(Instruction& instr) { //translate to a RandomX instruction format + instr.opcode = (int)getType(); + instr.dst = dst_; + instr.src = src_ >= 0 ? src_ : dst_; + instr.setMod(mod_); + instr.setImm32(imm32_); + } + + void createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) { + switch (slotSize) + { + case 3: + //if this is the last slot, we can also select "IMULH" instructions + if (isLast) { + create(slot_3L[gen.getByte() & 3], gen); + } + else { + create(slot_3[gen.getByte() & 1], gen); + } + break; + case 4: + //if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions + if (fetchType == 4 && !isLast) { + create(&SuperscalarInstructionInfo::IMUL_R, gen); + } + else { + create(slot_4[gen.getByte() & 1], gen); + } + break; + case 7: + create(slot_7[gen.getByte() & 1], gen); + break; + case 8: + create(slot_8[gen.getByte() & 1], gen); + break; + case 9: + create(slot_9[gen.getByte() & 1], gen); + break; + case 10: + create(slot_10, gen); + break; + default: + UNREACHABLE; + } + } + + void create(const SuperscalarInstructionInfo* info, Blake2Generator& gen) { + info_ = info; + reset(); + switch (info->getType()) + { + case SuperscalarInstructionType::ISUB_R: { + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IADD_RS; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IXOR_R: { + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IXOR_R; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IADD_RS: { + mod_ = gen.getByte(); + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IADD_RS; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IMUL_R: { + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IMUL_R; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IROR_C: { + mod_ = 0; + do { + imm32_ = gen.getByte() & 63; + } while (imm32_ == 0); + opGroup_ = SuperscalarInstructionType::IROR_C; + opGroupPar_ = -1; + } break; + + case SuperscalarInstructionType::IADD_C7: + case SuperscalarInstructionType::IADD_C8: + case SuperscalarInstructionType::IADD_C9: { + mod_ = 0; + imm32_ = gen.getUInt32(); + opGroup_ = SuperscalarInstructionType::IADD_C7; + opGroupPar_ = -1; + } break; + + case SuperscalarInstructionType::IXOR_C7: + case SuperscalarInstructionType::IXOR_C8: + case SuperscalarInstructionType::IXOR_C9: { + mod_ = 0; + imm32_ = gen.getUInt32(); + opGroup_ = SuperscalarInstructionType::IXOR_C7; + opGroupPar_ = -1; + } break; + + case SuperscalarInstructionType::IMULH_R: { + canReuse_ = true; + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IMULH_R; + opGroupPar_ = gen.getUInt32(); + } break; + + case SuperscalarInstructionType::ISMULH_R: { + canReuse_ = true; + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::ISMULH_R; + opGroupPar_ = gen.getUInt32(); + } break; + + case SuperscalarInstructionType::IMUL_RCP: { + mod_ = 0; + do { + imm32_ = gen.getUInt32(); + } while (isZeroOrPowerOf2(imm32_)); + opGroup_ = SuperscalarInstructionType::IMUL_RCP; + opGroupPar_ = -1; + } break; + + default: + break; + } + } + + bool selectDestination(int cycle, bool allowChainedMul, RegisterInfo (®isters)[8], Blake2Generator& gen) { + /*if (allowChainedMultiplication && opGroup_ == SuperscalarInstructionType::IMUL_R) + std::cout << "Selecting destination with chained MUL enabled" << std::endl;*/ + std::vector availableRegisters; + //Conditions for the destination register: + // * value must be ready at the required cycle + // * cannot be the same as the source register unless the instruction allows it + // - this avoids optimizable instructions such as "xor r, r" or "sub r, r" + // * register cannot be multiplied twice in a row unless allowChainedMul is true + // - this avoids accumulation of trailing zeroes in registers due to excessive multiplication + // - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator) + // * either the last instruction applied to the register or its source must be different than this instruction + // - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2" + // * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction) + for (unsigned i = 0; i < 8; ++i) { + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != RegisterNeedsDisplacement)) + availableRegisters.push_back(i); + } + return selectRegister(availableRegisters, gen, dst_); + } + + bool selectSource(int cycle, RegisterInfo(®isters)[8], Blake2Generator& gen) { + std::vector availableRegisters; + //all registers that are ready at the cycle + for (unsigned i = 0; i < 8; ++i) { + if (registers[i].latency <= cycle) + availableRegisters.push_back(i); + } + //if there are only 2 available registers for IADD_RS and one of them is r5, select it as the source because it cannot be the destination + if (availableRegisters.size() == 2 && info_->getType() == SuperscalarInstructionType::IADD_RS) { + if (availableRegisters[0] == RegisterNeedsDisplacement || availableRegisters[1] == RegisterNeedsDisplacement) { + opGroupPar_ = src_ = RegisterNeedsDisplacement; + return true; + } + } + if (selectRegister(availableRegisters, gen, src_)) { + if (groupParIsSource_) + opGroupPar_ = src_; + return true; + } + return false; + } + + SuperscalarInstructionType getType() { + return info_->getType(); + } + int getSource() { + return src_; + } + int getDestination() { + return dst_; + } + SuperscalarInstructionType getGroup() { + return opGroup_; + } + int getGroupPar() { + return opGroupPar_; + } + + const SuperscalarInstructionInfo& getInfo() const { + return *info_; + } + + static const SuperscalarInstruction Null; + + private: + const SuperscalarInstructionInfo* info_; + int src_ = -1; + int dst_ = -1; + int mod_; + uint32_t imm32_; + SuperscalarInstructionType opGroup_; + int opGroupPar_; + bool canReuse_ = false; + bool groupParIsSource_ = false; + + void reset() { + src_ = dst_ = -1; + canReuse_ = groupParIsSource_ = false; + } + + SuperscalarInstruction(const SuperscalarInstructionInfo* info) : info_(info) { + } + }; + + const SuperscalarInstruction SuperscalarInstruction::Null = SuperscalarInstruction(&SuperscalarInstructionInfo::NOP); + + constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 4; + constexpr int LOOK_FORWARD_CYCLES = 4; + constexpr int MAX_THROWAWAY_COUNT = 256; + + template + static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) { + //The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload + //port P1 (multiplication) by instructions that can go to any port. + for (; cycle < CYCLE_MAP_SIZE; ++cycle) { + if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) { + if (commit) { + if (trace) std::cout << "; P5 at cycle " << cycle << std::endl; + portBusy[cycle][2] = uop; + } + return cycle; + } + if ((uop & ExecutionPort::P0) != 0 && !portBusy[cycle][0]) { + if (commit) { + if (trace) std::cout << "; P0 at cycle " << cycle << std::endl; + portBusy[cycle][0] = uop; + } + return cycle; + } + if ((uop & ExecutionPort::P1) != 0 && !portBusy[cycle][1]) { + if (commit) { + if (trace) std::cout << "; P1 at cycle " << cycle << std::endl; + portBusy[cycle][1] = uop; + } + return cycle; + } + } + return -1; + } + + template + static int scheduleMop(const MacroOp& mop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle, int depCycle) { + //if this macro-op depends on the previous one, increase the starting cycle if needed + //this handles an explicit dependency chain in IMUL_RCP + if (mop.isDependent()) { + cycle = std::max(cycle, depCycle); + } + //move instructions are eliminated and don't need an execution unit + if (mop.isEliminated()) { + if (commit) + if (trace) std::cout << "; (eliminated)" << std::endl; + return cycle; + } + else if (mop.isSimple()) { + //this macro-op has only one uOP + return scheduleUop(mop.getUop1(), portBusy, cycle); + } + else { + //macro-ops with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle + for (; cycle < CYCLE_MAP_SIZE; ++cycle) { + + int cycle1 = scheduleUop(mop.getUop1(), portBusy, cycle); + int cycle2 = scheduleUop(mop.getUop2(), portBusy, cycle); + + if (cycle1 >= 0 && cycle1 == cycle2) { + if (commit) { + scheduleUop(mop.getUop1(), portBusy, cycle1); + scheduleUop(mop.getUop2(), portBusy, cycle2); + } + return cycle1; + } + } + } + + return -1; + } + + void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen) { + + ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; + memset(portBusy, 0, sizeof(portBusy)); + RegisterInfo registers[8]; + + const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default; + SuperscalarInstruction currentInstruction = SuperscalarInstruction::Null; + int macroOpIndex = 0; + int codeSize = 0; + int macroOpCount = 0; + int cycle = 0; + int depCycle = 0; + int retireCycle = 0; + bool portsSaturated = false; + int programSize = 0; + int mulCount = 0; + int decodeCycle; + int throwAwayCount = 0; + + //decode instructions for RANDOMX_SUPERSCALAR_LATENCY cycles or until an execution port is saturated. + //Each decode cycle decodes 16 bytes of x86 code. + //Since a decode cycle produces on average 3.45 macro-ops and there are only 3 ALU ports, execution ports are always + //saturated first. The cycle limit is present only to guarantee loop termination. + //Program size is limited to SuperscalarMaxSize instructions. + for (decodeCycle = 0; decodeCycle < RANDOMX_SUPERSCALAR_LATENCY && !portsSaturated && programSize < SuperscalarMaxSize; ++decodeCycle) { + + //select a decode configuration + decodeBuffer = decodeBuffer->fetchNext(currentInstruction.getType(), decodeCycle, mulCount, gen); + if (trace) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl; + + int bufferIndex = 0; + + //fill all instruction slots in the current decode buffer + while (bufferIndex < decodeBuffer->getSize()) { + int topCycle = cycle; + + //if we have issued all macro-ops for the current RandomX instruction, create a new instruction + if (macroOpIndex >= currentInstruction.getInfo().getSize()) { + if (portsSaturated || programSize >= SuperscalarMaxSize) + break; + //select an instruction so that the first macro-op fits into the current slot + currentInstruction.createForSlot(gen, decodeBuffer->getCounts()[bufferIndex], decodeBuffer->getIndex(), decodeBuffer->getSize() == bufferIndex + 1, bufferIndex == 0); + macroOpIndex = 0; + if (trace) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; + } + const MacroOp& mop = currentInstruction.getInfo().getOp(macroOpIndex); + if (trace) std::cout << mop.getName() << " "; + + //calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution + int scheduleCycle = scheduleMop(mop, portBusy, cycle, depCycle); + if (scheduleCycle < 0) { + if (trace) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl; + //__debugbreak(); + portsSaturated = true; + break; + } + + //find a source register (if applicable) that will be ready when this instruction executes + if (macroOpIndex == currentInstruction.getInfo().getSrcOp()) { + int forward; + //if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward + for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++forward) { + if (trace) std::cout << "; src STALL at cycle " << cycle << std::endl; + ++scheduleCycle; + ++cycle; + } + //if no register was found, throw the instruction away and try another one + if (forward == LOOK_FORWARD_CYCLES) { + if (throwAwayCount < MAX_THROWAWAY_COUNT) { + throwAwayCount++; + macroOpIndex = currentInstruction.getInfo().getSize(); + if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + //cycle = topCycle; + continue; + } + //abort this decode buffer + if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl; + currentInstruction = SuperscalarInstruction::Null; + break; + } + if (trace) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; + } + //find a destination register that will be ready when this instruction executes + if (macroOpIndex == currentInstruction.getInfo().getDstOp()) { + int forward; + for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, throwAwayCount > 0, registers, gen); ++forward) { + if (trace) std::cout << "; dst STALL at cycle " << cycle << std::endl; + ++scheduleCycle; + ++cycle; + } + if (forward == LOOK_FORWARD_CYCLES) { //throw instruction away + if (throwAwayCount < MAX_THROWAWAY_COUNT) { + throwAwayCount++; + macroOpIndex = currentInstruction.getInfo().getSize(); + if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + //cycle = topCycle; + continue; + } + //abort this decode buffer + if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl; + currentInstruction = SuperscalarInstruction::Null; + break; + } + if (trace) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; + } + throwAwayCount = 0; + + //recalculate when the instruction can be scheduled for execution based on operand availability + scheduleCycle = scheduleMop(mop, portBusy, scheduleCycle, scheduleCycle); + + if (scheduleCycle < 0) { + if (trace) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << scheduleCycle << ")" << std::endl; + portsSaturated = true; + break; + } + + //calculate when the result will be ready + depCycle = scheduleCycle + mop.getLatency(); + + //if this instruction writes the result, modify register information + // RegisterInfo.latency - which cycle the register will be ready + // RegisterInfo.lastOpGroup - the last operation that was applied to the register + // RegisterInfo.lastOpPar - the last operation source value (-1 = constant, 0-7 = register) + if (macroOpIndex == currentInstruction.getInfo().getResultOp()) { + int dst = currentInstruction.getDestination(); + RegisterInfo& ri = registers[dst]; + retireCycle = depCycle; + ri.latency = retireCycle; + ri.lastOpGroup = currentInstruction.getGroup(); + ri.lastOpPar = currentInstruction.getGroupPar(); + if (trace) std::cout << "; RETIRED at cycle " << retireCycle << std::endl; + } + codeSize += mop.getSize(); + bufferIndex++; + macroOpIndex++; + macroOpCount++; + + //terminating condition + if (scheduleCycle >= RANDOMX_SUPERSCALAR_LATENCY) { + portsSaturated = true; + } + cycle = topCycle; + + //when all macro-ops of the current instruction have been issued, add the instruction into the program + if (macroOpIndex >= currentInstruction.getInfo().getSize()) { + currentInstruction.toInstr(prog(programSize++)); + mulCount += isMultiplication(currentInstruction.getType()); + } + } + ++cycle; + } + + double ipc = (macroOpCount / (double)retireCycle); + + memset(prog.asicLatencies, 0, sizeof(prog.asicLatencies)); + + //Calculate ASIC latency: + //Assumes 1 cycle latency for all operations and unlimited parallelization. + for (int i = 0; i < programSize; ++i) { + Instruction& instr = prog(i); + int latDst = prog.asicLatencies[instr.dst] + 1; + int latSrc = instr.dst != instr.src ? prog.asicLatencies[instr.src] + 1 : 0; + prog.asicLatencies[instr.dst] = std::max(latDst, latSrc); + } + + //address register is the register with the highest ASIC latency + int asicLatencyMax = 0; + int addressReg = 0; + for (int i = 0; i < 8; ++i) { + if (prog.asicLatencies[i] > asicLatencyMax) { + asicLatencyMax = prog.asicLatencies[i]; + addressReg = i; + } + prog.cpuLatencies[i] = registers[i].latency; + } + + prog.setSize(programSize); + prog.setAddressRegister(addressReg); + + prog.cpuLatency = retireCycle; + prog.asicLatency = asicLatencyMax; + prog.codeSize = codeSize; + prog.macroOps = macroOpCount; + prog.decodeCycles = decodeCycle; + prog.ipc = ipc; + prog.mulCount = mulCount; + + + /*if(INFO) std::cout << "; ALU port utilization:" << std::endl; + if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl; + + int portCycles = 0; + for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { + std::cout << "; " << std::setw(3) << i << " "; + for (int j = 0; j < 3; ++j) { + std::cout << (portBusy[i][j] ? '*' : '_'); + portCycles += !!portBusy[i][j]; + } + std::cout << std::endl; + }*/ + } + + void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector *reciprocals) { + for (unsigned j = 0; j < prog.getSize(); ++j) { + Instruction& instr = prog(j); + switch ((SuperscalarInstructionType)instr.opcode) + { + case SuperscalarInstructionType::ISUB_R: + r[instr.dst] -= r[instr.src]; + break; + case SuperscalarInstructionType::IXOR_R: + r[instr.dst] ^= r[instr.src]; + break; + case SuperscalarInstructionType::IADD_RS: + r[instr.dst] += r[instr.src] << instr.getModShift(); + break; + case SuperscalarInstructionType::IMUL_R: + r[instr.dst] *= r[instr.src]; + break; + case SuperscalarInstructionType::IROR_C: + r[instr.dst] = rotr(r[instr.dst], instr.getImm32()); + break; + case SuperscalarInstructionType::IADD_C7: + case SuperscalarInstructionType::IADD_C8: + case SuperscalarInstructionType::IADD_C9: + r[instr.dst] += signExtend2sCompl(instr.getImm32()); + break; + case SuperscalarInstructionType::IXOR_C7: + case SuperscalarInstructionType::IXOR_C8: + case SuperscalarInstructionType::IXOR_C9: + r[instr.dst] ^= signExtend2sCompl(instr.getImm32()); + break; + case SuperscalarInstructionType::IMULH_R: + r[instr.dst] = mulh(r[instr.dst], r[instr.src]); + break; + case SuperscalarInstructionType::ISMULH_R: + r[instr.dst] = smulh(r[instr.dst], r[instr.src]); + break; + case SuperscalarInstructionType::IMUL_RCP: + if (reciprocals != nullptr) + r[instr.dst] *= (*reciprocals)[instr.getImm32()]; + else + r[instr.dst] *= randomx_reciprocal(instr.getImm32()); + break; + default: + UNREACHABLE; + } + } + } +} diff --git a/randomx/superscalar.hpp b/randomx/superscalar.hpp new file mode 100644 index 0000000..bc101c4 --- /dev/null +++ b/randomx/superscalar.hpp @@ -0,0 +1,60 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include "superscalar_program.hpp" +#include "blake2_generator.hpp" + +namespace randomx { + // Intel Ivy Bridge reference + enum class SuperscalarInstructionType { //uOPs (decode) execution ports latency code size + ISUB_R = 0, //1 p015 1 3 (sub) + IXOR_R = 1, //1 p015 1 3 (xor) + IADD_RS = 2, //1 p01 1 4 (lea) + IMUL_R = 3, //1 p1 3 4 (imul) + IROR_C = 4, //1 p05 1 4 (ror) + IADD_C7 = 5, //1 p015 1 7 (add) + IXOR_C7 = 6, //1 p015 1 7 (xor) + IADD_C8 = 7, //1+0 p015 1 7+1 (add+nop) + IXOR_C8 = 8, //1+0 p015 1 7+1 (xor+nop) + IADD_C9 = 9, //1+0 p015 1 7+2 (add+nop) + IXOR_C9 = 10, //1+0 p015 1 7+2 (xor+nop) + IMULH_R = 11, //1+2+1 0+(p1,p5)+0 3 3+3+3 (mov+mul+mov) + ISMULH_R = 12, //1+2+1 0+(p1,p5)+0 3 3+3+3 (mov+imul+mov) + IMUL_RCP = 13, //1+1 p015+p1 4 10+4 (mov+imul) + + COUNT = 14, + INVALID = -1 + }; + + void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen); + void executeSuperscalar(uint64_t(&r)[8], SuperscalarProgram& prog, std::vector *reciprocals = nullptr); +} \ No newline at end of file diff --git a/randomx/superscalar_program.hpp b/randomx/superscalar_program.hpp new file mode 100644 index 0000000..7bcd484 --- /dev/null +++ b/randomx/superscalar_program.hpp @@ -0,0 +1,84 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "instruction.hpp" +#include "common.hpp" + +namespace randomx { + + class SuperscalarProgram { + public: + Instruction& operator()(int pc) { + return programBuffer[pc]; + } + friend std::ostream& operator<<(std::ostream& os, const SuperscalarProgram& p) { + p.print(os); + return os; + } + uint32_t getSize() { + return size; + } + void setSize(uint32_t val) { + size = val; + } + int getAddressRegister() { + return addrReg; + } + void setAddressRegister(int val) { + addrReg = val; + } + + Instruction programBuffer[SuperscalarMaxSize]; + uint32_t size +#ifndef NDEBUG + = 0 +#endif + ; + int addrReg; + double ipc; + int codeSize; + int macroOps; + int decodeCycles; + int cpuLatency; + int asicLatency; + int mulCount; + int cpuLatencies[8]; + int asicLatencies[8]; + private: + void print(std::ostream& os) const { + for (unsigned i = 0; i < size; ++i) { + auto instr = programBuffer[i]; + os << instr; + } + } + }; + +} \ No newline at end of file diff --git a/randomx/virtual_machine.cpp b/randomx/virtual_machine.cpp new file mode 100644 index 0000000..f8010d5 --- /dev/null +++ b/randomx/virtual_machine.cpp @@ -0,0 +1,141 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include "virtual_machine.hpp" +#include "common.hpp" +#include "aes_hash.hpp" +#include "blake2/blake2.h" +#include "intrin_portable.h" +#include "allocator.hpp" + +randomx_vm::~randomx_vm() { + +} + +void randomx_vm::resetRoundingMode() { + rx_reset_float_state(); +} + +namespace randomx { + + static inline uint64_t getSmallPositiveFloatBits(uint64_t entropy) { + auto exponent = entropy >> 59; //0..31 + auto mantissa = entropy & mantissaMask; + exponent += exponentBias; + exponent &= exponentMask; + exponent <<= mantissaSize; + return exponent | mantissa; + } + + static inline uint64_t getStaticExponent(uint64_t entropy) { + auto exponent = constExponentBits; + exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits; + exponent <<= mantissaSize; + return exponent; + } + + static inline uint64_t getFloatMask(uint64_t entropy) { + constexpr uint64_t mask22bit = (1ULL << 22) - 1; + return (entropy & mask22bit) | getStaticExponent(entropy); + } + +} + +void randomx_vm::initialize() { + store64(®.a[0].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(0))); + store64(®.a[0].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(1))); + store64(®.a[1].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(2))); + store64(®.a[1].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(3))); + store64(®.a[2].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(4))); + store64(®.a[2].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(5))); + store64(®.a[3].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(6))); + store64(®.a[3].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(7))); + mem.ma = program.getEntropy(8) & randomx::CacheLineAlignMask; + mem.mx = program.getEntropy(10); + auto addressRegisters = program.getEntropy(12); + config.readReg0 = 0 + (addressRegisters & 1); + addressRegisters >>= 1; + config.readReg1 = 2 + (addressRegisters & 1); + addressRegisters >>= 1; + config.readReg2 = 4 + (addressRegisters & 1); + addressRegisters >>= 1; + config.readReg3 = 6 + (addressRegisters & 1); + datasetOffset = (program.getEntropy(13) % (randomx::DatasetExtraItems + 1)) * randomx::CacheLineSize; + store64(&config.eMask[0], randomx::getFloatMask(program.getEntropy(14))); + store64(&config.eMask[1], randomx::getFloatMask(program.getEntropy(15))); +} + +namespace randomx { + + alignas(16) volatile static rx_vec_i128 aesDummy; + + template + VmBase::~VmBase() { + Allocator::freeMemory(scratchpad, ScratchpadSize); + } + + template + void VmBase::allocate() { + if (datasetPtr == nullptr) + throw std::invalid_argument("Cache/Dataset not set"); + if (!softAes) { //if hardware AES is not supported, it's better to fail now than to return a ticking bomb + printf("allocate scratchpad !!\n"); + rx_vec_i128 tmp = rx_load_vec_i128((const rx_vec_i128*)&aesDummy); + tmp = rx_aesenc_vec_i128(tmp, tmp); + rx_store_vec_i128((rx_vec_i128*)&aesDummy, tmp); + } + scratchpad = (uint8_t*)Allocator::allocMemory(ScratchpadSize); + } + + template + void VmBase::getFinalResult(void* out, size_t outSize) { + printf("virtual_machine getFinalResult---\n"); + hashAes1Rx4(scratchpad, ScratchpadSize, ®.a); + blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); + } + + template + void VmBase::initScratchpad(void* seed) { + printf("virtual_machine initScratchpad---\n"); + fillAes1Rx4(seed, ScratchpadSize, scratchpad); + } + + template + void VmBase::generateProgram(void* seed) { + printf("virtual_machine generateProgram---\n"); + fillAes4Rx4(seed, sizeof(program), &program); + } + + template class VmBase, false>; + template class VmBase, true>; + template class VmBase; + template class VmBase; +} \ No newline at end of file diff --git a/randomx/virtual_machine.hpp b/randomx/virtual_machine.hpp new file mode 100644 index 0000000..d662c89 --- /dev/null +++ b/randomx/virtual_machine.hpp @@ -0,0 +1,85 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "common.hpp" +#include "program.hpp" + +/* Global namespace for C binding */ +class randomx_vm { +public: + virtual ~randomx_vm() = 0; + virtual void allocate() = 0; + virtual void getFinalResult(void* out, size_t outSize) = 0; + virtual void setDataset(randomx_dataset* dataset) { } + virtual void setCache(randomx_cache* cache) { } + virtual void initScratchpad(void* seed) = 0; + virtual void run(void* seed) = 0; + void resetRoundingMode(); + randomx::RegisterFile *getRegisterFile() { + return ® + } + const void* getScratchpad() { + return scratchpad; + } + const randomx::Program& getProgram() + { + return program; + } +protected: + void initialize(); + alignas(64) randomx::Program program; + alignas(64) randomx::RegisterFile reg; + alignas(16) randomx::ProgramConfiguration config; + randomx::MemoryRegisters mem; + uint8_t* scratchpad = nullptr; + union { + randomx_cache* cachePtr = nullptr; + randomx_dataset* datasetPtr; + }; + uint64_t datasetOffset; +public: + std::string cacheKey; +}; + +namespace randomx { + + template + class VmBase : public randomx_vm { + public: + ~VmBase() override; + void allocate() override; + void initScratchpad(void* seed) override; + void getFinalResult(void* out, size_t outSize) override; + protected: + void generateProgram(void* seed); + }; + +} diff --git a/randomx/virtual_memory.cpp b/randomx/virtual_memory.cpp new file mode 100644 index 0000000..128ddd2 --- /dev/null +++ b/randomx/virtual_memory.cpp @@ -0,0 +1,163 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "virtual_memory.hpp" + +#include +#include + +#if defined(_WIN32) || defined(__CYGWIN__) + #include +#else + #ifdef __APPLE__ + #include + #endif + #include + #include + #ifndef MAP_ANONYMOUS + #define MAP_ANONYMOUS MAP_ANON + #endif + #define PAGE_READONLY PROT_READ + #define PAGE_READWRITE (PROT_READ | PROT_WRITE) + #define PAGE_EXECUTE_READ (PROT_READ | PROT_EXEC) + #define PAGE_EXECUTE_READWRITE (PROT_READ | PROT_WRITE | PROT_EXEC) +#endif + +#if defined(_WIN32) || defined(__CYGWIN__) +std::string getErrorMessage(const char* function) { + LPSTR messageBuffer = nullptr; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); + std::string message(messageBuffer, size); + LocalFree(messageBuffer); + return std::string(function) + std::string(": ") + message; +} + +void setPrivilege(const char* pszPrivilege, BOOL bEnable) { + HANDLE hToken; + TOKEN_PRIVILEGES tp; + BOOL status; + DWORD error; + + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) + throw std::runtime_error(getErrorMessage("OpenProcessToken")); + + if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) + throw std::runtime_error(getErrorMessage("LookupPrivilegeValue")); + + tp.PrivilegeCount = 1; + + if (bEnable) + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + else + tp.Privileges[0].Attributes = 0; + + status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); + + error = GetLastError(); + if (!status || (error != ERROR_SUCCESS)) + throw std::runtime_error(getErrorMessage("AdjustTokenPrivileges")); + + if (!CloseHandle(hToken)) + throw std::runtime_error(getErrorMessage("CloseHandle")); +} +#endif + +void* allocMemoryPages(std::size_t bytes) { + void* mem; +#if defined(_WIN32) || defined(__CYGWIN__) + mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_READWRITE); + if (mem == nullptr) + throw std::runtime_error(getErrorMessage("allocMemoryPages - VirtualAlloc")); +#else + mem = mmap(nullptr, bytes, PAGE_READWRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (mem == MAP_FAILED) + throw std::runtime_error("allocMemoryPages - mmap failed"); +#endif + return mem; +} + +static inline void pageProtect(void* ptr, std::size_t bytes, int rules) { +#if defined(_WIN32) || defined(__CYGWIN__) + DWORD oldp; + if (!VirtualProtect(ptr, bytes, (DWORD)rules, &oldp)) { + throw std::runtime_error(getErrorMessage("VirtualProtect")); + } +#else + if (-1 == mprotect(ptr, bytes, rules)) + throw std::runtime_error("mprotect failed"); +#endif +} + +void setPagesRW(void* ptr, std::size_t bytes) { + pageProtect(ptr, bytes, PAGE_READWRITE); +} + +void setPagesRX(void* ptr, std::size_t bytes) { + pageProtect(ptr, bytes, PAGE_EXECUTE_READ); +} + +void setPagesRWX(void* ptr, std::size_t bytes) { + pageProtect(ptr, bytes, PAGE_EXECUTE_READWRITE); +} + +void* allocLargePagesMemory(std::size_t bytes) { + void* mem; + +#if defined(_WIN32) || defined(__CYGWIN__) + setPrivilege("SeLockMemoryPrivilege", 1); + auto pageMinimum = GetLargePageMinimum(); + if (pageMinimum > 0) + mem = VirtualAlloc(NULL, alignSize(bytes, pageMinimum), MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); + else + throw std::runtime_error("allocLargePagesMemory - Large pages are not supported"); + if (mem == nullptr) + throw std::runtime_error(getErrorMessage("allocLargePagesMemory - VirtualAlloc")); +#else + #ifdef __APPLE__ + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); + #elif defined(__FreeBSD__) + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER, -1, 0); + #elif defined(__OpenBSD__) + mem = MAP_FAILED; // OpenBSD does not support huge pages + #else + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); + #endif + if (mem == MAP_FAILED) + throw std::runtime_error("allocLargePagesMemory - mmap failed"); +#endif + return mem; +} + +void freePagedMemory(void* ptr, std::size_t bytes) { +#if defined(_WIN32) || defined(__CYGWIN__) + VirtualFree(ptr, 0, MEM_RELEASE); +#else + munmap(ptr, bytes); +#endif +} diff --git a/randomx/virtual_memory.hpp b/randomx/virtual_memory.hpp new file mode 100644 index 0000000..9e8bc29 --- /dev/null +++ b/randomx/virtual_memory.hpp @@ -0,0 +1,42 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +constexpr std::size_t alignSize(std::size_t pos, std::size_t align) { + return ((pos - 1) / align + 1) * align; +} + +void* allocMemoryPages(std::size_t); +void setPagesRW(void*, std::size_t); +void setPagesRX(void*, std::size_t); +void setPagesRWX(void*, std::size_t); +void* allocLargePagesMemory(std::size_t); +void freePagedMemory(void*, std::size_t); diff --git a/randomx/vm_compiled.cpp b/randomx/vm_compiled.cpp new file mode 100644 index 0000000..911f9e1 --- /dev/null +++ b/randomx/vm_compiled.cpp @@ -0,0 +1,82 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "vm_compiled.hpp" +#include "common.hpp" + +namespace randomx { + + static_assert(sizeof(MemoryRegisters) == 2 * sizeof(addr_t) + sizeof(uintptr_t), "Invalid alignment of struct randomx::MemoryRegisters"); + static_assert(sizeof(RegisterFile) == 256, "Invalid alignment of struct randomx::RegisterFile"); + + template + CompiledVm::CompiledVm() { + if (!secureJit) { + compiler.enableAll(); //make JIT buffer both writable and executable + } + } + + template + void CompiledVm::setDataset(randomx_dataset* dataset) { + printf("CompiledVm setDataset\n"); + datasetPtr = dataset; + } + + template + void CompiledVm::run(void* seed) { + //printf("CompiledVm run\n"); + VmBase::generateProgram(seed); + randomx_vm::initialize(); + if (secureJit) { + compiler.enableWriting(); + } + compiler.generateProgram(program, config); + if (secureJit) { + compiler.enableExecution(); + } + mem.memory = datasetPtr->memory + datasetOffset; + execute(); + } + + template + void CompiledVm::execute() { +#ifdef __aarch64__ + memcpy(reg.f, config.eMask, sizeof(config.eMask)); +#endif + compiler.getProgramFunc()(reg, mem, scratchpad, RANDOMX_PROGRAM_ITERATIONS); + } + + template class CompiledVm, false, false>; + template class CompiledVm, true, false>; + template class CompiledVm; + template class CompiledVm; + template class CompiledVm, false, true>; + template class CompiledVm, true, true>; + template class CompiledVm; + template class CompiledVm; +} \ No newline at end of file diff --git a/randomx/vm_compiled.hpp b/randomx/vm_compiled.hpp new file mode 100644 index 0000000..f7ceb0a --- /dev/null +++ b/randomx/vm_compiled.hpp @@ -0,0 +1,77 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include "virtual_machine.hpp" +#include "jit_compiler.hpp" +#include "allocator.hpp" +#include "dataset.hpp" + +namespace randomx { + + template + class CompiledVm : public VmBase { + public: + void* operator new(size_t size) { + void* ptr = AlignedAllocator::allocMemory(size); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + void operator delete(void* ptr) { + AlignedAllocator::freeMemory(ptr, sizeof(CompiledVm)); + } + CompiledVm(); + void setDataset(randomx_dataset* dataset) override; + void run(void* seed) override; + + using VmBase::mem; + using VmBase::program; + using VmBase::config; + using VmBase::reg; + using VmBase::scratchpad; + using VmBase::datasetPtr; + using VmBase::datasetOffset; + protected: + void execute(); + + JitCompiler compiler; + }; + + using CompiledVmDefault = CompiledVm, true, false>; + using CompiledVmHardAes = CompiledVm, false, false>; + using CompiledVmLargePage = CompiledVm; + using CompiledVmLargePageHardAes = CompiledVm; + using CompiledVmDefaultSecure = CompiledVm, true, true>; + using CompiledVmHardAesSecure = CompiledVm, false, true>; + using CompiledVmLargePageSecure = CompiledVm; + using CompiledVmLargePageHardAesSecure = CompiledVm; +} diff --git a/randomx/vm_compiled_light.cpp b/randomx/vm_compiled_light.cpp new file mode 100644 index 0000000..3093398 --- /dev/null +++ b/randomx/vm_compiled_light.cpp @@ -0,0 +1,72 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "vm_compiled_light.hpp" +#include "common.hpp" +#include + +namespace randomx { + + template + void CompiledLightVm::setCache(randomx_cache* cache) { + printf("CompiledLight cache to vm\n"); + cachePtr = cache; + mem.memory = cache->memory; + if (secureJit) { + compiler.enableWriting(); + } + compiler.generateSuperscalarHash(cache->programs, cache->reciprocalCache); + if (secureJit) { + compiler.enableExecution(); + } + } + + template + void CompiledLightVm::run(void* seed) { + printf("CompiledLight run\n"); + VmBase::generateProgram(seed); + randomx_vm::initialize(); + if (secureJit) { //secureJit = false + compiler.enableWriting(); + } + compiler.generateProgramLight(program, config, datasetOffset); + if (secureJit) { + compiler.enableExecution(); + } + CompiledVm::execute(); + } + + template class CompiledLightVm, false, false>; + template class CompiledLightVm, true, false>; + template class CompiledLightVm; + template class CompiledLightVm; + template class CompiledLightVm, false, true>; + template class CompiledLightVm, true, true>; + template class CompiledLightVm; + template class CompiledLightVm; +} \ No newline at end of file diff --git a/randomx/vm_compiled_light.hpp b/randomx/vm_compiled_light.hpp new file mode 100644 index 0000000..bed4ce1 --- /dev/null +++ b/randomx/vm_compiled_light.hpp @@ -0,0 +1,68 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "vm_compiled.hpp" + +namespace randomx { + + template + class CompiledLightVm : public CompiledVm { + public: + void* operator new(size_t size) { + void* ptr = AlignedAllocator::allocMemory(size); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + void operator delete(void* ptr) { + AlignedAllocator::freeMemory(ptr, sizeof(CompiledLightVm)); + } + void setCache(randomx_cache* cache) override; + void setDataset(randomx_dataset* dataset) override { } + void run(void* seed) override; + + using CompiledVm::mem; + using CompiledVm::compiler; + using CompiledVm::program; + using CompiledVm::config; + using CompiledVm::cachePtr; + using CompiledVm::datasetOffset; + }; + + using CompiledLightVmDefault = CompiledLightVm, true, false>; + using CompiledLightVmHardAes = CompiledLightVm, false, false>; + using CompiledLightVmLargePage = CompiledLightVm; + using CompiledLightVmLargePageHardAes = CompiledLightVm; + using CompiledLightVmDefaultSecure = CompiledLightVm, true, true>; + using CompiledLightVmHardAesSecure = CompiledLightVm, false, true>; + using CompiledLightVmLargePageSecure = CompiledLightVm; + using CompiledLightVmLargePageHardAesSecure = CompiledLightVm; +} \ No newline at end of file diff --git a/randomx/vm_interpreted.cpp b/randomx/vm_interpreted.cpp new file mode 100644 index 0000000..e5ec304 --- /dev/null +++ b/randomx/vm_interpreted.cpp @@ -0,0 +1,162 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include +#include +#include "vm_interpreted.hpp" +#include "dataset.hpp" +#include "intrin_portable.h" +#include "reciprocal.h" + +namespace randomx { + + template + void InterpretedVm::setDataset(randomx_dataset* dataset) { + printf("Interpreted setDataset\n"); + datasetPtr = dataset; + mem.memory = dataset->memory; + } + + template + void InterpretedVm::run(void* seed) { + printf("Interpreted run\n"); + VmBase::generateProgram(seed); + randomx_vm::initialize(); + execute(); + } + + template + void InterpretedVm::execute() { + + NativeRegisterFile nreg; + printf("InterpretedVm execute\n"); + for(unsigned i = 0; i < RegisterCountFlt; ++i) //RegisterCountFlt = 4 + nreg.a[i] = rx_load_vec_f128(®.a[i].lo); + + //printf("nreg.r[0]=%016llx\n",nreg.r[0]); + //printf("nreg.r[1]=%016llx\n",nreg.r[1]); + //printf("nreg.r[2]=%016llx\n",nreg.r[2]); + //printf("nreg.r[3]=%016llx\n",nreg.r[3]); + //printf("nreg.r[4]=%016llx\n",nreg.r[4]); + //printf("nreg.r[5]=%016llx\n",nreg.r[5]); + //printf("nreg.r[6]=%016llx\n",nreg.r[6]); + //printf("nreg.r[7]=%016llx\n",nreg.r[7]); + + //nreg.r 初始8个寄存器都是0; + + compileProgram(program, bytecode, nreg); + + uint32_t spAddr0 = mem.mx; + uint32_t spAddr1 = mem.ma; + + for(unsigned ic = 0; ic < RANDOMX_PROGRAM_ITERATIONS; ++ic) { //2048 RANDOMX_PROGRAM_ITERATIONS + uint64_t spMix = nreg.r[config.readReg0] ^ nreg.r[config.readReg1]; + //printf("ic= %0d,spAddr0= %0d,spAddr1=%0d,spMix=%0d,config.readReg0= %0d,config.readReg1=%0d\n",ic,spAddr0,spAddr1,spMix,config.readReg0,config.readReg1); + spAddr0 ^= spMix; + spAddr0 &= ScratchpadL3Mask64; + spAddr1 ^= spMix >> 32; + spAddr1 &= ScratchpadL3Mask64; //1cycle + + //printf("ic= %0d,spAddr0= %08lx,spAddr1=%08lx",ic,spAddr0,spAddr1); + for (unsigned i = 0; i < RegistersCount; ++i) //并行执行,读需要一个cycle + nreg.r[i] ^= load64(scratchpad + spAddr0 + 8 * i); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) //并行执行,读需要一个cycle,这里ram可以设置为读写总线为128bit + nreg.f[i] = rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * i); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) //并行执行,需要两个cycle + nreg.e[i] = maskRegisterExponentMantissa(config, rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i))); + + executeBytecode(bytecode, scratchpad, config); + + mem.mx ^= nreg.r[config.readReg2] ^ nreg.r[config.readReg3]; + mem.mx &= CacheLineAlignMask; + + // printf("ic= %0d,datasetOffset= %08lx,mem.ma=%08lx\n",ic,datasetOffset,mem.ma); + //datasetPrefetch(datasetOffset + mem.mx); + datasetRead(datasetOffset + mem.ma, nreg.r); //从memory中提取数据用于填充nreg.r 8个通用寄存器; 150cycle,连续读数据量为64byte; + std::swap(mem.mx, mem.ma); + + //if(ic== (RANDOMX_PROGRAM_ITERATIONS-1)){for (int i = 0; i < RegistersCount; ++i) printf("nreg.r[%d]= %016llx\n",i,nreg.r[i]);} + + for (unsigned i = 0; i < RegistersCount; ++i) + store64(scratchpad + spAddr1 + 8 * i, nreg.r[i]); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + nreg.f[i] = rx_xor_vec_f128(nreg.f[i], nreg.e[i]); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + rx_store_vec_f128((double*)(scratchpad + spAddr0 + 16 * i), nreg.f[i]); //ram写不需要cycle,但考虑到通道有限,这里上面两步写算一个cycle + + spAddr0 = 0; + spAddr1 = 0; + } + + for (unsigned i = 0; i < RegistersCount; ++i) + store64(®.r[i], nreg.r[i]); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + rx_store_vec_f128(®.f[i].lo, nreg.f[i]); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + rx_store_vec_f128(®.e[i].lo, nreg.e[i]); + } + + template + void InterpretedVm::datasetRead(uint64_t address, int_reg_t(&r)[RegistersCount]) { + //printf("vm_interpreted datasetRead\n"); + uint64_t* datasetLine = (uint64_t*)(mem.memory + address); + for (int i = 0; i < RegistersCount; ++i) + r[i] ^= datasetLine[i]; + + //printf("address= %016llx\n",address); + //for (int i = 0; i < RegistersCount; ++i) printf("datasetLine[%d]= %016llx\n",i,datasetLine[i]); + //for (int i = 0; i < RegistersCount; ++i) printf("r[%d]= %016llx\n",i,r[i]); + + //uint64_t* datasetLine = (uint64_t*)(mem.memory + 1); + //for (int i = 0; i < RegistersCount; ++i){ + // printf("datasetLine[%d]=%0llx\n",i,datasetLine[i]); + // r[i] ^= datasetLine[i]; + //} + } + + template + void InterpretedVm::datasetPrefetch(uint64_t address) { + //printf("vm_interpreted datasetPrefetch\n"); + rx_prefetch_nta(mem.memory + address); + } + + template class InterpretedVm, false>; + template class InterpretedVm, true>; + template class InterpretedVm; + template class InterpretedVm; +} \ No newline at end of file diff --git a/randomx/vm_interpreted.hpp b/randomx/vm_interpreted.hpp new file mode 100644 index 0000000..f119b6c --- /dev/null +++ b/randomx/vm_interpreted.hpp @@ -0,0 +1,75 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include "common.hpp" +#include "virtual_machine.hpp" +#include "bytecode_machine.hpp" +#include "intrin_portable.h" +#include "allocator.hpp" + +namespace randomx { + + template + class InterpretedVm : public VmBase, public BytecodeMachine { + public: + using VmBase::mem; + using VmBase::scratchpad; + using VmBase::program; + using VmBase::config; + using VmBase::reg; + using VmBase::datasetPtr; + using VmBase::datasetOffset; + void* operator new(size_t size) { + void* ptr = AlignedAllocator::allocMemory(size); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + void operator delete(void* ptr) { + AlignedAllocator::freeMemory(ptr, sizeof(InterpretedVm)); + } + void run(void* seed) override; + void setDataset(randomx_dataset* dataset) override; + protected: + virtual void datasetRead(uint64_t blockNumber, int_reg_t(&r)[RegistersCount]); + virtual void datasetPrefetch(uint64_t blockNumber); + private: + void execute(); + + InstructionByteCode bytecode[RANDOMX_PROGRAM_SIZE]; //RANDOMX_PROGRAM_SIZE = 256 + }; + + using InterpretedVmDefault = InterpretedVm, true>; + using InterpretedVmHardAes = InterpretedVm, false>; + using InterpretedVmLargePage = InterpretedVm; + using InterpretedVmLargePageHardAes = InterpretedVm; +} \ No newline at end of file diff --git a/randomx/vm_interpreted_light.cpp b/randomx/vm_interpreted_light.cpp new file mode 100644 index 0000000..6f6bab6 --- /dev/null +++ b/randomx/vm_interpreted_light.cpp @@ -0,0 +1,56 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "vm_interpreted_light.hpp" +#include "dataset.hpp" + +namespace randomx { + + template + void InterpretedLightVm::setCache(randomx_cache* cache) { + printf("InterpretedLight cache -> vm\n"); + cachePtr = cache; + mem.memory = cache->memory; + } + + template + void InterpretedLightVm::datasetRead(uint64_t address, int_reg_t(&r)[8]) { + uint32_t itemNumber = address / CacheLineSize; + int_reg_t rl[8]; + + initDatasetItem(cachePtr, (uint8_t*)rl, itemNumber); + + for (unsigned q = 0; q < 8; ++q) + r[q] ^= rl[q]; + } + + template class InterpretedLightVm, false>; + template class InterpretedLightVm, true>; + template class InterpretedLightVm; + template class InterpretedLightVm; +} diff --git a/randomx/vm_interpreted_light.hpp b/randomx/vm_interpreted_light.hpp new file mode 100644 index 0000000..02d678f --- /dev/null +++ b/randomx/vm_interpreted_light.hpp @@ -0,0 +1,61 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "vm_interpreted.hpp" + +namespace randomx { + + template + class InterpretedLightVm : public InterpretedVm { + public: + using VmBase::mem; + using VmBase::cachePtr; + void* operator new(size_t size) { + void* ptr = AlignedAllocator::allocMemory(size); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + void operator delete(void* ptr) { + AlignedAllocator::freeMemory(ptr, sizeof(InterpretedLightVm)); + } + void setDataset(randomx_dataset* dataset) override { } + void setCache(randomx_cache* cache) override; + protected: + void datasetRead(uint64_t address, int_reg_t(&r)[8]) override; + void datasetPrefetch(uint64_t address) override { } + }; + + using InterpretedLightVmDefault = InterpretedLightVm, true>; + using InterpretedLightVmHardAes = InterpretedLightVm, false>; + using InterpretedLightVmLargePage = InterpretedLightVm; + using InterpretedLightVmLargePageHardAes = InterpretedLightVm; +} diff --git a/sha3x/sha3x.c b/sha3x/sha3x.c new file mode 100644 index 0000000..1fefd45 --- /dev/null +++ b/sha3x/sha3x.c @@ -0,0 +1,220 @@ +/** libkeccak-tiny +* +* A single-file implementation of SHA-3 and SHAKE. +* +* Implementor: David Leon Gil +* License: CC0, attribution kindly requested. Blame taken too, +* but not liability. +*/ +//#include "sha3.h" + +#include +#include +#include +#include + +/******** The Keccak-f[1600] permutation ********/ + +/*** Constants. ***/ +static const uint8_t rho[24] = \ + { 1, 3, 6, 10, 15, 21, + 28, 36, 45, 55, 2, 14, + 27, 41, 56, 8, 25, 43, + 62, 18, 39, 61, 20, 44}; +static const uint8_t pi[24] = \ + {10, 7, 11, 17, 18, 3, + 5, 16, 8, 21, 24, 4, + 15, 23, 19, 13, 12, 2, + 20, 14, 22, 9, 6, 1}; +static const uint64_t RC[24] = \ + {1ULL, 0x8082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, + 0x808bULL, 0x80000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, + 0x8aULL, 0x88ULL, 0x80008009ULL, 0x8000000aULL, + 0x8000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, 0x8000000000008003ULL, + 0x8000000000008002ULL, 0x8000000000000080ULL, 0x800aULL, 0x800000008000000aULL, + 0x8000000080008081ULL, 0x8000000000008080ULL, 0x80000001ULL, 0x8000000080008008ULL}; + +/*** Helper macros to unroll the permutation. ***/ + +#define rol(x, s) (((x) << s) | ((x) >> (64 - s))) +#define Plen 200 + +static inline void keccakf(void* state) +{ + uint64_t* a = (uint64_t*)state; + uint64_t b[5] = {0}; + uint64_t t = 0; + uint8_t x, y; + +for (int i = 0; i < 24; i++) { + +//----------------------- Theta---------------- + for (int x =0 ; x<5; x=x+1) + { + b[x] = 0; + for (int y = 0 ;y<5*5; y=y+5) + { + b[x] ^= a[x + y]; + } + } +/* + if(i==0) + { + printf("the b is:\n"); + for (int i=0; i < 5; ++i) { + printf("%08llx ",b[i]); + } + printf("\n"); + } +*/ + for (int x= 0;x<5;x=x+1) + { + for(int y =0;y<5*5;y=y+5) + { + a[y + x] ^= b[(x + 4) % 5] ^ rol(b[(x + 1) % 5], 1); + } + } + + +//--------------------------------------- +// Rho and pi + t = a[1]; + x = 0; + + for(int x=0;x<24;x=x+1) + { + b[0] = a[pi[x]]; + a[pi[x]] = rol(t, rho[x]); + t = b[0]; + } + + // Chi + for (int y=0;y<5*5 ;y=y+5) + { + for(int x=0 ; x <5 ;x=x+1) + { + b[x] = a[y + x]; + } + for(int x =0 ;x <5 ;x=x+1) + { + a[y + x] = b[x] ^ ((~b[(x + 1) % 5]) & b[(x + 2) % 5]); + } + } +/* + if(i==0) + { + printf("the a is:\n"); + for (int i=0; i < 25; ++i) { + printf("%08llx ",a[i]); + } + printf("\n"); + } +*/ + // Iota + a[0] ^= RC[i]; + } +} + +/** The sponge-based hash construction. **/ +static inline int hash(uint8_t* out, size_t outlen,const uint8_t* in, size_t inlen,size_t rate, uint8_t delim) { + if ((out == NULL) || ((in == NULL) && inlen != 0) || (rate >= Plen)) { + return -1; + } + uint8_t a[Plen] = {0}; + + //printf("outlen=%lu,inlen=%lu,rate=%lu ,delim=%d\n",outlen,inlen,rate,delim); + // Absorb input. +// foldP(in, inlen, xorin); + while (inlen >= rate) { + printf("1\n"); + for (size_t i = 0; i < rate; i += 1) + { + a[i] ^= in[i]; + } + keccakf(a); + in += rate; + inlen -= rate; + } + // Xor in the DS and pad frame. + a[inlen] ^= delim; + a[rate - 1] ^= 0x80; +/* + printf("the a is:\n"); + for (int i=0; i < Plen; ++i) { + printf("%02x ",a[i]); + } + printf("\n"); +*/ + // Xor in the last block. +// xorin(a, in, inlen); + for (size_t i = 0; i < inlen; i += 1) + { + a[i] ^= in[i]; + } +/* + printf("the a is:\n"); + for (int i=0; i < Plen; ++i) { + printf("%02x",a[i]); + } + printf("\n"); +*/ + // Apply P + keccakf(a); +/* + printf("the a is:\n"); + for (int i=0; i < Plen; ++i) { + printf("%02x ",a[i]); + } + printf("\n"); +*/ + // Squeeze output. +// foldP(out, outlen, setout); + while (outlen >= rate) { + printf("2\n"); + for (size_t i = 0; i < rate; i += 1) + { + out[i] = a[i]; + } + keccakf(a); + out += rate; + inlen -= rate; + } + //setout(a, out, outlen); + for (size_t i = 0; i < outlen; i += 1) + { + out[i] = a[i]; + } + memset(a, 0, 200); + return 0; +} + +int sha3_256(uint8_t* out, size_t outlen,const uint8_t* in, size_t inlen) +{ + if (outlen > 32) { + return -1; + } + return hash(out, outlen, in, inlen, 200 - (256 / 4), 0x06); //0x01 +} + + +int main(int argc, char** argv) { +uint8_t md[32]; +uint8_t in[41] = { + 4, 0, 0, 0, 0, 0, 0, 0, + 105, 242, 153, 192, 55, 88, 19, 131, 21, 206, 122, 77, 20, 64, 137, 83, 244, 231, 6, 255, 198, 10, 176, 73, 146, 100, 29, 3, 9, 24, 152, 82, + 1 +}; +sha3_256(md, 32, in, sizeof(in)); +sha3_256(md, 32, md, sizeof(md)); +sha3_256(md, 32, md, sizeof(md)); + +for (int i=0; i < 32; ++i) { + printf("%02x",md[i]); +} +printf("\n"); +} + + +// nonce:[4, 0, 0, 0, 0, 0, 0, 0] +// mining_hash:FixedHash([105, 242, 153, 192, 55, 88, 19, 131, 21, 206, 122, 77, 20, 64, 137, 83, 244, 231, 6, 255, 198, 10, 176, 73, 146, 100, 29, 3, 9, 24, 152, 82]) +// pow_data:[1] \ No newline at end of file diff --git a/sha3x/sha3x.h b/sha3x/sha3x.h new file mode 100644 index 0000000..d3a7173 --- /dev/null +++ b/sha3x/sha3x.h @@ -0,0 +1,35 @@ +#ifndef SHA3X_H +#define SHA3X_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * SHA3-256哈希函数 + * @param out 输出缓冲区,用于存储32字节的哈希结果 + * @param outlen 输出长度,必须小于等于32 + * @param in 输入数据 + * @param inlen 输入数据长度 + * @return 0表示成功,-1表示失败 + */ +int sha3_256(uint8_t* out, size_t outlen, const uint8_t* in, size_t inlen); + +/** + * SHA3X三重哈希函数(Tari项目中的SHA3X算法) + * @param out 输出缓冲区,用于存储32字节的哈希结果 + * @param outlen 输出长度,必须小于等于32 + * @param in 输入数据 + * @param inlen 输入数据长度 + * @return 0表示成功,-1表示失败 + */ +int sha3x_hash(uint8_t* out, size_t outlen, const uint8_t* in, size_t inlen); + +#ifdef __cplusplus +} +#endif + +#endif // SHA3X_H \ No newline at end of file