#include #include // snprintf #include "../src/stdint.h" #include // strstr #include // std::stringstream #include "controller_ocl.h" #include "gf16_global.h" // GF16_POLYNOMIAL // for viewing compiled code, uncomment //#define DUMP_ASM // uncomment to prefer using the workgroup multiple instead of the max workgroup size // it seems like using the max width on GCN is usually ideal, but on Fermi and older, width is too wide that it hurts perf //#define OCL_PREFER_WORKGROUP_MULTIPLE 8 #ifdef DUMP_ASM # include // std::ofstream #endif #define STRINGIFY(...) #__VA_ARGS__ #define STR_HELPER(x) #x #define STR(x) STR_HELPER(x) const static char _ocl_defines[] = "#define GF16_POLYNOMIAL " STR(GF16_POLYNOMIAL) "\n" "#define SHIFT_TOP_BIT(x) ((x) >> (sizeof(x)*8-1))\n" // shift down top bit to bottom //"#define WORKAROUND_ALIAS_BUG\n" // have seen POCL not handle pointer alias writes properly // can use 24-bit multiply if the slice size fits within 24 bits (rownum guaranteed to be within 16 bits) // TODO: may be possible to do this is if stride is less than 16M *elements*, but we don't have byte addressing, so may not be possible "#if MAX_SLICE_SIZE < 16777216\n" " #define RBUF_REF(row, stride, col) mad24((uint)(row), (uint)(stride), (uint)(col))\n" "#else\n" " #define RBUF_REF(row, stride, col) ((row) * (stride) + (col))\n" "#endif\n" "#if MAX_SLICE_SIZE < 16777216 && MAX_SLICE_SIZE*NUM_OUTPUTS <= 0xffffffffUL\n" " #define WBUF_REF(row, stride, col) mad24((uint)(row), (uint)(stride), (uint)(col))\n" "#else\n" " #define WBUF_REF(row, stride, col) ((row) * (stride) + (col))\n" "#endif\n" "#define COBUF_REF(row, col) ((row) * SUBMIT_INPUTS + (col))\n" // if we're dealing with < 4GB at a time, there's no need for 64-bit memory references "#if MAX_SLICE_SIZE*SUBMIT_INPUTS > 0xffffffffUL || MAX_SLICE_SIZE*NUM_OUTPUTS > 0xffffffffUL\n" " typedef size_t memsize_t;\n" "#else\n" // TODO: check - size_t seems to faster than uint? //" typedef uint memsize_t;\n" " typedef size_t memsize_t;\n" "#endif\n" "#if VECT_WIDTH == 1\n" " #define VECT_ONE 0x0001u\n" " #define SPREAD_POLY(v) (((short)(v) >> 15) & GF16_POLYNOMIAL)\n" " #define nat_ushort4 ushort4\n" " #define nat_uint ushort\n" " #define nat_int short\n" " #define NAT_BITS 16\n" "#elif VECT_WIDTH == 2\n" " #define VECT_ONE 0x00010001ul\n" " #define SPREAD_POLY(v) mul24(((v) >> 15) & VECT_ONE, GF16_POLYNOMIAL & 0xffff)\n" " #define nat_ushort4 uint2\n" " #define nat_uint uint\n" " #define nat_int int\n" " #define NAT_BITS 32\n" "#elif VECT_WIDTH == 4 || VECT_WIDTH == 8\n" " #define VECT_ONE (ulong)0x0001000100010001ull\n" // Nvidia driver on Linux for 8400GS seems to need the explicit ulong cast " #define SPREAD_POLY(v) ((((v) >> 15) & VECT_ONE) * (GF16_POLYNOMIAL & 0xffff))\n" " #define nat_ushort4 ulong\n" " #define nat_uint ulong\n" " #define nat_int long\n" " #define NAT_BITS 64\n" "#else\n" " #error Unsupported vector width\n" "#endif\n" "#if VECT_WIDTH == 8\n" " #define val_t ulong2\n" "#else\n" " #define val_t nat_uint\n" "#endif\n" "#ifndef EX_TABLE_ARGS\n" " #define EX_TABLE_ARGS\n" " #define EX_TABLE_ARGS_DECL\n" "#endif\n" "#ifndef EX_TABLE_KARGS_DECL\n" " #define EX_TABLE_KARGS_DECL EX_TABLE_ARGS_DECL\n" "#endif\n" // experimental flag which probably isn't worth it - only usable in LOG methods, which doesn't have many coeffs to look up //"#define COPY_COEFF\n" // only used for `nolut` kernel: copies coefficients from global memory to local before using "#ifdef COPY_COEFF\n" " #define COEFF_TABLE_TYPE __local\n" "#else\n" " #define COEFF_TABLE_TYPE __global\n" "#endif\n" // TODO: explore using uint4 for reads to maximise bandwidth "#define READ_SRC(src, idx) src[idx]\n"; const static char _ocl_method_by2[] = "#define GF_MULTIPLY gf16_multiply_by2\n" STRINGIFY( // `a` is a packed ushort, `b` is just a ushort val_t gf16_multiply_by2(val_t a, nat_uint b) { nat_int btmp = b << (NAT_BITS - 16); val_t res = SHIFT_TOP_BIT(btmp) & a; ) "\n#pragma unroll\n" STRINGIFY( for(int i=0; i<15; i++) { val_t poly = SPREAD_POLY(res); res = ((res + res) & ~VECT_ONE) ^ poly; btmp <<= 1; res ^= SHIFT_TOP_BIT(btmp) & a; } return res; } ); const static char _ocl_method_lh[] = "#define LUT_REF(name, idx) ((__local ushort*)(name) + (idx)*512)\n" "#define LUT_REF_X2(name, idx) ((__local uint*)(name) + (idx)*512)\n" "#define LUT_DECLARATION(name, size) __local ushort name[512 * (size)] __attribute__ ((aligned (VECT_WIDTH*2)))\n" "#define LUT_DECLARATION_X2(name, size) __local uint name[512 * (size)] __attribute__ ((aligned (VECT_WIDTH*2)))\n" "#define LUT_DECLARATION_X2_1(name, src) __local ushort* name = (__local ushort*)(src)\n" "#define LUT_GENERATE compute_lhtable\n" "#define LUT_GENERATE_X2 compute_lhtable_x2\n" "#define LUT_GENERATE_HAS_BARRIER\n" "#define LUT_MULTIPLY lhtable_multiply\n" "#define LUT_MULTIPLY_X2 lhtable_multiply_x2\n" STRINGIFY( ushort gf16_multiply_256(ushort v) { return (v << 8) ^ mul256poly[v >> 8]; } // always writes 4 entries to table // requires val < 256 (used for lhtable calculation) void gf16_multiply_write_lh(__local nat_uint* table, nat_uint val, nat_uint coeff, nat_ushort4 coeff256) { nat_int b = coeff << (NAT_BITS - 16); ) "\n#if VECT_WIDTH >= 2\n" STRINGIFY( nat_uint prod = 0; // do a clmul, then use a lookup to do the reduction ) "\n#pragma unroll\n" STRINGIFY( for(uint i=1; i<256; i<<=1) prod ^= mul24((uint)(val & i), (uint)coeff); prod ^= mul256poly[prod >> 16]; // align to top to be compatible with old code (also conveniently discards upper bits) prod <<= NAT_BITS - 16; ) "\n#else\n" STRINGIFY( // align a and b to the top so that the top-bit can be replicated with a single arithmetic right-shift nat_int a = val << (NAT_BITS - 8); nat_int prod = SHIFT_TOP_BIT(a) & b; ) "\n#pragma unroll\n" STRINGIFY( for(int i=0; i<7; i++) { nat_uint poly = SHIFT_TOP_BIT(prod) & ((nat_uint)(GF16_POLYNOMIAL & 0xffff) << (NAT_BITS-16)); prod = (prod << 1) ^ poly; a <<= 1; prod ^= SHIFT_TOP_BIT(a) & b; } ) "\n#endif\n" STRINGIFY( // multiply b by 2 (aligned to top) nat_uint b2 = (b << 1) ^ (SHIFT_TOP_BIT(b) & ((nat_uint)(GF16_POLYNOMIAL & 0xffff) << (NAT_BITS-16))); // multiply prod by 256 ushort prod256 = gf16_multiply_256((nat_uint)prod >> (NAT_BITS - 16)); // write out to subsequent products ) "\n#if VECT_WIDTH == 1\n" STRINGIFY( table[val/VECT_WIDTH] = prod; table[val/VECT_WIDTH + 1] = prod ^ b; table[val/VECT_WIDTH + 2] = prod ^ b2; table[val/VECT_WIDTH + 3] = prod ^ b2 ^ b; table[val/VECT_WIDTH + 256/VECT_WIDTH] = prod256; table[val/VECT_WIDTH + 256/VECT_WIDTH + 1] = prod256 ^ coeff256.s1; table[val/VECT_WIDTH + 256/VECT_WIDTH + 2] = prod256 ^ coeff256.s2; table[val/VECT_WIDTH + 256/VECT_WIDTH + 3] = prod256 ^ coeff256.s3; ) "\n#elif VECT_WIDTH == 2\n" STRINGIFY( // add `b` to second element table[val/VECT_WIDTH] = (prod ^ b) | ((nat_uint)prod >> 16); table[val/VECT_WIDTH + 1] = (prod ^ b ^ b2) | (((nat_uint)prod ^ b2) >> 16); table[val/VECT_WIDTH + 256/VECT_WIDTH] = upsample(prod256, prod256) ^ coeff256.s0; table[val/VECT_WIDTH + 256/VECT_WIDTH + 1] = upsample(prod256, prod256) ^ coeff256.s1; ) "\n#elif VECT_WIDTH == 4\n" STRINGIFY( nat_uint res = (prod ^ b2) | ((nat_uint)prod >> 32); table[val/VECT_WIDTH] = (res ^ b ^ ((nat_uint)b >> 32)) | (res >> 16); table[val/VECT_WIDTH + 256/VECT_WIDTH] = (prod256 * VECT_ONE) ^ coeff256; ) "\n#else\n" " #error Vector width not implemented\n" "#endif\n" STRINGIFY( } nat_ushort4 lh_compute_coeff256(nat_uint coeff) { nat_ushort4 result; // compute coeff*0, coeff*256, coeff*512, coeff*768 ushort coeff256 = gf16_multiply_256(coeff); ushort coeff512 = ((nat_uint)coeff256 << 1) ^ (-((nat_uint)coeff256 >> 15) & GF16_POLYNOMIAL); ) "\n#if VECT_WIDTH == 1\n" STRINGIFY( //result[0] = 0; // unused result.s1 = coeff256; result.s2 = coeff512; result.s3 = coeff256 ^ coeff512; ) "\n#elif VECT_WIDTH == 2\n" STRINGIFY( result.s0 = (uint)coeff256 << 16; result.s1 = upsample((ushort)(coeff256 ^ coeff512), coeff512); ) "\n#elif VECT_WIDTH == 4\n" STRINGIFY( result = upsample(upsample((ushort)(coeff256 ^ coeff512), coeff512), (uint)coeff256 << 16); ) "\n#else\n" " #error Vector width not implemented\n" "#endif\n" STRINGIFY( return result; } void compute_lhtable(__local ushort* table, __global const ushort* restrict coeffs, const ushort numCoeff) { barrier(CLK_LOCAL_MEM_FENCE); ) "\n#ifdef WORKAROUND_ALIAS_BUG\n" STRINGIFY( if(((ulong)table & (VECT_WIDTH*2-1)) != 0) { // guaranteed to be false table[0] = 0; // never executed but needed for some reason } else ) "\n#endif\n" STRINGIFY( nat_uint localOffset = get_local_id(0)*4; // assume workgroup size is a power of 2 ) "\n#if COL_GROUP_SIZE*4 > 256\n" STRINGIFY( // process multiple coefficients at a time for(nat_uint coeffBase=0; coeffBase>8) | 0x1000100; return upsample( (ushort)(table[(val>>16) & 0xff] ^ table[(val_h>>16) & 0x1ff]), (ushort)(table[val & 0xff] ^ table[val_h & 0x1ff]) ); ) "\n#else\n" STRINGIFY( nat_uint result = table[val & 0xff] ^ table[((val>>8) & 0xff) + 256]; ) "\n#pragma unroll\n" STRINGIFY( for(nat_uint v=1; v>= 16; result |= (nat_uint)(table[val & 0xff] ^ table[((val>>8) & 0xff) + 256]) << (v*16); } return result; ) "\n#endif\n" STRINGIFY( } ) "\n#if VECT_WIDTH==2 && defined(WRITE_GRP2)\n" STRINGIFY( // LookupGrp2 stuff inline uint lh_mul256_x2(uint val) { uint val256 = upsample( mul256poly[val >> 24], mul256poly[(val >> 8) & 0xff] ); return ((val & 0xff00ff) << 8) ^ val256; } // writes two (low+high) entries void gf16_multiply_write_lh_x2(__local nat_uint* table, nat_uint val, uint coeffPair, uint coeff256Pair) { // align `a` to the top so that the top-bit can be replicated with a single arithmetic right-shift nat_int a = val << (NAT_BITS - 8); uint prod = SHIFT_TOP_BIT(a) & coeffPair; ) "\n#pragma unroll\n" STRINGIFY( for(int i=0; i<7; i++) { uint poly = SPREAD_POLY(prod); prod = ((prod + prod) & ~VECT_ONE) ^ poly; a <<= 1; prod ^= SHIFT_TOP_BIT(a) & coeffPair; } uint coeff2Pair = ((coeffPair + coeffPair) & ~VECT_ONE) ^ SPREAD_POLY(coeffPair); table[val] = prod; table[val+1] = prod ^ coeffPair; table[val+2] = prod ^ coeff2Pair; table[val+3] = prod ^ coeff2Pair ^ coeffPair; uint coeff512Pair = ((coeff256Pair + coeff256Pair) & ~VECT_ONE) ^ SPREAD_POLY(coeff256Pair); uint prod256 = lh_mul256_x2(prod); table[val + 256] = prod256; table[val + 257] = prod256 ^ coeff256Pair; table[val + 258] = prod256 ^ coeff512Pair; table[val + 259] = prod256 ^ coeff512Pair ^ coeff256Pair; /* dynamic number of entries idea: for(int i=0; i 256\n" STRINGIFY( // process multiple coefficients at a time for(nat_uint coeffBase=0; coeffBase>8) | 0x1000100; return (uint2)( table[val & 0xff] ^ table[val_h & 0x1ff], table[(val>>16) & 0xff] ^ table[val_h>>16] ); } ) "\n#endif\n" STRINGIFY( ); const static char _ocl_method_ll[] = "#define LUT_REF(name, idx) ((__local ushort*)(name) + (idx)*256)\n" "#define LUT_DECLARATION(name, size) __local ushort name[256 * (size)] __attribute__ ((aligned (VECT_WIDTH*2)))\n" "#define LUT_GENERATE compute_lltable\n" "#define LUT_GENERATE_HAS_BARRIER\n" "#define LUT_MULTIPLY lltable_multiply\n" STRINGIFY( ushort gf16_multiply_256(ushort v) { return (v << 8) ^ mul256poly[v >> 8]; } // as gf16_multiply_write_lh, but doesn't write 256x entries void gf16_multiply_write_ll(__local nat_uint* table, nat_uint val, nat_uint coeff) { // align a and b to the top so that the top-bit can be replicated with a single arithmetic right-shift nat_int b = coeff << (NAT_BITS - 16); ) "\n#if VECT_WIDTH >= 2\n" STRINGIFY( nat_uint prod = 0; ) "\n#pragma unroll\n" STRINGIFY( for(uint i=1; i<256; i<<=1) prod ^= mul24((uint)(val & i), (uint)coeff); prod ^= mul256poly[prod >> 16]; prod <<= NAT_BITS - 16; ) "\n#else\n" STRINGIFY( nat_int a = val << (NAT_BITS - 8); nat_int prod = SHIFT_TOP_BIT(a) & b; ) "\n#pragma unroll\n" STRINGIFY( for(int i=0; i<7; i++) { nat_uint poly = SHIFT_TOP_BIT(prod) & ((nat_uint)(GF16_POLYNOMIAL & 0xffff) << (NAT_BITS-16)); prod = (prod << 1) ^ poly; a <<= 1; prod ^= SHIFT_TOP_BIT(a) & b; } ) "\n#endif\n" STRINGIFY( // multiply by 2 (aligned to top) nat_uint b2 = (b << 1) ^ (SHIFT_TOP_BIT(b) & ((nat_uint)(GF16_POLYNOMIAL & 0xffff) << (NAT_BITS-16))); // write out to subsequent products ) "\n#if VECT_WIDTH == 1\n" STRINGIFY( table[val/VECT_WIDTH] = prod; table[val/VECT_WIDTH + 1] = prod ^ b; table[val/VECT_WIDTH + 2] = prod ^ b2; table[val/VECT_WIDTH + 3] = prod ^ b2 ^ b; ) "\n#elif VECT_WIDTH == 2\n" STRINGIFY( // add `b` to second element table[val/VECT_WIDTH] = (prod ^ b) | ((nat_uint)prod >> 16); table[val/VECT_WIDTH + 1] = (prod ^ b ^ b2) | (((nat_uint)prod ^ b2) >> 16); ) "\n#elif VECT_WIDTH == 4\n" STRINGIFY( nat_uint res = (prod ^ b2) | ((nat_uint)prod >> 32); table[val/VECT_WIDTH] = (res ^ b ^ ((nat_uint)b >> 32)) | (res >> 16); ) "\n#else\n" " #error Vector width not implemented\n" "#endif\n" STRINGIFY( } void compute_lltable(__local ushort* table, __global const ushort* restrict coeffs, const ushort numCoeff) { barrier(CLK_LOCAL_MEM_FENCE); ) "\n#ifdef WORKAROUND_ALIAS_BUG\n" STRINGIFY( if(((ulong)table & (VECT_WIDTH*2-1)) != 0) { // guaranteed to be false table[0] = 0; // never executed but needed for some reason } else ) "\n#endif\n" STRINGIFY( nat_uint localOffset = get_local_id(0)*4; // assume workgroup size is a power of 2 ) "\n#if COL_GROUP_SIZE*4 > 256\n" STRINGIFY( // process multiple coefficients at a time for(nat_uint coeffBase=0; coeffBase>16) & 0xff] ^ gf16_multiply_256(table[(val>>24) & 0xff])), (ushort)(table[val & 0xff] ^ gf16_multiply_256(table[(val>>8) & 0xff])) ); ) "\n#else\n" STRINGIFY( nat_uint result = table[val & 0xff] ^ gf16_multiply_256(table[(val>>8) & 0xff]); ) "\n#pragma unroll\n" STRINGIFY( for(nat_uint v=1; v>= 16; result |= (nat_uint)(table[val & 0xff] ^ gf16_multiply_256(table[(val>>8) & 0xff])) << (v*16); } return result; ) "\n#endif\n" STRINGIFY( } ); const static char _ocl_method_shuffle[] = "#undef READ_SRC\n" "#define READ_SRC gf16_shuffle_read\n" "#define WRITE_DST_OVERRIDE(dst, idx, val) gf16_shuffle_write(dst, idx, val)\n" "#define WRITE_DST_ADD_OVERRIDE(dst, idx, val) gf16_shuffle_add(dst, idx, val)\n" "#undef val_t\n" "#if VECT_WIDTH==1\n" " #define val_t uchar4\n" " #define val2_t ushort2\n" " #define nat_ucharX uchar2\n" "#elif VECT_WIDTH==2\n" " #define val_t uchar8\n" " #define val2_t ushort4\n" " #define nat_ucharX uchar4\n" "#elif VECT_WIDTH==4\n" " #define val_t uchar16\n" " #define val2_t ushort8\n" " #define nat_ucharX uchar8\n" "#else\n" " #error Unsupported vector width\n" "#endif\n" "#define LUT_REF(name, idx) ((__private uchar4*)(name) + (idx)*16)\n" "#define LUT_DECLARATION(name, size) __private uchar4 name[16 * (size)]\n" "#define LUT_GENERATE compute_shuffletable\n" "#define LUT_MULTIPLY shuffle_multiply\n" STRINGIFY( ushort gf16_mult2(const ushort v) { return (v << 1) ^ (-(v >= 0x8000) & (GF16_POLYNOMIAL & 0xffff)); } void compute_shuffletable(__private uchar4* table, __global const ushort* restrict coeffs, const ushort numCoeff) { //for(uint i=get_local_id(0); i>8, val2>>8, val3>>8); uchar4 ri; ) "\n#if GF16_POLYNOMIAL != 0x1100b\n#error Unsupported polynomial\n#endif\n" "#define MUL4(p, c) " "ri = tbl[p+1] >> (uchar)6;" "tbl[c+1] = ((tbl[p+1] << (uchar)2) | (tbl[p] >> (uchar)6)) ^ (ri << (uchar)4);" "tbl[c] = (tbl[p] << (uchar)2) ^ shuffle((uchar4)((uchar)0, (uchar)0xb, (uchar)(0xb<<1), (uchar)((0xb<<1)^0xb)), ri)\n" STRINGIFY( MUL4(0, 2); MUL4(2, 4); MUL4(4, 6); MUL4(6, 8); MUL4(8, 10); MUL4(10, 12); MUL4(12, 14); ) "\n#undef MUL4\n" STRINGIFY( } } val_t gf16_shuffle_read(__global const val_t* restrict src, const size_t idx) { const val_t data = src[idx]; return (val_t)(data.even, data.odd); } void gf16_shuffle_write(__global val_t* dst, const memsize_t offs, const val_t val) { *(__global val2_t*)(dst + offs) = upsample(val.hi, val.lo); } void gf16_shuffle_add(__global val_t* dst, const memsize_t offs, const val_t val) { *(__global val2_t*)(dst + offs) ^= upsample(val.hi, val.lo); } inline val_t shuffle_multiply(__private const uchar4* table, val_t val) { nat_ucharX resLo = shuffle(table[0], val.lo) ^ shuffle(table[8], val.hi); nat_ucharX resHi = shuffle(table[1], val.lo) ^ shuffle(table[9], val.hi); resLo ^= shuffle(table[2], val.lo >>(uchar)2) ^ shuffle(table[10], val.hi >>(uchar)2); resHi ^= shuffle(table[3], val.lo >>(uchar)2) ^ shuffle(table[11], val.hi >>(uchar)2); resLo ^= shuffle(table[4], val.lo >>(uchar)4) ^ shuffle(table[12], val.hi >>(uchar)4); resHi ^= shuffle(table[5], val.lo >>(uchar)4) ^ shuffle(table[13], val.hi >>(uchar)4); resLo ^= shuffle(table[6], val.lo >>(uchar)6) ^ shuffle(table[14], val.hi >>(uchar)6); resHi ^= shuffle(table[7], val.lo >>(uchar)6) ^ shuffle(table[15], val.hi >>(uchar)6); return (val_t)(resLo, resHi); } ); const static char _ocl_method_log[] = "#define GF_MULTIPLY(a, b) gf16_multiply_log(a, b EX_TABLE_ARGS)\n" "#undef READ_SRC\n" "#define READ_SRC(src, idx) gf16_log_src(src[idx] EX_TABLE_ARGS)\n" "#if defined(OCL_METHOD_LOG_TINY) || defined(OCL_METHOD_LOG_SMALL)\n" " #define LOG_SRC(n) gf16_log_small(n EX_TABLE_ARGS)\n" "#else\n" " #define LOG_SRC(n) ((LOG_MEM_TYPE ushort*)gf16_log)[n]\n" "#endif\n" STRINGIFY( ushort gf16_log_small(nat_uint val EX_TABLE_ARGS_DECL) { if(val == 0) return 65535; ) "\n#ifdef OCL_METHOD_LOG_SMALL\n" " #if __OPENCL_C_VERSION__ >= 200\n" " nat_uint log = ctz(val);\n" " #else\n" " nat_uint log = clz((nat_uint)((val-1)^val)) ^ (NAT_BITS-1);\n" " #endif\n" STRINGIFY( val >>= log; log += ((LOG_MEM_TYPE ushort*)gf16_log)[val >> 1]; ) "\n#else\n" STRINGIFY( nat_uint log = 0; uint prep; nat_uint shift; LOG_MEM_TYPE uint* gf16_log_prep = (gf16_log + 16384/2); prep = gf16_log_prep[val & 0x7ff]; shift = prep >> 16; log += shift; val = (val >> shift) ^ (prep & 0xffff); // (repeat above) prep = gf16_log_prep[val & 0x7ff]; shift = prep >> 16; log += shift; val = (val >> shift) ^ (prep & 0xffff); ushort log2 = ((LOG_MEM_TYPE ushort*)gf16_log)[val >> 2]; ) "\n#if VECT_WIDTH == 1\n" STRINGIFY( log += add_sat(log, log2) == (ushort)65535; log += log2; ) "\n#else\n" STRINGIFY( log += log2; if(log >= 65535) log -= 65535; //if (log == 12345678 && val == 12345678) return 0; // this is an impossible condition, but for some reason on some AMD cards, it makes the val==1 case work (i.e. without it, log(1) may result in 1 instead of 0) // unable to find the reason for the above issue (though have managed to cause different results with different ways to do the same thing) - I'm filing it as a compiler bug ) "\n#endif\n" "#endif\n" STRINGIFY( return log; } val_t gf16_log_src(val_t val EX_TABLE_ARGS_DECL) { ) "\n#if VECT_WIDTH==2\n" STRINGIFY( return upsample(LOG_SRC(val >> 16), LOG_SRC(val & 0xffff)); ) "\n#else\n" STRINGIFY( val_t res = LOG_SRC(val & 0xffff); ) "\n#pragma unroll\n" STRINGIFY( for(int v=1; v>shift) & 0xffff) << shift; } return res; ) "\n#endif\n" STRINGIFY( } nat_uint gfmat_calc_coeff_log(nat_uint recBlock, ushort inCoeff) { uint result = inCoeff * (ushort)recBlock; // calc 'result %= 65535' result = (result >> 16) + (result & 65535); result += result >> 16; return result & 65535; } val_t gf16_multiply_log(val_t a, nat_uint b EX_TABLE_ARGS_DECL) { val_t result = 0; ) "\n#pragma unroll\n" STRINGIFY( for(int v=0; v> 16) + (va & 0xffff); ) "\n#endif\n" STRINGIFY( ) "\n#ifdef OCL_METHOD_EXP_SMALL\n" STRINGIFY( uint vatmp = gf16_antilog[va >> 3]; vatmp <<= va & 7; vatmp ^= gf16_antilog[8192 + (vatmp >> 16)]; va = vatmp & 0xffff; ) "\n#elif defined(OCL_METHOD_EXP_TINY)\n" STRINGIFY( uint vatmp = gf16_antilog[va >> 4]; vatmp <<= va & 15; vatmp ^= gf16_antilog[4096 + (vatmp >> 24)] << 8; vatmp &= 0xffffff; vatmp ^= gf16_antilog[4096 + (vatmp >> 16)]; va = vatmp & 0xffff; ) "\n#else\n" STRINGIFY( va = gf16_antilog[va]; ) "\n#endif\n" STRINGIFY( result |= (isZero ? 0 : va) << (v*16); a >>= 16; } return result; } ); /* // probably not that useful of an idea const static char _ocl_method_splitmul[] = "#define GF_MULTIPLY gf16_multiply_splitmul\n" STRINGIFY( val_t gf16_multiply_splitmul(val_t a, nat_uint b) { val_t result = 0; ushort ba = (b>>12)*16; ushort bb = ((b>>8) & 0xf)*16; ushort bc = ((b>>4) & 0xf)*16; ushort bd = (b & 0xf)*16; ) "\n#pragma unroll\n" STRINGIFY( for(int v=0; v> 8) & 0xff; ushort ay = a & 0xff; ushort val = mul_split1[bd + ay] ^ gf16_mul16(mul_split1[bc + ay]) ^ mul_split256[bb + ay] ^ mul_split256[bd + ax] ^ gf16_mul16(mul_split256[ba + ay] ^ mul_split256[bc + ax]) ^ mul_split4107[bb + ax] ^ gf16_mul16(mul_split4107[ba + ax]) result |= val << (v*16); a >>= 16; } return result; } ); */ // kernel sources // defined during compile: MUL_ONLY, numInputs // strategy which caches several inputs, dot-products to multiple outputs, and doesn't require a LUT per coefficient #define KERNEL_NOLUT_MULGROUP \ "(__global val_t* restrict dst, __global const val_t* restrict src, COEFF_TABLE_TYPE const ushort* restrict coeff, const nat_uint outBlk, const nat_uint numOutputs, const ushort _numInputs, __local val_t* cache EX_TABLE_ARGS_DECL) { \n"\ " const memsize_t len = get_global_size(0)*COL_GROUP_ITERS; \n"\ " const memsize_t globalCol = get_global_id(0)*COL_GROUP_ITERS; \n"\ " const uint col = get_local_id(0)*COL_GROUP_ITERS; \n"\ " __global const val_t* srcBase = src + globalCol; \n"\ " \n"\ " val_t result[OUTPUT_GROUPING][COL_GROUP_ITERS]; \n"\ " val_t val[COL_GROUP_ITERS]; \n"\ " #pragma unroll \n"\ " for(nat_uint iter=0; iter> 16; \n"\ " curCoeff &= 0xffff; \n"\ " } \n"\ " #else \n"\ " #pragma unroll \n"\ " for (nat_uint o = 0; o < numOutputs; o++) { \n"\ " curCoeff = GET_COEFF(outBlk+o, 0u); \n"\ " #pragma unroll \n"\ " for(nat_uint iter=0; iter> 16; \n"\ " curCoeff &= 0xffff; \n"\ " } \n"\ " #else \n"\ " #pragma unroll \n"\ " for (nat_uint o = 0; o < numOutputs; o++) { \n"\ " curCoeff = GET_COEFF(outBlk+o, i); \n"\ " #pragma unroll \n"\ " for(nat_uint iter=0; iter OUTPUT_GROUPING\n" STRINGIFY( // first iteration -> copy loaded data to cache //barrier(CLK_LOCAL_MEM_FENCE); // probably not needed? thread only accesses the same value that it writes KERNFN(mulgroup_cache)(dst, src, lcoeff, outBlk, OUTPUT_GROUPING, numInputs, cache EX_TABLE_ARGS); outBlk += OUTPUT_GROUPING; //barrier(CLK_LOCAL_MEM_FENCE); for (; outBlk < OUTPUTS_PER_THREAD - OUTPUT_GROUPING + 1; outBlk += OUTPUT_GROUPING) { KERNFN(mulgroup_read)(dst, src, lcoeff, outBlk, OUTPUT_GROUPING, numInputs, cache EX_TABLE_ARGS); } ) "\n#endif\n" "#if OUTPUTS_PER_THREAD == OUTPUT_GROUPING || OUTPUTS_PER_THREAD % OUTPUT_GROUPING > 0\n" STRINGIFY( ) "\n#if OUTPUTS_PER_THREAD > OUTPUT_GROUPING\n" STRINGIFY( KERNFN(mulgroup_read)(dst, src, lcoeff, outBlk, (OUTPUTS_PER_THREAD == OUTPUT_GROUPING ? OUTPUTS_PER_THREAD : OUTPUTS_PER_THREAD % OUTPUT_GROUPING), numInputs, cache EX_TABLE_ARGS); ) "\n#else\n" STRINGIFY( KERNFN(mulgroup_nocache)(dst, src, lcoeff, outBlk, (OUTPUTS_PER_THREAD == OUTPUT_GROUPING ? OUTPUTS_PER_THREAD : OUTPUTS_PER_THREAD % OUTPUT_GROUPING), numInputs, (__local val_t*)0 EX_TABLE_ARGS); ) "\n#endif\n" STRINGIFY( ) "\n#endif\n" STRINGIFY( } else { ) "\n#define OUTPUTS_THIS_THREAD (NUM_OUTPUTS % OUTPUTS_PER_THREAD)\n" STRINGIFY( ) "\n#if OUTPUTS_THIS_THREAD > OUTPUT_GROUPING\n" STRINGIFY( // first iteration -> copy loaded data to cache //barrier(CLK_LOCAL_MEM_FENCE); KERNFN(mulgroup_cache)(dst, src, lcoeff, outBlk, OUTPUT_GROUPING, numInputs, cache EX_TABLE_ARGS); outBlk += OUTPUT_GROUPING; //barrier(CLK_LOCAL_MEM_FENCE); for (; outBlk < OUTPUTS_THIS_THREAD - OUTPUT_GROUPING + 1; outBlk += OUTPUT_GROUPING) { KERNFN(mulgroup_read)(dst, src, lcoeff, outBlk, OUTPUT_GROUPING, numInputs, cache EX_TABLE_ARGS); } ) "\n#endif\n#if OUTPUTS_THIS_THREAD == OUTPUT_GROUPING || OUTPUTS_THIS_THREAD % OUTPUT_GROUPING > 0\n" STRINGIFY( ) "\n#if OUTPUTS_THIS_THREAD > OUTPUT_GROUPING\n" STRINGIFY( KERNFN(mulgroup_read)(dst, src, lcoeff, outBlk, (OUTPUTS_THIS_THREAD == OUTPUT_GROUPING ? OUTPUTS_THIS_THREAD : OUTPUTS_THIS_THREAD % OUTPUT_GROUPING), numInputs, cache EX_TABLE_ARGS); ) "\n#else\n" STRINGIFY( KERNFN(mulgroup_nocache)(dst, src, lcoeff, outBlk, (OUTPUTS_THIS_THREAD == OUTPUT_GROUPING ? OUTPUTS_THIS_THREAD : OUTPUTS_THIS_THREAD % OUTPUT_GROUPING), numInputs, (__local val_t*)0 EX_TABLE_ARGS); ) "\n#endif\n" STRINGIFY( ) "\n#endif\n" STRINGIFY( } }) "\n#undef WRITE_DST\n"; // strategy which caches several inputs, uses dot-product operations to a single output, and a LUT per coefficient const static char _ocl_kernel_cachelut[] = "#ifdef WRITE_DST_OVERRIDE\n" " #ifdef MUL_ONLY\n" " #define WRITE_DST WRITE_DST_OVERRIDE\n" " #else\n" " #define WRITE_DST WRITE_DST_ADD_OVERRIDE\n" " #endif\n" "#elif defined(MUL_ONLY)\n" " #define WRITE_DST(dst, idx, val) dst[idx] = val\n" "#else\n" " #define WRITE_DST(dst, idx, val) dst[idx] ^= val\n" "#endif\n" STRINGIFY({ const uint colLocal = get_local_id(0); const memsize_t colGlobal = colLocal + get_group_id(0)*COL_GROUP_SIZE*COL_GROUP_ITERS; const memsize_t len = get_global_size(0) * COL_GROUP_ITERS; // compute output ) "\n#if NUM_OUTPUTS <= OUTPUTS_PER_THREAD\n" STRINGIFY( const nat_uint outputsThisThread = NUM_OUTPUTS; const nat_uint outBase = 0; __global val_t* dstBase = dst + colGlobal; ) "\n#ifdef WRITE_GRP2\n" STRINGIFY( __global val_t* dstBase2 = dst + colGlobal*2; ) "\n#endif\n" STRINGIFY( ) "\n#else\n" STRINGIFY( const nat_uint outBase = get_global_id(1) * OUTPUTS_PER_THREAD; ) "\n#if NUM_OUTPUTS % OUTPUTS_PER_THREAD == 0\n" STRINGIFY( const nat_uint outputsThisThread = OUTPUTS_PER_THREAD; ) "\n#else\n" STRINGIFY( const nat_uint outputsThisThread = (get_global_id(1)+1 == get_global_size(1) ? NUM_OUTPUTS % OUTPUTS_PER_THREAD : OUTPUTS_PER_THREAD); ) "\n#endif\n" STRINGIFY( __global val_t* dstBase = dst + WBUF_REF(outBase, len, colGlobal); ) "\n#ifdef WRITE_GRP2\n" STRINGIFY( __global val_t* dstBase2 = dst + WBUF_REF(outBase, len, colGlobal*2); ) "\n#endif\n" STRINGIFY( ) "\n#endif\n" STRINGIFY( __global const val_t* srcBase = src + colGlobal; ) "\n#ifdef WRITE_GRP2\n" STRINGIFY( LUT_DECLARATION_X2(lut_table_x2, SUBMIT_INPUTS); LUT_DECLARATION_X2_1(lut_table, lut_table_x2); ) "\n#else\n" STRINGIFY( LUT_DECLARATION(lut_table, SUBMIT_INPUTS); ) "\n#endif\n" STRINGIFY( // only one output -> no need to cache (this also handles the special case for *_X2) // TODO: check if this is worth it if(outputsThisThread == 1) { LUT_GENERATE(lut_table, coeff + COBUF_REF(outBase, 0u), numInputs); for(uint iter=0; iter copy loaded data to cache uint iterBase = iter*COL_GROUP_SIZE; val_t val = READ_SRC(srcBase, 0u); cache[0][colLocal+iterBase] = val; ) "\n#ifdef WRITE_GRP2\n" STRINGIFY( uint2 result = LUT_MULTIPLY_X2(LUT_REF_X2(lut_table_x2, 0u), val); ) "\n#else\n" STRINGIFY( val_t result = LUT_MULTIPLY(LUT_REF(lut_table, 0u), val); ) "\n#endif\n" STRINGIFY( __global const val_t* srcIterBase = srcBase; ) "\n#ifdef numInputs\n#pragma unroll\n#endif\n" STRINGIFY( for (nat_uint input = 1; input < numInputs; input++) { srcIterBase += len; val = READ_SRC(srcIterBase, 0u); // copy global input to local cache to be used in subsequent iterations // TODO: investigate using async_work_group_copy cache[input][colLocal + iterBase] = val; ) "\n#ifdef WRITE_GRP2\n" STRINGIFY( result ^= LUT_MULTIPLY_X2(LUT_REF_X2(lut_table_x2, input), val); ) "\n#else\n" STRINGIFY( result ^= LUT_MULTIPLY(LUT_REF(lut_table, input), val); ) "\n#endif\n" STRINGIFY( } ) "\n#ifdef WRITE_GRP2\n" STRINGIFY( WRITE_DST(dstBase2, iterBase*2, result[0]); WRITE_DST(dstBase2, iterBase*2+1, result[1]); ) "\n#else\n" STRINGIFY( WRITE_DST(dstBase, iterBase, result); ) "\n#endif\n" STRINGIFY( srcBase += COL_GROUP_SIZE; } //barrier(CLK_LOCAL_MEM_FENCE); // subsequent iterations: use cache ) "\n#ifdef WRITE_GRP2\n" STRINGIFY( nat_uint output=2; for(; output OUTPUT_GROUPING\n" STRINGIFY( for (; outBlk < OUTPUTS_PER_THREAD - OUTPUT_GROUPING + 1; outBlk += OUTPUT_GROUPING) { KERNFN(mulgroup)(dst, src, coeff, lut_table, outBlk, OUTPUT_GROUPING, numInputs); } ) "\n#endif\n#if OUTPUTS_PER_THREAD == OUTPUT_GROUPING || OUTPUTS_PER_THREAD % OUTPUT_GROUPING > 0\n" STRINGIFY({ ) "\n#define NUM_OUTPUTS_LEFT (OUTPUTS_PER_THREAD == OUTPUT_GROUPING ? OUTPUTS_PER_THREAD : OUTPUTS_PER_THREAD % OUTPUT_GROUPING)\n" STRINGIFY( KERNFN(mulgroup)(dst, src, coeff, lut_table, outBlk, NUM_OUTPUTS_LEFT, numInputs); ) "\n#undef NUM_OUTPUTS_LEFT\n" STRINGIFY( }) "\n#endif\n" STRINGIFY( } else { ) "\n#define OUTPUTS_THIS_THREAD (NUM_OUTPUTS % OUTPUTS_PER_THREAD)\n" STRINGIFY( ) "\n#if OUTPUTS_THIS_THREAD > OUTPUT_GROUPING\n" STRINGIFY( for (; outBlk < OUTPUTS_THIS_THREAD - OUTPUT_GROUPING + 1; outBlk += OUTPUT_GROUPING) { KERNFN(mulgroup)(dst, src, coeff, lut_table, outBlk, OUTPUT_GROUPING, numInputs); } ) "\n#endif\n#if OUTPUTS_THIS_THREAD == OUTPUT_GROUPING || OUTPUTS_THIS_THREAD % OUTPUT_GROUPING > 0\n" STRINGIFY({ ) "\n#define NUM_OUTPUTS_LEFT (OUTPUTS_THIS_THREAD == OUTPUT_GROUPING ? OUTPUTS_THIS_THREAD : OUTPUTS_THIS_THREAD % OUTPUT_GROUPING)\n" STRINGIFY( KERNFN(mulgroup)(dst, src, coeff, lut_table, outBlk, NUM_OUTPUTS_LEFT, numInputs); ) "\n#undef NUM_OUTPUTS_LEFT\n" STRINGIFY( }) "\n#endif\n" STRINGIFY( } }) "\n#undef WRITE_DST\n"; #undef STRINGIFY #undef STR_HELPER #undef STR #define CEIL_DIV(a, b) (((a) + (b)-1) / (b)) // ensure value is a power of 2, rounding down if needed static inline size_t round_down_pow2(size_t v) { if((v & (v-1)) == 0) return v; // is a power of 2 (shortcut exists because this is the common case) // find target number via a float conversion // (usage of float over double does mean that this can be wrong for very large numbers, but we're not expecting these) union { float f; uint32_t u; } tmp; tmp.f = (float)v; // convert to float tmp.u &= 0xff800000; // discard mantissa return (size_t)tmp.f; // convert back to int // alternative approach, avoiding flot<>int conversion, but using bit-scans /* #ifdef _MSC_VER unsigned long idx; _BitScanReverse(&idx, v); return 1 << idx; #else int idx = __builtin_clzl(v); return 1 << ((sizeof(unsigned long)-1) ^ idx); #endif */ } GF16OCL_MethodInfo PAR2ProcOCL::info(Galois16OCLMethods method) { GF16OCL_MethodInfo ret{ method, Galois16OCLMethodsText[static_cast(method)], 8, 1, true }; switch(method) { case GF16OCL_BY2: ret.idealInBatch = 16; break; case GF16OCL_SHUFFLE: ret.idealInBatch = 4; ret.usesOutGrouping = false; break; case GF16OCL_LOG: case GF16OCL_LOG_SMALL: case GF16OCL_LOG_SMALL2: case GF16OCL_LOG_TINY: case GF16OCL_LOG_SMALL_LMEM: case GF16OCL_LOG_TINY_LMEM: ret.idealInBatch = 8; break; case GF16OCL_LOOKUP_GRP2: ret.usesOutGrouping = false; ret.idealInBatch = 4; ret.idealIters = 2; break; case GF16OCL_LOOKUP: case GF16OCL_LOOKUP_HALF: ret.usesOutGrouping = false; ret.idealInBatch = 4; ret.idealIters = 4; break; case GF16OCL_LOOKUP_NOCACHE: case GF16OCL_LOOKUP_HALF_NOCACHE: ret.idealInBatch = 4; ret.idealIters = 8; // generally little reason not to do multiple iters break; case GF16OCL_LOOKUP_GRP2_NOCACHE: ret.idealInBatch = 6; ret.idealIters = 2; break; default: break; // prevent compiler warning } return ret; } // _sliceSize must be divisible by 2 bool PAR2ProcOCL::setup_kernels(Galois16OCLMethods method, unsigned targetInputBatch, unsigned targetIters, unsigned targetGrouping, bool outputSequential) { // TODO: get device info // CL_DEVICE_HOST_UNIFIED_MEMORY (deprecated?) // CL_DEVICE_ADDRESS_BITS (max referencable memory) unsigned infoShortVecSize = device.getInfo(); // seems to usually be the same as CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT if(infoShortVecSize < 4) infoShortVecSize = 2; // assume all GPUs do 32-bit math efficiently (various nVidia platforms return 1, but 2 runs faster) if(infoShortVecSize >= 8 && method == GF16OCL_BY2) infoShortVecSize = 8; // allow BY2 to use VECT_WIDTH=8 else if(infoShortVecSize >= 4) infoShortVecSize = 4; // other than BY2, we currently only support vect-width=1,2,4 wgSize = device.getInfo(); if(device.getInfo() == 1 && device.getInfo() == 0x10de /*Nvidia*/ && wgSize > 32) { // this size is too large for Nvidia cards; we'll assume their max width is ideal for 8-bit ints, so divide it by 4 wgSize /= infoShortVecSize*2; } #ifdef OCL_PREFER_WORKGROUP_MULTIPLE size_t wgSizeMultiple = getWGSize(context, device) * OCL_PREFER_WORKGROUP_MULTIPLE; if(wgSizeMultiple && wgSizeMultiple < wgSize) wgSize = wgSizeMultiple; #endif if(wgSize > 8192) wgSize = 8192; // sanity check size_t deviceAvailConstSize = device.getInfo(); size_t deviceLocalSize = device.getInfo() - 128; // subtract a little to allow some spare space if the device needs it //size_t deviceGlobalSize = device.getInfo(); // TODO: check sliceSizeCksum*numOutputs ? unsigned numOutputs = outputExponents.size(); unsigned outputsPerThread = numOutputs; // currently, process all outputs on every kernel invocation std::stringstream sourceStream; auto methInfo = info(method); inputBatchSize = methInfo.idealInBatch; unsigned groupIterations = methInfo.idealIters; unsigned threadWordSize = infoShortVecSize*2; unsigned sizePerWorkGroup = threadWordSize * wgSize; outputsPerGroup = 8; // for nolut kernel; have seen some cards prefer '4' (on older cards?) const char* kernelCode; const char* kernelFuncs = NULL; const char* methodCode; const char* oclVerArg = ""; // method selector coeffType = GF16OCL_COEFF_NORMAL; switch(method) { case GF16OCL_BY2: while(1) { if(inputBatchSize*sizePerWorkGroup > deviceLocalSize) inputBatchSize = (unsigned)(deviceLocalSize / sizePerWorkGroup); if(inputBatchSize < 1 && deviceLocalSize >= 2048) { // try reducing workgroup if too large if(wgSize > 256) { wgSize = 256; sizePerWorkGroup = threadWordSize * wgSize; inputBatchSize = 16; continue; } } break; } if(targetInputBatch) inputBatchSize = targetInputBatch; kernelCode = _ocl_kernel_nolut; kernelFuncs = _ocl_kernel_nolut_funcs; methodCode = _ocl_method_by2; break; case GF16OCL_SHUFFLE: //case GF16OCL_SHUFFLE2: oclVerArg = "-cl-std=CL1.1"; infoShortVecSize = 2; // currently only implemented for 32-bit on GPUs threadWordSize = infoShortVecSize*4; // process two words, so double per thread word size sizePerWorkGroup = threadWordSize * wgSize; if(targetInputBatch) inputBatchSize = targetInputBatch; // TODO: compute ideal iteration count while(1) { groupIterations = (unsigned)round_down_pow2( deviceLocalSize / (sizePerWorkGroup * inputBatchSize) ); if(groupIterations < 1 && deviceLocalSize >= 2048) { // try reducing workgroup if too large if(wgSize > 256) { wgSize = 256; sizePerWorkGroup = threadWordSize * wgSize; continue; } // try reducing input batch size if(inputBatchSize > 2) { inputBatchSize = 2; continue; } // try reducing workgroup again if(wgSize > 64) { wgSize = 64; sizePerWorkGroup = threadWordSize * wgSize; continue; } } break; } kernelCode = _ocl_kernel_cachelut; methodCode = _ocl_method_shuffle; break; case GF16OCL_LOG: case GF16OCL_LOG_SMALL: case GF16OCL_LOG_SMALL2: case GF16OCL_LOG_TINY: case GF16OCL_LOG_SMALL_LMEM: case GF16OCL_LOG_TINY_LMEM: coeffType = outputSequential ? GF16OCL_COEFF_LOG_SEQ : GF16OCL_COEFF_LOG; { size_t reqLocalMem = 0; if(method == GF16OCL_LOG_SMALL_LMEM) reqLocalMem = 16384+256; if(method == GF16OCL_LOG_TINY_LMEM) reqLocalMem = 8192+512; if(deviceLocalSize < reqLocalMem) return false; deviceLocalSize -= reqLocalMem; } while(1) { if(inputBatchSize*sizePerWorkGroup > deviceLocalSize) inputBatchSize = (unsigned)(deviceLocalSize / sizePerWorkGroup); if(inputBatchSize < 1 && deviceLocalSize >= 2048) { // try reducing workgroup if too large if(wgSize > 256) { wgSize = 256; sizePerWorkGroup = threadWordSize * wgSize; inputBatchSize = 8; continue; } } break; } if(targetInputBatch) inputBatchSize = targetInputBatch; kernelCode = _ocl_kernel_nolut; kernelFuncs = _ocl_kernel_nolut_funcs; sourceStream << "#define OCL_METHOD_LOG\n"; if(method == GF16OCL_LOG_SMALL || method == GF16OCL_LOG_SMALL2 || method == GF16OCL_LOG_SMALL_LMEM) sourceStream << "#define OCL_METHOD_EXP_SMALL\n"; if(method == GF16OCL_LOG_TINY || method == GF16OCL_LOG_TINY_LMEM) sourceStream << "#define OCL_METHOD_EXP_TINY\n"; if(method == GF16OCL_LOG_SMALL2) sourceStream << "#define OCL_METHOD_LOG_TINY\n"; methodCode = _ocl_method_log; break; case GF16OCL_LOOKUP: case GF16OCL_LOOKUP_HALF: case GF16OCL_LOOKUP_NOCACHE: case GF16OCL_LOOKUP_HALF_NOCACHE: case GF16OCL_LOOKUP_GRP2: case GF16OCL_LOOKUP_GRP2_NOCACHE: default: if(method == GF16OCL_LOOKUP_GRP2 || method == GF16OCL_LOOKUP_GRP2_NOCACHE) { if(method == GF16OCL_LOOKUP_GRP2) outputsPerThread += outputsPerThread & 1; // outputsPerThread must be a multiple of two sourceStream << "#define WRITE_GRP2 1\n"; infoShortVecSize = 2; // only supported on 32b // recompute the two variables dependent on infoShortVecSize threadWordSize = infoShortVecSize*2; sizePerWorkGroup = threadWordSize * wgSize; outputsInterleaved = 2; } if(method == GF16OCL_LOOKUP_NOCACHE || method == GF16OCL_LOOKUP_HALF_NOCACHE || method == GF16OCL_LOOKUP_GRP2_NOCACHE) { unsigned tables = (unsigned)(deviceLocalSize / (method == GF16OCL_LOOKUP_HALF_NOCACHE ? 512 : 1024)); if(tables >= 96) inputBatchSize = 8; else if(tables >= 60) inputBatchSize = 6; else if(tables >= 32) inputBatchSize = 4; else inputBatchSize = 2; outputsPerGroup = tables / inputBatchSize; if(outputsPerGroup > 16) outputsPerGroup = 16; if(method == GF16OCL_LOOKUP_GRP2_NOCACHE && outputsPerGroup > 1) { outputsPerGroup &= ~1; // must be a multiple of 2 if outputs are written in pairs targetGrouping &= ~1; } if(targetInputBatch) inputBatchSize = targetInputBatch; kernelCode = _ocl_kernel_lut; kernelFuncs = _ocl_kernel_lut_funcs; } else { if(targetInputBatch) inputBatchSize = targetInputBatch; // TODO: compute ideal iteration count unsigned tableSize = method == GF16OCL_LOOKUP_HALF ? 512 : 1024; if(method == GF16OCL_LOOKUP_GRP2) tableSize *= 2; while(1) { groupIterations = (unsigned)round_down_pow2( (deviceLocalSize - inputBatchSize*tableSize) / (sizePerWorkGroup * inputBatchSize) ); if(groupIterations < 1 && deviceLocalSize >= 8192) { // maybe the workgroup is too big if(wgSize > 256) { wgSize = 256; sizePerWorkGroup = threadWordSize * wgSize; continue; } // try reducing input batch size if(inputBatchSize > 2) { inputBatchSize = 2; continue; } } break; } kernelCode = _ocl_kernel_cachelut; } methodCode = (method == GF16OCL_LOOKUP_HALF || method == GF16OCL_LOOKUP_HALF_NOCACHE) ? _ocl_method_ll : _ocl_method_lh; // write multiply-by-256 reduction table sourceStream << "__constant ushort mul256poly[256] = {0"; for(int i=1; i<256; i++) { int n = i<<8; for(int b=0; b<8; b++) n = (n << 1) ^ (-(n>>15) & GF16_POLYNOMIAL); sourceStream << ',' << (n & 0xffff); } sourceStream << "};\n"; break; } if(inputBatchSize < 1) return false; minInBatchSize = inputBatchSize; if(targetIters) groupIterations = targetIters; if(targetGrouping) outputsPerGroup = targetGrouping; if(groupIterations < 1 || outputsPerGroup < 1) return false; // for very small slices, scale down iterations/wgSize if(sizePerWorkGroup * groupIterations > sliceSizeCksum) { // scale down iterations first if(groupIterations > 1) { groupIterations = (unsigned)CEIL_DIV(sliceSizeCksum, sizePerWorkGroup); if(groupIterations < 1) groupIterations = 1; } // if iterations cannot be scaled down, scale down workgroup size next unsigned minWgSize = 1; if(kernelCode == _ocl_kernel_lut || kernelCode == _ocl_kernel_cachelut) minWgSize = 128; // if computing lookup tables, don't scale down the workgroup below this amount if(groupIterations == 1 && sizePerWorkGroup > sliceSizeCksum && wgSize > minWgSize) { wgSize = CEIL_DIV(sliceSizeCksum, threadWordSize); if(wgSize < minWgSize) wgSize = minWgSize; } } if(method == GF16OCL_LOOKUP || method == GF16OCL_LOOKUP_HALF || method == GF16OCL_LOOKUP_NOCACHE || method == GF16OCL_LOOKUP_HALF_NOCACHE || method == GF16OCL_LOOKUP_GRP2 || method == GF16OCL_LOOKUP_GRP2_NOCACHE) { // lookup method assumes workgroup is a power of two wgSize = round_down_pow2(wgSize); } sizePerWorkGroup = threadWordSize * wgSize; // code generation for log methods uint16_t* tblLog = NULL; uint16_t* tblAntiLog = NULL; unsigned tblAntiLogSize = 0, tblLogSize = 0; if(coeffType != GF16OCL_COEFF_NORMAL) { // construct log/exp table and embed into OpenCL source if(method == GF16OCL_LOG_SMALL2) { tblLog = new uint16_t[tblLogSize = 16384 + 2048*2]; } else if(0) { // half-log idea tblLog = new uint16_t[tblLogSize = 32768]; } else { tblLog = new uint16_t[tblLogSize = 65536]; tblLog[0] = 65535; // special value to represent 0 } int n = 1; if(method == GF16OCL_LOG) { tblAntiLogSize = 65536; tblAntiLog = new uint16_t[tblAntiLogSize]; for (int exp = 0; exp < 65535; exp++) { tblAntiLog[exp] = n; tblLog[n] = exp; n = (n << 1) ^ (-(n>>15) & GF16_POLYNOMIAL); } tblAntiLog[65535] = tblAntiLog[0]; // saves having to wrap around 65535 to 0 } else { int bitsCut = (method == GF16OCL_LOG_SMALL || method == GF16OCL_LOG_SMALL2 || method == GF16OCL_LOG_SMALL_LMEM ? 3 : 4); if(bitsCut == 3) tblAntiLogSize = 8192+128; else tblAntiLogSize = 4096+256; tblAntiLog = new uint16_t[tblAntiLogSize]; for (int exp = 0; exp < 65535; exp++) { if((exp & ((1<> bitsCut] = n; if(method == GF16OCL_LOG_SMALL2) { if((n & 3) == 3) tblLog[n >> 2] = exp; } else if(0) { // half-log idea if((n & 1) == 1) tblLog[n >> 1] = exp; } else tblLog[n] = exp; n = (n << 1) ^ (-(n>>15) & GF16_POLYNOMIAL); } // add reduction table uint16_t* tblALogReduction = tblAntiLog + (uint32_t)(1u << (uint32_t)(16-bitsCut)); tblALogReduction[0] = 0; for (int i = 1; i < (1<<(bitsCut+4)); i++) { n = i << (12-bitsCut); for (int j = 0; j < (bitsCut+4); j++) n = (n << 1) ^ (-(n>>15) & GF16_POLYNOMIAL); tblALogReduction[i] = n; } // add log prep table if(method == GF16OCL_LOG_SMALL2) { uint16_t* tblLogPrep = tblLog + 16384; for (int i = 0; i < 2048; i++) { int iters = 0, r = 0; n = i; while((n&3) != 3 && iters < 10) { if(n & 1) { n ^= 0x1100b; r ^= 0x1100b; } n >>= 1; r >>= 1; iters++; } // assume little endian here tblLogPrep[i*2] = r; tblLogPrep[i*2+1] = iters; } } } // try to fit antiLog table first, then log table if(deviceAvailConstSize >= tblAntiLogSize*2) { deviceAvailConstSize -= tblAntiLogSize*2; sourceStream << "__constant ushort gf16_antilog"; if(method == GF16OCL_LOG_SMALL_LMEM || method == GF16OCL_LOG_TINY_LMEM) sourceStream << "_src"; sourceStream << "[" << tblAntiLogSize << "] __attribute__ ((aligned (VECT_WIDTH*2))) = {" << tblAntiLog[0]; for (unsigned j = 1; j < tblAntiLogSize;) { unsigned amount = tblAntiLogSize - j; if(amount > 8192) amount = 8192; for (unsigned i = 0; i < amount; i++) sourceStream << ',' << tblAntiLog[j+i]; sourceStream << '\n'; // have seen some OpenCL compilers barf at really long lines, so break things up j += amount; } sourceStream << "};\n"; delete[] tblAntiLog; tblAntiLog = NULL; } if(deviceAvailConstSize >= tblLogSize*2) { sourceStream << "#define LOG_MEM_TYPE __constant\n"; // gf16_log is defined as uint to ensure it's 4-byte aligned sourceStream << "__constant uint gf16_log[" << (tblLogSize/2) << "] = {" << ((tblLog[1]<<16) | tblLog[0]); for (unsigned j = 2; j < tblLogSize;) { unsigned amount = tblLogSize - j; if(amount > 8192) amount = 8192; for(unsigned i=0; i(1, device), paramsDump); } catch(cl::Error const& err) { printError("Build Error", err); if(err.err() == CL_BUILD_PROGRAM_FAILURE || err.err() == CL_COMPILE_PROGRAM_FAILURE || err.err() == CL_LINK_PROGRAM_FAILURE) { std::cerr << program.getBuildInfo(device) << std::endl; } if(tblLog) delete[] tblLog; if(tblAntiLog) delete[] tblAntiLog; return false; } std::vector binaries = program.getInfo(); for(unsigned i=0; i(1, device), params); } catch(cl::Error const& err) { printError("Build Error", err); #ifndef GF16OCL_NO_OUTPUT if(err.err() == CL_BUILD_PROGRAM_FAILURE || err.err() == CL_COMPILE_PROGRAM_FAILURE || err.err() == CL_LINK_PROGRAM_FAILURE) { std::cerr << program.getBuildInfo(device) << std::endl; } #endif if(tblLog) delete[] tblLog; if(tblAntiLog) delete[] tblAntiLog; return false; } // variant kernels for first (mul only) & last (misaligned input count) iterations kernelMul = cl::Kernel(program, "gf16_ocl_kernel_first"); kernelMulLast = cl::Kernel(program, "gf16_ocl_kernel_only"); kernelMulAddLast = cl::Kernel(program, "gf16_ocl_kernel_last"); kernelMulAdd = cl::Kernel(program, "gf16_ocl_kernel"); cl_mem_flags bufferFlags = CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR; if(oclPlatVersion >= 1002) bufferFlags |= CL_MEM_HOST_WRITE_ONLY; for(auto& area : staging) { area.input = cl::Buffer(context, bufferFlags, inputBatchSize*sliceSizeAligned); if(coeffType != GF16OCL_COEFF_NORMAL) { area.coeffs = cl::Buffer(context, bufferFlags, inputBatchSize*sizeof(uint16_t)); area.procCoeffs.resize(inputBatchSize); } else { area.coeffs = cl::Buffer(context, bufferFlags, inputBatchSize*numOutputs*sizeof(uint16_t)); area.procCoeffs.resize(inputBatchSize*numOutputs); } } // TODO: check if supports CPU shared memory and use host mem (CL_MEM_USE_HOST_PTR) instead? avoids a transfer when retrieving output; downside is that currently our API works by the caller supplying a buffer to write into, so would be a big change // TODO: need to consider CL_DEVICE_MAX_MEM_ALLOC_SIZE and perhaps break this into multiple allocations buffer_output = cl::Buffer(context, CL_MEM_READ_WRITE | (oclPlatVersion>=1002 ? CL_MEM_HOST_READ_ONLY : 0), numOutputs*sliceSizeAligned); workGroupRange = cl::NDRange(wgSize, 1); // attach arguments to kernels kernelMul.setArg(0, buffer_output); kernelMulLast.setArg(0, buffer_output); kernelMulAddLast.setArg(0, buffer_output); kernelMulAdd.setArg(0, buffer_output); extra_buffers.clear(); cl_mem_flags transBufferFlags = CL_MEM_READ_ONLY; if(oclPlatVersion >= 1002) transBufferFlags |= CL_MEM_HOST_WRITE_ONLY; if(coeffType == GF16OCL_COEFF_LOG) { buffer_outExp = cl::Buffer(context, transBufferFlags | CL_MEM_ALLOC_HOST_PTR, numOutputs*sizeof(uint16_t)); kernelMul.setArg(3, buffer_outExp); kernelMulAdd.setArg(3, buffer_outExp); kernelMulLast.setArg(4, buffer_outExp); kernelMulAddLast.setArg(4, buffer_outExp); } else { buffer_outExp = cl::Buffer(); } // if we couldn't embed log tables directly into the source, transfer them now if(tblLog) { const cl::Buffer bufLog(context, transBufferFlags, tblLogSize*2); extra_buffers.push_back(bufLog); kernelMul.setArg(4, bufLog); kernelMulAdd.setArg(4, bufLog); kernelMulLast.setArg(5, bufLog); kernelMulAddLast.setArg(5, bufLog); if(tblAntiLog) { extra_buffers.push_back(cl::Buffer(context, transBufferFlags, tblAntiLogSize*2)); const cl::Buffer& bufALog = extra_buffers.back(); kernelMul.setArg(5, bufALog); kernelMulAdd.setArg(5, bufALog); kernelMulLast.setArg(6, bufALog); kernelMulAddLast.setArg(6, bufALog); std::vector enqueueEvents(2); queue.enqueueWriteBuffer(bufLog, CL_FALSE, 0, tblLogSize*2, tblLog, NULL, &enqueueEvents[0]); queue.enqueueWriteBuffer(bufALog, CL_FALSE, 0, tblAntiLogSize*2, tblAntiLog, NULL, &enqueueEvents[1]); cl::Event::waitForEvents(enqueueEvents); delete[] tblLog; delete[] tblAntiLog; } else { queue.enqueueWriteBuffer(bufLog, CL_TRUE, 0, tblLogSize*2, tblLog); delete[] tblLog; } } // for some implementations, errors don't show up during build, but do during execute, so try a dummy run kernelMul.setArg(1, staging[0].input); kernelMul.setArg(2, staging[0].coeffs); try { queue.enqueueNDRangeKernel(kernelMul, cl::NullRange, workGroupRange, workGroupRange); } catch(cl::Error const& err) { printError("Execute Error", err); return false; } return true; }