
#include "../src/hedley.h"

#define FUNCS(v) \
	void* gf16_xor_jit_init_##v(int polynomial, int jitOptStrat); \
	void* gf16_xor_jit_init_mut_##v(); \
	void gf16_xor_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
	void gf16_xor_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); \
	int gf16_xor_finish_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \
	int gf16_xor_finish_partial_packsum_##v(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); \
	void gf16_xor_jit_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
	void gf16_xor_jit_muladd_prefetch_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); \
	extern int gf16_xor_available_##v

FUNCS(sse2);
FUNCS(avx2);
FUNCS(avx512);

#undef FUNCS

#ifdef PARPAR_INVERT_SUPPORT
#define FUNCS(v) \
	void gf16_xor_prepare_##v(void* dst, const void* src, size_t srcLen); \
	void gf16_xor_finish_##v(void *HEDLEY_RESTRICT dst, size_t len); \
	void gf16_xor_jit_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch)
FUNCS(sse2);
FUNCS(avx2);
FUNCS(avx512);
#undef FUNCS
#endif

#ifdef PARPAR_INCLUDE_BASIC_OPS
#define FUNCS(v) \
	void gf16_xor_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
	void gf16_xor_finish_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen)
FUNCS(sse2);
FUNCS(avx2);
FUNCS(avx512);
#undef FUNCS
#endif


void gf16_xor_jit_muladd_multi_avx512(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch);
void gf16_xor_jit_muladd_multi_packed_avx512(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch);

void gf16_xor_jit_uninit(void* scratch);

// non-JIT version
void* gf16_xor_init_sse2(int polynomial);
#ifdef PARPAR_INVERT_SUPPORT
void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
#endif
void gf16_xor_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);


// JIT strategies for current processor
#define GF16_XOR_JIT_STRAT_NONE 0
#define GF16_XOR_JIT_STRAT_COPYNT 1
#define GF16_XOR_JIT_STRAT_COPY 2
#define GF16_XOR_JIT_STRAT_CLR 3


#ifdef PARPAR_INVERT_SUPPORT
uint16_t gf16_xor16_replace_word(void* data, size_t index, uint16_t newValue);
uint16_t gf16_xor32_replace_word(void* data, size_t index, uint16_t newValue);
uint16_t gf16_xor64_replace_word(void* data, size_t index, uint16_t newValue);
#endif
