#include #include "../src/stdint.h" #include // free / calloc #include #include "controller_ocl.h" std::vector PAR2ProcOCL::platforms; int PAR2ProcOCL::load_runtime() { if(load_opencl()) { return 1; } try { cl::Platform::get(&platforms); if(platforms.size() == 0) return 2; } catch(cl::Error const&) { return 2; } return 0; } void PAR2ProcOCL::printError(const char* where, cl::Error const& err) { #ifndef GF16OCL_NO_OUTPUT std::cerr << "OpenCL " << where << ": "; switch(err.err()) { #define _PRINT_ERR(x) case x: std::cerr << #x; break _PRINT_ERR(CL_DEVICE_NOT_FOUND); _PRINT_ERR(CL_DEVICE_NOT_AVAILABLE); _PRINT_ERR(CL_COMPILER_NOT_AVAILABLE); _PRINT_ERR(CL_MEM_OBJECT_ALLOCATION_FAILURE); _PRINT_ERR(CL_OUT_OF_RESOURCES); _PRINT_ERR(CL_OUT_OF_HOST_MEMORY); _PRINT_ERR(CL_PROFILING_INFO_NOT_AVAILABLE); _PRINT_ERR(CL_MEM_COPY_OVERLAP); _PRINT_ERR(CL_IMAGE_FORMAT_MISMATCH); _PRINT_ERR(CL_IMAGE_FORMAT_NOT_SUPPORTED); _PRINT_ERR(CL_BUILD_PROGRAM_FAILURE); _PRINT_ERR(CL_MAP_FAILURE); #ifdef CL_VERSION_1_1 _PRINT_ERR(CL_MISALIGNED_SUB_BUFFER_OFFSET); _PRINT_ERR(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); #endif #ifdef CL_VERSION_1_2 _PRINT_ERR(CL_COMPILE_PROGRAM_FAILURE); _PRINT_ERR(CL_LINKER_NOT_AVAILABLE); _PRINT_ERR(CL_LINK_PROGRAM_FAILURE); _PRINT_ERR(CL_DEVICE_PARTITION_FAILED); _PRINT_ERR(CL_KERNEL_ARG_INFO_NOT_AVAILABLE); #endif _PRINT_ERR(CL_INVALID_VALUE); _PRINT_ERR(CL_INVALID_DEVICE_TYPE); _PRINT_ERR(CL_INVALID_PLATFORM); _PRINT_ERR(CL_INVALID_DEVICE); _PRINT_ERR(CL_INVALID_CONTEXT); _PRINT_ERR(CL_INVALID_QUEUE_PROPERTIES); _PRINT_ERR(CL_INVALID_COMMAND_QUEUE); _PRINT_ERR(CL_INVALID_HOST_PTR); _PRINT_ERR(CL_INVALID_MEM_OBJECT); _PRINT_ERR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); _PRINT_ERR(CL_INVALID_IMAGE_SIZE); _PRINT_ERR(CL_INVALID_SAMPLER); _PRINT_ERR(CL_INVALID_BINARY); _PRINT_ERR(CL_INVALID_BUILD_OPTIONS); _PRINT_ERR(CL_INVALID_PROGRAM); _PRINT_ERR(CL_INVALID_PROGRAM_EXECUTABLE); _PRINT_ERR(CL_INVALID_KERNEL_NAME); _PRINT_ERR(CL_INVALID_KERNEL_DEFINITION); _PRINT_ERR(CL_INVALID_KERNEL); _PRINT_ERR(CL_INVALID_ARG_INDEX); _PRINT_ERR(CL_INVALID_ARG_VALUE); _PRINT_ERR(CL_INVALID_ARG_SIZE); _PRINT_ERR(CL_INVALID_KERNEL_ARGS); _PRINT_ERR(CL_INVALID_WORK_DIMENSION); _PRINT_ERR(CL_INVALID_WORK_GROUP_SIZE); _PRINT_ERR(CL_INVALID_WORK_ITEM_SIZE); _PRINT_ERR(CL_INVALID_GLOBAL_OFFSET); _PRINT_ERR(CL_INVALID_EVENT_WAIT_LIST); _PRINT_ERR(CL_INVALID_EVENT); _PRINT_ERR(CL_INVALID_OPERATION); _PRINT_ERR(CL_INVALID_GL_OBJECT); _PRINT_ERR(CL_INVALID_BUFFER_SIZE); _PRINT_ERR(CL_INVALID_MIP_LEVEL); _PRINT_ERR(CL_INVALID_GLOBAL_WORK_SIZE); #ifdef CL_VERSION_1_1 _PRINT_ERR(CL_INVALID_PROPERTY); #endif #ifdef CL_VERSION_1_2 _PRINT_ERR(CL_INVALID_IMAGE_DESCRIPTOR); _PRINT_ERR(CL_INVALID_COMPILER_OPTIONS); _PRINT_ERR(CL_INVALID_LINKER_OPTIONS); _PRINT_ERR(CL_INVALID_DEVICE_PARTITION_COUNT); #endif #undef _PRINT_ERR default: std::cerr << err.err(); } std::cerr << std::endl; #endif } static int parse_ocl_version(const std::string& ver) { // OpenCL gives the string "OpenCL x.y [extra]" const char* p = ver.c_str() + 7; int major = 0, minor = 0; int* majmin = &major; while(1) { if(*p == '.') majmin = &minor; else if(*p >= '0' && *p <= '9') { *majmin *= 10; *majmin += *p - '0'; } else break; p++; } // note that we assume 1.20 is newer than 1.2, i.e. not interpreted as a floating point value return major * 1000 + minor; } PAR2ProcOCL::PAR2ProcOCL(IF_LIBUV(uv_loop_t* _loop,) int platformId, int deviceId, int stagingAreas) : IPAR2ProcBackend(IF_LIBUV(_loop)), staging(stagingAreas), allocatedSliceSize(0), transferThread(PAR2ProcOCL::transfer_slice) { _initSuccess = false; transferThread.name = "ocl_transfer"; #ifdef USE_LIBUV #define ERROR_EXIT { loop = nullptr; return; } #else #define ERROR_EXIT return; #endif if(!getDevice(device, platformId, deviceId)) ERROR_EXIT context = cl::Context(device); // we only support little-endian hosts and devices #if !defined(_MSC_VER) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ERROR_EXIT #endif if(device.getInfo() != CL_TRUE) ERROR_EXIT try { queue = cl::CommandQueue(context, device, 0); _initSuccess = true; } catch(cl::Error const& err) { printError("Create Queue Error", err); } #undef ERROR_EXIT queueEvents.reserve(2); _deviceId = deviceId; oclDevVersion = parse_ocl_version(device.getInfo()); cl::Platform platform; getPlatform(platform, platformId); oclPlatVersion = parse_ocl_version(platform.getInfo()); } PAR2ProcOCL::~PAR2ProcOCL() { deinit(); } // _sliceSize must be divisible by 2 void PAR2ProcOCL::setSliceSize(size_t _sliceSize) { sliceSize = _sliceSize; if(gf) sliceSizeCksum = _sliceSize + gf->info().cksumSize; } bool PAR2ProcOCL::init(Galois16OCLMethods method, unsigned targetInputBatch, unsigned targetIters, unsigned targetGrouping, Galois16Methods cksumMethod) { if(!_initSuccess) return false; outputExponents.clear(); if(method == GF16OCL_AUTO) method = default_method(); _setupMethod = method; _setupTargetInputBatch = targetInputBatch; _setupTargetIters = targetIters; _setupTargetGrouping = targetGrouping; reset_state(); coeffType = GF16OCL_COEFF_NORMAL; statBatchesStarted = 0; if(!gf || gfMethod != cksumMethod) gf.reset(new Galois16Mul(cksumMethod)); gfMethod = cksumMethod; sliceSizeCksum = sliceSize + gf->info().cksumSize; outputsInterleaved = 1; return true; } bool PAR2ProcOCL::setRecoverySlices(unsigned _numOutputs, const uint16_t* outputExp) { if(_numOutputs == 0) { outputExponents.clear(); return true; } // check if coeffs are sequential bool coeffIsSeq = false; if(coeffType != GF16OCL_COEFF_LOG && outputExp) { coeffIsSeq = true; uint16_t coeffBase = outputExp[0]; // we don't support _numOutputs < 1 for(unsigned i=1; i<_numOutputs; i++) { if(outputExp[i] != coeffBase+i) { coeffIsSeq = false; break; } } } if(_numOutputs != outputExponents.size() || (coeffType == GF16OCL_COEFF_LOG_SEQ && !coeffIsSeq)) { // need to re-init as the code assumes a fixed number of outputs outputExponents = std::vector(_numOutputs); if(!setup_kernels(_setupMethod, _setupTargetInputBatch, _setupTargetIters, _setupTargetGrouping, coeffIsSeq)) return false; } assert(outputExp || coeffType == GF16OCL_COEFF_NORMAL); // TODO: if outputs not specified, and a Log method is specified, need to re-init away from that method if(outputExp) memcpy(outputExponents.data(), outputExp, outputExponents.size()*sizeof(uint16_t)); if(coeffType == GF16OCL_COEFF_LOG) { cl::Event writeEvent; queue.enqueueWriteBuffer(buffer_outExp, CL_FALSE, 0, outputExponents.size()*sizeof(uint16_t), outputExponents.data(), &queueEvents, &writeEvent); queueEvents.clear(); queueEvents.reserve(2); queueEvents.push_back(writeEvent); } else if(coeffType == GF16OCL_COEFF_LOG_SEQ) { kernelMul.setArg(3, outputExponents[0]); kernelMulAdd.setArg(3, outputExponents[0]); kernelMulLast.setArg(4, outputExponents[0]); kernelMulAddLast.setArg(4, outputExponents[0]); } return true; } void PAR2ProcOCL::reset_state() { currentStagingInputs = 0; currentStagingArea = 0; stagingActiveCount = 0; for(auto& area : staging) { area.event = cl::Event(); // clear any existing event area.setIsActive(false); } processingAdd = false; } void PAR2ProcOCL::_deinit() { queue.finish(); queueEvents.clear(); } void PAR2ProcOCL::processing_finished() { IF_LIBUV(endSignalled = false); } std::vector PAR2ProcOCL::availableMethods(int platformId, int deviceId) { std::vector ret; if(platformId == -1 || deviceId == -1) { // we assume all methods are available ret.push_back(GF16OCL_LOOKUP); ret.push_back(GF16OCL_LOOKUP_HALF); ret.push_back(GF16OCL_LOOKUP_NOCACHE); ret.push_back(GF16OCL_LOOKUP_HALF_NOCACHE); ret.push_back(GF16OCL_LOOKUP_GRP2); ret.push_back(GF16OCL_LOOKUP_GRP2_NOCACHE); /* log methods are known to fail on some platforms, so disable for now TODO: debug these and enable ret.push_back(GF16OCL_LOG); ret.push_back(GF16OCL_LOG_SMALL); ret.push_back(GF16OCL_LOG_SMALL2); ret.push_back(GF16OCL_LOG_TINY); ret.push_back(GF16OCL_LOG_SMALL_LMEM); ret.push_back(GF16OCL_LOG_TINY_LMEM); */ //ret.push_back(GF16OCL_SHUFFLE); ret.push_back(GF16OCL_BY2); } else { if(platformId >= (int)platforms.size()) return ret; std::vector devices; platforms[platformId].getDevices(CL_DEVICE_TYPE_ALL, &devices); if(deviceId >= (int)devices.size()) return ret; cl::Context context(devices[deviceId]); // TODO: actually check capabilities ret.push_back(GF16OCL_LOOKUP); ret.push_back(GF16OCL_LOOKUP_HALF); ret.push_back(GF16OCL_LOOKUP_NOCACHE); ret.push_back(GF16OCL_LOOKUP_HALF_NOCACHE); ret.push_back(GF16OCL_LOOKUP_GRP2); // TODO: consider restricting to platforms with 32b word size (currently it just forces 32b word size) ret.push_back(GF16OCL_LOOKUP_GRP2_NOCACHE); /* log methods are known to fail on some platforms, so disable for now ret.push_back(GF16OCL_LOG); ret.push_back(GF16OCL_LOG_SMALL); ret.push_back(GF16OCL_LOG_SMALL2); ret.push_back(GF16OCL_LOG_TINY); ret.push_back(GF16OCL_LOG_SMALL_LMEM); ret.push_back(GF16OCL_LOG_TINY_LMEM); */ ret.push_back(GF16OCL_BY2); } return ret; } Galois16OCLMethods PAR2ProcOCL::default_method() const { // TODO: determine best method return GF16OCL_LOOKUP; } // reduce slice size bool PAR2ProcOCL::setCurrentSliceSize(size_t newSliceSize) { setSliceSize(newSliceSize); if(sliceSizeCksum > allocatedSliceSize) { // need to re-init everything auto outExp = outputExponents; if(!init(_setupMethod, _setupTargetInputBatch, _setupTargetIters, _setupTargetGrouping)) return false; if(!outExp.empty()) { if(!setRecoverySlices(outExp.size(), outExp.data())) return false; } return true; } size_t sliceGroups = (sliceSizeCksum+bytesPerGroup -1) / bytesPerGroup; processRange = cl::NDRange(sliceGroups * wgSize, processRange[1]); sliceSizeAligned = sliceGroups*bytesPerGroup; return true; } struct transfer_data_ocl { bool finish; // false = prepare, true = finish PAR2ProcOCL* parent; void* local; cl::Buffer* remote; size_t remoteOffset; size_t sliceLen, totalLen; Galois16Mul* gf; // prepare specific size_t srcLen; unsigned submitInBufs; unsigned inBufId; int oclPlatVersion; NOTIFY_DECL(cbPrep, promPrep); // finish specific NOTIFY_BOOL_DECL(cbOut, promOut); int cksumSuccess; unsigned grpSize; unsigned region; }; #ifdef USE_LIBUV void PAR2ProcOCL::_notifySent(void* _req) { auto req = static_cast(_req); pendingInCallbacks--; if(req->cbPrep && req->local) req->cbPrep(); delete req; // handle possibility of _notifySent being called after the last _notifyProc if(endSignalled && isEmpty() && progressCb) progressCb(0); // we got this callback after processing has finished -> need to flag to front-end that we're now done } #endif PAR2ProcBackendAddResult PAR2ProcOCL::canAdd() const { if(staging[currentStagingArea].getIsActive()) return PROC_ADD_FULL; return stagingActiveCount_get() < staging.size()-1 ? PROC_ADD_OK : PROC_ADD_OK_BUSY; } #ifndef USE_LIBUV void PAR2ProcOCL::waitForAdd() { IPAR2ProcBackend::_waitForAdd(staging[currentStagingArea]); } #endif void PAR2ProcOCL::transfer_slice(ThreadMessageQueue& q) { struct transfer_data_ocl* data; while((data = static_cast(q.pop())) != NULL) { // TODO: consider doing a single mapping for the entire slice (if not, consider async mapping) if(data->finish) { void* remote = data->parent->queue.enqueueMapBuffer(*(data->remote), CL_TRUE, CL_MAP_READ, data->remoteOffset, data->totalLen*data->grpSize); if(data->grpSize == 2) { // TODO: look at a way to avoid unnecessary map/unmap calls data->cksumSuccess = data->gf->finish_grp2_cksum(data->local, remote, data->sliceLen, data->region & (data->grpSize-1)); } else { data->cksumSuccess = data->gf->copy_cksum_check(data->local, remote, data->sliceLen); } data->parent->queue.enqueueUnmapMemObject(*(data->remote), remote); NOTIFY_DONE(data, _queueRecv, data->promOut, data->cksumSuccess); } else { if(data->local) { void* remote = data->parent->queue.enqueueMapBuffer(*(data->remote), CL_TRUE, data->oclPlatVersion>=1002 ? CL_MAP_WRITE_INVALIDATE_REGION : CL_MAP_WRITE, data->remoteOffset, data->totalLen); data->gf->copy_cksum(remote, data->local, data->srcLen, data->sliceLen); data->parent->queue.enqueueUnmapMemObject(*(data->remote), remote); } if(data->submitInBufs) { // queue async compute data->parent->run_kernel(data->inBufId, data->submitInBufs); } // signal main thread that prepare has completed NOTIFY_DONE(data, _queueSent, data->promPrep); } IF_NOT_LIBUV(delete data); } } FUTURE_RETURN_T PAR2ProcOCL::addInput(const void* buffer, size_t size, uint16_t inputNum, bool flush IF_LIBUV(, const PAR2ProcPlainCb& cb)) { IF_NOT_LIBUV(return) _addInput(buffer, size, inputNum, flush IF_LIBUV(, cb)); } FUTURE_RETURN_T PAR2ProcOCL::addInput(const void* buffer, size_t size, const uint16_t* coeffs, bool flush IF_LIBUV(, const PAR2ProcPlainCb& cb)) { IF_NOT_LIBUV(return) _addInput(buffer, size, coeffs, flush IF_LIBUV(, cb)); } template FUTURE_RETURN_T PAR2ProcOCL::_addInput(const void* buffer, size_t size, T inputNumOrCoeffs, bool flush IF_LIBUV(, const PAR2ProcPlainCb& cb)) { auto& area = staging[currentStagingArea]; // detect if busy assert(!area.getIsActive()); set_coeffs(area, currentStagingInputs, inputNumOrCoeffs); struct transfer_data_ocl* data = new struct transfer_data_ocl; data->finish = false; data->local = (void*)buffer; data->srcLen = size; data->parent = this; data->remote = &area.input; data->remoteOffset = currentStagingInputs * sliceSizeAligned; data->sliceLen = sliceSize; data->totalLen = sliceSizeCksum; data->gf = gf.get(); data->oclPlatVersion = oclPlatVersion; IF_LIBUV(data->cbPrep = cb); currentStagingInputs++; data->submitInBufs = (flush || currentStagingInputs == inputBatchSize || ( // allow submitting early if there's no active processing stagingActiveCount_get() == 0 && staging.size() > 1 && currentStagingInputs >= minInBatchSize )) ? currentStagingInputs : 0; data->inBufId = currentStagingArea; if(data->submitInBufs) { stagingActiveCount_inc(); area.setIsActive(true); // lock this buffer until processing is complete statBatchesStarted++; currentStagingInputs = 0; if(++currentStagingArea == staging.size()) currentStagingArea = 0; } IF_LIBUV(pendingInCallbacks++); IF_NOT_LIBUV(auto future = data->promPrep.get_future()); transferThread.send(data); IF_NOT_LIBUV(return future); } void PAR2ProcOCL::dummyInput(uint16_t inputNum, bool flush) { auto& area = staging[currentStagingArea]; // detect if busy assert(!area.getIsActive()); set_coeffs(area, currentStagingInputs, inputNum); currentStagingInputs++; if(currentStagingInputs == inputBatchSize || flush || (stagingActiveCount_get() == 0 && staging.size() > 1 && currentStagingInputs >= minInBatchSize)) { stagingActiveCount_inc(); area.setIsActive(true); // lock this buffer until processing is complete statBatchesStarted++; run_kernel(currentStagingArea, currentStagingInputs); currentStagingInputs = 0; if(++currentStagingArea == staging.size()) currentStagingArea = 0; } } bool PAR2ProcOCL::fillInput(const void* buffer) { void* remote = queue.enqueueMapBuffer(staging[currentStagingArea].input, CL_TRUE, oclPlatVersion>=1002 ? CL_MAP_WRITE_INVALIDATE_REGION : CL_MAP_WRITE, currentStagingInputs * sliceSizeAligned, sliceSizeAligned); gf->copy_cksum(remote, buffer, sliceSize, sliceSize); queue.enqueueUnmapMemObject(staging[currentStagingArea].input, remote); if(++currentStagingInputs == inputBatchSize) { currentStagingInputs = 0; if(++currentStagingArea == staging.size()) { currentStagingArea = 0; return true; // all filled } } return false; } #include "gfmat_coeff.h" void PAR2ProcOCL::set_coeffs(PAR2ProcOCLStaging& area, unsigned idx, uint16_t inputNum) { uint16_t inputLog = gfmat_input_log(inputNum); auto& coeffs = area.procCoeffs; if(coeffType != GF16OCL_COEFF_NORMAL) { coeffs[idx] = inputLog; } else { for(unsigned i=0; ifinish = false; data->local = NULL; data->parent = this; data->submitInBufs = currentStagingInputs; data->inBufId = currentStagingArea; data->gf = gf.get(); data->oclPlatVersion = oclPlatVersion; stagingActiveCount_inc(); staging[currentStagingArea].setIsActive(true); // lock this buffer until processing is complete statBatchesStarted++; currentStagingInputs = 0; if(++currentStagingArea == staging.size()) currentStagingArea = 0; IF_LIBUV(pendingInCallbacks++); transferThread.send(data); } struct recv_data { PAR2ProcOCL* parent; NOTIFY_BOOL_DECL(cb, prom); }; #ifdef USE_LIBUV void PAR2ProcOCL::_notifyRecv(void* _req) { auto req = static_cast(_req); pendingOutCallbacks--; req->cbOut(req->cksumSuccess); delete req; if(pendingOutCallbacks < 1 && deinitCallback) deinit(deinitCallback); } #endif FUTURE_RETURN_BOOL_T PAR2ProcOCL::getOutput(unsigned index, void* output IF_LIBUV(, const PAR2ProcOutputCb& cb)) { struct transfer_data_ocl* data = new struct transfer_data_ocl; data->finish = true; data->parent = this; data->remote = &buffer_output; data->remoteOffset = index*sliceSizeAligned; data->sliceLen = sliceSize; data->totalLen = sliceSizeAligned; data->gf = gf.get(); data->oclPlatVersion = oclPlatVersion; data->local = output; data->region = index; data->grpSize = 1; if(outputsInterleaved > 1 && index < ((unsigned)outputExponents.size() & ~(outputsInterleaved-1))) { data->grpSize = outputsInterleaved; data->remoteOffset = (index & ~(outputsInterleaved-1))*sliceSizeAligned; } #ifdef USE_LIBUV data->cbOut = cb; pendingOutCallbacks++; #else auto future = data->promOut.get_future(); #endif transferThread.send(data); IF_NOT_LIBUV(return future); } typedef struct PAR2ProcBackendBaseComputeReq compute_req; #ifdef USE_LIBUV void PAR2ProcOCL::_notifyProc(void* _req) { auto req = static_cast(_req); stagingActiveCount_dec(); staging[req->procIdx].setIsActive(false); // if add was blocked, allow adds to continue - calling application will need to listen to this event to know to continue if(progressCb) progressCb(req->numInputs); delete req; } #endif static void CL_CALLBACK kernel_event_callback(cl_event, cl_int, void* _req) { auto req = static_cast(_req); #ifdef USE_LIBUV req->parent->_queueProc.notify(req); #else req->parent->stagingActiveCount_dec(); req->parent->_setAreaActive(req->procIdx, false); delete req; #endif } void PAR2ProcOCL::run_kernel(unsigned buf, unsigned numInputs) { auto& area = staging[buf]; // transfer coefficient list cl::Event coeffEvent; queue.enqueueWriteBuffer(area.coeffs, CL_FALSE, 0, inputBatchSize * (coeffType!=GF16OCL_COEFF_NORMAL ? 1 : outputExponents.size()) * sizeof(uint16_t), area.procCoeffs.data(), NULL, &coeffEvent); queueEvents.push_back(coeffEvent); // invoke kernel cl::Kernel* kernel; if(numInputs == inputBatchSize) { kernel = processingAdd ? &kernelMulAdd : &kernelMul; } else { // incomplete kernel -> need to pass in number of inputs to read from kernel = processingAdd ? &kernelMulAddLast : &kernelMulLast; kernel->setArg(3, numInputs); } processingAdd = true; kernel->setArg(1, area.input); kernel->setArg(2, area.coeffs); // TODO: try-catch to detect errors (e.g. out of resources) queue.enqueueNDRangeKernel(*kernel, cl::NullRange, processRange, workGroupRange, &queueEvents, &area.event); queueEvents.clear(); // for coeffType!=GF16OCL_COEFF_NORMAL, assume that the outputs are transferred by the time we enqueue the next kernel // when kernel finishes auto* req = new compute_req; req->numInputs = numInputs; req->procIdx = buf; req->parent = this; area.event.setCallback(CL_COMPLETE, kernel_event_callback, req); queue.flush(); // ensure the kernel is executed } bool PAR2ProcOCL::getPlatform(cl::Platform& platform, int platformId) { if(platformId == -1) { try { platform = cl::Platform::getDefault(); } catch(cl::Error const&) { return false; } } else { if(platformId >= (int)platforms.size()) return false; platform = platforms[platformId]; } return true; } std::vector PAR2ProcOCL::getPlatforms() { std::vector ret(platforms.size()); for(unsigned i=0; i(); } return ret; } int PAR2ProcOCL::defaultPlatformId() { cl::Platform platform; try { platform = cl::Platform::getDefault(); } catch(cl::Error const&) { return -1; } const auto vendor = platform.getInfo(); const auto name = platform.getInfo(); int id = 0; for(const auto& plat : platforms) { if(vendor == plat.getInfo() && name == plat.getInfo()) return id; id++; } return -1; } int PAR2ProcOCL::defaultDeviceId(int platformId) { cl::Platform platform; if(!getPlatform(platform, platformId)) return -1; std::vector devices; platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); // match first GPU device that's available int firstAvailable = -1; for(unsigned i=0; i() && device.getInfo() && device.getInfo()) { if(firstAvailable == -1) firstAvailable = i; if(device.getInfo() == CL_DEVICE_TYPE_GPU) return i; // found first GPU } } return firstAvailable; } // based off getWGsizes function from clinfo size_t GF16OCL_DeviceInfo::getWGSize(const cl::Context& context, const cl::Device& device) { try { cl::Program program(context, "#define GWO(type) global type* restrict\n" "#define GRO(type) global const type* restrict\n" "#define BODY int i = get_global_id(0); out[i] = in1[i] + in2[i]\n" "#define _KRN(T, N) kernel void sum##N(GWO(T##N) out, GRO(T##N) in1, GRO(T##N) in2) { BODY; }\n" "#define KRN(N) _KRN(int, N)\n" "KRN()\n/* KRN(2)\nKRN(4)\nKRN(8)\nKRN(16) */\n", true); cl::Kernel kern(program, "sum"); return kern.getWorkGroupInfo(device); } catch(cl::Error const&) { return 0; } } GF16OCL_DeviceInfo::GF16OCL_DeviceInfo(int _id, const cl::Device& device) { id = _id; name = device.getInfo(); vendorId = device.getInfo(); type = device.getInfo(); available = device.getInfo() && device.getInfo(); supported = available && device.getInfo(); memory = device.getInfo(); maxWorkGroup = (unsigned)device.getInfo(); computeUnits = device.getInfo(); globalCache = device.getInfo(); constantMemory = device.getInfo(); localMemory = device.getInfo(); localMemoryIsGlobal = device.getInfo() == CL_GLOBAL; maxAllocation = device.getInfo(); unifiedMemory = device.getInfo(); workGroupMultiple = (unsigned)getWGSize(device, device); } std::vector PAR2ProcOCL::getDevices(int platformId) { cl::Platform platform; if(!getPlatform(platform, platformId)) return std::vector(); std::vector devices; try { platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); } catch(cl::Error const&) { return {}; } std::vector ret; ret.reserve(devices.size()); for(unsigned i=0; i devices; if(deviceId == -1) { // first try GPU try { platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); } catch(cl::Error const&) {} // if no GPU device found, try anything else if(devices.size() < 1) { try { platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); } catch(cl::Error const&) {} } if(devices.size() < 1) return false; // no devices! // match first device that's available bool found = false; for(unsigned i=0; i() && device.getInfo() && device.getInfo()) { found = true; break; } } if(!found) return false; } else { platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); if(deviceId >= (int)devices.size()) return false; device = devices[deviceId]; } return true; }