16 # include <sys/types.h>
30 # if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(__CYGWIN__) || \
34 # elif defined(__linux__)
37 # elif defined(__APPLE__)
46 # include <mach/mach_time.h>
59 GPUEnv OpenclDevice::gpuEnv;
61 bool OpenclDevice::deviceIsSelected =
false;
62 ds_device OpenclDevice::selectedDevice;
64 int OpenclDevice::isInited = 0;
66 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
68 static const l_uint32 lmask32[] = {
69 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000, 0xfe000000, 0xff000000,
70 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000, 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000,
71 0xffff8000, 0xffffc000, 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
72 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc, 0xfffffffe, 0xffffffff};
74 static const l_uint32 rmask32[] = {
75 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
76 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
77 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
78 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff};
80 static cl_mem pixsCLBuffer, pixdCLBuffer,
82 static cl_mem pixThBuffer;
83 static cl_int clStatus;
84 static KernelEnv rEnv;
86 # define DS_TAG_VERSION "<version>"
87 # define DS_TAG_VERSION_END "</version>"
88 # define DS_TAG_DEVICE "<device>"
89 # define DS_TAG_DEVICE_END "</device>"
90 # define DS_TAG_SCORE "<score>"
91 # define DS_TAG_SCORE_END "</score>"
92 # define DS_TAG_DEVICE_TYPE "<type>"
93 # define DS_TAG_DEVICE_TYPE_END "</type>"
94 # define DS_TAG_DEVICE_NAME "<name>"
95 # define DS_TAG_DEVICE_NAME_END "</name>"
96 # define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
97 # define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
99 # define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
101 # define DS_DEVICE_NAME_LENGTH 256
103 enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
106 std::vector<ds_device> devices;
107 unsigned int numDevices;
113 DS_INVALID_PROFILE = 1000,
115 DS_INVALID_PERF_EVALUATOR_TYPE,
116 DS_INVALID_PERF_EVALUATOR,
117 DS_PERF_EVALUATOR_ERROR,
119 DS_UNKNOWN_DEVICE_TYPE,
120 DS_PROFILE_FILE_ERROR,
121 DS_SCORE_SERIALIZER_ERROR,
122 DS_SCORE_DESERIALIZER_ERROR
129 typedef ds_status (*ds_perf_evaluator)(ds_device *device,
void *data);
132 typedef ds_status (*ds_score_release)(TessDeviceScore *score);
134 static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
135 ds_status status = DS_SUCCESS;
136 if (profile !=
nullptr) {
139 for (i = 0; i < profile->numDevices; i++) {
140 free(profile->devices[i].oclDeviceName);
141 free(profile->devices[i].oclDriverVersion);
142 status = sr(profile->devices[i].score);
143 if (status != DS_SUCCESS)
152 static ds_status initDSProfile(ds_profile **p,
const char *version) {
154 cl_uint numPlatforms;
155 std::vector<cl_platform_id> platforms;
156 std::vector<cl_device_id> devices;
157 ds_status status = DS_SUCCESS;
162 return DS_INVALID_PROFILE;
164 ds_profile *profile =
new ds_profile;
166 memset(profile, 0,
sizeof(ds_profile));
168 clGetPlatformIDs(0,
nullptr, &numPlatforms);
170 if (numPlatforms > 0) {
171 platforms.reserve(numPlatforms);
172 clGetPlatformIDs(numPlatforms, &platforms[0],
nullptr);
176 for (i = 0; i < numPlatforms; i++) {
178 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0,
nullptr, &num);
182 if (numDevices > 0) {
183 devices.reserve(numDevices);
186 profile->numDevices = numDevices + 1;
187 profile->devices.reserve(profile->numDevices);
188 memset(&profile->devices[0], 0, profile->numDevices *
sizeof(ds_device));
191 for (i = 0; i < numPlatforms; i++) {
194 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
195 for (j = 0; j < num; j++, next++) {
196 char buffer[DS_DEVICE_NAME_LENGTH];
199 profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
200 profile->devices[next].oclDeviceID = devices[j];
202 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME, DS_DEVICE_NAME_LENGTH,
204 length = strlen(buffer);
205 profile->devices[next].oclDeviceName = (
char *)malloc(length + 1);
206 memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
208 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION, DS_DEVICE_NAME_LENGTH,
210 length = strlen(buffer);
211 profile->devices[next].oclDriverVersion = (
char *)malloc(length + 1);
212 memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
215 profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
216 profile->version = version;
222 static ds_status profileDevices(ds_profile *profile,
const ds_evaluation_type type,
223 ds_perf_evaluator evaluator,
void *evaluatorData,
224 unsigned int *numUpdates) {
225 ds_status status = DS_SUCCESS;
227 unsigned int updates = 0;
229 if (profile ==
nullptr) {
230 return DS_INVALID_PROFILE;
232 if (evaluator ==
nullptr) {
233 return DS_INVALID_PERF_EVALUATOR;
236 for (i = 0; i < profile->numDevices; i++) {
237 ds_status evaluatorStatus;
240 case DS_EVALUATE_NEW_ONLY:
241 if (profile->devices[i].score !=
nullptr)
244 case DS_EVALUATE_ALL:
245 evaluatorStatus = evaluator(&profile->devices[i], evaluatorData);
246 if (evaluatorStatus != DS_SUCCESS) {
247 status = evaluatorStatus;
253 return DS_INVALID_PERF_EVALUATOR_TYPE;
258 *numUpdates = updates;
262 static const char *findString(
const char *contentStart,
const char *contentEnd,
263 const char *
string) {
265 const char *currentPosition;
266 const char *found =
nullptr;
267 stringLength = strlen(
string);
268 currentPosition = contentStart;
269 for (currentPosition = contentStart; currentPosition < contentEnd; currentPosition++) {
270 if (*currentPosition ==
string[0]) {
271 if (currentPosition + stringLength < contentEnd) {
272 if (strncmp(currentPosition,
string, stringLength) == 0) {
273 found = currentPosition;
282 static ds_status readProFile(
const char *fileName,
char **content,
size_t *contentSize) {
285 ds_status status = DS_SUCCESS;
286 FILE *input = fopen(fileName,
"rb");
287 if (input ==
nullptr) {
288 status = DS_FILE_ERROR;
290 fseek(input, 0L, SEEK_END);
291 auto pos = std::ftell(input);
295 char *binary =
new char[size];
296 if (fread(binary,
sizeof(
char), size, input) != size) {
297 status = DS_FILE_ERROR;
309 typedef ds_status (*ds_score_deserializer)(ds_device *device,
const uint8_t *serializedScore,
310 unsigned int serializedScoreSize);
312 static ds_status readProfileFromFile(ds_profile *profile, ds_score_deserializer deserializer,
314 ds_status status = DS_SUCCESS;
318 if (profile ==
nullptr)
319 return DS_INVALID_PROFILE;
321 status = readProFile(
file, &contentStart, &contentSize);
322 if (status == DS_SUCCESS) {
323 const char *currentPosition;
324 const char *dataStart;
327 const char *contentEnd = contentStart + contentSize;
328 currentPosition = contentStart;
331 dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
332 if (dataStart ==
nullptr) {
333 status = DS_PROFILE_FILE_ERROR;
336 dataStart += strlen(DS_TAG_VERSION);
338 dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
339 if (dataEnd ==
nullptr) {
340 status = DS_PROFILE_FILE_ERROR;
344 size_t versionStringLength = strlen(profile->version);
345 if (versionStringLength + dataStart != dataEnd ||
346 strncmp(profile->version, dataStart, versionStringLength) != 0) {
348 status = DS_PROFILE_FILE_ERROR;
351 currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
357 const char *deviceTypeStart;
358 const char *deviceTypeEnd;
359 ds_device_type deviceType;
361 const char *deviceNameStart;
362 const char *deviceNameEnd;
364 const char *deviceScoreStart;
365 const char *deviceScoreEnd;
367 const char *deviceDriverStart;
368 const char *deviceDriverEnd;
370 dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
371 if (dataStart ==
nullptr) {
375 dataStart += strlen(DS_TAG_DEVICE);
376 dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
377 if (dataEnd ==
nullptr) {
378 status = DS_PROFILE_FILE_ERROR;
383 deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
384 if (deviceTypeStart ==
nullptr) {
385 status = DS_PROFILE_FILE_ERROR;
388 deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
389 deviceTypeEnd = findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
390 if (deviceTypeEnd ==
nullptr) {
391 status = DS_PROFILE_FILE_ERROR;
394 memcpy(&deviceType, deviceTypeStart,
sizeof(ds_device_type));
397 if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
398 deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
399 if (deviceNameStart ==
nullptr) {
400 status = DS_PROFILE_FILE_ERROR;
403 deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
404 deviceNameEnd = findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
405 if (deviceNameEnd ==
nullptr) {
406 status = DS_PROFILE_FILE_ERROR;
410 deviceDriverStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
411 if (deviceDriverStart ==
nullptr) {
412 status = DS_PROFILE_FILE_ERROR;
415 deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
417 findString(deviceDriverStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION_END);
418 if (deviceDriverEnd ==
nullptr) {
419 status = DS_PROFILE_FILE_ERROR;
424 for (i = 0; i < profile->numDevices; i++) {
425 if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
426 size_t actualDeviceNameLength;
427 size_t driverVersionLength;
429 actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
430 driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
431 if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
432 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
433 strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
434 actualDeviceNameLength) == 0 &&
435 strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
436 driverVersionLength) == 0) {
437 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
438 deviceScoreStart += strlen(DS_TAG_SCORE);
439 deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
440 status = deserializer(&profile->devices[i], (
const unsigned char *)deviceScoreStart,
441 deviceScoreEnd - deviceScoreStart);
442 if (status != DS_SUCCESS) {
448 }
else if (deviceType == DS_DEVICE_NATIVE_CPU) {
449 for (i = 0; i < profile->numDevices; i++) {
450 if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
451 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
452 if (deviceScoreStart ==
nullptr) {
453 status = DS_PROFILE_FILE_ERROR;
456 deviceScoreStart += strlen(DS_TAG_SCORE);
457 deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
458 status = deserializer(&profile->devices[i], (
const unsigned char *)deviceScoreStart,
459 deviceScoreEnd - deviceScoreStart);
460 if (status != DS_SUCCESS) {
468 currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
472 delete[] contentStart;
476 typedef ds_status (*ds_score_serializer)(ds_device *device, uint8_t **serializedScore,
477 unsigned int *serializedScoreSize);
478 static ds_status writeProfileToFile(ds_profile *profile, ds_score_serializer serializer,
480 ds_status status = DS_SUCCESS;
482 if (profile ==
nullptr)
483 return DS_INVALID_PROFILE;
485 FILE *profileFile = fopen(
file,
"wb");
486 if (profileFile ==
nullptr) {
487 status = DS_FILE_ERROR;
492 fwrite(DS_TAG_VERSION,
sizeof(
char), strlen(DS_TAG_VERSION), profileFile);
493 fwrite(profile->version,
sizeof(
char), strlen(profile->version), profileFile);
494 fwrite(DS_TAG_VERSION_END,
sizeof(
char), strlen(DS_TAG_VERSION_END), profileFile);
495 fwrite(
"\n",
sizeof(
char), 1, profileFile);
497 for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
498 uint8_t *serializedScore;
499 unsigned int serializedScoreSize;
501 fwrite(DS_TAG_DEVICE,
sizeof(
char), strlen(DS_TAG_DEVICE), profileFile);
503 fwrite(DS_TAG_DEVICE_TYPE,
sizeof(
char), strlen(DS_TAG_DEVICE_TYPE), profileFile);
504 fwrite(&profile->devices[i].type,
sizeof(ds_device_type), 1, profileFile);
505 fwrite(DS_TAG_DEVICE_TYPE_END,
sizeof(
char), strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
507 switch (profile->devices[i].type) {
508 case DS_DEVICE_NATIVE_CPU: {
519 case DS_DEVICE_OPENCL_DEVICE: {
520 fwrite(DS_TAG_DEVICE_NAME,
sizeof(
char), strlen(DS_TAG_DEVICE_NAME), profileFile);
521 fwrite(profile->devices[i].oclDeviceName,
sizeof(
char),
522 strlen(profile->devices[i].oclDeviceName), profileFile);
523 fwrite(DS_TAG_DEVICE_NAME_END,
sizeof(
char), strlen(DS_TAG_DEVICE_NAME_END), profileFile);
525 fwrite(DS_TAG_DEVICE_DRIVER_VERSION,
sizeof(
char), strlen(DS_TAG_DEVICE_DRIVER_VERSION),
527 fwrite(profile->devices[i].oclDriverVersion,
sizeof(
char),
528 strlen(profile->devices[i].oclDriverVersion), profileFile);
529 fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END,
sizeof(
char),
530 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
533 status = DS_UNKNOWN_DEVICE_TYPE;
537 fwrite(DS_TAG_SCORE,
sizeof(
char), strlen(DS_TAG_SCORE), profileFile);
538 status = serializer(&profile->devices[i], &serializedScore, &serializedScoreSize);
539 if (status == DS_SUCCESS && serializedScore !=
nullptr && serializedScoreSize > 0) {
540 fwrite(serializedScore,
sizeof(
char), serializedScoreSize, profileFile);
541 delete[] serializedScore;
543 fwrite(DS_TAG_SCORE_END,
sizeof(
char), strlen(DS_TAG_SCORE_END), profileFile);
544 fwrite(DS_TAG_DEVICE_END,
sizeof(
char), strlen(DS_TAG_DEVICE_END), profileFile);
545 fwrite(
"\n",
sizeof(
char), 1, profileFile);
553 static void legalizeFileName(
char *fileName) {
555 const char *invalidChars =
"/\?:*\"><| ";
557 for (
unsigned i = 0; i < strlen(invalidChars); i++) {
559 invalidStr[0] = invalidChars[i];
560 invalidStr[1] =
'\0';
566 for (
char *pos = strstr(fileName, invalidStr); pos !=
nullptr;
567 pos = strstr(pos + 1, invalidStr)) {
575 static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) {
578 gpuInfo->mnIsUserCreated = 1;
580 gpuInfo->mpDevID = device;
581 gpuInfo->mpArryDevsID =
new cl_device_id[1];
582 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
583 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
sizeof(cl_device_type),
584 &gpuInfo->mDevType, &size);
585 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(TYPE)");
587 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
sizeof(cl_platform_id),
588 &gpuInfo->mpPlatformID, &size);
589 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(PLATFORM)");
591 cl_context_properties props[3];
592 props[0] = CL_CONTEXT_PLATFORM;
593 props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
595 gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID,
nullptr,
nullptr, &clStatus);
596 CHECK_OPENCL(clStatus,
"populateGPUEnv::createContext");
598 cl_command_queue_properties queueProperties = 0;
599 gpuInfo->mpCmdQueue =
600 clCreateCommandQueue(gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
601 CHECK_OPENCL(clStatus,
"populateGPUEnv::createCommandQueue");
604 int OpenclDevice::LoadOpencl() {
606 HINSTANCE HOpenclDll =
nullptr;
607 void *OpenclDll =
nullptr;
609 OpenclDll =
static_cast<HINSTANCE
>(HOpenclDll);
610 OpenclDll = LoadLibrary(
"openCL.dll");
611 if (!
static_cast<HINSTANCE
>(OpenclDll)) {
612 fprintf(stderr,
"[OD] Load opencl.dll failed!\n");
613 FreeLibrary(
static_cast<HINSTANCE
>(OpenclDll));
616 fprintf(stderr,
"[OD] Load opencl.dll successful!\n");
620 int OpenclDevice::SetKernelEnv(KernelEnv *envInfo) {
621 envInfo->mpkContext = gpuEnv.mpContext;
622 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
623 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
628 static cl_mem allocateZeroCopyBuffer(
const KernelEnv &rEnv, l_uint32 *hostbuffer,
size_t nElements,
629 cl_mem_flags flags, cl_int *pStatus) {
630 cl_mem membuffer = clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
631 nElements *
sizeof(l_uint32), hostbuffer, pStatus);
636 static Image mapOutputCLBuffer(
const KernelEnv &rEnv, cl_mem clbuffer, Image pixd, Image pixs,
637 int elements, cl_mem_flags flags,
bool memcopy =
false,
641 if ((pixd = pixCreateTemplate(pixs)) ==
nullptr)
644 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) ==
650 (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
651 elements *
sizeof(l_uint32), 0,
nullptr,
nullptr,
nullptr);
654 memcpy(pixGetData(pixd), pValues, elements *
sizeof(l_uint32));
656 pixSetData(pixd, pValues);
659 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0,
nullptr,
nullptr);
662 clFinish(rEnv.mpkCmdQueue);
668 void OpenclDevice::releaseMorphCLBuffers() {
669 if (pixdCLIntermediate !=
nullptr)
670 clReleaseMemObject(pixdCLIntermediate);
671 if (pixsCLBuffer !=
nullptr)
672 clReleaseMemObject(pixsCLBuffer);
673 if (pixdCLBuffer !=
nullptr)
674 clReleaseMemObject(pixdCLBuffer);
675 if (pixThBuffer !=
nullptr)
676 clReleaseMemObject(pixThBuffer);
677 pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer =
nullptr;
680 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Image pixs) {
683 if (pixThBuffer !=
nullptr) {
684 pixsCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
687 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
688 sizeof(l_uint32) * wpl * h, 0,
nullptr,
nullptr);
691 l_uint32 *srcdata =
reinterpret_cast<l_uint32 *
>(malloc(wpl * h *
sizeof(l_uint32)));
692 memcpy(srcdata, pixGetData(pixs), wpl * h *
sizeof(l_uint32));
694 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h, CL_MEM_USE_HOST_PTR, &clStatus);
697 pixdCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
700 allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
702 return (
int)clStatus;
705 int OpenclDevice::InitEnv() {
709 if (1 == LoadOpencl())
715 InitOpenclRunEnv_DeviceSelection(0);
719 int OpenclDevice::ReleaseOpenclRunEnv() {
720 ReleaseOpenclEnv(&gpuEnv);
727 inline int OpenclDevice::AddKernelConfig(
int kCount,
const char *kName) {
729 ASSERT_HOST(strlen(kName) <
sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
730 strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
731 gpuEnv.mnKernelCount++;
735 int OpenclDevice::RegistOpenclKernel() {
736 if (!gpuEnv.mnIsUserCreated)
737 memset(&gpuEnv, 0,
sizeof(gpuEnv));
739 gpuEnv.mnFileCount = 0;
740 gpuEnv.mnKernelCount = 0UL;
742 AddKernelConfig(1,
"oclAverageSub1");
746 int OpenclDevice::InitOpenclRunEnv_DeviceSelection(
int argc) {
749 ds_device bestDevice_DS = getDeviceSelection();
750 cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
752 if (selectedDeviceIsOpenCL()) {
755 populateGPUEnvFromDevice(&gpuEnv, bestDevice);
756 gpuEnv.mnFileCount = 0;
757 gpuEnv.mnKernelCount = 0UL;
758 CompileKernelFile(&gpuEnv,
"");
768 OpenclDevice::OpenclDevice() {
772 OpenclDevice::~OpenclDevice() {
776 int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
784 for (i = 0; i < gpuEnv.mnFileCount; i++) {
785 if (gpuEnv.mpArryPrograms[i]) {
786 clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
787 CHECK_OPENCL(clStatus,
"clReleaseProgram");
788 gpuEnv.mpArryPrograms[i] =
nullptr;
791 if (gpuEnv.mpCmdQueue) {
792 clReleaseCommandQueue(gpuEnv.mpCmdQueue);
793 gpuEnv.mpCmdQueue =
nullptr;
795 if (gpuEnv.mpContext) {
796 clReleaseContext(gpuEnv.mpContext);
797 gpuEnv.mpContext =
nullptr;
800 gpuInfo->mnIsUserCreated = 0;
801 delete[] gpuInfo->mpArryDevsID;
804 int OpenclDevice::BinaryGenerated(
const char *clFileName, FILE **fhandle) {
809 char fileName[256] = {0}, cl_name[128] = {0};
810 char deviceName[1024];
811 clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
sizeof(deviceName), deviceName,
813 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
814 const char *str = strstr(clFileName,
".cl");
815 memcpy(cl_name, clFileName, str - clFileName);
816 cl_name[str - clFileName] =
'\0';
817 sprintf(fileName,
"%s-%s.bin", cl_name, deviceName);
818 legalizeFileName(fileName);
819 fd = fopen(fileName,
"rb");
820 status = (fd !=
nullptr) ? 1 : 0;
826 int OpenclDevice::CachedOfKernerPrg(
const GPUEnv *gpuEnvCached,
const char *clFileName) {
828 for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
829 if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
830 if (gpuEnvCached->mpArryPrograms[i] !=
nullptr) {
838 int OpenclDevice::WriteBinaryToFile(
const char *fileName,
const char *birary,
size_t numBytes) {
839 FILE *output =
nullptr;
840 output = fopen(fileName,
"wb");
841 if (output ==
nullptr) {
845 fwrite(birary,
sizeof(
char), numBytes, output);
851 int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
const char *clFileName) {
857 clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
sizeof(numDevices), &numDevices,
nullptr);
858 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
860 std::vector<cl_device_id> mpArryDevsID(numDevices);
863 clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
sizeof(cl_device_id) * numDevices,
864 &mpArryDevsID[0],
nullptr);
865 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
868 std::vector<size_t> binarySizes(numDevices);
870 clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
sizeof(
size_t) * numDevices,
871 &binarySizes[0],
nullptr);
872 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
875 std::vector<char *> binaries(numDevices);
877 for (i = 0; i < numDevices; i++) {
878 if (binarySizes[i] != 0) {
879 binaries[i] =
new char[binarySizes[i]];
881 binaries[i] =
nullptr;
885 clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
sizeof(
char *) * numDevices,
886 &binaries[0],
nullptr);
887 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
890 for (i = 0; i < numDevices; i++) {
891 char fileName[256] = {0}, cl_name[128] = {0};
893 if (binarySizes[i] != 0) {
894 char deviceName[1024];
896 clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
sizeof(deviceName), deviceName,
nullptr);
897 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
899 const char *str = strstr(clFileName,
".cl");
900 memcpy(cl_name, clFileName, str - clFileName);
901 cl_name[str - clFileName] =
'\0';
902 sprintf(fileName,
"%s-%s.bin", cl_name, deviceName);
903 legalizeFileName(fileName);
904 if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
905 tprintf(
"[OD] write binary[%s] failed\n", fileName);
908 tprintf(
"[OD] write binary[%s] successfully\n", fileName);
913 for (i = 0; i < numDevices; i++) {
914 delete[] binaries[i];
920 int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo,
const char *buildOption) {
923 size_t source_size[1];
924 int binary_status, binaryExisted, idx;
927 const char *filename =
"kernel.cl";
929 if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
933 idx = gpuInfo->mnFileCount;
937 source_size[0] = strlen(source);
939 binaryExisted = BinaryGenerated(filename, &fd);
940 if (binaryExisted == 1) {
941 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
sizeof(numDevices),
942 &numDevices,
nullptr);
943 CHECK_OPENCL(clStatus,
"clGetContextInfo");
945 std::vector<cl_device_id> mpArryDevsID(numDevices);
946 bool b_error = fseek(fd, 0, SEEK_END) < 0;
947 auto pos = std::ftell(fd);
948 b_error |= (pos <= 0);
950 b_error |= fseek(fd, 0, SEEK_SET) < 0;
956 std::vector<uint8_t> binary(length + 2);
958 memset(&binary[0], 0, length + 2);
959 b_error |= fread(&binary[0], 1, length, fd) != length;
964 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
965 sizeof(cl_device_id) * numDevices, &mpArryDevsID[0],
nullptr);
966 CHECK_OPENCL(clStatus,
"clGetContextInfo");
968 const uint8_t *c_binary = &binary[0];
969 gpuInfo->mpArryPrograms[idx] =
970 clCreateProgramWithBinary(gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length,
971 &c_binary, &binary_status, &clStatus);
972 CHECK_OPENCL(clStatus,
"clCreateProgramWithBinary");
976 gpuInfo->mpArryPrograms[idx] =
977 clCreateProgramWithSource(gpuInfo->mpContext, 1, &source, source_size, &clStatus);
978 CHECK_OPENCL(clStatus,
"clCreateProgramWithSource");
981 if (gpuInfo->mpArryPrograms[idx] == (cl_program)
nullptr) {
988 if (!gpuInfo->mnIsUserCreated) {
989 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, buildOption,
992 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), buildOption,
995 if (clStatus != CL_SUCCESS) {
996 tprintf(
"BuildProgram error!\n");
998 if (!gpuInfo->mnIsUserCreated) {
999 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1000 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1002 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1003 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1005 if (clStatus != CL_SUCCESS) {
1006 tprintf(
"opencl create build log fail\n");
1009 std::vector<char> buildLog(length);
1010 if (!gpuInfo->mnIsUserCreated) {
1011 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1012 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1014 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1015 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1017 if (clStatus != CL_SUCCESS) {
1018 tprintf(
"opencl program build info fail\n");
1022 fd1 = fopen(
"kernel-build.log",
"w+");
1023 if (fd1 !=
nullptr) {
1024 fwrite(&buildLog[0],
sizeof(
char), length, fd1);
1031 strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1032 if (binaryExisted == 0) {
1033 GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1036 gpuInfo->mnFileCount += 1;
1040 l_uint32 *OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata, l_int32 w, l_int32 h, l_int32 wpl,
1044 size_t globalThreads[2];
1045 size_t localThreads[2];
1051 gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1052 globalThreads[0] = gsize;
1053 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1054 globalThreads[1] = gsize;
1055 localThreads[0] = GROUPSIZE_X;
1056 localThreads[1] = GROUPSIZE_Y;
1058 SetKernelEnv(&rEnv);
1060 l_uint32 *pResult = (l_uint32 *)malloc(w * h *
sizeof(l_uint32));
1061 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"composeRGBPixel", &clStatus);
1062 CHECK_OPENCL(clStatus,
"clCreateKernel composeRGBPixel");
1065 valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1067 outputCl = allocateZeroCopyBuffer(rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
1071 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &valuesCl);
1072 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1073 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(w), &w);
1074 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1075 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(h), &h);
1076 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1077 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1078 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1079 clStatus = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &outputCl);
1080 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1083 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1084 localThreads, 0,
nullptr,
nullptr);
1085 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel");
1088 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0,
1089 w * h *
sizeof(l_uint32), 0,
nullptr,
nullptr, &clStatus);
1090 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer outputCl");
1091 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0,
nullptr,
nullptr);
1094 clFinish(rEnv.mpkCmdQueue);
1100 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1101 size_t globalThreads[2];
1105 size_t localThreads[2];
1108 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1109 globalThreads[0] = gsize;
1110 globalThreads[1] = GROUPSIZE_HMORY;
1111 localThreads[0] = GROUPSIZE_HMORX;
1112 localThreads[1] = GROUPSIZE_HMORY;
1114 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_5x5", &status);
1115 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_5x5");
1117 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1118 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1119 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1120 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1122 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1123 localThreads, 0,
nullptr,
nullptr);
1126 pixtemp = pixsCLBuffer;
1127 pixsCLBuffer = pixdCLBuffer;
1128 pixdCLBuffer = pixtemp;
1131 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1132 globalThreads[0] = gsize;
1133 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1134 globalThreads[1] = gsize;
1135 localThreads[0] = GROUPSIZE_X;
1136 localThreads[1] = GROUPSIZE_Y;
1138 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer_5x5", &status);
1139 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer_5x5");
1141 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1142 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1143 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1144 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1145 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1146 localThreads, 0,
nullptr,
nullptr);
1153 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1154 size_t globalThreads[2];
1158 l_uint32 fwmask, lwmask;
1159 size_t localThreads[2];
1161 lwmask = lmask32[31 - 2];
1162 fwmask = rmask32[31 - 2];
1165 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1166 globalThreads[0] = gsize;
1167 globalThreads[1] = GROUPSIZE_HMORY;
1168 localThreads[0] = GROUPSIZE_HMORX;
1169 localThreads[1] = GROUPSIZE_HMORY;
1171 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_5x5", &status);
1172 CHECK_OPENCL(status,
"clCreateKernel morphoErodeHor_5x5");
1174 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1175 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1176 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1177 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1179 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1180 localThreads, 0,
nullptr,
nullptr);
1183 pixtemp = pixsCLBuffer;
1184 pixsCLBuffer = pixdCLBuffer;
1185 pixdCLBuffer = pixtemp;
1188 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1189 globalThreads[0] = gsize;
1190 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1191 globalThreads[1] = gsize;
1192 localThreads[0] = GROUPSIZE_X;
1193 localThreads[1] = GROUPSIZE_Y;
1195 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer_5x5", &status);
1196 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer_5x5");
1198 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1199 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1200 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1201 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1202 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(fwmask), &fwmask);
1203 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(lwmask), &lwmask);
1204 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1205 localThreads, 0,
nullptr,
nullptr);
1211 static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1212 l_int32 xp, yp, xn, yn;
1214 size_t globalThreads[2];
1218 size_t localThreads[2];
1221 OpenclDevice::SetKernelEnv(&rEnv);
1223 if (hsize == 5 && vsize == 5) {
1225 status = pixDilateCL_55(wpl, h);
1229 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1231 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1234 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1235 globalThreads[0] = gsize;
1236 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1237 globalThreads[1] = gsize;
1238 localThreads[0] = GROUPSIZE_X;
1239 localThreads[1] = GROUPSIZE_Y;
1241 if (xp > 31 || xn > 31) {
1243 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor", &status);
1244 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor");
1246 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1247 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1248 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1249 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1250 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1251 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1252 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1253 localThreads, 0,
nullptr,
nullptr);
1255 if (yp > 0 || yn > 0) {
1256 pixtemp = pixsCLBuffer;
1257 pixsCLBuffer = pixdCLBuffer;
1258 pixdCLBuffer = pixtemp;
1260 }
else if (xp > 0 || xn > 0) {
1262 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_32word", &status);
1263 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_32word");
1264 isEven = (xp != xn);
1266 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1267 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1268 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1269 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1270 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1271 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isEven), &isEven);
1272 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1273 localThreads, 0,
nullptr,
nullptr);
1275 if (yp > 0 || yn > 0) {
1276 pixtemp = pixsCLBuffer;
1277 pixsCLBuffer = pixdCLBuffer;
1278 pixdCLBuffer = pixtemp;
1282 if (yp > 0 || yn > 0) {
1283 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer", &status);
1284 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer");
1286 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1287 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1288 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1289 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1290 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1291 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(yn), &yn);
1292 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1293 localThreads, 0,
nullptr,
nullptr);
1300 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) {
1301 l_int32 xp, yp, xn, yn;
1303 size_t globalThreads[2];
1304 size_t localThreads[2];
1308 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1309 l_uint32 rwmask, lwmask;
1312 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1314 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1316 OpenclDevice::SetKernelEnv(&rEnv);
1318 if (hsize == 5 && vsize == 5 && isAsymmetric) {
1320 status = pixErodeCL_55(wpl, h);
1324 lwmask = lmask32[31 - (xn & 31)];
1325 rwmask = rmask32[31 - (xp & 31)];
1328 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1329 globalThreads[0] = gsize;
1330 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1331 globalThreads[1] = gsize;
1332 localThreads[0] = GROUPSIZE_X;
1333 localThreads[1] = GROUPSIZE_Y;
1336 if (xp > 31 || xn > 31) {
1338 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor", &status);
1340 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1341 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1342 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1343 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1344 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1345 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1346 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(isAsymmetric), &isAsymmetric);
1347 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(rwmask), &rwmask);
1348 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(lwmask), &lwmask);
1349 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1350 localThreads, 0,
nullptr,
nullptr);
1352 if (yp > 0 || yn > 0) {
1353 pixtemp = pixsCLBuffer;
1354 pixsCLBuffer = pixdCLBuffer;
1355 pixdCLBuffer = pixtemp;
1357 }
else if (xp > 0 || xn > 0) {
1358 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_32word", &status);
1359 isEven = (xp != xn);
1361 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1362 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1363 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1364 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1365 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1366 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1367 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(rwmask), &rwmask);
1368 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(lwmask), &lwmask);
1369 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(isEven), &isEven);
1370 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1371 localThreads, 0,
nullptr,
nullptr);
1373 if (yp > 0 || yn > 0) {
1374 pixtemp = pixsCLBuffer;
1375 pixsCLBuffer = pixdCLBuffer;
1376 pixdCLBuffer = pixtemp;
1381 if (yp > 0 || yn > 0) {
1382 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer", &status);
1383 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer");
1385 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1386 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1387 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1388 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1389 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1390 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1391 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(yn), &yn);
1392 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1393 localThreads, 0,
nullptr,
nullptr);
1400 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1405 status = pixErodeCL(hsize, vsize, wpl, h);
1407 pixtemp = pixsCLBuffer;
1408 pixsCLBuffer = pixdCLBuffer;
1409 pixdCLBuffer = pixtemp;
1411 status = pixDilateCL(hsize, vsize, wpl, h);
1417 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1422 status = pixDilateCL(hsize, vsize, wpl, h);
1424 pixtemp = pixsCLBuffer;
1425 pixsCLBuffer = pixdCLBuffer;
1426 pixdCLBuffer = pixtemp;
1428 status = pixErodeCL(hsize, vsize, wpl, h);
1434 static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2) {
1436 size_t globalThreads[2];
1438 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1440 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1441 globalThreads[0] = gsize;
1442 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1443 globalThreads[1] = gsize;
1445 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"pixSubtract_inplace", &status);
1446 CHECK_OPENCL(status,
"clCreateKernel pixSubtract_inplace");
1449 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &buffer1);
1450 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &buffer2);
1451 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1452 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1453 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1454 localThreads, 0,
nullptr,
nullptr);
1462 void OpenclDevice::pixGetLinesCL(Image pixd, Image pixs, Image *pix_vline, Image *pix_hline,
1463 Image *pixClosed,
bool getpixClosed, l_int32 close_hsize,
1464 l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize,
1465 l_int32 line_hsize, l_int32 line_vsize) {
1469 wpl = pixGetWpl(pixs);
1470 h = pixGetHeight(pixs);
1473 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1478 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl * h, CL_MAP_READ,
true,
false);
1483 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1484 sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1487 pixtemp = pixsCLBuffer;
1488 pixsCLBuffer = pixdCLBuffer;
1489 pixdCLBuffer = pixtemp;
1491 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1494 pixtemp = pixsCLBuffer;
1495 pixsCLBuffer = pixdCLBuffer;
1496 pixdCLBuffer = pixdCLIntermediate;
1497 pixdCLIntermediate = pixtemp;
1499 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1503 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1504 sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1506 pixtemp = pixsCLBuffer;
1507 pixsCLBuffer = pixdCLBuffer;
1508 pixdCLBuffer = pixtemp;
1512 clStatus = pixOpenCL(1, line_vsize, wpl, h);
1516 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h, CL_MAP_READ,
true,
false);
1518 pixtemp = pixsCLBuffer;
1519 pixsCLBuffer = pixdCLIntermediate;
1520 pixdCLIntermediate = pixtemp;
1524 clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1528 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h, CL_MAP_READ,
true,
true);
1539 int OpenclDevice::HistogramRectOCL(
void *imageData,
int bytes_per_pixel,
int bytes_per_line,
1543 int *histogramAllChannels) {
1547 SetKernelEnv(&histKern);
1548 KernelEnv histRedKern;
1549 SetKernelEnv(&histRedKern);
1555 cl_mem imageBuffer =
1556 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1557 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1558 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1561 int block_size = 256;
1563 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(numCUs), &numCUs,
1565 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1567 int requestedOccupancy = 10;
1568 int numWorkGroups = numCUs * requestedOccupancy;
1569 int numThreads = block_size * numWorkGroups;
1570 size_t local_work_size[] = {
static_cast<size_t>(block_size)};
1571 size_t global_work_size[] = {
static_cast<size_t>(numThreads)};
1572 size_t red_global_work_size[] = {
1573 static_cast<size_t>(block_size *
kHistogramSize * bytes_per_pixel)};
1577 cl_mem histogramBuffer = clCreateBuffer(
1578 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1579 kHistogramSize * bytes_per_pixel *
sizeof(
int), histogramAllChannels, &clStatus);
1580 CHECK_OPENCL(clStatus,
"clCreateBuffer histogramBuffer");
1584 int tmpHistogramBins =
kHistogramSize * bytes_per_pixel * histRed;
1586 cl_mem tmpHistogramBuffer =
1587 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins *
sizeof(cl_uint),
1588 nullptr, &clStatus);
1589 CHECK_OPENCL(clStatus,
"clCreateBuffer tmpHistogramBuffer");
1592 int *zeroBuffer =
new int[1];
1594 cl_mem atomicSyncBuffer =
1595 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
sizeof(cl_int),
1596 zeroBuffer, &clStatus);
1597 CHECK_OPENCL(clStatus,
"clCreateBuffer atomicSyncBuffer");
1598 delete[] zeroBuffer;
1600 if (bytes_per_pixel == 1) {
1601 histKern.mpkKernel =
1602 clCreateKernel(histKern.mpkProgram,
"kernel_HistogramRectOneChannel", &clStatus);
1603 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectOneChannel");
1605 histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1606 "kernel_HistogramRectOneChannelReduction", &clStatus);
1607 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectOneChannelReduction");
1609 histKern.mpkKernel =
1610 clCreateKernel(histKern.mpkProgram,
"kernel_HistogramRectAllChannels", &clStatus);
1611 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectAllChannels");
1613 histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1614 "kernel_HistogramRectAllChannelsReduction", &clStatus);
1615 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectAllChannelsReduction");
1621 ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1622 tmpHistogramBins *
sizeof(cl_uint), 0,
nullptr,
nullptr, &clStatus);
1623 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer tmpHistogramBuffer");
1625 memset(ptr, 0, tmpHistogramBins *
sizeof(cl_uint));
1626 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
nullptr,
nullptr);
1629 clStatus = clSetKernelArg(histKern.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1630 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1631 cl_uint numPixels = width * height;
1632 clStatus = clSetKernelArg(histKern.mpkKernel, 1,
sizeof(cl_uint), &numPixels);
1633 CHECK_OPENCL(clStatus,
"clSetKernelArg numPixels");
1634 clStatus = clSetKernelArg(histKern.mpkKernel, 2,
sizeof(cl_mem), &tmpHistogramBuffer);
1635 CHECK_OPENCL(clStatus,
"clSetKernelArg tmpHistogramBuffer");
1638 int n = numThreads / bytes_per_pixel;
1639 clStatus = clSetKernelArg(histRedKern.mpkKernel, 0,
sizeof(cl_int), &n);
1640 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1641 clStatus = clSetKernelArg(histRedKern.mpkKernel, 1,
sizeof(cl_mem), &tmpHistogramBuffer);
1642 CHECK_OPENCL(clStatus,
"clSetKernelArg tmpHistogramBuffer");
1643 clStatus = clSetKernelArg(histRedKern.mpkKernel, 2,
sizeof(cl_mem), &histogramBuffer);
1644 CHECK_OPENCL(clStatus,
"clSetKernelArg histogramBuffer");
1647 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
nullptr,
1648 global_work_size, local_work_size, 0,
nullptr,
nullptr);
1649 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1650 clFinish(histKern.mpkCmdQueue);
1651 if (clStatus != 0) {
1655 clStatus = clEnqueueNDRangeKernel(histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1,
nullptr,
1656 red_global_work_size, local_work_size, 0,
nullptr,
nullptr);
1657 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1658 clFinish(histRedKern.mpkCmdQueue);
1659 if (clStatus != 0) {
1664 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0,
1665 kHistogramSize * bytes_per_pixel *
sizeof(
int), 0,
nullptr,
nullptr,
1667 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
1668 if (clStatus != 0) {
1671 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
nullptr,
nullptr);
1673 clReleaseMemObject(histogramBuffer);
1674 clReleaseMemObject(imageBuffer);
1683 int OpenclDevice::ThresholdRectToPixOCL(
unsigned char *imageData,
int bytes_per_pixel,
1684 int bytes_per_line,
int *thresholds,
int *hi_values,
1685 Image *pix,
int height,
int width,
int top,
int left) {
1688 *pix = pixCreate(width, height, 1);
1689 uint32_t *pixData = pixGetData(*pix);
1690 int wpl = pixGetWpl(*pix);
1691 int pixSize = wpl * height *
sizeof(uint32_t);
1695 SetKernelEnv(&rEnv);
1698 int block_size = 256;
1700 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(numCUs), &numCUs,
1702 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1704 int requestedOccupancy = 10;
1705 int numWorkGroups = numCUs * requestedOccupancy;
1706 int numThreads = block_size * numWorkGroups;
1707 size_t local_work_size[] = {(size_t)block_size};
1708 size_t global_work_size[] = {(size_t)numThreads};
1715 cl_mem imageBuffer =
1716 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1717 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1718 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1721 pixThBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize,
1722 pixData, &clStatus);
1723 CHECK_OPENCL(clStatus,
"clCreateBuffer pix");
1726 cl_mem thresholdsBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1727 bytes_per_pixel *
sizeof(
int), thresholds, &clStatus);
1728 CHECK_OPENCL(clStatus,
"clCreateBuffer thresholdBuffer");
1729 cl_mem hiValuesBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1730 bytes_per_pixel *
sizeof(
int), hi_values, &clStatus);
1731 CHECK_OPENCL(clStatus,
"clCreateBuffer hiValuesBuffer");
1734 if (bytes_per_pixel == 4) {
1735 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"kernel_ThresholdRectToPix", &clStatus);
1736 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix");
1739 clCreateKernel(rEnv.mpkProgram,
"kernel_ThresholdRectToPix_OneChan", &clStatus);
1740 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix_OneChan");
1744 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1745 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1746 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(
int), &height);
1747 CHECK_OPENCL(clStatus,
"clSetKernelArg height");
1748 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(
int), &width);
1749 CHECK_OPENCL(clStatus,
"clSetKernelArg width");
1750 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(
int), &wpl);
1751 CHECK_OPENCL(clStatus,
"clSetKernelArg wpl");
1752 clStatus = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &thresholdsBuffer);
1753 CHECK_OPENCL(clStatus,
"clSetKernelArg thresholdsBuffer");
1754 clStatus = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(cl_mem), &hiValuesBuffer);
1755 CHECK_OPENCL(clStatus,
"clSetKernelArg hiValuesBuffer");
1756 clStatus = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(cl_mem), &pixThBuffer);
1757 CHECK_OPENCL(clStatus,
"clSetKernelArg pixThBuffer");
1760 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
nullptr, global_work_size,
1761 local_work_size, 0,
nullptr,
nullptr);
1762 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1763 clFinish(rEnv.mpkCmdQueue);
1764 if (clStatus != 0) {
1765 tprintf(
"Setting return value to -1\n");
1769 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0,
1770 nullptr,
nullptr, &clStatus);
1771 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
1772 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0,
nullptr,
nullptr);
1774 clReleaseMemObject(imageBuffer);
1775 clReleaseMemObject(thresholdsBuffer);
1776 clReleaseMemObject(hiValuesBuffer);
1785 struct TessScoreEvaluationInputData {
1789 unsigned char *imageData;
1793 static void populateTessScoreEvaluationInputData(TessScoreEvaluationInputData *input) {
1798 int numChannels = 4;
1799 input->height = height;
1800 input->width = width;
1801 input->numChannels = numChannels;
1802 unsigned char(*imageData4)[4] = (
unsigned char(*)[4])malloc(
1803 height * width * numChannels *
sizeof(
unsigned char));
1804 input->imageData = (
unsigned char *)&imageData4[0];
1807 unsigned char pixelWhite[4] = {0, 0, 0, 255};
1808 unsigned char pixelBlack[4] = {255, 255, 255, 255};
1809 for (
int p = 0; p < height * width; p++) {
1811 imageData4[p][0] = pixelWhite[0];
1812 imageData4[p][1] = pixelWhite[1];
1813 imageData4[p][2] = pixelWhite[2];
1814 imageData4[p][3] = pixelWhite[3];
1817 int maxLineWidth = 64;
1820 for (
int i = 0; i < numLines; i++) {
1821 int lineWidth = rand() % maxLineWidth;
1822 int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1824 for (
int row = vertLinePos - lineWidth / 2; row < vertLinePos + lineWidth / 2; row++) {
1825 for (
int col = 0; col < height; col++) {
1827 imageData4[row * width + col][0] = pixelBlack[0];
1828 imageData4[row * width + col][1] = pixelBlack[1];
1829 imageData4[row * width + col][2] = pixelBlack[2];
1830 imageData4[row * width + col][3] = pixelBlack[3];
1835 for (
int i = 0; i < numLines; i++) {
1836 int lineWidth = rand() % maxLineWidth;
1837 int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1839 for (
int row = 0; row < width; row++) {
1840 for (
int col = horLinePos - lineWidth / 2; col < horLinePos + lineWidth / 2;
1845 imageData4[row * width + col][0] = pixelBlack[0];
1846 imageData4[row * width + col][1] = pixelBlack[1];
1847 imageData4[row * width + col][2] = pixelBlack[2];
1848 imageData4[row * width + col][3] = pixelBlack[3];
1853 float fractionBlack = 0.1;
1854 int numSpots = (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1855 for (
int i = 0; i < numSpots; i++) {
1856 int lineWidth = rand() % maxLineWidth;
1857 int col = lineWidth + rand() % (width - 2 * lineWidth);
1858 int row = lineWidth + rand() % (height - 2 * lineWidth);
1860 for (
int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1861 for (
int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
1864 imageData4[r * width + c][0] = pixelBlack[0];
1865 imageData4[r * width + c][1] = pixelBlack[1];
1866 imageData4[r * width + c][2] = pixelBlack[2];
1867 imageData4[r * width + c][3] = pixelBlack[3];
1872 input->pix = pixCreate(input->width, input->height, 8 * input->numChannels);
1875 struct TessDeviceScore {
1885 static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1886 ds_device_type type) {
1889 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1890 QueryPerformanceFrequency(&freq);
1892 mach_timebase_info_data_t info = {0, 0};
1893 mach_timebase_info(&info);
1894 long long start, stop;
1896 timespec time_funct_start, time_funct_end;
1899 l_uint32 *tiffdata = (l_uint32 *)input.imageData;
1903 if (type == DS_DEVICE_OPENCL_DEVICE) {
1905 QueryPerformanceCounter(&time_funct_start);
1907 start = mach_absolute_time();
1909 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1912 OpenclDevice::gpuEnv = *env;
1913 int wpl = pixGetWpl(input.pix);
1914 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl,
nullptr);
1916 QueryPerformanceCounter(&time_funct_end);
1917 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
1919 stop = mach_absolute_time();
1920 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
1922 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1923 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1924 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1929 QueryPerformanceCounter(&time_funct_start);
1931 start = mach_absolute_time();
1933 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1935 Image pix = pixCreate(input.width, input.height, 32);
1936 l_uint32 *pixData = pixGetData(pix);
1939 for (i = 0; i < input.height; i++) {
1940 for (j = 0; j < input.width; j++) {
1941 l_uint32 tiffword = tiffdata[i * input.width + j];
1942 l_int32 rval = ((tiffword)&0xff);
1943 l_int32 gval = (((tiffword) >> 8) & 0xff);
1944 l_int32 bval = (((tiffword) >> 16) & 0xff);
1945 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
1946 pixData[idx] = value;
1951 QueryPerformanceCounter(&time_funct_end);
1952 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
1954 stop = mach_absolute_time();
1955 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
1957 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1958 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1959 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1967 static double histogramRectMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1968 ds_device_type type) {
1971 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1972 QueryPerformanceFrequency(&freq);
1974 mach_timebase_info_data_t info = {0, 0};
1975 mach_timebase_info(&info);
1976 long long start, stop;
1978 timespec time_funct_start, time_funct_end;
1984 int bytes_per_line = input.width * input.numChannels;
1985 int *histogramAllChannels =
new int[
kHistogramSize * input.numChannels];
1987 if (type == DS_DEVICE_OPENCL_DEVICE) {
1989 QueryPerformanceCounter(&time_funct_start);
1991 start = mach_absolute_time();
1993 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1996 OpenclDevice::gpuEnv = *env;
1997 int retVal = OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line,
1998 left, top, input.width, input.height,
2002 QueryPerformanceCounter(&time_funct_end);
2003 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2005 stop = mach_absolute_time();
2007 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2012 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2013 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2014 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2019 QueryPerformanceCounter(&time_funct_start);
2021 start = mach_absolute_time();
2023 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2025 for (
int ch = 0; ch < input.numChannels; ++ch) {
2030 QueryPerformanceCounter(&time_funct_end);
2031 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2033 stop = mach_absolute_time();
2034 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2036 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2037 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2038 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2044 delete[] histogramAllChannels;
2049 static void ThresholdRectToPix_Native(
const unsigned char *imagedata,
int bytes_per_pixel,
2050 int bytes_per_line,
const int *thresholds,
2051 const int *hi_values, Image *pix) {
2054 int width = pixGetWidth(*pix);
2055 int height = pixGetHeight(*pix);
2057 *pix = pixCreate(width, height, 1);
2058 uint32_t *pixdata = pixGetData(*pix);
2059 int wpl = pixGetWpl(*pix);
2060 const unsigned char *srcdata = imagedata + top * bytes_per_line + left * bytes_per_pixel;
2061 for (
int y = 0; y < height; ++y) {
2062 const uint8_t *linedata = srcdata;
2063 uint32_t *pixline = pixdata + y * wpl;
2064 for (
int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2065 bool white_result =
true;
2066 for (
int ch = 0; ch < bytes_per_pixel; ++ch) {
2067 if (hi_values[ch] >= 0 && (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2068 white_result =
false;
2073 CLEAR_DATA_BIT(pixline, x);
2075 SET_DATA_BIT(pixline, x);
2077 srcdata += bytes_per_line;
2081 static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2082 ds_device_type type) {
2085 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2086 QueryPerformanceFrequency(&freq);
2088 mach_timebase_info_data_t info = {0, 0};
2089 mach_timebase_info(&info);
2090 long long start, stop;
2092 timespec time_funct_start, time_funct_end;
2096 unsigned char pixelHi = (
unsigned char)255;
2097 int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2102 int bytes_per_line = input.width * input.numChannels;
2105 if (type == DS_DEVICE_OPENCL_DEVICE) {
2107 QueryPerformanceCounter(&time_funct_start);
2109 start = mach_absolute_time();
2111 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2114 OpenclDevice::gpuEnv = *env;
2116 int retVal = OpenclDevice::ThresholdRectToPixOCL(
2117 input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix,
2118 input.height, input.width, top, left);
2121 QueryPerformanceCounter(&time_funct_end);
2122 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2124 stop = mach_absolute_time();
2126 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2132 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2133 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2134 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2140 QueryPerformanceCounter(&time_funct_start);
2142 start = mach_absolute_time();
2144 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2146 int hi_values[4] = {};
2147 ThresholdRectToPix_Native(input.imageData, input.numChannels, bytes_per_line, thresholds,
2148 hi_values, &input.pix);
2151 QueryPerformanceCounter(&time_funct_end);
2152 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2154 stop = mach_absolute_time();
2155 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2157 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2158 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2159 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2166 static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2167 ds_device_type type) {
2170 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2171 QueryPerformanceFrequency(&freq);
2173 mach_timebase_info_data_t info = {0, 0};
2174 mach_timebase_info(&info);
2175 long long start, stop;
2177 timespec time_funct_start, time_funct_end;
2181 int resolution = 300;
2182 int wpl = pixGetWpl(input.pix);
2187 int closing_brick = max_line_width / 3;
2190 if (type == DS_DEVICE_OPENCL_DEVICE) {
2192 QueryPerformanceCounter(&time_funct_start);
2194 start = mach_absolute_time();
2196 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2198 OpenclDevice::gpuEnv = *env;
2199 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2200 Image pix_vline =
nullptr, pix_hline =
nullptr, pix_closed =
nullptr;
2201 OpenclDevice::pixGetLinesCL(
nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed,
true,
2202 closing_brick, closing_brick, max_line_width, max_line_width,
2203 min_line_length, min_line_length);
2205 OpenclDevice::releaseMorphCLBuffers();
2208 QueryPerformanceCounter(&time_funct_end);
2209 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2211 stop = mach_absolute_time();
2212 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2214 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2215 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2216 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2220 QueryPerformanceCounter(&time_funct_start);
2222 start = mach_absolute_time();
2224 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2228 Image src_pix = input.pix;
2229 Image pix_closed = pixCloseBrick(
nullptr, src_pix, closing_brick, closing_brick);
2230 Image pix_solid = pixOpenBrick(
nullptr, pix_closed, max_line_width, max_line_width);
2231 Image pix_hollow = pixSubtract(
nullptr, pix_closed, pix_solid);
2232 pix_solid.destroy();
2233 Image pix_vline = pixOpenBrick(
nullptr, pix_hollow, 1, min_line_length);
2234 Image pix_hline = pixOpenBrick(
nullptr, pix_hollow, min_line_length, 1);
2235 pix_hline.destroy();
2236 pix_vline.destroy();
2237 pix_hollow.destroy();
2240 QueryPerformanceCounter(&time_funct_end);
2241 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2243 stop = mach_absolute_time();
2244 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2246 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2247 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2248 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2260 static ds_status serializeScore(ds_device *device, uint8_t **serializedScore,
2261 unsigned int *serializedScoreSize) {
2262 *serializedScoreSize =
sizeof(TessDeviceScore);
2263 *serializedScore =
new uint8_t[*serializedScoreSize];
2264 memcpy(*serializedScore, device->score, *serializedScoreSize);
2269 static ds_status deserializeScore(ds_device *device,
const uint8_t *serializedScore,
2270 unsigned int serializedScoreSize) {
2272 device->score =
new TessDeviceScore;
2273 memcpy(device->score, serializedScore, serializedScoreSize);
2277 static ds_status releaseScore(TessDeviceScore *score) {
2283 static ds_status evaluateScoreForDevice(ds_device *device,
void *inputData) {
2286 tprintf(
"\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2287 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2288 GPUEnv *env =
nullptr;
2289 if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2290 env = &OpenclDevice::gpuEnv;
2291 memset(env, 0,
sizeof(*env));
2293 populateGPUEnvFromDevice(env, device->oclDeviceID);
2294 env->mnFileCount = 0;
2295 env->mnKernelCount = 0UL;
2297 OpenclDevice::CompileKernelFile(env,
"");
2300 TessScoreEvaluationInputData *input =
static_cast<TessScoreEvaluationInputData *
>(inputData);
2303 double composeRGBPixelTime = composeRGBPixelMicroBench(env, *input, device->type);
2306 double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2309 double thresholdRectToPixTime = thresholdRectToPixMicroBench(env, *input, device->type);
2312 double getLineMasksMorphTime = getLineMasksMorphMicroBench(env, *input, device->type);
2316 float composeRGBPixelWeight = 1.2f;
2317 float histogramRectWeight = 2.4f;
2318 float thresholdRectToPixWeight = 4.5f;
2319 float getLineMasksMorphWeight = 5.0f;
2321 float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2322 histogramRectWeight * histogramRectTime +
2323 thresholdRectToPixWeight * thresholdRectToPixTime +
2324 getLineMasksMorphWeight * getLineMasksMorphTime;
2325 device->score =
new TessDeviceScore;
2326 device->score->time = weightedTime;
2328 tprintf(
"[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2329 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2330 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight);
2331 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"HistogramRect", histogramRectTime, histogramRectWeight);
2332 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"ThresholdRectToPix", thresholdRectToPixTime,
2333 thresholdRectToPixWeight);
2334 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"getLineMasksMorph", getLineMasksMorphTime,
2335 getLineMasksMorphWeight);
2336 tprintf(
"[DS]%25s: %f\n",
"Score", device->score->time);
2341 ds_device OpenclDevice::getDeviceSelection() {
2342 if (!deviceIsSelected) {
2344 if (1 == LoadOpencl()) {
2348 ds_profile *profile;
2349 status = initDSProfile(&profile,
"v0.1");
2351 const char *fileName =
"tesseract_opencl_profile_devices.dat";
2352 status = readProfileFromFile(profile, deserializeScore, fileName);
2353 if (status != DS_SUCCESS) {
2355 tprintf(
"[DS] Profile file not available (%s); performing profiling.\n", fileName);
2358 TessScoreEvaluationInputData input;
2359 populateTessScoreEvaluationInputData(&input);
2361 unsigned int numUpdates;
2363 profileDevices(profile, DS_EVALUATE_ALL, evaluateScoreForDevice, &input, &numUpdates);
2365 if (status == DS_SUCCESS) {
2366 status = writeProfileToFile(profile, serializeScore, fileName);
2367 if (status == DS_SUCCESS) {
2368 tprintf(
"[DS] Scores written to file (%s).\n", fileName);
2371 "[DS] Error saving scores to file (%s); scores not written to "
2377 "[DS] Unable to evaluate performance; scores not written to "
2381 tprintf(
"[DS] Profile read from file (%s).\n", fileName);
2386 float bestTime = FLT_MAX;
2387 int bestDeviceIdx = -1;
2388 for (
unsigned d = 0; d < profile->numDevices; d++) {
2389 ds_device device = profile->devices[d];
2390 if (device.score ==
nullptr)
2392 TessDeviceScore score = *device.score;
2394 float time = score.time;
2395 tprintf(
"[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type, device.oclDeviceName,
2397 if (time < bestTime) {
2402 if (bestDeviceIdx >= 0) {
2404 "[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2405 profile->devices[bestDeviceIdx].oclDeviceName,
2406 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2411 bool overridden =
false;
2412 char *overrideDeviceStr = getenv(
"TESSERACT_OPENCL_DEVICE");
2413 if (overrideDeviceStr !=
nullptr) {
2414 int overrideDeviceIdx = atoi(overrideDeviceStr);
2415 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2417 "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2419 overrideDeviceStr, overrideDeviceIdx);
2420 bestDeviceIdx = overrideDeviceIdx - 1;
2424 "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2425 "valid devices).\n",
2426 overrideDeviceStr, profile->numDevices);
2432 "[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2433 profile->devices[bestDeviceIdx].oclDeviceName,
2434 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2436 selectedDevice = profile->devices[bestDeviceIdx];
2438 releaseDSProfile(profile, releaseScore);
2441 tprintf(
"[DS] OpenCL runtime not available.\n");
2442 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2443 selectedDevice.oclDeviceName =
"(null)";
2444 selectedDevice.score =
nullptr;
2445 selectedDevice.oclDeviceID =
nullptr;
2446 selectedDevice.oclDriverVersion =
nullptr;
2448 deviceIsSelected =
true;
2450 return selectedDevice;
2453 bool OpenclDevice::selectedDeviceIsOpenCL() {
2454 ds_device device = getDeviceSelection();
2455 return (device.type == DS_DEVICE_OPENCL_DEVICE);
void tprintf(const char *format,...)
void HistogramRect(Image src_pix, int channel, int left, int top, int width, int height, int *histogram)
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)