tesseract  5.0.0
openclwrapper.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 
11 #ifdef USE_OPENCL
12 
13 # ifdef _WIN32
14 # include <io.h>
15 # else
16 # include <sys/types.h>
17 # include <unistd.h>
18 # endif
19 # include <cfloat>
20 # include <ctime> // for clock_gettime
21 
22 # include "oclkernels.h"
23 # include "openclwrapper.h"
24 
25 // for micro-benchmark
26 # include "otsuthr.h"
27 # include "thresholder.h"
28 
29 // platform preprocessor commands
30 # if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(__CYGWIN__) || \
31  defined(__MINGW32__)
32 # define ON_WINDOWS 1
33 # define ON_APPLE 0
34 # elif defined(__linux__)
35 # define ON_WINDOWS 0
36 # define ON_APPLE 0
37 # elif defined(__APPLE__)
38 # define ON_WINDOWS 0
39 # define ON_APPLE 1
40 # else
41 # define ON_WINDOWS 0
42 # define ON_APPLE 0
43 # endif
44 
45 # if ON_APPLE
46 # include <mach/mach_time.h>
47 # endif
48 
49 # include <cstdio>
50 # include <cstdlib>
51 # include <cstring> // for memset, strcpy, ...
52 # include <vector>
53 
54 # include "errcode.h" // for ASSERT_HOST
55 # include "image.h" // for Image
56 
57 namespace tesseract {
58 
59 GPUEnv OpenclDevice::gpuEnv;
60 
61 bool OpenclDevice::deviceIsSelected = false;
62 ds_device OpenclDevice::selectedDevice;
63 
64 int OpenclDevice::isInited = 0;
65 
66 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
67 
68 static const l_uint32 lmask32[] = {
69  0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000, 0xfe000000, 0xff000000,
70  0xff800000, 0xffc00000, 0xffe00000, 0xfff00000, 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000,
71  0xffff8000, 0xffffc000, 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
72  0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc, 0xfffffffe, 0xffffffff};
73 
74 static const l_uint32 rmask32[] = {
75  0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
76  0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
77  0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
78  0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff};
79 
80 static cl_mem pixsCLBuffer, pixdCLBuffer,
81  pixdCLIntermediate; // Morph operations buffers
82 static cl_mem pixThBuffer; // output from thresholdtopix calculation
83 static cl_int clStatus;
84 static KernelEnv rEnv;
85 
86 # define DS_TAG_VERSION "<version>"
87 # define DS_TAG_VERSION_END "</version>"
88 # define DS_TAG_DEVICE "<device>"
89 # define DS_TAG_DEVICE_END "</device>"
90 # define DS_TAG_SCORE "<score>"
91 # define DS_TAG_SCORE_END "</score>"
92 # define DS_TAG_DEVICE_TYPE "<type>"
93 # define DS_TAG_DEVICE_TYPE_END "</type>"
94 # define DS_TAG_DEVICE_NAME "<name>"
95 # define DS_TAG_DEVICE_NAME_END "</name>"
96 # define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
97 # define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
98 
99 # define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
100 
101 # define DS_DEVICE_NAME_LENGTH 256
102 
103 enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
104 
105 struct ds_profile {
106  std::vector<ds_device> devices;
107  unsigned int numDevices;
108  const char *version;
109 };
110 
111 enum ds_status {
112  DS_SUCCESS = 0,
113  DS_INVALID_PROFILE = 1000,
114  DS_MEMORY_ERROR,
115  DS_INVALID_PERF_EVALUATOR_TYPE,
116  DS_INVALID_PERF_EVALUATOR,
117  DS_PERF_EVALUATOR_ERROR,
118  DS_FILE_ERROR,
119  DS_UNKNOWN_DEVICE_TYPE,
120  DS_PROFILE_FILE_ERROR,
121  DS_SCORE_SERIALIZER_ERROR,
122  DS_SCORE_DESERIALIZER_ERROR
123 };
124 
125 // Pointer to a function that calculates the score of a device (ex:
126 // device->score) update the data size of score. The encoding and the format
127 // of the score data is implementation defined. The function should return
128 // DS_SUCCESS if there's no error to be reported.
129 typedef ds_status (*ds_perf_evaluator)(ds_device *device, void *data);
130 
131 // deallocate memory used by score
132 typedef ds_status (*ds_score_release)(TessDeviceScore *score);
133 
134 static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
135  ds_status status = DS_SUCCESS;
136  if (profile != nullptr) {
137  if (sr != nullptr) {
138  unsigned int i;
139  for (i = 0; i < profile->numDevices; i++) {
140  free(profile->devices[i].oclDeviceName);
141  free(profile->devices[i].oclDriverVersion);
142  status = sr(profile->devices[i].score);
143  if (status != DS_SUCCESS)
144  break;
145  }
146  }
147  delete profile;
148  }
149  return status;
150 }
151 
152 static ds_status initDSProfile(ds_profile **p, const char *version) {
153  int numDevices;
154  cl_uint numPlatforms;
155  std::vector<cl_platform_id> platforms;
156  std::vector<cl_device_id> devices;
157  ds_status status = DS_SUCCESS;
158  unsigned int next;
159  unsigned int i;
160 
161  if (p == nullptr)
162  return DS_INVALID_PROFILE;
163 
164  ds_profile *profile = new ds_profile;
165 
166  memset(profile, 0, sizeof(ds_profile));
167 
168  clGetPlatformIDs(0, nullptr, &numPlatforms);
169 
170  if (numPlatforms > 0) {
171  platforms.reserve(numPlatforms);
172  clGetPlatformIDs(numPlatforms, &platforms[0], nullptr);
173  }
174 
175  numDevices = 0;
176  for (i = 0; i < numPlatforms; i++) {
177  cl_uint num;
178  clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, nullptr, &num);
179  numDevices += num;
180  }
181 
182  if (numDevices > 0) {
183  devices.reserve(numDevices);
184  }
185 
186  profile->numDevices = numDevices + 1; // +1 to numDevices to include the native CPU
187  profile->devices.reserve(profile->numDevices);
188  memset(&profile->devices[0], 0, profile->numDevices * sizeof(ds_device));
189 
190  next = 0;
191  for (i = 0; i < numPlatforms; i++) {
192  cl_uint num;
193  unsigned j;
194  clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
195  for (j = 0; j < num; j++, next++) {
196  char buffer[DS_DEVICE_NAME_LENGTH];
197  size_t length;
198 
199  profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
200  profile->devices[next].oclDeviceID = devices[j];
201 
202  clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME, DS_DEVICE_NAME_LENGTH,
203  &buffer, nullptr);
204  length = strlen(buffer);
205  profile->devices[next].oclDeviceName = (char *)malloc(length + 1);
206  memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
207 
208  clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION, DS_DEVICE_NAME_LENGTH,
209  &buffer, nullptr);
210  length = strlen(buffer);
211  profile->devices[next].oclDriverVersion = (char *)malloc(length + 1);
212  memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
213  }
214  }
215  profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
216  profile->version = version;
217 
218  *p = profile;
219  return status;
220 }
221 
222 static ds_status profileDevices(ds_profile *profile, const ds_evaluation_type type,
223  ds_perf_evaluator evaluator, void *evaluatorData,
224  unsigned int *numUpdates) {
225  ds_status status = DS_SUCCESS;
226  unsigned int i;
227  unsigned int updates = 0;
228 
229  if (profile == nullptr) {
230  return DS_INVALID_PROFILE;
231  }
232  if (evaluator == nullptr) {
233  return DS_INVALID_PERF_EVALUATOR;
234  }
235 
236  for (i = 0; i < profile->numDevices; i++) {
237  ds_status evaluatorStatus;
238 
239  switch (type) {
240  case DS_EVALUATE_NEW_ONLY:
241  if (profile->devices[i].score != nullptr)
242  break;
243  // else fall through
244  case DS_EVALUATE_ALL:
245  evaluatorStatus = evaluator(&profile->devices[i], evaluatorData);
246  if (evaluatorStatus != DS_SUCCESS) {
247  status = evaluatorStatus;
248  return status;
249  }
250  updates++;
251  break;
252  default:
253  return DS_INVALID_PERF_EVALUATOR_TYPE;
254  break;
255  };
256  }
257  if (numUpdates)
258  *numUpdates = updates;
259  return status;
260 }
261 
262 static const char *findString(const char *contentStart, const char *contentEnd,
263  const char *string) {
264  size_t stringLength;
265  const char *currentPosition;
266  const char *found = nullptr;
267  stringLength = strlen(string);
268  currentPosition = contentStart;
269  for (currentPosition = contentStart; currentPosition < contentEnd; currentPosition++) {
270  if (*currentPosition == string[0]) {
271  if (currentPosition + stringLength < contentEnd) {
272  if (strncmp(currentPosition, string, stringLength) == 0) {
273  found = currentPosition;
274  break;
275  }
276  }
277  }
278  }
279  return found;
280 }
281 
282 static ds_status readProFile(const char *fileName, char **content, size_t *contentSize) {
283  *contentSize = 0;
284  *content = nullptr;
285  ds_status status = DS_SUCCESS;
286  FILE *input = fopen(fileName, "rb");
287  if (input == nullptr) {
288  status = DS_FILE_ERROR;
289  } else {
290  fseek(input, 0L, SEEK_END);
291  auto pos = std::ftell(input);
292  rewind(input);
293  if (pos > 0) {
294  size_t size = pos;
295  char *binary = new char[size];
296  if (fread(binary, sizeof(char), size, input) != size) {
297  status = DS_FILE_ERROR;
298  delete[] binary;
299  } else {
300  *contentSize = size;
301  *content = binary;
302  }
303  }
304  fclose(input);
305  }
306  return status;
307 }
308 
309 typedef ds_status (*ds_score_deserializer)(ds_device *device, const uint8_t *serializedScore,
310  unsigned int serializedScoreSize);
311 
312 static ds_status readProfileFromFile(ds_profile *profile, ds_score_deserializer deserializer,
313  const char *file) {
314  ds_status status = DS_SUCCESS;
315  char *contentStart;
316  size_t contentSize;
317 
318  if (profile == nullptr)
319  return DS_INVALID_PROFILE;
320 
321  status = readProFile(file, &contentStart, &contentSize);
322  if (status == DS_SUCCESS) {
323  const char *currentPosition;
324  const char *dataStart;
325  const char *dataEnd;
326 
327  const char *contentEnd = contentStart + contentSize;
328  currentPosition = contentStart;
329 
330  // parse the version string
331  dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
332  if (dataStart == nullptr) {
333  status = DS_PROFILE_FILE_ERROR;
334  goto cleanup;
335  }
336  dataStart += strlen(DS_TAG_VERSION);
337 
338  dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
339  if (dataEnd == nullptr) {
340  status = DS_PROFILE_FILE_ERROR;
341  goto cleanup;
342  }
343 
344  size_t versionStringLength = strlen(profile->version);
345  if (versionStringLength + dataStart != dataEnd ||
346  strncmp(profile->version, dataStart, versionStringLength) != 0) {
347  // version mismatch
348  status = DS_PROFILE_FILE_ERROR;
349  goto cleanup;
350  }
351  currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
352 
353  // parse the device information
354  while (1) {
355  unsigned int i;
356 
357  const char *deviceTypeStart;
358  const char *deviceTypeEnd;
359  ds_device_type deviceType;
360 
361  const char *deviceNameStart;
362  const char *deviceNameEnd;
363 
364  const char *deviceScoreStart;
365  const char *deviceScoreEnd;
366 
367  const char *deviceDriverStart;
368  const char *deviceDriverEnd;
369 
370  dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
371  if (dataStart == nullptr) {
372  // nothing useful remain, quit...
373  break;
374  }
375  dataStart += strlen(DS_TAG_DEVICE);
376  dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
377  if (dataEnd == nullptr) {
378  status = DS_PROFILE_FILE_ERROR;
379  goto cleanup;
380  }
381 
382  // parse the device type
383  deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
384  if (deviceTypeStart == nullptr) {
385  status = DS_PROFILE_FILE_ERROR;
386  goto cleanup;
387  }
388  deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
389  deviceTypeEnd = findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
390  if (deviceTypeEnd == nullptr) {
391  status = DS_PROFILE_FILE_ERROR;
392  goto cleanup;
393  }
394  memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
395 
396  // parse the device name
397  if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
398  deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
399  if (deviceNameStart == nullptr) {
400  status = DS_PROFILE_FILE_ERROR;
401  goto cleanup;
402  }
403  deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
404  deviceNameEnd = findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
405  if (deviceNameEnd == nullptr) {
406  status = DS_PROFILE_FILE_ERROR;
407  goto cleanup;
408  }
409 
410  deviceDriverStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
411  if (deviceDriverStart == nullptr) {
412  status = DS_PROFILE_FILE_ERROR;
413  goto cleanup;
414  }
415  deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
416  deviceDriverEnd =
417  findString(deviceDriverStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION_END);
418  if (deviceDriverEnd == nullptr) {
419  status = DS_PROFILE_FILE_ERROR;
420  goto cleanup;
421  }
422 
423  // check if this device is on the system
424  for (i = 0; i < profile->numDevices; i++) {
425  if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
426  size_t actualDeviceNameLength;
427  size_t driverVersionLength;
428 
429  actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
430  driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
431  if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
432  deviceDriverStart + driverVersionLength == deviceDriverEnd &&
433  strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
434  actualDeviceNameLength) == 0 &&
435  strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
436  driverVersionLength) == 0) {
437  deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
438  deviceScoreStart += strlen(DS_TAG_SCORE);
439  deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
440  status = deserializer(&profile->devices[i], (const unsigned char *)deviceScoreStart,
441  deviceScoreEnd - deviceScoreStart);
442  if (status != DS_SUCCESS) {
443  goto cleanup;
444  }
445  }
446  }
447  }
448  } else if (deviceType == DS_DEVICE_NATIVE_CPU) {
449  for (i = 0; i < profile->numDevices; i++) {
450  if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
451  deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
452  if (deviceScoreStart == nullptr) {
453  status = DS_PROFILE_FILE_ERROR;
454  goto cleanup;
455  }
456  deviceScoreStart += strlen(DS_TAG_SCORE);
457  deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
458  status = deserializer(&profile->devices[i], (const unsigned char *)deviceScoreStart,
459  deviceScoreEnd - deviceScoreStart);
460  if (status != DS_SUCCESS) {
461  goto cleanup;
462  }
463  }
464  }
465  }
466 
467  // skip over the current one to find the next device
468  currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
469  }
470  }
471 cleanup:
472  delete[] contentStart;
473  return status;
474 }
475 
476 typedef ds_status (*ds_score_serializer)(ds_device *device, uint8_t **serializedScore,
477  unsigned int *serializedScoreSize);
478 static ds_status writeProfileToFile(ds_profile *profile, ds_score_serializer serializer,
479  const char *file) {
480  ds_status status = DS_SUCCESS;
481 
482  if (profile == nullptr)
483  return DS_INVALID_PROFILE;
484 
485  FILE *profileFile = fopen(file, "wb");
486  if (profileFile == nullptr) {
487  status = DS_FILE_ERROR;
488  } else {
489  unsigned int i;
490 
491  // write version string
492  fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
493  fwrite(profile->version, sizeof(char), strlen(profile->version), profileFile);
494  fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END), profileFile);
495  fwrite("\n", sizeof(char), 1, profileFile);
496 
497  for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
498  uint8_t *serializedScore;
499  unsigned int serializedScoreSize;
500 
501  fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
502 
503  fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE), profileFile);
504  fwrite(&profile->devices[i].type, sizeof(ds_device_type), 1, profileFile);
505  fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char), strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
506 
507  switch (profile->devices[i].type) {
508  case DS_DEVICE_NATIVE_CPU: {
509  // There's no need to emit a device name for the native CPU device.
510  /*
511 fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
512  profileFile);
513 fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
514  strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
515 fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
516  strlen(DS_TAG_DEVICE_NAME_END), profileFile);
517 */
518  } break;
519  case DS_DEVICE_OPENCL_DEVICE: {
520  fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME), profileFile);
521  fwrite(profile->devices[i].oclDeviceName, sizeof(char),
522  strlen(profile->devices[i].oclDeviceName), profileFile);
523  fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char), strlen(DS_TAG_DEVICE_NAME_END), profileFile);
524 
525  fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char), strlen(DS_TAG_DEVICE_DRIVER_VERSION),
526  profileFile);
527  fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
528  strlen(profile->devices[i].oclDriverVersion), profileFile);
529  fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
530  strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
531  } break;
532  default:
533  status = DS_UNKNOWN_DEVICE_TYPE;
534  continue;
535  };
536 
537  fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
538  status = serializer(&profile->devices[i], &serializedScore, &serializedScoreSize);
539  if (status == DS_SUCCESS && serializedScore != nullptr && serializedScoreSize > 0) {
540  fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
541  delete[] serializedScore;
542  }
543  fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END), profileFile);
544  fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END), profileFile);
545  fwrite("\n", sizeof(char), 1, profileFile);
546  }
547  fclose(profileFile);
548  }
549  return status;
550 }
551 
552 // substitute invalid characters in device name with _
553 static void legalizeFileName(char *fileName) {
554  // tprintf("fileName: %s\n", fileName);
555  const char *invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches
556  // for each invalid char
557  for (unsigned i = 0; i < strlen(invalidChars); i++) {
558  char invalidStr[4];
559  invalidStr[0] = invalidChars[i];
560  invalidStr[1] = '\0';
561  // tprintf("eliminating %s\n", invalidStr);
562  // char *pos = strstr(fileName, invalidStr);
563  // initial ./ is valid for present directory
564  // if (*pos == '.') pos++;
565  // if (*pos == '/') pos++;
566  for (char *pos = strstr(fileName, invalidStr); pos != nullptr;
567  pos = strstr(pos + 1, invalidStr)) {
568  // tprintf("\tfound: %s, ", pos);
569  pos[0] = '_';
570  // tprintf("fileName: %s\n", fileName);
571  }
572  }
573 }
574 
575 static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) {
576  // tprintf("[DS] populateGPUEnvFromDevice\n");
577  size_t size;
578  gpuInfo->mnIsUserCreated = 1;
579  // device
580  gpuInfo->mpDevID = device;
581  gpuInfo->mpArryDevsID = new cl_device_id[1];
582  gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
583  clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE, sizeof(cl_device_type),
584  &gpuInfo->mDevType, &size);
585  CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
586  // platform
587  clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
588  &gpuInfo->mpPlatformID, &size);
589  CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
590  // context
591  cl_context_properties props[3];
592  props[0] = CL_CONTEXT_PLATFORM;
593  props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
594  props[2] = 0;
595  gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr, nullptr, &clStatus);
596  CHECK_OPENCL(clStatus, "populateGPUEnv::createContext");
597  // queue
598  cl_command_queue_properties queueProperties = 0;
599  gpuInfo->mpCmdQueue =
600  clCreateCommandQueue(gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
601  CHECK_OPENCL(clStatus, "populateGPUEnv::createCommandQueue");
602 }
603 
604 int OpenclDevice::LoadOpencl() {
605 # ifdef WIN32
606  HINSTANCE HOpenclDll = nullptr;
607  void *OpenclDll = nullptr;
608  // fprintf(stderr, " LoadOpenclDllxx... \n");
609  OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
610  OpenclDll = LoadLibrary("openCL.dll");
611  if (!static_cast<HINSTANCE>(OpenclDll)) {
612  fprintf(stderr, "[OD] Load opencl.dll failed!\n");
613  FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
614  return 0;
615  }
616  fprintf(stderr, "[OD] Load opencl.dll successful!\n");
617 # endif
618  return 1;
619 }
620 int OpenclDevice::SetKernelEnv(KernelEnv *envInfo) {
621  envInfo->mpkContext = gpuEnv.mpContext;
622  envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
623  envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
624 
625  return 1;
626 }
627 
628 static cl_mem allocateZeroCopyBuffer(const KernelEnv &rEnv, l_uint32 *hostbuffer, size_t nElements,
629  cl_mem_flags flags, cl_int *pStatus) {
630  cl_mem membuffer = clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
631  nElements * sizeof(l_uint32), hostbuffer, pStatus);
632 
633  return membuffer;
634 }
635 
636 static Image mapOutputCLBuffer(const KernelEnv &rEnv, cl_mem clbuffer, Image pixd, Image pixs,
637  int elements, cl_mem_flags flags, bool memcopy = false,
638  bool sync = true) {
639  if (!pixd) {
640  if (memcopy) {
641  if ((pixd = pixCreateTemplate(pixs)) == nullptr)
642  tprintf("pixd not made\n");
643  } else {
644  if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) ==
645  nullptr)
646  tprintf("pixd not made\n");
647  }
648  }
649  l_uint32 *pValues =
650  (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
651  elements * sizeof(l_uint32), 0, nullptr, nullptr, nullptr);
652 
653  if (memcopy) {
654  memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
655  } else {
656  pixSetData(pixd, pValues);
657  }
658 
659  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, nullptr, nullptr);
660 
661  if (sync) {
662  clFinish(rEnv.mpkCmdQueue);
663  }
664 
665  return pixd;
666 }
667 
668 void OpenclDevice::releaseMorphCLBuffers() {
669  if (pixdCLIntermediate != nullptr)
670  clReleaseMemObject(pixdCLIntermediate);
671  if (pixsCLBuffer != nullptr)
672  clReleaseMemObject(pixsCLBuffer);
673  if (pixdCLBuffer != nullptr)
674  clReleaseMemObject(pixdCLBuffer);
675  if (pixThBuffer != nullptr)
676  clReleaseMemObject(pixThBuffer);
677  pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr;
678 }
679 
680 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Image pixs) {
681  SetKernelEnv(&rEnv);
682 
683  if (pixThBuffer != nullptr) {
684  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
685 
686  // Get the output from ThresholdToPix operation
687  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
688  sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr);
689  } else {
690  // Get data from the source image
691  l_uint32 *srcdata = reinterpret_cast<l_uint32 *>(malloc(wpl * h * sizeof(l_uint32)));
692  memcpy(srcdata, pixGetData(pixs), wpl * h * sizeof(l_uint32));
693 
694  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h, CL_MEM_USE_HOST_PTR, &clStatus);
695  }
696 
697  pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
698 
699  pixdCLIntermediate =
700  allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
701 
702  return (int)clStatus;
703 }
704 
705 int OpenclDevice::InitEnv() {
706 // tprintf("[OD] OpenclDevice::InitEnv()\n");
707 # ifdef SAL_WIN32
708  while (1) {
709  if (1 == LoadOpencl())
710  break;
711  }
712 # endif
713  // sets up environment, compiles programs
714 
715  InitOpenclRunEnv_DeviceSelection(0);
716  return 1;
717 }
718 
719 int OpenclDevice::ReleaseOpenclRunEnv() {
720  ReleaseOpenclEnv(&gpuEnv);
721 # ifdef SAL_WIN32
722  FreeOpenclDll();
723 # endif
724  return 1;
725 }
726 
727 inline int OpenclDevice::AddKernelConfig(int kCount, const char *kName) {
728  ASSERT_HOST(kCount > 0);
729  ASSERT_HOST(strlen(kName) < sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
730  strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
731  gpuEnv.mnKernelCount++;
732  return 0;
733 }
734 
735 int OpenclDevice::RegistOpenclKernel() {
736  if (!gpuEnv.mnIsUserCreated)
737  memset(&gpuEnv, 0, sizeof(gpuEnv));
738 
739  gpuEnv.mnFileCount = 0; // argc;
740  gpuEnv.mnKernelCount = 0UL;
741 
742  AddKernelConfig(1, "oclAverageSub1");
743  return 0;
744 }
745 
746 int OpenclDevice::InitOpenclRunEnv_DeviceSelection(int argc) {
747  if (!isInited) {
748  // after programs compiled, selects best device
749  ds_device bestDevice_DS = getDeviceSelection();
750  cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
751  // overwrite global static GPUEnv with new device
752  if (selectedDeviceIsOpenCL()) {
753  // tprintf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice()
754  // for selected device\n");
755  populateGPUEnvFromDevice(&gpuEnv, bestDevice);
756  gpuEnv.mnFileCount = 0; // argc;
757  gpuEnv.mnKernelCount = 0UL;
758  CompileKernelFile(&gpuEnv, "");
759  } else {
760  // tprintf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice()
761  // b/c native cpu selected\n");
762  }
763  isInited = 1;
764  }
765  return 0;
766 }
767 
768 OpenclDevice::OpenclDevice() {
769  // InitEnv();
770 }
771 
772 OpenclDevice::~OpenclDevice() {
773  // ReleaseOpenclRunEnv();
774 }
775 
776 int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
777  int i = 0;
778  int clStatus = 0;
779 
780  if (!isInited) {
781  return 1;
782  }
783 
784  for (i = 0; i < gpuEnv.mnFileCount; i++) {
785  if (gpuEnv.mpArryPrograms[i]) {
786  clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
787  CHECK_OPENCL(clStatus, "clReleaseProgram");
788  gpuEnv.mpArryPrograms[i] = nullptr;
789  }
790  }
791  if (gpuEnv.mpCmdQueue) {
792  clReleaseCommandQueue(gpuEnv.mpCmdQueue);
793  gpuEnv.mpCmdQueue = nullptr;
794  }
795  if (gpuEnv.mpContext) {
796  clReleaseContext(gpuEnv.mpContext);
797  gpuEnv.mpContext = nullptr;
798  }
799  isInited = 0;
800  gpuInfo->mnIsUserCreated = 0;
801  delete[] gpuInfo->mpArryDevsID;
802  return 1;
803 }
804 int OpenclDevice::BinaryGenerated(const char *clFileName, FILE **fhandle) {
805  unsigned int i = 0;
806  cl_int clStatus;
807  int status = 0;
808  FILE *fd = nullptr;
809  char fileName[256] = {0}, cl_name[128] = {0};
810  char deviceName[1024];
811  clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName,
812  nullptr);
813  CHECK_OPENCL(clStatus, "clGetDeviceInfo");
814  const char *str = strstr(clFileName, ".cl");
815  memcpy(cl_name, clFileName, str - clFileName);
816  cl_name[str - clFileName] = '\0';
817  sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
818  legalizeFileName(fileName);
819  fd = fopen(fileName, "rb");
820  status = (fd != nullptr) ? 1 : 0;
821  if (fd != nullptr) {
822  *fhandle = fd;
823  }
824  return status;
825 }
826 int OpenclDevice::CachedOfKernerPrg(const GPUEnv *gpuEnvCached, const char *clFileName) {
827  int i;
828  for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
829  if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
830  if (gpuEnvCached->mpArryPrograms[i] != nullptr) {
831  return 1;
832  }
833  }
834  }
835 
836  return 0;
837 }
838 int OpenclDevice::WriteBinaryToFile(const char *fileName, const char *birary, size_t numBytes) {
839  FILE *output = nullptr;
840  output = fopen(fileName, "wb");
841  if (output == nullptr) {
842  return 0;
843  }
844 
845  fwrite(birary, sizeof(char), numBytes, output);
846  fclose(output);
847 
848  return 1;
849 }
850 
851 int OpenclDevice::GeneratBinFromKernelSource(cl_program program, const char *clFileName) {
852  unsigned int i = 0;
853  cl_int clStatus;
854  cl_uint numDevices;
855 
856  clStatus =
857  clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(numDevices), &numDevices, nullptr);
858  CHECK_OPENCL(clStatus, "clGetProgramInfo");
859 
860  std::vector<cl_device_id> mpArryDevsID(numDevices);
861 
862  /* grab the handles to all of the devices in the program. */
863  clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * numDevices,
864  &mpArryDevsID[0], nullptr);
865  CHECK_OPENCL(clStatus, "clGetProgramInfo");
866 
867  /* figure out the sizes of each of the binaries. */
868  std::vector<size_t> binarySizes(numDevices);
869 
870  clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * numDevices,
871  &binarySizes[0], nullptr);
872  CHECK_OPENCL(clStatus, "clGetProgramInfo");
873 
874  /* copy over all of the generated binaries. */
875  std::vector<char *> binaries(numDevices);
876 
877  for (i = 0; i < numDevices; i++) {
878  if (binarySizes[i] != 0) {
879  binaries[i] = new char[binarySizes[i]];
880  } else {
881  binaries[i] = nullptr;
882  }
883  }
884 
885  clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char *) * numDevices,
886  &binaries[0], nullptr);
887  CHECK_OPENCL(clStatus, "clGetProgramInfo");
888 
889  /* dump out each binary into its own separate file. */
890  for (i = 0; i < numDevices; i++) {
891  char fileName[256] = {0}, cl_name[128] = {0};
892 
893  if (binarySizes[i] != 0) {
894  char deviceName[1024];
895  clStatus =
896  clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
897  CHECK_OPENCL(clStatus, "clGetDeviceInfo");
898 
899  const char *str = strstr(clFileName, ".cl");
900  memcpy(cl_name, clFileName, str - clFileName);
901  cl_name[str - clFileName] = '\0';
902  sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
903  legalizeFileName(fileName);
904  if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
905  tprintf("[OD] write binary[%s] failed\n", fileName);
906  return 0;
907  } // else
908  tprintf("[OD] write binary[%s] successfully\n", fileName);
909  }
910  }
911 
912  // Release all resources and memory
913  for (i = 0; i < numDevices; i++) {
914  delete[] binaries[i];
915  }
916 
917  return 1;
918 }
919 
920 int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
921  cl_int clStatus = 0;
922  const char *source;
923  size_t source_size[1];
924  int binary_status, binaryExisted, idx;
925  cl_uint numDevices;
926  FILE *fd, *fd1;
927  const char *filename = "kernel.cl";
928  // fprintf(stderr, "[OD] CompileKernelFile ... \n");
929  if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
930  return 1;
931  }
932 
933  idx = gpuInfo->mnFileCount;
934 
935  source = kernel_src;
936 
937  source_size[0] = strlen(source);
938  binaryExisted = 0;
939  binaryExisted = BinaryGenerated(filename, &fd); // don't check for binary during microbenchmark
940  if (binaryExisted == 1) {
941  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, sizeof(numDevices),
942  &numDevices, nullptr);
943  CHECK_OPENCL(clStatus, "clGetContextInfo");
944 
945  std::vector<cl_device_id> mpArryDevsID(numDevices);
946  bool b_error = fseek(fd, 0, SEEK_END) < 0;
947  auto pos = std::ftell(fd);
948  b_error |= (pos <= 0);
949  size_t length = pos;
950  b_error |= fseek(fd, 0, SEEK_SET) < 0;
951  if (b_error) {
952  fclose(fd);
953  return 0;
954  }
955 
956  std::vector<uint8_t> binary(length + 2);
957 
958  memset(&binary[0], 0, length + 2);
959  b_error |= fread(&binary[0], 1, length, fd) != length;
960 
961  fclose(fd);
962  fd = nullptr;
963  // grab the handles to all of the devices in the context.
964  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
965  sizeof(cl_device_id) * numDevices, &mpArryDevsID[0], nullptr);
966  CHECK_OPENCL(clStatus, "clGetContextInfo");
967  // fprintf(stderr, "[OD] Create kernel from binary\n");
968  const uint8_t *c_binary = &binary[0];
969  gpuInfo->mpArryPrograms[idx] =
970  clCreateProgramWithBinary(gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length,
971  &c_binary, &binary_status, &clStatus);
972  CHECK_OPENCL(clStatus, "clCreateProgramWithBinary");
973  } else {
974  // create a CL program using the kernel source
975  // fprintf(stderr, "[OD] Create kernel from source\n");
976  gpuInfo->mpArryPrograms[idx] =
977  clCreateProgramWithSource(gpuInfo->mpContext, 1, &source, source_size, &clStatus);
978  CHECK_OPENCL(clStatus, "clCreateProgramWithSource");
979  }
980 
981  if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) {
982  return 0;
983  }
984 
985  // char options[512];
986  // create a cl program executable for all the devices specified
987  // tprintf("[OD] BuildProgram.\n");
988  if (!gpuInfo->mnIsUserCreated) {
989  clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, buildOption,
990  nullptr, nullptr);
991  } else {
992  clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), buildOption,
993  nullptr, nullptr);
994  }
995  if (clStatus != CL_SUCCESS) {
996  tprintf("BuildProgram error!\n");
997  size_t length;
998  if (!gpuInfo->mnIsUserCreated) {
999  clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1000  CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1001  } else {
1002  clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1003  CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1004  }
1005  if (clStatus != CL_SUCCESS) {
1006  tprintf("opencl create build log fail\n");
1007  return 0;
1008  }
1009  std::vector<char> buildLog(length);
1010  if (!gpuInfo->mnIsUserCreated) {
1011  clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1012  CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1013  } else {
1014  clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1015  CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1016  }
1017  if (clStatus != CL_SUCCESS) {
1018  tprintf("opencl program build info fail\n");
1019  return 0;
1020  }
1021 
1022  fd1 = fopen("kernel-build.log", "w+");
1023  if (fd1 != nullptr) {
1024  fwrite(&buildLog[0], sizeof(char), length, fd1);
1025  fclose(fd1);
1026  }
1027 
1028  return 0;
1029  }
1030 
1031  strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1032  if (binaryExisted == 0) {
1033  GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1034  }
1035 
1036  gpuInfo->mnFileCount += 1;
1037  return 1;
1038 }
1039 
1040 l_uint32 *OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata, l_int32 w, l_int32 h, l_int32 wpl,
1041  l_uint32 *line) {
1042  cl_int clStatus;
1043  KernelEnv rEnv;
1044  size_t globalThreads[2];
1045  size_t localThreads[2];
1046  int gsize;
1047  cl_mem valuesCl;
1048  cl_mem outputCl;
1049 
1050  // global and local work dimensions for Horizontal pass
1051  gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1052  globalThreads[0] = gsize;
1053  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1054  globalThreads[1] = gsize;
1055  localThreads[0] = GROUPSIZE_X;
1056  localThreads[1] = GROUPSIZE_Y;
1057 
1058  SetKernelEnv(&rEnv);
1059 
1060  l_uint32 *pResult = (l_uint32 *)malloc(w * h * sizeof(l_uint32));
1061  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "composeRGBPixel", &clStatus);
1062  CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1063 
1064  // Allocate input and output OCL buffers
1065  valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1066  &clStatus);
1067  outputCl = allocateZeroCopyBuffer(rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
1068  &clStatus);
1069 
1070  // Kernel arguments
1071  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1072  CHECK_OPENCL(clStatus, "clSetKernelArg");
1073  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1074  CHECK_OPENCL(clStatus, "clSetKernelArg");
1075  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1076  CHECK_OPENCL(clStatus, "clSetKernelArg");
1077  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1078  CHECK_OPENCL(clStatus, "clSetKernelArg");
1079  clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1080  CHECK_OPENCL(clStatus, "clSetKernelArg");
1081 
1082  // Kernel enqueue
1083  clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1084  localThreads, 0, nullptr, nullptr);
1085  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1086 
1087  /* map results back from gpu */
1088  void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0,
1089  w * h * sizeof(l_uint32), 0, nullptr, nullptr, &clStatus);
1090  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1091  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr);
1092 
1093  // Sync
1094  clFinish(rEnv.mpkCmdQueue);
1095  return pResult;
1096 }
1097 
1098 // Morphology Dilate operation for 5x5 structuring element. Invokes the relevant
1099 // OpenCL kernels
1100 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1101  size_t globalThreads[2];
1102  cl_mem pixtemp;
1103  cl_int status;
1104  int gsize;
1105  size_t localThreads[2];
1106 
1107  // Horizontal pass
1108  gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1109  globalThreads[0] = gsize;
1110  globalThreads[1] = GROUPSIZE_HMORY;
1111  localThreads[0] = GROUPSIZE_HMORX;
1112  localThreads[1] = GROUPSIZE_HMORY;
1113 
1114  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_5x5", &status);
1115  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5");
1116 
1117  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1118  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1119  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1120  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1121 
1122  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1123  localThreads, 0, nullptr, nullptr);
1124 
1125  // Swap source and dest buffers
1126  pixtemp = pixsCLBuffer;
1127  pixsCLBuffer = pixdCLBuffer;
1128  pixdCLBuffer = pixtemp;
1129 
1130  // Vertical
1131  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1132  globalThreads[0] = gsize;
1133  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1134  globalThreads[1] = gsize;
1135  localThreads[0] = GROUPSIZE_X;
1136  localThreads[1] = GROUPSIZE_Y;
1137 
1138  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateVer_5x5", &status);
1139  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5");
1140 
1141  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1142  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1143  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1144  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1145  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1146  localThreads, 0, nullptr, nullptr);
1147 
1148  return status;
1149 }
1150 
1151 // Morphology Erode operation for 5x5 structuring element. Invokes the relevant
1152 // OpenCL kernels
1153 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1154  size_t globalThreads[2];
1155  cl_mem pixtemp;
1156  cl_int status;
1157  int gsize;
1158  l_uint32 fwmask, lwmask;
1159  size_t localThreads[2];
1160 
1161  lwmask = lmask32[31 - 2];
1162  fwmask = rmask32[31 - 2];
1163 
1164  // Horizontal pass
1165  gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1166  globalThreads[0] = gsize;
1167  globalThreads[1] = GROUPSIZE_HMORY;
1168  localThreads[0] = GROUPSIZE_HMORX;
1169  localThreads[1] = GROUPSIZE_HMORY;
1170 
1171  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_5x5", &status);
1172  CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5");
1173 
1174  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1175  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1176  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1177  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1178 
1179  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1180  localThreads, 0, nullptr, nullptr);
1181 
1182  // Swap source and dest buffers
1183  pixtemp = pixsCLBuffer;
1184  pixsCLBuffer = pixdCLBuffer;
1185  pixdCLBuffer = pixtemp;
1186 
1187  // Vertical
1188  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1189  globalThreads[0] = gsize;
1190  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1191  globalThreads[1] = gsize;
1192  localThreads[0] = GROUPSIZE_X;
1193  localThreads[1] = GROUPSIZE_Y;
1194 
1195  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer_5x5", &status);
1196  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5");
1197 
1198  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1199  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1200  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1201  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1202  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask);
1203  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask);
1204  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1205  localThreads, 0, nullptr, nullptr);
1206 
1207  return status;
1208 }
1209 
1210 // Morphology Dilate operation. Invokes the relevant OpenCL kernels
1211 static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1212  l_int32 xp, yp, xn, yn;
1213  SEL *sel;
1214  size_t globalThreads[2];
1215  cl_mem pixtemp;
1216  cl_int status = 0;
1217  int gsize;
1218  size_t localThreads[2];
1219  char isEven;
1220 
1221  OpenclDevice::SetKernelEnv(&rEnv);
1222 
1223  if (hsize == 5 && vsize == 5) {
1224  // Specific case for 5x5
1225  status = pixDilateCL_55(wpl, h);
1226  return status;
1227  }
1228 
1229  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1230 
1231  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1232  selDestroy(&sel);
1233  // global and local work dimensions for Horizontal pass
1234  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1235  globalThreads[0] = gsize;
1236  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1237  globalThreads[1] = gsize;
1238  localThreads[0] = GROUPSIZE_X;
1239  localThreads[1] = GROUPSIZE_Y;
1240 
1241  if (xp > 31 || xn > 31) {
1242  // Generic case.
1243  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status);
1244  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor");
1245 
1246  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1247  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1248  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1249  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1250  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1251  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1252  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1253  localThreads, 0, nullptr, nullptr);
1254 
1255  if (yp > 0 || yn > 0) {
1256  pixtemp = pixsCLBuffer;
1257  pixsCLBuffer = pixdCLBuffer;
1258  pixdCLBuffer = pixtemp;
1259  }
1260  } else if (xp > 0 || xn > 0) {
1261  // Specific Horizontal pass kernel for half width < 32
1262  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status);
1263  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word");
1264  isEven = (xp != xn);
1265 
1266  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1267  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1268  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1269  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1270  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1271  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven);
1272  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1273  localThreads, 0, nullptr, nullptr);
1274 
1275  if (yp > 0 || yn > 0) {
1276  pixtemp = pixsCLBuffer;
1277  pixsCLBuffer = pixdCLBuffer;
1278  pixdCLBuffer = pixtemp;
1279  }
1280  }
1281 
1282  if (yp > 0 || yn > 0) {
1283  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateVer", &status);
1284  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer");
1285 
1286  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1287  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1288  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1289  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1290  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1291  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn);
1292  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1293  localThreads, 0, nullptr, nullptr);
1294  }
1295 
1296  return status;
1297 }
1298 
1299 // Morphology Erode operation. Invokes the relevant OpenCL kernels
1300 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) {
1301  l_int32 xp, yp, xn, yn;
1302  SEL *sel;
1303  size_t globalThreads[2];
1304  size_t localThreads[2];
1305  cl_mem pixtemp;
1306  cl_int status = 0;
1307  int gsize;
1308  char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1309  l_uint32 rwmask, lwmask;
1310  char isEven;
1311 
1312  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1313 
1314  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1315  selDestroy(&sel);
1316  OpenclDevice::SetKernelEnv(&rEnv);
1317 
1318  if (hsize == 5 && vsize == 5 && isAsymmetric) {
1319  // Specific kernel for 5x5
1320  status = pixErodeCL_55(wpl, h);
1321  return status;
1322  }
1323 
1324  lwmask = lmask32[31 - (xn & 31)];
1325  rwmask = rmask32[31 - (xp & 31)];
1326 
1327  // global and local work dimensions for Horizontal pass
1328  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1329  globalThreads[0] = gsize;
1330  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1331  globalThreads[1] = gsize;
1332  localThreads[0] = GROUPSIZE_X;
1333  localThreads[1] = GROUPSIZE_Y;
1334 
1335  // Horizontal Pass
1336  if (xp > 31 || xn > 31) {
1337  // Generic case.
1338  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status);
1339 
1340  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1341  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1342  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1343  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1344  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1345  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1346  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric);
1347  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask);
1348  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask);
1349  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1350  localThreads, 0, nullptr, nullptr);
1351 
1352  if (yp > 0 || yn > 0) {
1353  pixtemp = pixsCLBuffer;
1354  pixsCLBuffer = pixdCLBuffer;
1355  pixdCLBuffer = pixtemp;
1356  }
1357  } else if (xp > 0 || xn > 0) {
1358  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status);
1359  isEven = (xp != xn);
1360 
1361  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1362  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1363  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1364  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1365  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1366  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1367  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask);
1368  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask);
1369  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven);
1370  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1371  localThreads, 0, nullptr, nullptr);
1372 
1373  if (yp > 0 || yn > 0) {
1374  pixtemp = pixsCLBuffer;
1375  pixsCLBuffer = pixdCLBuffer;
1376  pixdCLBuffer = pixtemp;
1377  }
1378  }
1379 
1380  // Vertical Pass
1381  if (yp > 0 || yn > 0) {
1382  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status);
1383  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer");
1384 
1385  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1386  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1387  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1388  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1389  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1390  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1391  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn);
1392  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1393  localThreads, 0, nullptr, nullptr);
1394  }
1395 
1396  return status;
1397 }
1398 
1399 // Morphology Open operation. Invokes the relevant OpenCL kernels
1400 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1401  cl_int status;
1402  cl_mem pixtemp;
1403 
1404  // Erode followed by Dilate
1405  status = pixErodeCL(hsize, vsize, wpl, h);
1406 
1407  pixtemp = pixsCLBuffer;
1408  pixsCLBuffer = pixdCLBuffer;
1409  pixdCLBuffer = pixtemp;
1410 
1411  status = pixDilateCL(hsize, vsize, wpl, h);
1412 
1413  return status;
1414 }
1415 
1416 // Morphology Close operation. Invokes the relevant OpenCL kernels
1417 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1418  cl_int status;
1419  cl_mem pixtemp;
1420 
1421  // Dilate followed by Erode
1422  status = pixDilateCL(hsize, vsize, wpl, h);
1423 
1424  pixtemp = pixsCLBuffer;
1425  pixsCLBuffer = pixdCLBuffer;
1426  pixdCLBuffer = pixtemp;
1427 
1428  status = pixErodeCL(hsize, vsize, wpl, h);
1429 
1430  return status;
1431 }
1432 
1433 // output = buffer1 & ~(buffer2)
1434 static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2) {
1435  cl_int status;
1436  size_t globalThreads[2];
1437  int gsize;
1438  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1439 
1440  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1441  globalThreads[0] = gsize;
1442  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1443  globalThreads[1] = gsize;
1444 
1445  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status);
1446  CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace");
1447 
1448  // Enqueue a kernel run call.
1449  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1);
1450  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2);
1451  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1452  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1453  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1454  localThreads, 0, nullptr, nullptr);
1455 
1456  return status;
1457 }
1458 
1459 // OpenCL implementation of Get Lines from pix function
1460 // Note: Assumes the source and dest opencl buffer are initialized. No check
1461 // done
1462 void OpenclDevice::pixGetLinesCL(Image pixd, Image pixs, Image *pix_vline, Image *pix_hline,
1463  Image *pixClosed, bool getpixClosed, l_int32 close_hsize,
1464  l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize,
1465  l_int32 line_hsize, l_int32 line_vsize) {
1466  l_uint32 wpl, h;
1467  cl_mem pixtemp;
1468 
1469  wpl = pixGetWpl(pixs);
1470  h = pixGetHeight(pixs);
1471 
1472  // First step : Close Morph operation: Dilate followed by Erode
1473  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1474 
1475  // Copy the Close output to CPU buffer
1476  if (getpixClosed) {
1477  *pixClosed =
1478  mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl * h, CL_MAP_READ, true, false);
1479  }
1480 
1481  // Store the output of close operation in an intermediate buffer
1482  // this will be later used for pixsubtract
1483  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1484  sizeof(int) * wpl * h, 0, nullptr, nullptr);
1485 
1486  // Second step: Open Operation - Erode followed by Dilate
1487  pixtemp = pixsCLBuffer;
1488  pixsCLBuffer = pixdCLBuffer;
1489  pixdCLBuffer = pixtemp;
1490 
1491  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1492 
1493  // Third step: Subtract : (Close - Open)
1494  pixtemp = pixsCLBuffer;
1495  pixsCLBuffer = pixdCLBuffer;
1496  pixdCLBuffer = pixdCLIntermediate;
1497  pixdCLIntermediate = pixtemp;
1498 
1499  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1500 
1501  // Store the output of Hollow operation in an intermediate buffer
1502  // this will be later used
1503  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1504  sizeof(int) * wpl * h, 0, nullptr, nullptr);
1505 
1506  pixtemp = pixsCLBuffer;
1507  pixsCLBuffer = pixdCLBuffer;
1508  pixdCLBuffer = pixtemp;
1509 
1510  // Fourth step: Get vertical line
1511  // pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
1512  clStatus = pixOpenCL(1, line_vsize, wpl, h);
1513 
1514  // Copy the vertical line output to CPU buffer
1515  *pix_vline =
1516  mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h, CL_MAP_READ, true, false);
1517 
1518  pixtemp = pixsCLBuffer;
1519  pixsCLBuffer = pixdCLIntermediate;
1520  pixdCLIntermediate = pixtemp;
1521 
1522  // Fifth step: Get horizontal line
1523  // pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
1524  clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1525 
1526  // Copy the horizontal line output to CPU buffer
1527  *pix_hline =
1528  mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h, CL_MAP_READ, true, true);
1529 
1530  return;
1531 }
1532 
1533 /*************************************************************************
1534  * HistogramRect
1535  * Otsu Thresholding Operations
1536  * histogramAllChannels is laid out as all channel 0, then all channel 1...
1537  * only supports 1 or 4 channels (bytes_per_pixel)
1538  ************************************************************************/
1539 int OpenclDevice::HistogramRectOCL(void *imageData, int bytes_per_pixel, int bytes_per_line,
1540  int left, // always 0
1541  int top, // always 0
1542  int width, int height, int kHistogramSize,
1543  int *histogramAllChannels) {
1544  cl_int clStatus;
1545  int retVal = 0;
1546  KernelEnv histKern;
1547  SetKernelEnv(&histKern);
1548  KernelEnv histRedKern;
1549  SetKernelEnv(&histRedKern);
1550  /* map imagedata to device as read only */
1551  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1552  // coherent which we don't need.
1553  // faster option would be to allocate initial image buffer
1554  // using a garlic bus memory type
1555  cl_mem imageBuffer =
1556  clCreateBuffer(histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1557  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1558  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1559 
1560  /* setup work group size parameters */
1561  int block_size = 256;
1562  cl_uint numCUs;
1563  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs,
1564  nullptr);
1565  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1566 
1567  int requestedOccupancy = 10;
1568  int numWorkGroups = numCUs * requestedOccupancy;
1569  int numThreads = block_size * numWorkGroups;
1570  size_t local_work_size[] = {static_cast<size_t>(block_size)};
1571  size_t global_work_size[] = {static_cast<size_t>(numThreads)};
1572  size_t red_global_work_size[] = {
1573  static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
1574 
1575  /* map histogramAllChannels as write only */
1576 
1577  cl_mem histogramBuffer = clCreateBuffer(
1578  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1579  kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels, &clStatus);
1580  CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer");
1581 
1582  /* intermediate histogram buffer */
1583  int histRed = 256;
1584  int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed;
1585 
1586  cl_mem tmpHistogramBuffer =
1587  clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins * sizeof(cl_uint),
1588  nullptr, &clStatus);
1589  CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer");
1590 
1591  /* atomic sync buffer */
1592  int *zeroBuffer = new int[1];
1593  zeroBuffer[0] = 0;
1594  cl_mem atomicSyncBuffer =
1595  clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int),
1596  zeroBuffer, &clStatus);
1597  CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer");
1598  delete[] zeroBuffer;
1599  // Create kernel objects based on bytes_per_pixel
1600  if (bytes_per_pixel == 1) {
1601  histKern.mpkKernel =
1602  clCreateKernel(histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus);
1603  CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
1604 
1605  histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1606  "kernel_HistogramRectOneChannelReduction", &clStatus);
1607  CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction");
1608  } else {
1609  histKern.mpkKernel =
1610  clCreateKernel(histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus);
1611  CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
1612 
1613  histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1614  "kernel_HistogramRectAllChannelsReduction", &clStatus);
1615  CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1616  }
1617 
1618  void *ptr;
1619 
1620  // Initialize tmpHistogramBuffer buffer
1621  ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1622  tmpHistogramBins * sizeof(cl_uint), 0, nullptr, nullptr, &clStatus);
1623  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
1624 
1625  memset(ptr, 0, tmpHistogramBins * sizeof(cl_uint));
1626  clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, nullptr, nullptr);
1627 
1628  /* set kernel 1 arguments */
1629  clStatus = clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1630  CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1631  cl_uint numPixels = width * height;
1632  clStatus = clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
1633  CHECK_OPENCL(clStatus, "clSetKernelArg numPixels");
1634  clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem), &tmpHistogramBuffer);
1635  CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1636 
1637  /* set kernel 2 arguments */
1638  int n = numThreads / bytes_per_pixel;
1639  clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n);
1640  CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1641  clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem), &tmpHistogramBuffer);
1642  CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1643  clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem), &histogramBuffer);
1644  CHECK_OPENCL(clStatus, "clSetKernelArg histogramBuffer");
1645 
1646  /* launch histogram */
1647  clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1, nullptr,
1648  global_work_size, local_work_size, 0, nullptr, nullptr);
1649  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1650  clFinish(histKern.mpkCmdQueue);
1651  if (clStatus != 0) {
1652  retVal = -1;
1653  }
1654  /* launch histogram */
1655  clStatus = clEnqueueNDRangeKernel(histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr,
1656  red_global_work_size, local_work_size, 0, nullptr, nullptr);
1657  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1658  clFinish(histRedKern.mpkCmdQueue);
1659  if (clStatus != 0) {
1660  retVal = -1;
1661  }
1662 
1663  /* map results back from gpu */
1664  ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0,
1665  kHistogramSize * bytes_per_pixel * sizeof(int), 0, nullptr, nullptr,
1666  &clStatus);
1667  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1668  if (clStatus != 0) {
1669  retVal = -1;
1670  }
1671  clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, nullptr, nullptr);
1672 
1673  clReleaseMemObject(histogramBuffer);
1674  clReleaseMemObject(imageBuffer);
1675  return retVal;
1676 }
1677 
1678 /*************************************************************************
1679  * Threshold the rectangle, taking everything except the image buffer pointer
1680  * from the class, using thresholds/hi_values to the output IMAGE.
1681  * only supports 1 or 4 channels
1682  ************************************************************************/
1683 int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData, int bytes_per_pixel,
1684  int bytes_per_line, int *thresholds, int *hi_values,
1685  Image *pix, int height, int width, int top, int left) {
1686  int retVal = 0;
1687  /* create pix result buffer */
1688  *pix = pixCreate(width, height, 1);
1689  uint32_t *pixData = pixGetData(*pix);
1690  int wpl = pixGetWpl(*pix);
1691  int pixSize = wpl * height * sizeof(uint32_t); // number of pixels
1692 
1693  cl_int clStatus;
1694  KernelEnv rEnv;
1695  SetKernelEnv(&rEnv);
1696 
1697  /* setup work group size parameters */
1698  int block_size = 256;
1699  cl_uint numCUs = 6;
1700  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs,
1701  nullptr);
1702  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1703 
1704  int requestedOccupancy = 10;
1705  int numWorkGroups = numCUs * requestedOccupancy;
1706  int numThreads = block_size * numWorkGroups;
1707  size_t local_work_size[] = {(size_t)block_size};
1708  size_t global_work_size[] = {(size_t)numThreads};
1709 
1710  /* map imagedata to device as read only */
1711  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1712  // coherent which we don't need.
1713  // faster option would be to allocate initial image buffer
1714  // using a garlic bus memory type
1715  cl_mem imageBuffer =
1716  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1717  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1718  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1719 
1720  /* map pix as write only */
1721  pixThBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize,
1722  pixData, &clStatus);
1723  CHECK_OPENCL(clStatus, "clCreateBuffer pix");
1724 
1725  /* map thresholds and hi_values */
1726  cl_mem thresholdsBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1727  bytes_per_pixel * sizeof(int), thresholds, &clStatus);
1728  CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer");
1729  cl_mem hiValuesBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1730  bytes_per_pixel * sizeof(int), hi_values, &clStatus);
1731  CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer");
1732 
1733  /* compile kernel */
1734  if (bytes_per_pixel == 4) {
1735  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus);
1736  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix");
1737  } else {
1738  rEnv.mpkKernel =
1739  clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus);
1740  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
1741  }
1742 
1743  /* set kernel arguments */
1744  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1745  CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1746  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height);
1747  CHECK_OPENCL(clStatus, "clSetKernelArg height");
1748  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width);
1749  CHECK_OPENCL(clStatus, "clSetKernelArg width");
1750  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl);
1751  CHECK_OPENCL(clStatus, "clSetKernelArg wpl");
1752  clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer);
1753  CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer");
1754  clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer);
1755  CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer");
1756  clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer);
1757  CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer");
1758 
1759  /* launch kernel & wait */
1760  clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1, nullptr, global_work_size,
1761  local_work_size, 0, nullptr, nullptr);
1762  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1763  clFinish(rEnv.mpkCmdQueue);
1764  if (clStatus != 0) {
1765  tprintf("Setting return value to -1\n");
1766  retVal = -1;
1767  }
1768  /* map results back from gpu */
1769  void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0,
1770  nullptr, nullptr, &clStatus);
1771  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1772  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, nullptr, nullptr);
1773 
1774  clReleaseMemObject(imageBuffer);
1775  clReleaseMemObject(thresholdsBuffer);
1776  clReleaseMemObject(hiValuesBuffer);
1777 
1778  return retVal;
1779 }
1780 
1781 /******************************************************************************
1782  * Data Types for Device Selection
1783  *****************************************************************************/
1784 
1785 struct TessScoreEvaluationInputData {
1786  int height;
1787  int width;
1788  int numChannels;
1789  unsigned char *imageData;
1790  Image pix;
1791 };
1792 
1793 static void populateTessScoreEvaluationInputData(TessScoreEvaluationInputData *input) {
1794  srand(1);
1795  // 8.5x11 inches @ 300dpi rounded to clean multiples
1796  int height = 3328; // %256
1797  int width = 2560; // %512
1798  int numChannels = 4;
1799  input->height = height;
1800  input->width = width;
1801  input->numChannels = numChannels;
1802  unsigned char(*imageData4)[4] = (unsigned char(*)[4])malloc(
1803  height * width * numChannels * sizeof(unsigned char)); // new unsigned char[4][height*width];
1804  input->imageData = (unsigned char *)&imageData4[0];
1805 
1806  // zero out image
1807  unsigned char pixelWhite[4] = {0, 0, 0, 255};
1808  unsigned char pixelBlack[4] = {255, 255, 255, 255};
1809  for (int p = 0; p < height * width; p++) {
1810  // unsigned char tmp[4] = imageData4[0];
1811  imageData4[p][0] = pixelWhite[0];
1812  imageData4[p][1] = pixelWhite[1];
1813  imageData4[p][2] = pixelWhite[2];
1814  imageData4[p][3] = pixelWhite[3];
1815  }
1816  // random lines to be eliminated
1817  int maxLineWidth = 64; // pixels wide
1818  int numLines = 10;
1819  // vertical lines
1820  for (int i = 0; i < numLines; i++) {
1821  int lineWidth = rand() % maxLineWidth;
1822  int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1823  // tprintf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
1824  for (int row = vertLinePos - lineWidth / 2; row < vertLinePos + lineWidth / 2; row++) {
1825  for (int col = 0; col < height; col++) {
1826  // imageData4[row*width+col] = pixelBlack;
1827  imageData4[row * width + col][0] = pixelBlack[0];
1828  imageData4[row * width + col][1] = pixelBlack[1];
1829  imageData4[row * width + col][2] = pixelBlack[2];
1830  imageData4[row * width + col][3] = pixelBlack[3];
1831  }
1832  }
1833  }
1834  // horizontal lines
1835  for (int i = 0; i < numLines; i++) {
1836  int lineWidth = rand() % maxLineWidth;
1837  int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1838  // tprintf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
1839  for (int row = 0; row < width; row++) {
1840  for (int col = horLinePos - lineWidth / 2; col < horLinePos + lineWidth / 2;
1841  col++) { // for (int row = vertLinePos-lineWidth/2; row <
1842  // vertLinePos+lineWidth/2; row++) {
1843  // tprintf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
1844  // imageData4[row*width+col] = pixelBlack;
1845  imageData4[row * width + col][0] = pixelBlack[0];
1846  imageData4[row * width + col][1] = pixelBlack[1];
1847  imageData4[row * width + col][2] = pixelBlack[2];
1848  imageData4[row * width + col][3] = pixelBlack[3];
1849  }
1850  }
1851  }
1852  // spots (noise, squares)
1853  float fractionBlack = 0.1; // how much of the image should be blackened
1854  int numSpots = (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1855  for (int i = 0; i < numSpots; i++) {
1856  int lineWidth = rand() % maxLineWidth;
1857  int col = lineWidth + rand() % (width - 2 * lineWidth);
1858  int row = lineWidth + rand() % (height - 2 * lineWidth);
1859  // tprintf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
1860  for (int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1861  for (int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
1862  // tprintf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
1863  // imageData4[row*width+col] = pixelBlack;
1864  imageData4[r * width + c][0] = pixelBlack[0];
1865  imageData4[r * width + c][1] = pixelBlack[1];
1866  imageData4[r * width + c][2] = pixelBlack[2];
1867  imageData4[r * width + c][3] = pixelBlack[3];
1868  }
1869  }
1870  }
1871 
1872  input->pix = pixCreate(input->width, input->height, 8 * input->numChannels);
1873 }
1874 
1875 struct TessDeviceScore {
1876  float time; // small time means faster device
1877  bool clError; // were there any opencl errors
1878  bool valid; // was the correct response generated
1879 };
1880 
1881 /******************************************************************************
1882  * Micro Benchmarks for Device Selection
1883  *****************************************************************************/
1884 
1885 static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1886  ds_device_type type) {
1887  double time = 0;
1888 # if ON_WINDOWS
1889  LARGE_INTEGER freq, time_funct_start, time_funct_end;
1890  QueryPerformanceFrequency(&freq);
1891 # elif ON_APPLE
1892  mach_timebase_info_data_t info = {0, 0};
1893  mach_timebase_info(&info);
1894  long long start, stop;
1895 # else
1896  timespec time_funct_start, time_funct_end;
1897 # endif
1898  // input data
1899  l_uint32 *tiffdata = (l_uint32 *)input.imageData; // same size and random data; data doesn't
1900  // change workload
1901 
1902  // function call
1903  if (type == DS_DEVICE_OPENCL_DEVICE) {
1904 # if ON_WINDOWS
1905  QueryPerformanceCounter(&time_funct_start);
1906 # elif ON_APPLE
1907  start = mach_absolute_time();
1908 # else
1909  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1910 # endif
1911 
1912  OpenclDevice::gpuEnv = *env;
1913  int wpl = pixGetWpl(input.pix);
1914  OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, nullptr);
1915 # if ON_WINDOWS
1916  QueryPerformanceCounter(&time_funct_end);
1917  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
1918 # elif ON_APPLE
1919  stop = mach_absolute_time();
1920  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
1921 # else
1922  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1923  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1924  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1925 # endif
1926 
1927  } else {
1928 # if ON_WINDOWS
1929  QueryPerformanceCounter(&time_funct_start);
1930 # elif ON_APPLE
1931  start = mach_absolute_time();
1932 # else
1933  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1934 # endif
1935  Image pix = pixCreate(input.width, input.height, 32);
1936  l_uint32 *pixData = pixGetData(pix);
1937  int i, j;
1938  int idx = 0;
1939  for (i = 0; i < input.height; i++) {
1940  for (j = 0; j < input.width; j++) {
1941  l_uint32 tiffword = tiffdata[i * input.width + j];
1942  l_int32 rval = ((tiffword)&0xff);
1943  l_int32 gval = (((tiffword) >> 8) & 0xff);
1944  l_int32 bval = (((tiffword) >> 16) & 0xff);
1945  l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
1946  pixData[idx] = value;
1947  idx++;
1948  }
1949  }
1950 # if ON_WINDOWS
1951  QueryPerformanceCounter(&time_funct_end);
1952  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
1953 # elif ON_APPLE
1954  stop = mach_absolute_time();
1955  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
1956 # else
1957  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1958  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1959  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1960 # endif
1961  pix.destroy();
1962  }
1963 
1964  return time;
1965 }
1966 
1967 static double histogramRectMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1968  ds_device_type type) {
1969  double time;
1970 # if ON_WINDOWS
1971  LARGE_INTEGER freq, time_funct_start, time_funct_end;
1972  QueryPerformanceFrequency(&freq);
1973 # elif ON_APPLE
1974  mach_timebase_info_data_t info = {0, 0};
1975  mach_timebase_info(&info);
1976  long long start, stop;
1977 # else
1978  timespec time_funct_start, time_funct_end;
1979 # endif
1980 
1981  const int left = 0;
1982  const int top = 0;
1983  int kHistogramSize = 256;
1984  int bytes_per_line = input.width * input.numChannels;
1985  int *histogramAllChannels = new int[kHistogramSize * input.numChannels];
1986  // function call
1987  if (type == DS_DEVICE_OPENCL_DEVICE) {
1988 # if ON_WINDOWS
1989  QueryPerformanceCounter(&time_funct_start);
1990 # elif ON_APPLE
1991  start = mach_absolute_time();
1992 # else
1993  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1994 # endif
1995 
1996  OpenclDevice::gpuEnv = *env;
1997  int retVal = OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line,
1998  left, top, input.width, input.height,
1999  kHistogramSize, histogramAllChannels);
2000 
2001 # if ON_WINDOWS
2002  QueryPerformanceCounter(&time_funct_end);
2003  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2004 # elif ON_APPLE
2005  stop = mach_absolute_time();
2006  if (retVal == 0) {
2007  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2008  } else {
2009  time = FLT_MAX;
2010  }
2011 # else
2012  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2013  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2014  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2015 # endif
2016  } else {
2017  int *histogram = new int[kHistogramSize];
2018 # if ON_WINDOWS
2019  QueryPerformanceCounter(&time_funct_start);
2020 # elif ON_APPLE
2021  start = mach_absolute_time();
2022 # else
2023  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2024 # endif
2025  for (int ch = 0; ch < input.numChannels; ++ch) {
2026  tesseract::HistogramRect(input.pix, input.numChannels, left, top, input.width, input.height,
2027  histogram);
2028  }
2029 # if ON_WINDOWS
2030  QueryPerformanceCounter(&time_funct_end);
2031  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2032 # elif ON_APPLE
2033  stop = mach_absolute_time();
2034  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2035 # else
2036  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2037  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2038  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2039 # endif
2040  delete[] histogram;
2041  }
2042 
2043  // cleanup
2044  delete[] histogramAllChannels;
2045  return time;
2046 }
2047 
2048 // Reproducing the ThresholdRectToPix native version
2049 static void ThresholdRectToPix_Native(const unsigned char *imagedata, int bytes_per_pixel,
2050  int bytes_per_line, const int *thresholds,
2051  const int *hi_values, Image *pix) {
2052  int top = 0;
2053  int left = 0;
2054  int width = pixGetWidth(*pix);
2055  int height = pixGetHeight(*pix);
2056 
2057  *pix = pixCreate(width, height, 1);
2058  uint32_t *pixdata = pixGetData(*pix);
2059  int wpl = pixGetWpl(*pix);
2060  const unsigned char *srcdata = imagedata + top * bytes_per_line + left * bytes_per_pixel;
2061  for (int y = 0; y < height; ++y) {
2062  const uint8_t *linedata = srcdata;
2063  uint32_t *pixline = pixdata + y * wpl;
2064  for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2065  bool white_result = true;
2066  for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2067  if (hi_values[ch] >= 0 && (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2068  white_result = false;
2069  break;
2070  }
2071  }
2072  if (white_result)
2073  CLEAR_DATA_BIT(pixline, x);
2074  else
2075  SET_DATA_BIT(pixline, x);
2076  }
2077  srcdata += bytes_per_line;
2078  }
2079 }
2080 
2081 static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2082  ds_device_type type) {
2083  double time;
2084 # if ON_WINDOWS
2085  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2086  QueryPerformanceFrequency(&freq);
2087 # elif ON_APPLE
2088  mach_timebase_info_data_t info = {0, 0};
2089  mach_timebase_info(&info);
2090  long long start, stop;
2091 # else
2092  timespec time_funct_start, time_funct_end;
2093 # endif
2094 
2095  // input data
2096  unsigned char pixelHi = (unsigned char)255;
2097  int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2098 
2099  // Pix* pix = pixCreate(width, height, 1);
2100  int top = 0;
2101  int left = 0;
2102  int bytes_per_line = input.width * input.numChannels;
2103 
2104  // function call
2105  if (type == DS_DEVICE_OPENCL_DEVICE) {
2106 # if ON_WINDOWS
2107  QueryPerformanceCounter(&time_funct_start);
2108 # elif ON_APPLE
2109  start = mach_absolute_time();
2110 # else
2111  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2112 # endif
2113 
2114  OpenclDevice::gpuEnv = *env;
2115  int hi_values[4];
2116  int retVal = OpenclDevice::ThresholdRectToPixOCL(
2117  input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix,
2118  input.height, input.width, top, left);
2119 
2120 # if ON_WINDOWS
2121  QueryPerformanceCounter(&time_funct_end);
2122  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2123 # elif ON_APPLE
2124  stop = mach_absolute_time();
2125  if (retVal == 0) {
2126  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2127  } else {
2128  time = FLT_MAX;
2129  }
2130 
2131 # else
2132  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2133  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2134  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2135 # endif
2136  } else {
2137  tesseract::ImageThresholder thresholder;
2138  thresholder.SetImage(input.pix);
2139 # if ON_WINDOWS
2140  QueryPerformanceCounter(&time_funct_start);
2141 # elif ON_APPLE
2142  start = mach_absolute_time();
2143 # else
2144  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2145 # endif
2146  int hi_values[4] = {};
2147  ThresholdRectToPix_Native(input.imageData, input.numChannels, bytes_per_line, thresholds,
2148  hi_values, &input.pix);
2149 
2150 # if ON_WINDOWS
2151  QueryPerformanceCounter(&time_funct_end);
2152  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2153 # elif ON_APPLE
2154  stop = mach_absolute_time();
2155  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2156 # else
2157  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2158  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2159  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2160 # endif
2161  }
2162 
2163  return time;
2164 }
2165 
2166 static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2167  ds_device_type type) {
2168  double time = 0;
2169 # if ON_WINDOWS
2170  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2171  QueryPerformanceFrequency(&freq);
2172 # elif ON_APPLE
2173  mach_timebase_info_data_t info = {0, 0};
2174  mach_timebase_info(&info);
2175  long long start, stop;
2176 # else
2177  timespec time_funct_start, time_funct_end;
2178 # endif
2179 
2180  // input data
2181  int resolution = 300;
2182  int wpl = pixGetWpl(input.pix);
2183  int kThinLineFraction = 20; // tess constant
2184  int kMinLineLengthFraction = 4; // tess constant
2185  int max_line_width = resolution / kThinLineFraction;
2186  int min_line_length = resolution / kMinLineLengthFraction;
2187  int closing_brick = max_line_width / 3;
2188 
2189  // function call
2190  if (type == DS_DEVICE_OPENCL_DEVICE) {
2191 # if ON_WINDOWS
2192  QueryPerformanceCounter(&time_funct_start);
2193 # elif ON_APPLE
2194  start = mach_absolute_time();
2195 # else
2196  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2197 # endif
2198  OpenclDevice::gpuEnv = *env;
2199  OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2200  Image pix_vline = nullptr, pix_hline = nullptr, pix_closed = nullptr;
2201  OpenclDevice::pixGetLinesCL(nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed, true,
2202  closing_brick, closing_brick, max_line_width, max_line_width,
2203  min_line_length, min_line_length);
2204 
2205  OpenclDevice::releaseMorphCLBuffers();
2206 
2207 # if ON_WINDOWS
2208  QueryPerformanceCounter(&time_funct_end);
2209  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2210 # elif ON_APPLE
2211  stop = mach_absolute_time();
2212  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2213 # else
2214  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2215  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2216  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2217 # endif
2218  } else {
2219 # if ON_WINDOWS
2220  QueryPerformanceCounter(&time_funct_start);
2221 # elif ON_APPLE
2222  start = mach_absolute_time();
2223 # else
2224  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2225 # endif
2226 
2227  // native serial code
2228  Image src_pix = input.pix;
2229  Image pix_closed = pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick);
2230  Image pix_solid = pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width);
2231  Image pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid);
2232  pix_solid.destroy();
2233  Image pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
2234  Image pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
2235  pix_hline.destroy();
2236  pix_vline.destroy();
2237  pix_hollow.destroy();
2238 
2239 # if ON_WINDOWS
2240  QueryPerformanceCounter(&time_funct_end);
2241  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2242 # elif ON_APPLE
2243  stop = mach_absolute_time();
2244  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2245 # else
2246  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2247  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2248  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2249 # endif
2250  }
2251 
2252  return time;
2253 }
2254 
2255 /******************************************************************************
2256  * Device Selection
2257  *****************************************************************************/
2258 
2259 // encode score object as byte string
2260 static ds_status serializeScore(ds_device *device, uint8_t **serializedScore,
2261  unsigned int *serializedScoreSize) {
2262  *serializedScoreSize = sizeof(TessDeviceScore);
2263  *serializedScore = new uint8_t[*serializedScoreSize];
2264  memcpy(*serializedScore, device->score, *serializedScoreSize);
2265  return DS_SUCCESS;
2266 }
2267 
2268 // parses byte string and stores in score object
2269 static ds_status deserializeScore(ds_device *device, const uint8_t *serializedScore,
2270  unsigned int serializedScoreSize) {
2271  // check that serializedScoreSize == sizeof(TessDeviceScore);
2272  device->score = new TessDeviceScore;
2273  memcpy(device->score, serializedScore, serializedScoreSize);
2274  return DS_SUCCESS;
2275 }
2276 
2277 static ds_status releaseScore(TessDeviceScore *score) {
2278  delete score;
2279  return DS_SUCCESS;
2280 }
2281 
2282 // evaluate devices
2283 static ds_status evaluateScoreForDevice(ds_device *device, void *inputData) {
2284  // overwrite statuc gpuEnv w/ current device
2285  // so native opencl calls can be used; they use static gpuEnv
2286  tprintf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2287  device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2288  GPUEnv *env = nullptr;
2289  if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2290  env = &OpenclDevice::gpuEnv;
2291  memset(env, 0, sizeof(*env));
2292  // tprintf("[DS] populating tmp GPUEnv from device\n");
2293  populateGPUEnvFromDevice(env, device->oclDeviceID);
2294  env->mnFileCount = 0; // argc;
2295  env->mnKernelCount = 0UL;
2296  // tprintf("[DS] compiling kernels for tmp GPUEnv\n");
2297  OpenclDevice::CompileKernelFile(env, "");
2298  }
2299 
2300  TessScoreEvaluationInputData *input = static_cast<TessScoreEvaluationInputData *>(inputData);
2301 
2302  // pixReadTiff
2303  double composeRGBPixelTime = composeRGBPixelMicroBench(env, *input, device->type);
2304 
2305  // HistogramRect
2306  double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2307 
2308  // ThresholdRectToPix
2309  double thresholdRectToPixTime = thresholdRectToPixMicroBench(env, *input, device->type);
2310 
2311  // getLineMasks
2312  double getLineMasksMorphTime = getLineMasksMorphMicroBench(env, *input, device->type);
2313 
2314  // weigh times (% of cpu time)
2315  // these weights should be the % execution time that the native cpu code took
2316  float composeRGBPixelWeight = 1.2f;
2317  float histogramRectWeight = 2.4f;
2318  float thresholdRectToPixWeight = 4.5f;
2319  float getLineMasksMorphWeight = 5.0f;
2320 
2321  float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2322  histogramRectWeight * histogramRectTime +
2323  thresholdRectToPixWeight * thresholdRectToPixTime +
2324  getLineMasksMorphWeight * getLineMasksMorphTime;
2325  device->score = new TessDeviceScore;
2326  device->score->time = weightedTime;
2327 
2328  tprintf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2329  device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2330  tprintf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight);
2331  tprintf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight);
2332  tprintf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime,
2333  thresholdRectToPixWeight);
2334  tprintf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime,
2335  getLineMasksMorphWeight);
2336  tprintf("[DS]%25s: %f\n", "Score", device->score->time);
2337  return DS_SUCCESS;
2338 }
2339 
2340 // initial call to select device
2341 ds_device OpenclDevice::getDeviceSelection() {
2342  if (!deviceIsSelected) {
2343  // check if opencl is available at runtime
2344  if (1 == LoadOpencl()) {
2345  // opencl is available
2346  // setup devices
2347  ds_status status;
2348  ds_profile *profile;
2349  status = initDSProfile(&profile, "v0.1");
2350  // try reading scores from file
2351  const char *fileName = "tesseract_opencl_profile_devices.dat";
2352  status = readProfileFromFile(profile, deserializeScore, fileName);
2353  if (status != DS_SUCCESS) {
2354  // need to run evaluation
2355  tprintf("[DS] Profile file not available (%s); performing profiling.\n", fileName);
2356 
2357  // create input data
2358  TessScoreEvaluationInputData input;
2359  populateTessScoreEvaluationInputData(&input);
2360  // perform evaluations
2361  unsigned int numUpdates;
2362  status =
2363  profileDevices(profile, DS_EVALUATE_ALL, evaluateScoreForDevice, &input, &numUpdates);
2364  // write scores to file
2365  if (status == DS_SUCCESS) {
2366  status = writeProfileToFile(profile, serializeScore, fileName);
2367  if (status == DS_SUCCESS) {
2368  tprintf("[DS] Scores written to file (%s).\n", fileName);
2369  } else {
2370  tprintf(
2371  "[DS] Error saving scores to file (%s); scores not written to "
2372  "file.\n",
2373  fileName);
2374  }
2375  } else {
2376  tprintf(
2377  "[DS] Unable to evaluate performance; scores not written to "
2378  "file.\n");
2379  }
2380  } else {
2381  tprintf("[DS] Profile read from file (%s).\n", fileName);
2382  }
2383 
2384  // we now have device scores either from file or evaluation
2385  // select fastest using custom Tesseract selection algorithm
2386  float bestTime = FLT_MAX; // begin search with worst possible time
2387  int bestDeviceIdx = -1;
2388  for (unsigned d = 0; d < profile->numDevices; d++) {
2389  ds_device device = profile->devices[d];
2390  if (device.score == nullptr)
2391  continue;
2392  TessDeviceScore score = *device.score;
2393 
2394  float time = score.time;
2395  tprintf("[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type, device.oclDeviceName,
2396  time);
2397  if (time < bestTime) {
2398  bestTime = time;
2399  bestDeviceIdx = d;
2400  }
2401  }
2402  if (bestDeviceIdx >= 0) {
2403  tprintf(
2404  "[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2405  profile->devices[bestDeviceIdx].oclDeviceName,
2406  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2407  }
2408  // cleanup
2409  // TODO: call destructor for profile object?
2410 
2411  bool overridden = false;
2412  char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
2413  if (overrideDeviceStr != nullptr) {
2414  int overrideDeviceIdx = atoi(overrideDeviceStr);
2415  if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2416  tprintf(
2417  "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2418  "%i)\n",
2419  overrideDeviceStr, overrideDeviceIdx);
2420  bestDeviceIdx = overrideDeviceIdx - 1;
2421  overridden = true;
2422  } else {
2423  tprintf(
2424  "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2425  "valid devices).\n",
2426  overrideDeviceStr, profile->numDevices);
2427  }
2428  }
2429 
2430  if (overridden) {
2431  tprintf(
2432  "[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2433  profile->devices[bestDeviceIdx].oclDeviceName,
2434  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2435  }
2436  selectedDevice = profile->devices[bestDeviceIdx];
2437  // cleanup
2438  releaseDSProfile(profile, releaseScore);
2439  } else {
2440  // opencl isn't available at runtime, select native cpu device
2441  tprintf("[DS] OpenCL runtime not available.\n");
2442  selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2443  selectedDevice.oclDeviceName = "(null)";
2444  selectedDevice.score = nullptr;
2445  selectedDevice.oclDeviceID = nullptr;
2446  selectedDevice.oclDriverVersion = nullptr;
2447  }
2448  deviceIsSelected = true;
2449  }
2450  return selectedDevice;
2451 }
2452 
2453 bool OpenclDevice::selectedDeviceIsOpenCL() {
2454  ds_device device = getDeviceSelection();
2455  return (device.type == DS_DEVICE_OPENCL_DEVICE);
2456 }
2457 
2458 } // namespace
2459 
2460 #endif
#define ASSERT_HOST(x)
Definition: errcode.h:59
const int kHistogramSize
Definition: otsuthr.h:30
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void HistogramRect(Image src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:146
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
Definition: linefind.cpp:41
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
Definition: linefind.cpp:39
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:70