lhez commited on
Commit
ae0c7b8
·
1 Parent(s): 1a9d2d3

opencl: ref count `ggml_backend_opencl_context` and refactor profiling (llama/14254)

Browse files

* Move profiling info into `ggml_backend_opencl_context`
* Add `enqueue_ndrange_kernel` to launch kernel

Files changed (1) hide show
  1. ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
ggml/src/ggml-opencl/ggml-opencl.cpp CHANGED
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
231
  return { type, major, minor, patch };
232
  }
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  struct ggml_backend_opencl_context;
235
 
236
  // backend device context
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
254
 
255
  // backend context
256
  struct ggml_backend_opencl_context {
 
 
257
  cl_device_id device;
258
  std::string device_name;
259
 
@@ -369,6 +436,108 @@ struct ggml_backend_opencl_context {
369
  cl_kernel kernel_timestep_embedding;
370
  cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
373
  // Transpose kernels
374
  cl_program program_transpose;
@@ -395,46 +564,19 @@ struct ggml_backend_opencl_context {
395
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
396
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
397
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
398
- };
399
 
400
- // All registered devices with a default device in the front.
401
- static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
402
-
403
- // Profiling
404
  #ifdef GGML_OPENCL_PROFILING
405
- struct ProfilingInfo {
406
- std::string op_name;
407
- std::string kernel_name;
408
-
409
- cl_kernel kernel;
410
- cl_event evt;
411
-
412
- cl_ulong cmd_queued;
413
- cl_ulong cmd_submit;
414
- cl_ulong cmd_start;
415
- cl_ulong cmd_end;
416
- cl_ulong overhead_start;
417
- cl_ulong overhead_end;
418
- // For the times below, see spec for clGetEventProfilingInfo
419
- // The time kernel spent in cmd queue - SUBMIT - QUEUED
420
- cl_ulong cmd_queued_duration_ns;
421
- // The time kernel spent for submission - START - SUBMIT
422
- cl_ulong cmd_submit_duration_ns;
423
- // Kernel execution time in nanoseconds - END - START
424
- cl_ulong cmd_duration_ns;
425
- // The time for the kernel to complete - COMPLETE - END
426
- cl_ulong cmd_complete_duration_ns;
427
- // Total time to finish the kernel - COMPELTE - QUEUED
428
- cl_ulong cmd_total_duration_ns;
429
- // Global and local work sizes.
430
- size_t global_size[3];
431
- size_t local_size[3];
432
- // Op output size.
433
- size_t output_size[4];
434
  };
435
 
436
- std::vector<ProfilingInfo> g_profiling_info;
437
- #endif
438
 
439
  inline std::string read_file(const std::string &path) {
440
  std::ifstream ifs(path);
@@ -1669,6 +1811,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1669
  backend_ctx->device = dev_ctx->device;
1670
  backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1671
 
 
 
 
 
 
 
1672
  if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
1673
  strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
1674
  strstr(dev_ctx->device_version.c_str(), "Adreno")) {
@@ -1841,93 +1989,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1841
  return dev_ctx->backend_ctx;
1842
  }
1843
 
1844
- static void ggml_cl2_free(void) {
1845
- #ifdef GGML_OPENCL_PROFILING
1846
- FILE * fperf = fopen("cl_profiling.csv", "w");
1847
- if (!fperf) {
1848
- GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
1849
- return;
1850
- }
1851
 
1852
- // Populate profiling info
1853
- for (ProfilingInfo & info : g_profiling_info) {
1854
- cl_ulong cmd_queued;
1855
- cl_ulong cmd_submit;
1856
- cl_ulong cmd_start;
1857
- cl_ulong cmd_end;
1858
- cl_ulong cmd_complete;
1859
-
1860
- CL_CHECK(clWaitForEvents(1, &info.evt));
1861
- CL_CHECK(clGetEventProfilingInfo(
1862
- info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
1863
- CL_CHECK(clGetEventProfilingInfo(
1864
- info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
1865
- CL_CHECK(clGetEventProfilingInfo(
1866
- info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
1867
- CL_CHECK(clGetEventProfilingInfo(
1868
- info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
1869
- CL_CHECK(clGetEventProfilingInfo(
1870
- info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
1871
- CL_CHECK(clReleaseEvent(info.evt));
1872
-
1873
- char kernel_name[512];
1874
- CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
1875
- sizeof(kernel_name), kernel_name, NULL));
1876
- info.kernel_name = kernel_name;
1877
-
1878
- info.cmd_queued = cmd_queued;
1879
- info.cmd_submit = cmd_submit;
1880
- info.cmd_start = cmd_start;
1881
- info.cmd_end = cmd_end;
1882
-
1883
- info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
1884
- info.cmd_submit_duration_ns = cmd_start - cmd_submit;
1885
- info.cmd_duration_ns = cmd_end - cmd_start;
1886
- info.cmd_complete_duration_ns = cmd_complete - cmd_end;
1887
- info.cmd_total_duration_ns = cmd_complete - cmd_queued;
1888
- }
1889
-
1890
- // Dump a csv
1891
- float total_kernel_time = 0;
1892
- fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
1893
- for (const ProfilingInfo & info : g_profiling_info) {
1894
- total_kernel_time += info.cmd_duration_ns/1.e6f;
1895
- fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
1896
- info.op_name.c_str(), info.kernel_name.c_str(),
1897
- info.cmd_queued_duration_ns/1.e6f,
1898
- info.cmd_submit_duration_ns/1.e6f,
1899
- info.cmd_duration_ns/1.e6f,
1900
- info.cmd_complete_duration_ns/1.e6f,
1901
- info.cmd_total_duration_ns/1.e6f,
1902
- info.global_size[0], info.global_size[1], info.global_size[2],
1903
- info.local_size[0], info.local_size[1], info.local_size[2],
1904
- info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
1905
- }
1906
- fclose(fperf);
1907
-
1908
- GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
1909
-
1910
- // Dump a simple chrome trace
1911
- FILE* ftrace = fopen("cl_trace.json", "w");
1912
- if (!ftrace) {
1913
- GGML_LOG_ERROR("Failed to open cl_trace.json\n");
1914
- return;
1915
  }
1916
 
1917
- fprintf(ftrace, "[\n");
1918
- for (const ProfilingInfo & info : g_profiling_info) {
1919
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
1920
- info.kernel_name.c_str(), info.cmd_queued/1000);
1921
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
1922
- info.kernel_name.c_str(), info.cmd_submit/1000);
1923
-
1924
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1925
- info.kernel_name.c_str(), info.cmd_start/1000);
1926
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1927
- info.kernel_name.c_str(), info.cmd_end/1000);
1928
  }
1929
- fclose(ftrace);
1930
- #endif
1931
  }
1932
 
1933
  //------------------------------------------------------------------------------
@@ -2011,9 +2088,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
2011
  }
2012
 
2013
  static void ggml_backend_opencl_free(ggml_backend_t backend) {
2014
- ggml_cl2_free();
2015
-
2016
- GGML_UNUSED(backend);
2017
  }
2018
 
2019
  static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -2899,6 +2974,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
2899
 
2900
  static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
2901
  ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
 
 
2902
 
2903
  ggml_backend_t backend = new ggml_backend {
2904
  /* .guid = */ ggml_backend_opencl_guid(),
@@ -3159,31 +3236,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
3159
  #define dump_tensor(tensor)
3160
  #endif
3161
 
3162
- //------------------------------------------------------------------------------
3163
- // Profiling utility
3164
- //------------------------------------------------------------------------------
3165
- #ifdef GGML_OPENCL_PROFILING
3166
- static void populateProfilingInfo(
3167
- ProfilingInfo& info, cl_event evt, cl_kernel kernel,
3168
- size_t global_size[3], size_t local_size[3],
3169
- const ggml_tensor * tensor) {
3170
- info.op_name = tensor->name;
3171
- info.kernel = kernel;
3172
- info.evt = evt;
3173
-
3174
- info.local_size[0] = local_size[0];
3175
- info.local_size[1] = local_size[1];
3176
- info.local_size[2] = local_size[2];
3177
- info.global_size[0] = global_size[0];
3178
- info.global_size[1] = global_size[1];
3179
- info.global_size[2] = global_size[2];
3180
- info.output_size[0] = tensor->ne[0];
3181
- info.output_size[1] = tensor->ne[1];
3182
- info.output_size[2] = tensor->ne[2];
3183
- info.output_size[3] = tensor->ne[3];
3184
- }
3185
- #endif
3186
-
3187
  //------------------------------------------------------------------------------
3188
  // Ops
3189
  //------------------------------------------------------------------------------
@@ -3227,7 +3279,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3227
  const cl_ulong nb2 = dst ? dst->nb[2] : 0;
3228
 
3229
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3230
- cl_command_queue queue = backend_ctx->queue;
3231
 
3232
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3233
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3271,15 +3322,7 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3271
  size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
3272
  size_t local_work_size[] = {1, 1, 1};
3273
 
3274
- #ifdef GGML_OPENCL_PROFILING
3275
- cl_event evt;
3276
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3277
-
3278
- g_profiling_info.emplace_back();
3279
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3280
- #else
3281
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3282
- #endif
3283
  }
3284
 
3285
  static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3321,7 +3364,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3321
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3322
 
3323
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3324
- cl_command_queue queue = backend_ctx->queue;
3325
 
3326
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3327
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3396,29 +3438,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3396
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3397
  }
3398
 
3399
- #ifdef GGML_OPENCL_PROFILING
3400
- cl_event evt;
3401
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3402
-
3403
- g_profiling_info.emplace_back();
3404
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3405
- #else
3406
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3407
- #endif
3408
  } else {
3409
  unsigned int nth = MIN(64, ne0);
3410
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3411
  size_t local_work_size[] = {nth, 1, 1};
3412
 
3413
- #ifdef GGML_OPENCL_PROFILING
3414
- cl_event evt;
3415
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3416
-
3417
- g_profiling_info.emplace_back();
3418
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3419
- #else
3420
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3421
- #endif
3422
  }
3423
  }
3424
 
@@ -3461,7 +3487,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3461
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3462
 
3463
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3464
- cl_command_queue queue = backend_ctx->queue;
3465
 
3466
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3467
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3536,29 +3561,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3536
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3537
  }
3538
 
3539
- #ifdef GGML_OPENCL_PROFILING
3540
- cl_event evt;
3541
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3542
-
3543
- g_profiling_info.emplace_back();
3544
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3545
- #else
3546
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3547
- #endif
3548
  } else {
3549
  unsigned int nth = MIN(64, ne0);
3550
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3551
  size_t local_work_size[] = {nth, 1, 1};
3552
 
3553
- #ifdef GGML_OPENCL_PROFILING
3554
- cl_event evt;
3555
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3556
-
3557
- g_profiling_info.emplace_back();
3558
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3559
- #else
3560
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3561
- #endif
3562
  }
3563
  }
3564
 
@@ -3598,7 +3607,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3598
  const cl_ulong nb3 = dst->nb[3];
3599
 
3600
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3601
- cl_command_queue queue = backend_ctx->queue;
3602
 
3603
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3604
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3661,29 +3669,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3661
  size_t global_work_size[] = {(size_t)n, 1, 1};
3662
  size_t local_work_size[] = {64, 1, 1};
3663
 
3664
- #ifdef GGML_OPENCL_PROFILING
3665
- cl_event evt;
3666
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3667
-
3668
- g_profiling_info.emplace_back();
3669
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3670
- #else
3671
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3672
- #endif
3673
  } else {
3674
  unsigned int nth = MIN(64, ne0);
3675
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3676
  size_t local_work_size[] = {nth, 1, 1};
3677
 
3678
- #ifdef GGML_OPENCL_PROFILING
3679
- cl_event evt;
3680
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3681
-
3682
- g_profiling_info.emplace_back();
3683
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3684
- #else
3685
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3686
- #endif
3687
  }
3688
  }
3689
 
@@ -3723,7 +3715,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
3723
  const cl_ulong nb3 = dst->nb[3];
3724
 
3725
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3726
- cl_command_queue queue = backend_ctx->queue;
3727
 
3728
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3729
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3786,29 +3777,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
3786
  size_t global_work_size[] = {(size_t)n, 1, 1};
3787
  size_t local_work_size[] = {64, 1, 1};
3788
 
3789
- #ifdef GGML_OPENCL_PROFILING
3790
- cl_event evt;
3791
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3792
-
3793
- g_profiling_info.emplace_back();
3794
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3795
- #else
3796
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3797
- #endif
3798
  } else {
3799
  unsigned int nth = MIN(64, ne0);
3800
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3801
  size_t local_work_size[] = {nth, 1, 1};
3802
 
3803
- #ifdef GGML_OPENCL_PROFILING
3804
- cl_event evt;
3805
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3806
-
3807
- g_profiling_info.emplace_back();
3808
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3809
- #else
3810
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3811
- #endif
3812
  }
3813
  }
3814
 
@@ -3821,7 +3796,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3821
  UNUSED(src1);
3822
 
3823
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3824
- cl_command_queue queue = backend_ctx->queue;
3825
 
3826
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3827
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3848,15 +3822,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3848
  size_t global_work_size[] = {(size_t)n, 1, 1};
3849
  size_t local_work_size[] = {64, 1, 1};
3850
 
3851
- #ifdef GGML_OPENCL_PROFILING
3852
- cl_event evt;
3853
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3854
-
3855
- g_profiling_info.emplace_back();
3856
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3857
- #else
3858
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3859
- #endif
3860
  }
3861
 
3862
  static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3868,7 +3834,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
3868
  UNUSED(src1);
3869
 
3870
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3871
- cl_command_queue queue = backend_ctx->queue;
3872
 
3873
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3874
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3895,15 +3860,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
3895
  size_t global_work_size[] = {(size_t)n, 1, 1};
3896
  size_t local_work_size[] = {64, 1, 1};
3897
 
3898
- #ifdef GGML_OPENCL_PROFILING
3899
- cl_event evt;
3900
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3901
-
3902
- g_profiling_info.emplace_back();
3903
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3904
- #else
3905
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3906
- #endif
3907
  }
3908
 
3909
  static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3915,7 +3872,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3915
  UNUSED(src1);
3916
 
3917
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3918
- cl_command_queue queue = backend_ctx->queue;
3919
 
3920
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3921
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3947,15 +3903,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3947
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3948
  }
3949
 
3950
- #ifdef GGML_OPENCL_PROFILING
3951
- cl_event evt;
3952
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3953
-
3954
- g_profiling_info.emplace_back();
3955
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3956
- #else
3957
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3958
- #endif
3959
  }
3960
 
3961
  static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3967,7 +3915,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3967
  UNUSED(src1);
3968
 
3969
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3970
- cl_command_queue queue = backend_ctx->queue;
3971
 
3972
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3973
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3992,15 +3939,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3992
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3993
  }
3994
 
3995
- #ifdef GGML_OPENCL_PROFILING
3996
- cl_event evt;
3997
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3998
-
3999
- g_profiling_info.emplace_back();
4000
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4001
- #else
4002
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4003
- #endif
4004
  }
4005
 
4006
  static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4012,7 +3951,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
4012
  UNUSED(src1);
4013
 
4014
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4015
- cl_command_queue queue = backend_ctx->queue;
4016
 
4017
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4018
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4044,15 +3982,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
4044
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4045
  }
4046
 
4047
- #ifdef GGML_OPENCL_PROFILING
4048
- cl_event evt;
4049
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4050
-
4051
- g_profiling_info.emplace_back();
4052
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4053
- #else
4054
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4055
- #endif
4056
  }
4057
 
4058
  static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4064,7 +3994,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
4064
  UNUSED(src1);
4065
 
4066
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4067
- cl_command_queue queue = backend_ctx->queue;
4068
 
4069
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4070
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4096,15 +4025,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
4096
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4097
  }
4098
 
4099
- #ifdef GGML_OPENCL_PROFILING
4100
- cl_event evt;
4101
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4102
-
4103
- g_profiling_info.emplace_back();
4104
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4105
- #else
4106
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4107
- #endif
4108
  }
4109
 
4110
  static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4116,7 +4037,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
4116
  UNUSED(src1);
4117
 
4118
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4119
- cl_command_queue queue = backend_ctx->queue;
4120
 
4121
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4122
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4157,15 +4077,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
4157
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
4158
  size_t local_work_size[] = {(size_t)nth, 1, 1};
4159
 
4160
- #ifdef GGML_OPENCL_PROFILING
4161
- cl_event evt;
4162
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4163
-
4164
- g_profiling_info.emplace_back();
4165
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4166
- #else
4167
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4168
- #endif
4169
  }
4170
 
4171
  static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4177,7 +4089,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
4177
  UNUSED(src1);
4178
 
4179
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4180
- cl_command_queue queue = backend_ctx->queue;
4181
 
4182
  //ggml_backend_opencl_device_context * dev_ctx =
4183
  // (ggml_backend_opencl_device_context *)backend->device->context;
@@ -4241,15 +4152,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
4241
  // This is local memory - the size depends on subgroup size.
4242
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
4243
 
4244
- #ifdef GGML_OPENCL_PROFILING
4245
- cl_event evt;
4246
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4247
-
4248
- g_profiling_info.emplace_back();
4249
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4250
- #else
4251
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4252
- #endif
4253
  }
4254
 
4255
  static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4261,7 +4164,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
4261
  UNUSED(src1);
4262
 
4263
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4264
- cl_command_queue queue = backend_ctx->queue;
4265
 
4266
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4267
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4300,15 +4202,7 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
4300
  size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
4301
  size_t local_work_size[] = {(size_t)sgs, 1, 1};
4302
 
4303
- #ifdef GGML_OPENCL_PROFILING
4304
- cl_event evt;
4305
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4306
-
4307
- g_profiling_info.emplace_back();
4308
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4309
- #else
4310
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4311
- #endif
4312
  }
4313
 
4314
  static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4320,7 +4214,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
4320
  UNUSED(src1);
4321
 
4322
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4323
- cl_command_queue queue = backend_ctx->queue;
4324
 
4325
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4326
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4397,16 +4290,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
4397
  }
4398
  if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
4399
 
4400
-
4401
- #ifdef GGML_OPENCL_PROFILING
4402
- cl_event evt;
4403
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4404
-
4405
- g_profiling_info.emplace_back();
4406
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
4407
- #else
4408
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4409
- #endif
4410
  }
4411
 
4412
  static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
@@ -4419,7 +4303,6 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
4419
  UNUSED(src1_shape_def);
4420
 
4421
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4422
- cl_command_queue queue = backend_ctx->queue;
4423
 
4424
  if (backend_ctx->kernel_repeat == nullptr) {
4425
  GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
@@ -4467,15 +4350,7 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
4467
 
4468
  size_t global_work_size[] = { gws0, gws1, gws2 };
4469
 
4470
- #ifdef GGML_OPENCL_PROFILING
4471
- cl_event evt;
4472
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt));
4473
-
4474
- g_profiling_info.emplace_back();
4475
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst);
4476
- #else
4477
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
4478
- #endif
4479
  }
4480
 
4481
  static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -4488,7 +4363,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
4488
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
4489
 
4490
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4491
- cl_command_queue queue = backend_ctx->queue;
4492
 
4493
  if (backend_ctx->kernel_pad == nullptr) {
4494
  GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
@@ -4533,15 +4407,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
4533
  local_work_size_ptr = nullptr;
4534
  }
4535
 
4536
- #ifdef GGML_OPENCL_PROFILING
4537
- cl_event evt;
4538
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4539
-
4540
- g_profiling_info.emplace_back();
4541
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
4542
- #else
4543
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4544
- #endif
4545
  }
4546
 
4547
  static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -4553,7 +4419,6 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4553
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
4554
 
4555
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4556
- cl_command_queue queue = backend_ctx->queue;
4557
 
4558
  const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
4559
  cl_kernel kernel = nullptr;
@@ -4644,17 +4509,7 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4644
  local_work_size_ptr = nullptr;
4645
  }
4646
 
4647
- #ifdef GGML_OPENCL_PROFILING
4648
- cl_event evt;
4649
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4650
-
4651
- g_profiling_info.emplace_back();
4652
- size_t profiling_gws[3] = {global_work_size[0], 1, 1};
4653
- size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1};
4654
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
4655
- #else
4656
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4657
- #endif
4658
  }
4659
 
4660
  static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4732,7 +4587,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
4732
  global_work_size[1] = d_ne1;
4733
  global_work_size[2] = d_ne2;
4734
 
4735
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
4736
  }
4737
  }
4738
  } else {
@@ -4782,7 +4637,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
4782
  d_ne2 > 0 ? (size_t)d_ne2 : 1,
4783
  d_ne3 > 0 ? (size_t)d_ne3 : 1 };
4784
 
4785
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL));
4786
  }
4787
  }
4788
 
@@ -4795,7 +4650,6 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
4795
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
4796
 
4797
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4798
- cl_command_queue queue = backend_ctx->queue;
4799
 
4800
  if (backend_ctx->kernel_timestep_embedding == nullptr) {
4801
  GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
@@ -4828,17 +4682,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
4828
 
4829
  size_t global_work_size[] = {gws0, gws1, 1};
4830
 
4831
- #ifdef GGML_OPENCL_PROFILING
4832
- cl_event evt;
4833
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
4834
-
4835
- g_profiling_info.emplace_back();
4836
- size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1};
4837
- size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS
4838
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
4839
- #else
4840
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem
4841
- #endif
4842
  }
4843
 
4844
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4853,7 +4697,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4853
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
4854
 
4855
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4856
- cl_command_queue queue = backend_ctx->queue;
4857
 
4858
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4859
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -5058,15 +4901,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5058
  static_cast<size_t>(padded_height_B)
5059
  };
5060
 
5061
- #ifdef GGML_OPENCL_PROFILING
5062
- cl_event evt;
5063
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
5064
-
5065
- g_profiling_info.emplace_back();
5066
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
5067
- #else
5068
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
5069
- #endif
5070
  } else {
5071
  // no need to transpose B in other cases
5072
  // create an image for B from sub_buffer
@@ -5188,16 +5023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5188
 
5189
  // enqueue kernel with profiling
5190
  // <--------------------------------------------> //
5191
- #ifdef GGML_OPENCL_PROFILING
5192
- cl_event evt;
5193
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5194
-
5195
- g_profiling_info.emplace_back();
5196
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5197
- // enqueue kernel without profiling
5198
- #else
5199
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5200
- #endif
5201
  // <--------------------------------------------> //
5202
 
5203
  // deallocate sub buffers and images
@@ -5277,15 +5103,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5277
  global_work_size[2] = (size_t)ne12*ne13;
5278
  }
5279
 
5280
- #ifdef GGML_OPENCL_PROFILING
5281
- cl_event evt;
5282
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5283
-
5284
- g_profiling_info.emplace_back();
5285
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5286
- #else
5287
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5288
- #endif
5289
  return;
5290
  }
5291
  #else // GGML_OPENCL_SOA_Q
@@ -5515,15 +5333,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5515
  size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
5516
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5517
 
5518
- #ifdef GGML_OPENCL_PROFILING
5519
- cl_event evt;
5520
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5521
-
5522
- g_profiling_info.emplace_back();
5523
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5524
- #else
5525
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5526
- #endif
5527
  } else if (src0t == GGML_TYPE_Q4_K) {
5528
  GGML_ASSERT(false && "not implemented");
5529
  } else if (src0t == GGML_TYPE_Q3_K) {
@@ -5534,30 +5344,14 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5534
  size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
5535
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5536
 
5537
- #ifdef GGML_OPENCL_PROFILING
5538
- cl_event evt;
5539
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5540
-
5541
- g_profiling_info.emplace_back();
5542
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5543
- #else
5544
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5545
- #endif
5546
  } else {
5547
  int64_t ny = (ne11 + nrows - 1)/nrows;
5548
 
5549
  size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
5550
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5551
 
5552
- #ifdef GGML_OPENCL_PROFILING
5553
- cl_event evt;
5554
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5555
-
5556
- g_profiling_info.emplace_back();
5557
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5558
- #else
5559
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5560
- #endif
5561
  }
5562
  }
5563
 
@@ -5574,7 +5368,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
5574
  GGML_ASSERT(src2->extra);
5575
 
5576
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5577
- cl_command_queue queue = backend_ctx->queue;
5578
 
5579
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5580
  ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
@@ -5680,15 +5473,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
5680
  size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
5681
  size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
5682
 
5683
- #ifdef GGML_OPENCL_PROFILING
5684
- cl_event evt;
5685
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5686
-
5687
- g_profiling_info.emplace_back();
5688
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5689
- #else
5690
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5691
- #endif
5692
  }
5693
 
5694
  static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5701,7 +5486,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
5701
  GGML_ASSERT(ggml_is_contiguous(src0));
5702
 
5703
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5704
- cl_command_queue queue = backend_ctx->queue;
5705
 
5706
  float scale;
5707
  memcpy(&scale, dst->op_params, sizeof(scale));
@@ -5730,15 +5514,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
5730
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5731
  }
5732
 
5733
- #ifdef GGML_OPENCL_PROFILING
5734
- cl_event evt;
5735
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
5736
-
5737
- g_profiling_info.emplace_back();
5738
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
5739
- #else
5740
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
5741
- #endif
5742
  }
5743
 
5744
  static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5775,7 +5551,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
5775
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
5776
 
5777
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5778
- cl_command_queue queue = backend_ctx->queue;
5779
 
5780
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5781
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -5840,15 +5615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
5840
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5841
  size_t local_work_size[] = {(size_t)nth, 1, 1};
5842
 
5843
- #ifdef GGML_OPENCL_PROFILING
5844
- cl_event evt;
5845
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5846
-
5847
- g_profiling_info.emplace_back();
5848
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
5849
- #else
5850
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5851
- #endif
5852
  }
5853
 
5854
  static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5871,7 +5638,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5871
  const int ne02 = src0 ? src0->ne[2] : 0;
5872
 
5873
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5874
- cl_command_queue queue = backend_ctx->queue;
5875
 
5876
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5877
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5895,15 +5661,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5895
  size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
5896
  size_t local_work_size[] = {64, 1, 1};
5897
 
5898
- #ifdef GGML_OPENCL_PROFILING
5899
- cl_event evt;
5900
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5901
-
5902
- g_profiling_info.emplace_back();
5903
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5904
- #else
5905
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5906
- #endif
5907
  } else {
5908
  kernel = backend_ctx->kernel_diag_mask_inf;
5909
 
@@ -5923,15 +5681,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5923
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5924
  }
5925
 
5926
- #ifdef GGML_OPENCL_PROFILING
5927
- cl_event evt;
5928
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
5929
-
5930
- g_profiling_info.emplace_back();
5931
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
5932
- #else
5933
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
5934
- #endif
5935
  }
5936
  }
5937
 
@@ -5951,7 +5701,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
5951
  }
5952
 
5953
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5954
- cl_command_queue queue = backend_ctx->queue;
5955
 
5956
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5957
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6031,15 +5780,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
6031
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
6032
  size_t local_work_size[] = {(size_t)nth, 1, 1};
6033
 
6034
- #ifdef GGML_OPENCL_PROFILING
6035
- cl_event evt;
6036
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6037
-
6038
- g_profiling_info.emplace_back();
6039
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6040
- #else
6041
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6042
- #endif
6043
  }
6044
 
6045
  static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6051,7 +5792,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
6051
  GGML_ASSERT(dst->extra);
6052
 
6053
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6054
- cl_command_queue queue = backend_ctx->queue;
6055
 
6056
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6057
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -6217,15 +5957,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
6217
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
6218
  size_t local_work_size[] = {(size_t)nth, 1, 1};
6219
 
6220
- #ifdef GGML_OPENCL_PROFILING
6221
- cl_event evt;
6222
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6223
-
6224
- g_profiling_info.emplace_back();
6225
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6226
- #else
6227
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6228
- #endif
6229
  }
6230
 
6231
  static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6240,7 +5972,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
6240
  GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
6241
 
6242
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6243
- cl_command_queue queue = backend_ctx->queue;
6244
 
6245
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6246
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6309,15 +6040,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
6309
  size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
6310
  size_t local_work_size[] = {256, 1, 1};
6311
 
6312
- #ifdef GGML_OPENCL_PROFILING
6313
- cl_event evt;
6314
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6315
-
6316
- g_profiling_info.emplace_back();
6317
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6318
- #else
6319
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6320
- #endif
6321
  }
6322
 
6323
  static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6332,7 +6055,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
6332
  GGML_ASSERT(ggml_is_contiguous(src0));
6333
 
6334
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6335
- cl_command_queue queue = backend_ctx->queue;
6336
 
6337
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6338
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6364,15 +6086,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
6364
  size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
6365
  size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
6366
 
6367
- #ifdef GGML_OPENCL_PROFILING
6368
- cl_event evt;
6369
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6370
-
6371
- g_profiling_info.emplace_back();
6372
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6373
- #else
6374
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6375
- #endif
6376
  }
6377
 
6378
  static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6386,7 +6100,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
6386
  GGML_ASSERT(ggml_is_contiguous(src0));
6387
 
6388
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6389
- cl_command_queue queue = backend_ctx->queue;
6390
 
6391
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6392
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6427,15 +6140,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
6427
  size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
6428
  size_t local_work_size[] = {(size_t)64, 1, 1};
6429
 
6430
- #ifdef GGML_OPENCL_PROFILING
6431
- cl_event evt;
6432
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6433
-
6434
- g_profiling_info.emplace_back();
6435
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6436
- #else
6437
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6438
- #endif
6439
  }
6440
 
6441
  //------------------------------------------------------------------------------
 
231
  return { type, major, minor, patch };
232
  }
233
 
234
+ // Profiling
235
+ struct ProfilingInfo {
236
+ std::string op_name;
237
+ std::string kernel_name;
238
+
239
+ cl_kernel kernel;
240
+ cl_event evt;
241
+
242
+ cl_ulong cmd_queued;
243
+ cl_ulong cmd_submit;
244
+ cl_ulong cmd_start;
245
+ cl_ulong cmd_end;
246
+ cl_ulong overhead_start;
247
+ cl_ulong overhead_end;
248
+ // For the times below, see spec for clGetEventProfilingInfo
249
+ // The time kernel spent in cmd queue - SUBMIT - QUEUED
250
+ cl_ulong cmd_queued_duration_ns;
251
+ // The time kernel spent for submission - START - SUBMIT
252
+ cl_ulong cmd_submit_duration_ns;
253
+ // Kernel execution time in nanoseconds - END - START
254
+ cl_ulong cmd_duration_ns;
255
+ // The time for the kernel to complete - COMPLETE - END
256
+ cl_ulong cmd_complete_duration_ns;
257
+ // Total time to finish the kernel - COMPELTE - QUEUED
258
+ cl_ulong cmd_total_duration_ns;
259
+ // Global and local work sizes.
260
+ size_t global_size[3];
261
+ size_t local_size[3];
262
+ // Op output size.
263
+ size_t output_size[4];
264
+ };
265
+
266
+ static void populateProfilingInfo(
267
+ ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
268
+ size_t global_size[3], size_t local_size[3],
269
+ const ggml_tensor * tensor) {
270
+ info.op_name = tensor->name;
271
+ info.kernel = kernel;
272
+ info.evt = evt;
273
+
274
+ // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
275
+ info.local_size[0] = 0;
276
+ info.local_size[1] = 0;
277
+ info.local_size[2] = 0;
278
+
279
+ info.global_size[0] = 0;
280
+ info.global_size[1] = 0;
281
+ info.global_size[2] = 0;
282
+
283
+ if (local_size) {
284
+ for (cl_uint i = 0; i < work_dim; ++i) {
285
+ info.local_size[i] = local_size[i];
286
+ }
287
+ }
288
+
289
+ for (cl_uint i = 0; i < work_dim; ++i) {
290
+ info.global_size[i] = global_size[i];
291
+ }
292
+
293
+ info.output_size[0] = tensor->ne[0];
294
+ info.output_size[1] = tensor->ne[1];
295
+ info.output_size[2] = tensor->ne[2];
296
+ info.output_size[3] = tensor->ne[3];
297
+ }
298
+
299
  struct ggml_backend_opencl_context;
300
 
301
  // backend device context
 
319
 
320
  // backend context
321
  struct ggml_backend_opencl_context {
322
+ int ref_count;
323
+
324
  cl_device_id device;
325
  std::string device_name;
326
 
 
436
  cl_kernel kernel_timestep_embedding;
437
  cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
438
 
439
+ std::vector<ProfilingInfo> profiling_info;
440
+
441
+ void write_profiling_info() {
442
+ FILE * fperf = fopen("cl_profiling.csv", "w");
443
+ if (!fperf) {
444
+ GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
445
+ return;
446
+ }
447
+
448
+ // Populate profiling info
449
+ for (ProfilingInfo & info : profiling_info) {
450
+ cl_ulong cmd_queued;
451
+ cl_ulong cmd_submit;
452
+ cl_ulong cmd_start;
453
+ cl_ulong cmd_end;
454
+ cl_ulong cmd_complete;
455
+
456
+ CL_CHECK(clWaitForEvents(1, &info.evt));
457
+ CL_CHECK(clGetEventProfilingInfo(
458
+ info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
459
+ CL_CHECK(clGetEventProfilingInfo(
460
+ info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
461
+ CL_CHECK(clGetEventProfilingInfo(
462
+ info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
463
+ CL_CHECK(clGetEventProfilingInfo(
464
+ info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
465
+ CL_CHECK(clGetEventProfilingInfo(
466
+ info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
467
+ CL_CHECK(clReleaseEvent(info.evt));
468
+
469
+ char kernel_name[512];
470
+ CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
471
+ sizeof(kernel_name), kernel_name, NULL));
472
+ info.kernel_name = kernel_name;
473
+
474
+ info.cmd_queued = cmd_queued;
475
+ info.cmd_submit = cmd_submit;
476
+ info.cmd_start = cmd_start;
477
+ info.cmd_end = cmd_end;
478
+
479
+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
480
+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
481
+ info.cmd_duration_ns = cmd_end - cmd_start;
482
+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
483
+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
484
+ }
485
+
486
+ // Dump a csv
487
+ float total_kernel_time = 0;
488
+ fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
489
+ for (const ProfilingInfo & info : profiling_info) {
490
+ total_kernel_time += info.cmd_duration_ns/1.e6f;
491
+ fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
492
+ info.op_name.c_str(), info.kernel_name.c_str(),
493
+ info.cmd_queued_duration_ns/1.e6f,
494
+ info.cmd_submit_duration_ns/1.e6f,
495
+ info.cmd_duration_ns/1.e6f,
496
+ info.cmd_complete_duration_ns/1.e6f,
497
+ info.cmd_total_duration_ns/1.e6f,
498
+ info.global_size[0], info.global_size[1], info.global_size[2],
499
+ info.local_size[0], info.local_size[1], info.local_size[2],
500
+ info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
501
+ }
502
+ fclose(fperf);
503
+
504
+ GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
505
+
506
+ // Dump a simple chrome trace
507
+ FILE* ftrace = fopen("cl_trace.json", "w");
508
+ if (!ftrace) {
509
+ GGML_LOG_ERROR("Failed to open cl_trace.json\n");
510
+ return;
511
+ }
512
+
513
+ fprintf(ftrace, "[\n");
514
+ for (const ProfilingInfo & info : profiling_info) {
515
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
516
+ info.kernel_name.c_str(), info.cmd_queued/1000);
517
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
518
+ info.kernel_name.c_str(), info.cmd_submit/1000);
519
+
520
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
521
+ info.kernel_name.c_str(), info.cmd_start/1000);
522
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
523
+ info.kernel_name.c_str(), info.cmd_end/1000);
524
+ }
525
+ fclose(ftrace);
526
+ }
527
+
528
+ void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
529
+ #ifdef GGML_OPENCL_PROFILING
530
+ cl_event evt;
531
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
532
+
533
+ profiling_info.emplace_back();
534
+ populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
535
+ #else
536
+ GGML_UNUSED(tensor);
537
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
538
+ #endif
539
+ }
540
+
541
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
542
  // Transpose kernels
543
  cl_program program_transpose;
 
564
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
565
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
566
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 
567
 
568
+ void free() {
569
+ ref_count--;
570
+ if (ref_count == 0) {
 
571
  #ifdef GGML_OPENCL_PROFILING
572
+ write_profiling_info();
573
+ #endif
574
+ }
575
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  };
577
 
578
+ // All registered devices with a default device in the front.
579
+ static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
580
 
581
  inline std::string read_file(const std::string &path) {
582
  std::ifstream ifs(path);
 
1811
  backend_ctx->device = dev_ctx->device;
1812
  backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1813
 
1814
+ // ref_count get increased in ggml_backend_opencl_device_init
1815
+ // This function is also used to retrieve backend context, so we don't want
1816
+ // to increase ref_count for each call. We only want to increase ref_count
1817
+ // when the associated device is initialized
1818
+ backend_ctx->ref_count = 0;
1819
+
1820
  if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
1821
  strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
1822
  strstr(dev_ctx->device_version.c_str(), "Adreno")) {
 
1989
  return dev_ctx->backend_ctx;
1990
  }
1991
 
1992
+ static void ggml_cl2_free(ggml_backend_t backend) {
1993
+ ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
1994
+ ctx->free();
 
 
 
 
1995
 
1996
+ // The CL context is shared by all backends, release it if all backends have been released
1997
+ bool should_release_opencl = true;
1998
+ for (auto device : g_ggml_backend_opencl_devices) {
1999
+ ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
2000
+ if (ctx_dev->backend_ctx->ref_count > 0) {
2001
+ should_release_opencl = false;
2002
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2003
  }
2004
 
2005
+ if (should_release_opencl) {
2006
+ CL_CHECK(clReleaseContext(ctx->context));
 
 
 
 
 
 
 
 
 
2007
  }
 
 
2008
  }
2009
 
2010
  //------------------------------------------------------------------------------
 
2088
  }
2089
 
2090
  static void ggml_backend_opencl_free(ggml_backend_t backend) {
2091
+ ggml_cl2_free(backend);
 
 
2092
  }
2093
 
2094
  static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 
2974
 
2975
  static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
2976
  ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
2977
+ // Getting a new reference to the backend, increase ref_count
2978
+ backend_ctx->ref_count++;
2979
 
2980
  ggml_backend_t backend = new ggml_backend {
2981
  /* .guid = */ ggml_backend_opencl_guid(),
 
3236
  #define dump_tensor(tensor)
3237
  #endif
3238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3239
  //------------------------------------------------------------------------------
3240
  // Ops
3241
  //------------------------------------------------------------------------------
 
3279
  const cl_ulong nb2 = dst ? dst->nb[2] : 0;
3280
 
3281
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3282
 
3283
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3284
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 
3322
  size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
3323
  size_t local_work_size[] = {1, 1, 1};
3324
 
3325
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
3326
  }
3327
 
3328
  static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3364
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3365
 
3366
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3367
 
3368
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3369
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 
3438
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3439
  }
3440
 
3441
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
3442
  } else {
3443
  unsigned int nth = MIN(64, ne0);
3444
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3445
  size_t local_work_size[] = {nth, 1, 1};
3446
 
3447
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
3448
  }
3449
  }
3450
 
 
3487
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3488
 
3489
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3490
 
3491
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3492
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 
3561
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3562
  }
3563
 
3564
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
3565
  } else {
3566
  unsigned int nth = MIN(64, ne0);
3567
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3568
  size_t local_work_size[] = {nth, 1, 1};
3569
 
3570
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
3571
  }
3572
  }
3573
 
 
3607
  const cl_ulong nb3 = dst->nb[3];
3608
 
3609
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3610
 
3611
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3612
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 
3669
  size_t global_work_size[] = {(size_t)n, 1, 1};
3670
  size_t local_work_size[] = {64, 1, 1};
3671
 
3672
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
3673
  } else {
3674
  unsigned int nth = MIN(64, ne0);
3675
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3676
  size_t local_work_size[] = {nth, 1, 1};
3677
 
3678
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
3679
  }
3680
  }
3681
 
 
3715
  const cl_ulong nb3 = dst->nb[3];
3716
 
3717
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3718
 
3719
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3720
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 
3777
  size_t global_work_size[] = {(size_t)n, 1, 1};
3778
  size_t local_work_size[] = {64, 1, 1};
3779
 
3780
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
3781
  } else {
3782
  unsigned int nth = MIN(64, ne0);
3783
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3784
  size_t local_work_size[] = {nth, 1, 1};
3785
 
3786
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
3787
  }
3788
  }
3789
 
 
3796
  UNUSED(src1);
3797
 
3798
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3799
 
3800
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3801
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
3822
  size_t global_work_size[] = {(size_t)n, 1, 1};
3823
  size_t local_work_size[] = {64, 1, 1};
3824
 
3825
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
3826
  }
3827
 
3828
  static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3834
  UNUSED(src1);
3835
 
3836
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3837
 
3838
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3839
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
3860
  size_t global_work_size[] = {(size_t)n, 1, 1};
3861
  size_t local_work_size[] = {64, 1, 1};
3862
 
3863
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
3864
  }
3865
 
3866
  static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3872
  UNUSED(src1);
3873
 
3874
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3875
 
3876
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3877
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
3903
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3904
  }
3905
 
3906
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
3907
  }
3908
 
3909
  static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3915
  UNUSED(src1);
3916
 
3917
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3918
 
3919
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3920
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
3939
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3940
  }
3941
 
3942
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
3943
  }
3944
 
3945
  static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3951
  UNUSED(src1);
3952
 
3953
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3954
 
3955
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3956
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
3982
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3983
  }
3984
 
3985
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
3986
  }
3987
 
3988
  static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3994
  UNUSED(src1);
3995
 
3996
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
3997
 
3998
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3999
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
4025
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4026
  }
4027
 
4028
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
4029
  }
4030
 
4031
  static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
4037
  UNUSED(src1);
4038
 
4039
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
4040
 
4041
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4042
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
4077
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
4078
  size_t local_work_size[] = {(size_t)nth, 1, 1};
4079
 
4080
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
4081
  }
4082
 
4083
  static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
4089
  UNUSED(src1);
4090
 
4091
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
4092
 
4093
  //ggml_backend_opencl_device_context * dev_ctx =
4094
  // (ggml_backend_opencl_device_context *)backend->device->context;
 
4152
  // This is local memory - the size depends on subgroup size.
4153
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
4154
 
4155
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
4156
  }
4157
 
4158
  static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
4164
  UNUSED(src1);
4165
 
4166
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
4167
 
4168
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4169
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
4202
  size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
4203
  size_t local_work_size[] = {(size_t)sgs, 1, 1};
4204
 
4205
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
4206
  }
4207
 
4208
  static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
4214
  UNUSED(src1);
4215
 
4216
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
4217
 
4218
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4219
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
4290
  }
4291
  if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
4292
 
4293
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
 
4294
  }
4295
 
4296
  static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
 
4303
  UNUSED(src1_shape_def);
4304
 
4305
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
4306
 
4307
  if (backend_ctx->kernel_repeat == nullptr) {
4308
  GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
 
4350
 
4351
  size_t global_work_size[] = { gws0, gws1, gws2 };
4352
 
4353
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
 
 
 
 
 
 
 
 
4354
  }
4355
 
4356
  static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
 
4363
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
4364
 
4365
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
4366
 
4367
  if (backend_ctx->kernel_pad == nullptr) {
4368
  GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
 
4407
  local_work_size_ptr = nullptr;
4408
  }
4409
 
4410
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
4411
  }
4412
 
4413
  static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
 
4419
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
4420
 
4421
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
4422
 
4423
  const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
4424
  cl_kernel kernel = nullptr;
 
4509
  local_work_size_ptr = nullptr;
4510
  }
4511
 
4512
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
 
 
4513
  }
4514
 
4515
  static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
4587
  global_work_size[1] = d_ne1;
4588
  global_work_size[2] = d_ne2;
4589
 
4590
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4591
  }
4592
  }
4593
  } else {
 
4637
  d_ne2 > 0 ? (size_t)d_ne2 : 1,
4638
  d_ne3 > 0 ? (size_t)d_ne3 : 1 };
4639
 
4640
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
4641
  }
4642
  }
4643
 
 
4650
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
4651
 
4652
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
4653
 
4654
  if (backend_ctx->kernel_timestep_embedding == nullptr) {
4655
  GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
 
4682
 
4683
  size_t global_work_size[] = {gws0, gws1, 1};
4684
 
4685
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
 
 
 
 
 
 
 
 
 
 
4686
  }
4687
 
4688
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
4697
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
4698
 
4699
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
4700
 
4701
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4702
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 
4901
  static_cast<size_t>(padded_height_B)
4902
  };
4903
 
4904
+ backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
 
 
 
 
 
 
 
 
4905
  } else {
4906
  // no need to transpose B in other cases
4907
  // create an image for B from sub_buffer
 
5023
 
5024
  // enqueue kernel with profiling
5025
  // <--------------------------------------------> //
5026
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
 
5027
  // <--------------------------------------------> //
5028
 
5029
  // deallocate sub buffers and images
 
5103
  global_work_size[2] = (size_t)ne12*ne13;
5104
  }
5105
 
5106
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
5107
  return;
5108
  }
5109
  #else // GGML_OPENCL_SOA_Q
 
5333
  size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
5334
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5335
 
5336
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
5337
  } else if (src0t == GGML_TYPE_Q4_K) {
5338
  GGML_ASSERT(false && "not implemented");
5339
  } else if (src0t == GGML_TYPE_Q3_K) {
 
5344
  size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
5345
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5346
 
5347
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
5348
  } else {
5349
  int64_t ny = (ne11 + nrows - 1)/nrows;
5350
 
5351
  size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
5352
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5353
 
5354
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
5355
  }
5356
  }
5357
 
 
5368
  GGML_ASSERT(src2->extra);
5369
 
5370
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
5371
 
5372
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5373
  ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
 
5473
  size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
5474
  size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
5475
 
5476
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
5477
  }
5478
 
5479
  static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
5486
  GGML_ASSERT(ggml_is_contiguous(src0));
5487
 
5488
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
5489
 
5490
  float scale;
5491
  memcpy(&scale, dst->op_params, sizeof(scale));
 
5514
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5515
  }
5516
 
5517
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
5518
  }
5519
 
5520
  static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
5551
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
5552
 
5553
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
5554
 
5555
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5556
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 
5615
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5616
  size_t local_work_size[] = {(size_t)nth, 1, 1};
5617
 
5618
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
 
 
 
 
 
 
 
 
5619
  }
5620
 
5621
  static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
5638
  const int ne02 = src0 ? src0->ne[2] : 0;
5639
 
5640
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
5641
 
5642
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5643
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
5661
  size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
5662
  size_t local_work_size[] = {64, 1, 1};
5663
 
5664
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
5665
  } else {
5666
  kernel = backend_ctx->kernel_diag_mask_inf;
5667
 
 
5681
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5682
  }
5683
 
5684
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 
 
 
 
 
 
 
 
5685
  }
5686
  }
5687
 
 
5701
  }
5702
 
5703
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
5704
 
5705
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5706
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
5780
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5781
  size_t local_work_size[] = {(size_t)nth, 1, 1};
5782
 
5783
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
5784
  }
5785
 
5786
  static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
5792
  GGML_ASSERT(dst->extra);
5793
 
5794
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
5795
 
5796
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5797
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 
5957
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5958
  size_t local_work_size[] = {(size_t)nth, 1, 1};
5959
 
5960
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
5961
  }
5962
 
5963
  static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
5972
  GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
5973
 
5974
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
5975
 
5976
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5977
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
6040
  size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
6041
  size_t local_work_size[] = {256, 1, 1};
6042
 
6043
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
6044
  }
6045
 
6046
  static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
6055
  GGML_ASSERT(ggml_is_contiguous(src0));
6056
 
6057
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
6058
 
6059
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6060
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
6086
  size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
6087
  size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
6088
 
6089
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
6090
  }
6091
 
6092
  static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
6100
  GGML_ASSERT(ggml_is_contiguous(src0));
6101
 
6102
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
6103
 
6104
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6105
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
6140
  size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
6141
  size_t local_work_size[] = {(size_t)64, 1, 1};
6142
 
6143
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 
 
 
 
 
 
 
 
6144
  }
6145
 
6146
  //------------------------------------------------------------------------------