lhez commited on
Commit
1ab0f23
·
1 Parent(s): 4473109

opencl: add new ops - `argsort`, `div`, `sub`, `addrows`, `sigmoid`, `group_norm` (llama/13787)

Browse files

* opencl: add `argsort`

* opencl: add `div`

* opencl: add `add_rows`

* opencl: add `sub`

* opencl: add `sigmoid`, both `f16` and `f32`

* opencl: add `group_norm`

ggml/src/ggml-opencl/CMakeLists.txt CHANGED
@@ -55,14 +55,17 @@ endfunction()
55
 
56
  set(GGML_OPENCL_KERNELS
57
  add
 
58
  clamp
59
  cpy
60
  cvt
61
  diag_mask_inf
 
62
  gelu
63
  gemv_noshuffle_general
64
  gemv_noshuffle
65
  get_rows
 
66
  im2col_f32
67
  im2col_f16
68
  mul_mat_Ab_Bi_8x4
@@ -83,11 +86,14 @@ set(GGML_OPENCL_KERNELS
83
  rms_norm
84
  rope
85
  scale
 
86
  silu
87
  softmax_4_f32
88
  softmax_4_f16
89
  softmax_f32
90
  softmax_f16
 
 
91
  transpose
92
  )
93
 
 
55
 
56
  set(GGML_OPENCL_KERNELS
57
  add
58
+ argsort
59
  clamp
60
  cpy
61
  cvt
62
  diag_mask_inf
63
+ div
64
  gelu
65
  gemv_noshuffle_general
66
  gemv_noshuffle
67
  get_rows
68
+ group_norm
69
  im2col_f32
70
  im2col_f16
71
  mul_mat_Ab_Bi_8x4
 
86
  rms_norm
87
  rope
88
  scale
89
+ sigmoid
90
  silu
91
  softmax_4_f32
92
  softmax_4_f16
93
  softmax_f32
94
  softmax_f16
95
+ sub
96
+ sum_rows
97
  transpose
98
  )
99
 
ggml/src/ggml-opencl/ggml-opencl.cpp CHANGED
@@ -299,27 +299,37 @@ struct ggml_backend_opencl_context {
299
  cl_program program_mul_mv_f16_f32;
300
  cl_program program_mul_mv_f32_f32;
301
  cl_program program_mul;
 
 
302
  cl_program program_norm;
303
  cl_program program_relu;
304
  cl_program program_rms_norm;
 
305
  cl_program program_rope;
306
  cl_program program_scale;
307
  cl_program program_silu;
 
308
  cl_program program_softmax_f32;
309
  cl_program program_softmax_f16;
310
  cl_program program_softmax_4_f32;
311
  cl_program program_softmax_4_f16;
 
 
312
 
313
  cl_kernel kernel_add, kernel_add_row;
314
  cl_kernel kernel_mul, kernel_mul_row;
 
 
315
  cl_kernel kernel_scale;
316
  cl_kernel kernel_silu, kernel_silu_4;
317
  cl_kernel kernel_gelu, kernel_gelu_4;
318
  cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
319
  cl_kernel kernel_relu;
 
320
  cl_kernel kernel_clamp;
321
  cl_kernel kernel_norm;
322
  cl_kernel kernel_rms_norm;
 
323
  cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
324
  cl_kernel kernel_soft_max, kernel_soft_max_4;
325
  cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
@@ -339,6 +349,8 @@ struct ggml_backend_opencl_context {
339
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
340
  cl_kernel kernel_mul_mv_q6_K_f32;
341
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
 
 
342
 
343
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
344
  // Transpose kernels
@@ -986,6 +998,105 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
986
  GGML_LOG_CONT(".");
987
  }
988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
989
  // Adreno kernels
990
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
991
  // transpose
@@ -1856,6 +1967,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1856
  case GGML_OP_ADD:
1857
  case GGML_OP_SCALE:
1858
  case GGML_OP_MUL:
 
 
1859
  return op->src[0]->type == GGML_TYPE_F32;
1860
  case GGML_OP_UNARY:
1861
  switch (ggml_get_unary_op(op)) {
@@ -1863,7 +1976,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1863
  case GGML_UNARY_OP_SILU:
1864
  case GGML_UNARY_OP_RELU:
1865
  case GGML_UNARY_OP_GELU_QUICK:
1866
- return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
 
 
1867
  default:
1868
  return false;
1869
  }
@@ -1873,6 +1988,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1873
  case GGML_OP_NORM:
1874
  case GGML_OP_RMS_NORM:
1875
  return true;
 
 
1876
  case GGML_OP_MUL_MAT:
1877
  if (op->src[0]->type == GGML_TYPE_F16) {
1878
  return true;
@@ -1912,6 +2029,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1912
  }
1913
  case GGML_OP_IM2COL:
1914
  return true;
 
 
 
 
1915
  default:
1916
  return false;
1917
  }
@@ -3238,6 +3359,256 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3238
  }
3239
  }
3240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3241
  static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3242
  GGML_ASSERT(src0);
3243
  GGML_ASSERT(src0->extra);
@@ -3429,6 +3800,58 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3429
  #endif
3430
  }
3431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3432
  static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3433
  GGML_ASSERT(src0);
3434
  GGML_ASSERT(src0->extra);
@@ -3626,6 +4049,65 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
3626
  #endif
3627
  }
3628
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3629
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3630
  GGML_ASSERT(src0);
3631
  GGML_ASSERT(src0->extra);
@@ -4975,6 +5457,124 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
4975
  #endif
4976
  }
4977
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4978
  //------------------------------------------------------------------------------
4979
  // Op offloading
4980
  //------------------------------------------------------------------------------
@@ -5023,6 +5623,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5023
  }
5024
  func = ggml_cl_mul;
5025
  break;
 
 
 
 
 
 
 
 
 
 
 
 
5026
  case GGML_OP_UNARY:
5027
  switch (ggml_get_unary_op(tensor)) {
5028
  case GGML_UNARY_OP_GELU:
@@ -5049,6 +5661,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5049
  }
5050
  func = ggml_cl_relu;
5051
  break;
 
 
 
 
 
 
5052
  default:
5053
  return false;
5054
  } break;
@@ -5070,6 +5688,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5070
  }
5071
  func = ggml_cl_rms_norm;
5072
  break;
 
 
 
 
 
 
5073
  case GGML_OP_MUL_MAT:
5074
  if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
5075
  return false;
@@ -5115,6 +5739,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5115
  }
5116
  func = ggml_cl_im2col;
5117
  break;
 
 
 
 
 
 
 
 
 
 
 
 
5118
  default:
5119
  return false;
5120
  }
 
299
  cl_program program_mul_mv_f16_f32;
300
  cl_program program_mul_mv_f32_f32;
301
  cl_program program_mul;
302
+ cl_program program_div;
303
+ cl_program program_sub;
304
  cl_program program_norm;
305
  cl_program program_relu;
306
  cl_program program_rms_norm;
307
+ cl_program program_group_norm;
308
  cl_program program_rope;
309
  cl_program program_scale;
310
  cl_program program_silu;
311
+ cl_program program_sigmoid;
312
  cl_program program_softmax_f32;
313
  cl_program program_softmax_f16;
314
  cl_program program_softmax_4_f32;
315
  cl_program program_softmax_4_f16;
316
+ cl_program program_argsort_f32_i32;
317
+ cl_program program_sum_rows_f32;
318
 
319
  cl_kernel kernel_add, kernel_add_row;
320
  cl_kernel kernel_mul, kernel_mul_row;
321
+ cl_kernel kernel_div, kernel_div_row;
322
+ cl_kernel kernel_sub, kernel_sub_row;
323
  cl_kernel kernel_scale;
324
  cl_kernel kernel_silu, kernel_silu_4;
325
  cl_kernel kernel_gelu, kernel_gelu_4;
326
  cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
327
  cl_kernel kernel_relu;
328
+ cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
329
  cl_kernel kernel_clamp;
330
  cl_kernel kernel_norm;
331
  cl_kernel kernel_rms_norm;
332
+ cl_kernel kernel_group_norm;
333
  cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
334
  cl_kernel kernel_soft_max, kernel_soft_max_4;
335
  cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
 
349
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
350
  cl_kernel kernel_mul_mv_q6_K_f32;
351
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
352
+ cl_kernel kernel_argsort_f32_i32;
353
+ cl_kernel kernel_sum_rows_f32;
354
 
355
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
356
  // Transpose kernels
 
998
  GGML_LOG_CONT(".");
999
  }
1000
 
1001
+ // argsort
1002
+ {
1003
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1004
+ const std::string kernel_src {
1005
+ #include "argsort.cl.h"
1006
+ };
1007
+ #else
1008
+ const std::string kernel_src = read_file("argsort.cl");
1009
+ #endif
1010
+ backend_ctx->program_argsort_f32_i32 =
1011
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1012
+
1013
+ CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
1014
+ GGML_LOG_CONT(".");
1015
+ }
1016
+
1017
+ // div
1018
+ {
1019
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1020
+ const std::string kernel_src {
1021
+ #include "div.cl.h"
1022
+ };
1023
+ #else
1024
+ const std::string kernel_src = read_file("div.cl");
1025
+ #endif
1026
+ backend_ctx->program_div =
1027
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1028
+
1029
+ CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
1030
+ CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
1031
+ GGML_LOG_CONT(".");
1032
+ }
1033
+
1034
+ // sub
1035
+ {
1036
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1037
+ const std::string kernel_src {
1038
+ #include "sub.cl.h"
1039
+ };
1040
+ #else
1041
+ const std::string kernel_src = read_file("sub.cl");
1042
+ #endif
1043
+ backend_ctx->program_sub =
1044
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1045
+
1046
+ CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
1047
+ CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
1048
+ GGML_LOG_CONT(".");
1049
+ }
1050
+
1051
+ // sum_rows
1052
+ {
1053
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1054
+ const std::string kernel_src {
1055
+ #include "sum_rows.cl.h"
1056
+ };
1057
+ #else
1058
+ const std::string kernel_src = read_file("sum_rows.cl");
1059
+ #endif
1060
+ backend_ctx->program_sum_rows_f32 =
1061
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1062
+
1063
+ CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
1064
+ GGML_LOG_CONT(".");
1065
+ }
1066
+
1067
+ // sigmoid
1068
+ {
1069
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1070
+ const std::string kernel_src {
1071
+ #include "sigmoid.cl.h"
1072
+ };
1073
+ #else
1074
+ const std::string kernel_src = read_file("sigmoid.cl");
1075
+ #endif
1076
+ backend_ctx->program_sigmoid =
1077
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1078
+
1079
+ CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
1080
+ CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
1081
+ GGML_LOG_CONT(".");
1082
+ }
1083
+
1084
+ // group_norm
1085
+ {
1086
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1087
+ const std::string kernel_src {
1088
+ #include "group_norm.cl.h"
1089
+ };
1090
+ #else
1091
+ const std::string kernel_src = read_file("group_norm.cl");
1092
+ #endif
1093
+ backend_ctx->program_group_norm =
1094
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1095
+
1096
+ CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
1097
+ GGML_LOG_CONT(".");
1098
+ }
1099
+
1100
  // Adreno kernels
1101
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1102
  // transpose
 
1967
  case GGML_OP_ADD:
1968
  case GGML_OP_SCALE:
1969
  case GGML_OP_MUL:
1970
+ case GGML_OP_DIV:
1971
+ case GGML_OP_SUB:
1972
  return op->src[0]->type == GGML_TYPE_F32;
1973
  case GGML_OP_UNARY:
1974
  switch (ggml_get_unary_op(op)) {
 
1976
  case GGML_UNARY_OP_SILU:
1977
  case GGML_UNARY_OP_RELU:
1978
  case GGML_UNARY_OP_GELU_QUICK:
1979
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1980
+ case GGML_UNARY_OP_SIGMOID:
1981
+ return ggml_is_contiguous(op->src[0]);
1982
  default:
1983
  return false;
1984
  }
 
1988
  case GGML_OP_NORM:
1989
  case GGML_OP_RMS_NORM:
1990
  return true;
1991
+ case GGML_OP_GROUP_NORM:
1992
+ return ggml_is_contiguous(op->src[0]);
1993
  case GGML_OP_MUL_MAT:
1994
  if (op->src[0]->type == GGML_TYPE_F16) {
1995
  return true;
 
2029
  }
2030
  case GGML_OP_IM2COL:
2031
  return true;
2032
+ case GGML_OP_ARGSORT:
2033
+ return op->src[0]->type == GGML_TYPE_F32;
2034
+ case GGML_OP_SUM_ROWS:
2035
+ return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
2036
  default:
2037
  return false;
2038
  }
 
3359
  }
3360
  }
3361
 
3362
+ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3363
+ GGML_ASSERT(src0);
3364
+ GGML_ASSERT(src0->extra);
3365
+ GGML_ASSERT(src1);
3366
+ GGML_ASSERT(src1->extra);
3367
+ GGML_ASSERT(dst);
3368
+ GGML_ASSERT(dst->extra);
3369
+
3370
+ const int ne00 = src0->ne[0];
3371
+ const int ne01 = src0->ne[1];
3372
+ const int ne02 = src0->ne[2];
3373
+ const int ne03 = src0->ne[3];
3374
+
3375
+ const cl_ulong nb00 = src0->nb[0];
3376
+ const cl_ulong nb01 = src0->nb[1];
3377
+ const cl_ulong nb02 = src0->nb[2];
3378
+ const cl_ulong nb03 = src0->nb[3];
3379
+
3380
+ const int ne10 = src1->ne[0];
3381
+ const int ne11 = src1->ne[1];
3382
+ const int ne12 = src1->ne[2];
3383
+ const int ne13 = src1->ne[3];
3384
+
3385
+ const cl_ulong nb10 = src1->nb[0];
3386
+ const cl_ulong nb11 = src1->nb[1];
3387
+ const cl_ulong nb12 = src1->nb[2];
3388
+ const cl_ulong nb13 = src1->nb[3];
3389
+
3390
+ const int ne0 = dst->ne[0];
3391
+
3392
+ const cl_ulong nb0 = dst->nb[0];
3393
+ const cl_ulong nb1 = dst->nb[1];
3394
+ const cl_ulong nb2 = dst->nb[2];
3395
+ const cl_ulong nb3 = dst->nb[3];
3396
+
3397
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3398
+ cl_command_queue queue = backend_ctx->queue;
3399
+
3400
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3401
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3402
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3403
+
3404
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3405
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
3406
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3407
+
3408
+ bool bcast_row = false;
3409
+ cl_kernel kernel;
3410
+
3411
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
3412
+ GGML_ASSERT(ggml_is_contiguous(src0));
3413
+
3414
+ // src1 is a row
3415
+ GGML_ASSERT(ne11 == 1);
3416
+
3417
+ bcast_row = true;
3418
+ int ne = ne00 / 4;
3419
+ kernel = backend_ctx->kernel_div_row;
3420
+
3421
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3422
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3423
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3424
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3425
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3426
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3427
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
3428
+ } else {
3429
+ kernel = backend_ctx->kernel_div;
3430
+
3431
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3432
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3433
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3434
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3435
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3436
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3437
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
3438
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3439
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3440
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
3441
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
3442
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
3443
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
3444
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
3445
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
3446
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
3447
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
3448
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
3449
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
3450
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
3451
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
3452
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
3453
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
3454
+ }
3455
+
3456
+ if (bcast_row) {
3457
+ int n = ggml_nelements(dst)/4;
3458
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3459
+ size_t local_work_size[] = {64, 1, 1};
3460
+
3461
+ #ifdef GGML_OPENCL_PROFILING
3462
+ cl_event evt;
3463
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3464
+
3465
+ g_profiling_info.emplace_back();
3466
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3467
+ #else
3468
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3469
+ #endif
3470
+ } else {
3471
+ unsigned int nth = MIN(64, ne0);
3472
+ size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3473
+ size_t local_work_size[] = {nth, 1, 1};
3474
+
3475
+ #ifdef GGML_OPENCL_PROFILING
3476
+ cl_event evt;
3477
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3478
+
3479
+ g_profiling_info.emplace_back();
3480
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3481
+ #else
3482
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3483
+ #endif
3484
+ }
3485
+ }
3486
+
3487
+ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3488
+ GGML_ASSERT(src0);
3489
+ GGML_ASSERT(src0->extra);
3490
+ GGML_ASSERT(src1);
3491
+ GGML_ASSERT(src1->extra);
3492
+ GGML_ASSERT(dst);
3493
+ GGML_ASSERT(dst->extra);
3494
+
3495
+ const int ne00 = src0->ne[0];
3496
+ const int ne01 = src0->ne[1];
3497
+ const int ne02 = src0->ne[2];
3498
+ const int ne03 = src0->ne[3];
3499
+
3500
+ const cl_ulong nb00 = src0->nb[0];
3501
+ const cl_ulong nb01 = src0->nb[1];
3502
+ const cl_ulong nb02 = src0->nb[2];
3503
+ const cl_ulong nb03 = src0->nb[3];
3504
+
3505
+ const int ne10 = src1->ne[0];
3506
+ const int ne11 = src1->ne[1];
3507
+ const int ne12 = src1->ne[2];
3508
+ const int ne13 = src1->ne[3];
3509
+
3510
+ const cl_ulong nb10 = src1->nb[0];
3511
+ const cl_ulong nb11 = src1->nb[1];
3512
+ const cl_ulong nb12 = src1->nb[2];
3513
+ const cl_ulong nb13 = src1->nb[3];
3514
+
3515
+ const int ne0 = dst->ne[0];
3516
+
3517
+ const cl_ulong nb0 = dst->nb[0];
3518
+ const cl_ulong nb1 = dst->nb[1];
3519
+ const cl_ulong nb2 = dst->nb[2];
3520
+ const cl_ulong nb3 = dst->nb[3];
3521
+
3522
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3523
+ cl_command_queue queue = backend_ctx->queue;
3524
+
3525
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3526
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3527
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3528
+
3529
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3530
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
3531
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3532
+
3533
+ bool bcast_row = false;
3534
+ cl_kernel kernel;
3535
+
3536
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
3537
+ GGML_ASSERT(ggml_is_contiguous(src0));
3538
+
3539
+ // src1 is a row
3540
+ GGML_ASSERT(ne11 == 1);
3541
+
3542
+ bcast_row = true;
3543
+ int ne = ne00 / 4;
3544
+ kernel = backend_ctx->kernel_sub_row;
3545
+
3546
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3547
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3548
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3549
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3550
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3551
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3552
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
3553
+ } else {
3554
+ kernel = backend_ctx->kernel_sub;
3555
+
3556
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3557
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3558
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3559
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3560
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3561
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3562
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
3563
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3564
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3565
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
3566
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
3567
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
3568
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
3569
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
3570
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
3571
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
3572
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
3573
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
3574
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
3575
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
3576
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
3577
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
3578
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
3579
+ }
3580
+
3581
+ if (bcast_row) {
3582
+ int n = ggml_nelements(dst)/4;
3583
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3584
+ size_t local_work_size[] = {64, 1, 1};
3585
+
3586
+ #ifdef GGML_OPENCL_PROFILING
3587
+ cl_event evt;
3588
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3589
+
3590
+ g_profiling_info.emplace_back();
3591
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3592
+ #else
3593
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3594
+ #endif
3595
+ } else {
3596
+ unsigned int nth = MIN(64, ne0);
3597
+ size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3598
+ size_t local_work_size[] = {nth, 1, 1};
3599
+
3600
+ #ifdef GGML_OPENCL_PROFILING
3601
+ cl_event evt;
3602
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3603
+
3604
+ g_profiling_info.emplace_back();
3605
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3606
+ #else
3607
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3608
+ #endif
3609
+ }
3610
+ }
3611
+
3612
  static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3613
  GGML_ASSERT(src0);
3614
  GGML_ASSERT(src0->extra);
 
3800
  #endif
3801
  }
3802
 
3803
+ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3804
+ GGML_ASSERT(src0);
3805
+ GGML_ASSERT(src0->extra);
3806
+ GGML_ASSERT(dst);
3807
+ GGML_ASSERT(dst->extra);
3808
+
3809
+ UNUSED(src1);
3810
+
3811
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3812
+ cl_command_queue queue = backend_ctx->queue;
3813
+
3814
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3815
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3816
+
3817
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3818
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3819
+
3820
+ cl_kernel kernel;
3821
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3822
+ kernel = backend_ctx->kernel_sigmoid_f32;
3823
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3824
+ kernel = backend_ctx->kernel_sigmoid_f16;
3825
+ } else {
3826
+ GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
3827
+ }
3828
+
3829
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3830
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3831
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
3832
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
3833
+
3834
+ const int64_t n = ggml_nelements(dst);
3835
+
3836
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3837
+ size_t local_work_size[] = {64, 1, 1};
3838
+
3839
+ size_t * local_work_size_ptr = local_work_size;
3840
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3841
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3842
+ }
3843
+
3844
+ #ifdef GGML_OPENCL_PROFILING
3845
+ cl_event evt;
3846
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3847
+
3848
+ g_profiling_info.emplace_back();
3849
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3850
+ #else
3851
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3852
+ #endif
3853
+ }
3854
+
3855
  static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3856
  GGML_ASSERT(src0);
3857
  GGML_ASSERT(src0->extra);
 
4049
  #endif
4050
  }
4051
 
4052
+ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4053
+ GGML_ASSERT(src0);
4054
+ GGML_ASSERT(src0->extra);
4055
+ GGML_ASSERT(dst);
4056
+ GGML_ASSERT(dst->extra);
4057
+
4058
+ UNUSED(src1);
4059
+
4060
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4061
+ cl_command_queue queue = backend_ctx->queue;
4062
+
4063
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4064
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4065
+
4066
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
4067
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
4068
+
4069
+ int32_t n_groups = ((const int32_t *) dst->op_params)[0];
4070
+ int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
4071
+ float eps = ((const float *) dst->op_params)[1];
4072
+
4073
+ const int ne00 = src0->ne[0];
4074
+ const int ne01 = src0->ne[1];
4075
+ const int ne02 = src0->ne[2];
4076
+ const int ne = ne00*ne01*ne02;
4077
+
4078
+ cl_kernel kernel = backend_ctx->kernel_group_norm;
4079
+
4080
+ size_t sgs = 64;
4081
+ if (backend_ctx->gpu_family == ADRENO) {
4082
+ sgs = 64;
4083
+ } else if (backend_ctx->gpu_family == INTEL) {
4084
+ sgs = 32;
4085
+ } else {
4086
+ GGML_ASSERT(false && "Unsupported GPU");
4087
+ }
4088
+
4089
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
4090
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
4091
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4092
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
4093
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
4094
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
4095
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
4096
+
4097
+ size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
4098
+ size_t local_work_size[] = {(size_t)sgs, 1, 1};
4099
+
4100
+ #ifdef GGML_OPENCL_PROFILING
4101
+ cl_event evt;
4102
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4103
+
4104
+ g_profiling_info.emplace_back();
4105
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4106
+ #else
4107
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4108
+ #endif
4109
+ }
4110
+
4111
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4112
  GGML_ASSERT(src0);
4113
  GGML_ASSERT(src0->extra);
 
5457
  #endif
5458
  }
5459
 
5460
+ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5461
+ GGML_ASSERT(src0);
5462
+ GGML_ASSERT(src0->extra);
5463
+ GGML_ASSERT(dst);
5464
+ GGML_ASSERT(dst->extra);
5465
+ GGML_UNUSED(src1);
5466
+
5467
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5468
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
5469
+ GGML_ASSERT(ggml_is_contiguous(src0));
5470
+
5471
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5472
+ cl_command_queue queue = backend_ctx->queue;
5473
+
5474
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5475
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5476
+
5477
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
5478
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
5479
+
5480
+ const int ne00 = src0->ne[0];
5481
+ const int nrows = ggml_nrows(src0);
5482
+
5483
+ int ne00_padded = 1;
5484
+ while (ne00_padded < ne00) {
5485
+ ne00_padded *= 2;
5486
+ }
5487
+
5488
+ int order = (enum ggml_sort_order) dst->op_params[0];
5489
+
5490
+ cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
5491
+
5492
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5493
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5494
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
5495
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
5496
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
5497
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
5498
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
5499
+ CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
5500
+
5501
+ size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
5502
+ size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
5503
+
5504
+ #ifdef GGML_OPENCL_PROFILING
5505
+ cl_event evt;
5506
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5507
+
5508
+ g_profiling_info.emplace_back();
5509
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5510
+ #else
5511
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5512
+ #endif
5513
+ }
5514
+
5515
+ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5516
+ GGML_ASSERT(src0);
5517
+ GGML_ASSERT(src0->extra);
5518
+ GGML_ASSERT(dst);
5519
+ GGML_ASSERT(dst->extra);
5520
+ GGML_UNUSED(src1);
5521
+
5522
+ GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
5523
+ GGML_ASSERT(ggml_is_contiguous(src0));
5524
+
5525
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5526
+ cl_command_queue queue = backend_ctx->queue;
5527
+
5528
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5529
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5530
+
5531
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
5532
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
5533
+
5534
+ const int ne00 = src0->ne[0];
5535
+ const int ne01 = src0->ne[1];
5536
+ const int ne02 = src0->ne[2];
5537
+ const int ne03 = src0->ne[3];
5538
+
5539
+ const cl_ulong nb01 = src0->nb[1];
5540
+ const cl_ulong nb02 = src0->nb[2];
5541
+ const cl_ulong nb03 = src0->nb[3];
5542
+
5543
+ const cl_ulong nb1 = dst->nb[1];
5544
+ const cl_ulong nb2 = dst->nb[2];
5545
+ const cl_ulong nb3 = dst->nb[3];
5546
+
5547
+ cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
5548
+
5549
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5550
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5551
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
5552
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
5553
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
5554
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
5555
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
5556
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
5557
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
5558
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
5559
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
5560
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
5561
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
5562
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
5563
+
5564
+ size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
5565
+ size_t local_work_size[] = {(size_t)64, 1, 1};
5566
+
5567
+ #ifdef GGML_OPENCL_PROFILING
5568
+ cl_event evt;
5569
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5570
+
5571
+ g_profiling_info.emplace_back();
5572
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5573
+ #else
5574
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5575
+ #endif
5576
+ }
5577
+
5578
  //------------------------------------------------------------------------------
5579
  // Op offloading
5580
  //------------------------------------------------------------------------------
 
5623
  }
5624
  func = ggml_cl_mul;
5625
  break;
5626
+ case GGML_OP_DIV:
5627
+ if (!any_on_device) {
5628
+ return false;
5629
+ }
5630
+ func = ggml_cl_div;
5631
+ break;
5632
+ case GGML_OP_SUB:
5633
+ if (!any_on_device) {
5634
+ return false;
5635
+ }
5636
+ func = ggml_cl_sub;
5637
+ break;
5638
  case GGML_OP_UNARY:
5639
  switch (ggml_get_unary_op(tensor)) {
5640
  case GGML_UNARY_OP_GELU:
 
5661
  }
5662
  func = ggml_cl_relu;
5663
  break;
5664
+ case GGML_UNARY_OP_SIGMOID:
5665
+ if (!any_on_device) {
5666
+ return false;
5667
+ }
5668
+ func = ggml_cl_sigmoid;
5669
+ break;
5670
  default:
5671
  return false;
5672
  } break;
 
5688
  }
5689
  func = ggml_cl_rms_norm;
5690
  break;
5691
+ case GGML_OP_GROUP_NORM:
5692
+ if (!any_on_device) {
5693
+ return false;
5694
+ }
5695
+ func = ggml_cl_group_norm;
5696
+ break;
5697
  case GGML_OP_MUL_MAT:
5698
  if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
5699
  return false;
 
5739
  }
5740
  func = ggml_cl_im2col;
5741
  break;
5742
+ case GGML_OP_ARGSORT:
5743
+ if (!any_on_device) {
5744
+ return false;
5745
+ }
5746
+ func = ggml_cl_argsort;
5747
+ break;
5748
+ case GGML_OP_SUM_ROWS:
5749
+ if (!any_on_device) {
5750
+ return false;
5751
+ }
5752
+ func = ggml_cl_sum_rows;
5753
+ break;
5754
  default:
5755
  return false;
5756
  }
ggml/src/ggml-opencl/kernels/argsort.cl ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
22
+
23
+ enum ggml_sort_order {
24
+ GGML_SORT_ORDER_ASC,
25
+ GGML_SORT_ORDER_DESC,
26
+ };
27
+
28
+ kernel void kernel_argsort_f32_i32(
29
+ global float * src0,
30
+ ulong offset0,
31
+ global int * dst,
32
+ ulong offsetd,
33
+ const int ne00,
34
+ const int ne00_pad,
35
+ const int order,
36
+ local int * dst_row
37
+ ) {
38
+ // bitonic sort
39
+ int col = get_local_id(0);
40
+ int row = get_group_id(1);
41
+
42
+ if (col >= ne00_pad) {
43
+ return;
44
+ }
45
+
46
+ src0 = (global char *)((global char *)src0 + offset0);
47
+ dst = (global float *)((global char *)dst + offsetd);
48
+
49
+ global float * x_row = src0 + row * ne00;
50
+
51
+ // initialize indices
52
+ dst_row[col] = col;
53
+
54
+ barrier(CLK_LOCAL_MEM_FENCE);
55
+
56
+ for (int k = 2; k <= ne00_pad; k *= 2) {
57
+ for (int j = k / 2; j > 0; j /= 2) {
58
+ int ixj = col ^ j;
59
+ if (ixj > col) {
60
+ if ((col & k) == 0) {
61
+ if (dst_row[col] >= ne00 ||
62
+ (dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
63
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
64
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
65
+ ) {
66
+ SWAP(dst_row[col], dst_row[ixj], int);
67
+ }
68
+ } else {
69
+ if (dst_row[ixj] >= ne00 ||
70
+ (dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
71
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
72
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
73
+ ) {
74
+ SWAP(dst_row[col], dst_row[ixj], int);
75
+ }
76
+ }
77
+ }
78
+ barrier(CLK_LOCAL_MEM_FENCE);
79
+ }
80
+ }
81
+
82
+ // copy the result to dst without the padding
83
+ if (col < ne00) {
84
+ dst[row * ne00 + col] = dst_row[col];
85
+ }
86
+ }
ggml/src/ggml-opencl/kernels/div.cl ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // div
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_div(
7
+ global char * src0,
8
+ ulong offset0,
9
+ global char * src1,
10
+ ulong offset1,
11
+ global char * dst,
12
+ ulong offsetd,
13
+ ulong nb00,
14
+ ulong nb01,
15
+ ulong nb02,
16
+ ulong nb03,
17
+ int ne10,
18
+ int ne11,
19
+ int ne12,
20
+ int ne13,
21
+ ulong nb10,
22
+ ulong nb11,
23
+ ulong nb12,
24
+ ulong nb13,
25
+ int ne0,
26
+ ulong nb0,
27
+ ulong nb1,
28
+ ulong nb2,
29
+ ulong nb3
30
+ ) {
31
+ src0 = src0 + offset0;
32
+ src1 = src1 + offset1;
33
+ dst = dst + offsetd;
34
+
35
+ int i03 = get_group_id(2);
36
+ int i02 = get_group_id(1);
37
+ int i01 = get_group_id(0);
38
+
39
+ int i13 = i03 % ne13;
40
+ int i12 = i02 % ne12;
41
+ int i11 = i01 % ne11;
42
+
43
+ global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
44
+ global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
45
+ global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
46
+
47
+ for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
48
+ const int i10 = i0 % ne10;
49
+ *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
50
+ }
51
+ }
52
+
53
+ // assumption: src1 is a row
54
+ // broadcast src1 into src0
55
+ kernel void kernel_div_row(
56
+ global float4 * src0,
57
+ ulong offset0,
58
+ global float4 * src1,
59
+ ulong offset1,
60
+ global float4 * dst,
61
+ ulong offsetd,
62
+ int ne
63
+ ) {
64
+ src0 = (global float4*)((global char*)src0 + offset0);
65
+ src1 = (global float4*)((global char*)src1 + offset1);
66
+ dst = (global float4*)((global char*)dst + offsetd);
67
+
68
+ // This performs better than using %.
69
+ uint gid = get_global_id(0);
70
+ uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
71
+ dst[gid] = src0[gid] / src1[idx1];
72
+ }
ggml/src/ggml-opencl/kernels/group_norm.cl ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ // Workgroup must be a subgroup
22
+ #ifdef INTEL_GPU
23
+ REQD_SUBGROUP_SIZE_32
24
+ #elif defined (ADRENO_GPU)
25
+ REQD_SUBGROUP_SIZE_64
26
+ #endif
27
+ kernel void kernel_group_norm(
28
+ global float * src0,
29
+ ulong offset0,
30
+ global float * dst,
31
+ ulong offsetd,
32
+ int ne,
33
+ int group_size,
34
+ float eps
35
+ ) {
36
+ src0 = (global float *)((global char *)src0 + offset0);
37
+ dst = (global float *)((global char *)dst + offsetd);
38
+
39
+ int start = get_group_id(0) * group_size;
40
+ int end = start + group_size;
41
+
42
+ start += get_local_id(0);
43
+
44
+ if (end >= ne) {
45
+ end = ne;
46
+ }
47
+
48
+ float tmp = 0.0f;
49
+
50
+ for (int j = start; j < end; j += get_local_size(0)) {
51
+ tmp += src0[j];
52
+ }
53
+
54
+ tmp = sub_group_reduce_add(tmp);
55
+
56
+ const float mean = tmp / group_size;
57
+ tmp = 0.0f;
58
+
59
+ for (int j = start; j < end; j += get_local_size(0)) {
60
+ float xi = src0[j] - mean;
61
+ dst[j] = xi;
62
+ tmp += xi * xi;
63
+ }
64
+
65
+ tmp = sub_group_reduce_add(tmp);
66
+
67
+ const float variance = tmp / group_size;
68
+ const float scale = 1.0f/sqrt(variance + eps);
69
+ for (int j = start; j < end; j += get_local_size(0)) {
70
+ dst[j] *= scale;
71
+ }
72
+ }
ggml/src/ggml-opencl/kernels/sigmoid.cl ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // sigmoid
5
+ //------------------------------------------------------------------------------
6
+
7
+ kernel void kernel_sigmoid_f32(
8
+ global float * src0,
9
+ ulong offset0,
10
+ global float * dst,
11
+ ulong offsetd
12
+ ) {
13
+ src0 = (global float*)((global char*)src0 + offset0);
14
+ dst = (global float*)((global char*)dst + offsetd);
15
+
16
+ dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
17
+ }
18
+
19
+ kernel void kernel_sigmoid_f16(
20
+ global half * src0,
21
+ ulong offset0,
22
+ global half * dst,
23
+ ulong offsetd
24
+ ) {
25
+ src0 = (global half*)((global char*)src0 + offset0);
26
+ dst = (global half*)((global char*)dst + offsetd);
27
+
28
+ dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
29
+ }
ggml/src/ggml-opencl/kernels/sub.cl ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // div
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_sub(
7
+ global char * src0,
8
+ ulong offset0,
9
+ global char * src1,
10
+ ulong offset1,
11
+ global char * dst,
12
+ ulong offsetd,
13
+ ulong nb00,
14
+ ulong nb01,
15
+ ulong nb02,
16
+ ulong nb03,
17
+ int ne10,
18
+ int ne11,
19
+ int ne12,
20
+ int ne13,
21
+ ulong nb10,
22
+ ulong nb11,
23
+ ulong nb12,
24
+ ulong nb13,
25
+ int ne0,
26
+ ulong nb0,
27
+ ulong nb1,
28
+ ulong nb2,
29
+ ulong nb3
30
+ ) {
31
+ src0 = src0 + offset0;
32
+ src1 = src1 + offset1;
33
+ dst = dst + offsetd;
34
+
35
+ int i03 = get_group_id(2);
36
+ int i02 = get_group_id(1);
37
+ int i01 = get_group_id(0);
38
+
39
+ int i13 = i03 % ne13;
40
+ int i12 = i02 % ne12;
41
+ int i11 = i01 % ne11;
42
+
43
+ global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
44
+ global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
45
+ global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
46
+
47
+ for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
48
+ const int i10 = i0 % ne10;
49
+ *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
50
+ }
51
+ }
52
+
53
+ // assumption: src1 is a row
54
+ // broadcast src1 into src0
55
+ kernel void kernel_sub_row(
56
+ global float4 * src0,
57
+ ulong offset0,
58
+ global float4 * src1,
59
+ ulong offset1,
60
+ global float4 * dst,
61
+ ulong offsetd,
62
+ int ne
63
+ ) {
64
+ src0 = (global float4*)((global char*)src0 + offset0);
65
+ src1 = (global float4*)((global char*)src1 + offset1);
66
+ dst = (global float4*)((global char*)dst + offsetd);
67
+
68
+ // This performs better than using %.
69
+ uint gid = get_global_id(0);
70
+ uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
71
+ dst[gid] = src0[gid] - src1[idx1];
72
+ }
ggml/src/ggml-opencl/kernels/sum_rows.cl ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ kernel void kernel_sum_rows_f32(
3
+ global float * src0,
4
+ ulong offset0,
5
+ global float * dst,
6
+ ulong offsetd,
7
+ int ne00,
8
+ int ne01,
9
+ int ne02,
10
+ int ne03,
11
+ ulong nb01,
12
+ ulong nb02,
13
+ ulong nb03,
14
+ ulong nb1,
15
+ ulong nb2,
16
+ ulong nb3
17
+ ) {
18
+ src0 = (global float *)((global char *)src0 + offset0);
19
+ dst = (global float *)((global char *)dst + offsetd);
20
+
21
+ int i3 = get_global_id(2);
22
+ int i2 = get_global_id(1);
23
+ int i1 = get_global_id(0);
24
+
25
+ if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
26
+ return;
27
+ }
28
+
29
+ global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
30
+ global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
31
+
32
+ float row_sum = 0;
33
+
34
+ for (int i0 = 0; i0 < ne00; i0++) {
35
+ row_sum += src_row[i0];
36
+ }
37
+
38
+ dst_row[0] = row_sum;
39
+ }