Spaces:
Sleeping
Sleeping
lhez commited on
Commit ·
1ab0f23
1
Parent(s): 4473109
opencl: add new ops - `argsort`, `div`, `sub`, `addrows`, `sigmoid`, `group_norm` (llama/13787)
Browse files* opencl: add `argsort`
* opencl: add `div`
* opencl: add `add_rows`
* opencl: add `sub`
* opencl: add `sigmoid`, both `f16` and `f32`
* opencl: add `group_norm`
- ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- ggml/src/ggml-opencl/ggml-opencl.cpp +637 -1
- ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- ggml/src/ggml-opencl/kernels/div.cl +72 -0
- ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
ggml/src/ggml-opencl/CMakeLists.txt
CHANGED
|
@@ -55,14 +55,17 @@ endfunction()
|
|
| 55 |
|
| 56 |
set(GGML_OPENCL_KERNELS
|
| 57 |
add
|
|
|
|
| 58 |
clamp
|
| 59 |
cpy
|
| 60 |
cvt
|
| 61 |
diag_mask_inf
|
|
|
|
| 62 |
gelu
|
| 63 |
gemv_noshuffle_general
|
| 64 |
gemv_noshuffle
|
| 65 |
get_rows
|
|
|
|
| 66 |
im2col_f32
|
| 67 |
im2col_f16
|
| 68 |
mul_mat_Ab_Bi_8x4
|
|
@@ -83,11 +86,14 @@ set(GGML_OPENCL_KERNELS
|
|
| 83 |
rms_norm
|
| 84 |
rope
|
| 85 |
scale
|
|
|
|
| 86 |
silu
|
| 87 |
softmax_4_f32
|
| 88 |
softmax_4_f16
|
| 89 |
softmax_f32
|
| 90 |
softmax_f16
|
|
|
|
|
|
|
| 91 |
transpose
|
| 92 |
)
|
| 93 |
|
|
|
|
| 55 |
|
| 56 |
set(GGML_OPENCL_KERNELS
|
| 57 |
add
|
| 58 |
+
argsort
|
| 59 |
clamp
|
| 60 |
cpy
|
| 61 |
cvt
|
| 62 |
diag_mask_inf
|
| 63 |
+
div
|
| 64 |
gelu
|
| 65 |
gemv_noshuffle_general
|
| 66 |
gemv_noshuffle
|
| 67 |
get_rows
|
| 68 |
+
group_norm
|
| 69 |
im2col_f32
|
| 70 |
im2col_f16
|
| 71 |
mul_mat_Ab_Bi_8x4
|
|
|
|
| 86 |
rms_norm
|
| 87 |
rope
|
| 88 |
scale
|
| 89 |
+
sigmoid
|
| 90 |
silu
|
| 91 |
softmax_4_f32
|
| 92 |
softmax_4_f16
|
| 93 |
softmax_f32
|
| 94 |
softmax_f16
|
| 95 |
+
sub
|
| 96 |
+
sum_rows
|
| 97 |
transpose
|
| 98 |
)
|
| 99 |
|
ggml/src/ggml-opencl/ggml-opencl.cpp
CHANGED
|
@@ -299,27 +299,37 @@ struct ggml_backend_opencl_context {
|
|
| 299 |
cl_program program_mul_mv_f16_f32;
|
| 300 |
cl_program program_mul_mv_f32_f32;
|
| 301 |
cl_program program_mul;
|
|
|
|
|
|
|
| 302 |
cl_program program_norm;
|
| 303 |
cl_program program_relu;
|
| 304 |
cl_program program_rms_norm;
|
|
|
|
| 305 |
cl_program program_rope;
|
| 306 |
cl_program program_scale;
|
| 307 |
cl_program program_silu;
|
|
|
|
| 308 |
cl_program program_softmax_f32;
|
| 309 |
cl_program program_softmax_f16;
|
| 310 |
cl_program program_softmax_4_f32;
|
| 311 |
cl_program program_softmax_4_f16;
|
|
|
|
|
|
|
| 312 |
|
| 313 |
cl_kernel kernel_add, kernel_add_row;
|
| 314 |
cl_kernel kernel_mul, kernel_mul_row;
|
|
|
|
|
|
|
| 315 |
cl_kernel kernel_scale;
|
| 316 |
cl_kernel kernel_silu, kernel_silu_4;
|
| 317 |
cl_kernel kernel_gelu, kernel_gelu_4;
|
| 318 |
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
| 319 |
cl_kernel kernel_relu;
|
|
|
|
| 320 |
cl_kernel kernel_clamp;
|
| 321 |
cl_kernel kernel_norm;
|
| 322 |
cl_kernel kernel_rms_norm;
|
|
|
|
| 323 |
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
|
| 324 |
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
| 325 |
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
|
@@ -339,6 +349,8 @@ struct ggml_backend_opencl_context {
|
|
| 339 |
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
| 340 |
cl_kernel kernel_mul_mv_q6_K_f32;
|
| 341 |
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
|
|
|
|
|
|
| 342 |
|
| 343 |
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
| 344 |
// Transpose kernels
|
|
@@ -986,6 +998,105 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
| 986 |
GGML_LOG_CONT(".");
|
| 987 |
}
|
| 988 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 989 |
// Adreno kernels
|
| 990 |
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
| 991 |
// transpose
|
|
@@ -1856,6 +1967,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
| 1856 |
case GGML_OP_ADD:
|
| 1857 |
case GGML_OP_SCALE:
|
| 1858 |
case GGML_OP_MUL:
|
|
|
|
|
|
|
| 1859 |
return op->src[0]->type == GGML_TYPE_F32;
|
| 1860 |
case GGML_OP_UNARY:
|
| 1861 |
switch (ggml_get_unary_op(op)) {
|
|
@@ -1863,7 +1976,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
| 1863 |
case GGML_UNARY_OP_SILU:
|
| 1864 |
case GGML_UNARY_OP_RELU:
|
| 1865 |
case GGML_UNARY_OP_GELU_QUICK:
|
| 1866 |
-
|
|
|
|
|
|
|
| 1867 |
default:
|
| 1868 |
return false;
|
| 1869 |
}
|
|
@@ -1873,6 +1988,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
| 1873 |
case GGML_OP_NORM:
|
| 1874 |
case GGML_OP_RMS_NORM:
|
| 1875 |
return true;
|
|
|
|
|
|
|
| 1876 |
case GGML_OP_MUL_MAT:
|
| 1877 |
if (op->src[0]->type == GGML_TYPE_F16) {
|
| 1878 |
return true;
|
|
@@ -1912,6 +2029,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
| 1912 |
}
|
| 1913 |
case GGML_OP_IM2COL:
|
| 1914 |
return true;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1915 |
default:
|
| 1916 |
return false;
|
| 1917 |
}
|
|
@@ -3238,6 +3359,256 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3238 |
}
|
| 3239 |
}
|
| 3240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3241 |
static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3242 |
GGML_ASSERT(src0);
|
| 3243 |
GGML_ASSERT(src0->extra);
|
|
@@ -3429,6 +3800,58 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3429 |
#endif
|
| 3430 |
}
|
| 3431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3432 |
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3433 |
GGML_ASSERT(src0);
|
| 3434 |
GGML_ASSERT(src0->extra);
|
|
@@ -3626,6 +4049,65 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 3626 |
#endif
|
| 3627 |
}
|
| 3628 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3629 |
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3630 |
GGML_ASSERT(src0);
|
| 3631 |
GGML_ASSERT(src0->extra);
|
|
@@ -4975,6 +5457,124 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
| 4975 |
#endif
|
| 4976 |
}
|
| 4977 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4978 |
//------------------------------------------------------------------------------
|
| 4979 |
// Op offloading
|
| 4980 |
//------------------------------------------------------------------------------
|
|
@@ -5023,6 +5623,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
| 5023 |
}
|
| 5024 |
func = ggml_cl_mul;
|
| 5025 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5026 |
case GGML_OP_UNARY:
|
| 5027 |
switch (ggml_get_unary_op(tensor)) {
|
| 5028 |
case GGML_UNARY_OP_GELU:
|
|
@@ -5049,6 +5661,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
| 5049 |
}
|
| 5050 |
func = ggml_cl_relu;
|
| 5051 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5052 |
default:
|
| 5053 |
return false;
|
| 5054 |
} break;
|
|
@@ -5070,6 +5688,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
| 5070 |
}
|
| 5071 |
func = ggml_cl_rms_norm;
|
| 5072 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5073 |
case GGML_OP_MUL_MAT:
|
| 5074 |
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
| 5075 |
return false;
|
|
@@ -5115,6 +5739,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
| 5115 |
}
|
| 5116 |
func = ggml_cl_im2col;
|
| 5117 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5118 |
default:
|
| 5119 |
return false;
|
| 5120 |
}
|
|
|
|
| 299 |
cl_program program_mul_mv_f16_f32;
|
| 300 |
cl_program program_mul_mv_f32_f32;
|
| 301 |
cl_program program_mul;
|
| 302 |
+
cl_program program_div;
|
| 303 |
+
cl_program program_sub;
|
| 304 |
cl_program program_norm;
|
| 305 |
cl_program program_relu;
|
| 306 |
cl_program program_rms_norm;
|
| 307 |
+
cl_program program_group_norm;
|
| 308 |
cl_program program_rope;
|
| 309 |
cl_program program_scale;
|
| 310 |
cl_program program_silu;
|
| 311 |
+
cl_program program_sigmoid;
|
| 312 |
cl_program program_softmax_f32;
|
| 313 |
cl_program program_softmax_f16;
|
| 314 |
cl_program program_softmax_4_f32;
|
| 315 |
cl_program program_softmax_4_f16;
|
| 316 |
+
cl_program program_argsort_f32_i32;
|
| 317 |
+
cl_program program_sum_rows_f32;
|
| 318 |
|
| 319 |
cl_kernel kernel_add, kernel_add_row;
|
| 320 |
cl_kernel kernel_mul, kernel_mul_row;
|
| 321 |
+
cl_kernel kernel_div, kernel_div_row;
|
| 322 |
+
cl_kernel kernel_sub, kernel_sub_row;
|
| 323 |
cl_kernel kernel_scale;
|
| 324 |
cl_kernel kernel_silu, kernel_silu_4;
|
| 325 |
cl_kernel kernel_gelu, kernel_gelu_4;
|
| 326 |
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
| 327 |
cl_kernel kernel_relu;
|
| 328 |
+
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
| 329 |
cl_kernel kernel_clamp;
|
| 330 |
cl_kernel kernel_norm;
|
| 331 |
cl_kernel kernel_rms_norm;
|
| 332 |
+
cl_kernel kernel_group_norm;
|
| 333 |
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
|
| 334 |
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
| 335 |
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
|
|
|
| 349 |
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
| 350 |
cl_kernel kernel_mul_mv_q6_K_f32;
|
| 351 |
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
| 352 |
+
cl_kernel kernel_argsort_f32_i32;
|
| 353 |
+
cl_kernel kernel_sum_rows_f32;
|
| 354 |
|
| 355 |
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
| 356 |
// Transpose kernels
|
|
|
|
| 998 |
GGML_LOG_CONT(".");
|
| 999 |
}
|
| 1000 |
|
| 1001 |
+
// argsort
|
| 1002 |
+
{
|
| 1003 |
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
| 1004 |
+
const std::string kernel_src {
|
| 1005 |
+
#include "argsort.cl.h"
|
| 1006 |
+
};
|
| 1007 |
+
#else
|
| 1008 |
+
const std::string kernel_src = read_file("argsort.cl");
|
| 1009 |
+
#endif
|
| 1010 |
+
backend_ctx->program_argsort_f32_i32 =
|
| 1011 |
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
| 1012 |
+
|
| 1013 |
+
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
|
| 1014 |
+
GGML_LOG_CONT(".");
|
| 1015 |
+
}
|
| 1016 |
+
|
| 1017 |
+
// div
|
| 1018 |
+
{
|
| 1019 |
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
| 1020 |
+
const std::string kernel_src {
|
| 1021 |
+
#include "div.cl.h"
|
| 1022 |
+
};
|
| 1023 |
+
#else
|
| 1024 |
+
const std::string kernel_src = read_file("div.cl");
|
| 1025 |
+
#endif
|
| 1026 |
+
backend_ctx->program_div =
|
| 1027 |
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
| 1028 |
+
|
| 1029 |
+
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
|
| 1030 |
+
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
|
| 1031 |
+
GGML_LOG_CONT(".");
|
| 1032 |
+
}
|
| 1033 |
+
|
| 1034 |
+
// sub
|
| 1035 |
+
{
|
| 1036 |
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
| 1037 |
+
const std::string kernel_src {
|
| 1038 |
+
#include "sub.cl.h"
|
| 1039 |
+
};
|
| 1040 |
+
#else
|
| 1041 |
+
const std::string kernel_src = read_file("sub.cl");
|
| 1042 |
+
#endif
|
| 1043 |
+
backend_ctx->program_sub =
|
| 1044 |
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
| 1045 |
+
|
| 1046 |
+
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
|
| 1047 |
+
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
|
| 1048 |
+
GGML_LOG_CONT(".");
|
| 1049 |
+
}
|
| 1050 |
+
|
| 1051 |
+
// sum_rows
|
| 1052 |
+
{
|
| 1053 |
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
| 1054 |
+
const std::string kernel_src {
|
| 1055 |
+
#include "sum_rows.cl.h"
|
| 1056 |
+
};
|
| 1057 |
+
#else
|
| 1058 |
+
const std::string kernel_src = read_file("sum_rows.cl");
|
| 1059 |
+
#endif
|
| 1060 |
+
backend_ctx->program_sum_rows_f32 =
|
| 1061 |
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
| 1062 |
+
|
| 1063 |
+
CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
|
| 1064 |
+
GGML_LOG_CONT(".");
|
| 1065 |
+
}
|
| 1066 |
+
|
| 1067 |
+
// sigmoid
|
| 1068 |
+
{
|
| 1069 |
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
| 1070 |
+
const std::string kernel_src {
|
| 1071 |
+
#include "sigmoid.cl.h"
|
| 1072 |
+
};
|
| 1073 |
+
#else
|
| 1074 |
+
const std::string kernel_src = read_file("sigmoid.cl");
|
| 1075 |
+
#endif
|
| 1076 |
+
backend_ctx->program_sigmoid =
|
| 1077 |
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
| 1078 |
+
|
| 1079 |
+
CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
|
| 1080 |
+
CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
|
| 1081 |
+
GGML_LOG_CONT(".");
|
| 1082 |
+
}
|
| 1083 |
+
|
| 1084 |
+
// group_norm
|
| 1085 |
+
{
|
| 1086 |
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
| 1087 |
+
const std::string kernel_src {
|
| 1088 |
+
#include "group_norm.cl.h"
|
| 1089 |
+
};
|
| 1090 |
+
#else
|
| 1091 |
+
const std::string kernel_src = read_file("group_norm.cl");
|
| 1092 |
+
#endif
|
| 1093 |
+
backend_ctx->program_group_norm =
|
| 1094 |
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
| 1095 |
+
|
| 1096 |
+
CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
|
| 1097 |
+
GGML_LOG_CONT(".");
|
| 1098 |
+
}
|
| 1099 |
+
|
| 1100 |
// Adreno kernels
|
| 1101 |
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
| 1102 |
// transpose
|
|
|
|
| 1967 |
case GGML_OP_ADD:
|
| 1968 |
case GGML_OP_SCALE:
|
| 1969 |
case GGML_OP_MUL:
|
| 1970 |
+
case GGML_OP_DIV:
|
| 1971 |
+
case GGML_OP_SUB:
|
| 1972 |
return op->src[0]->type == GGML_TYPE_F32;
|
| 1973 |
case GGML_OP_UNARY:
|
| 1974 |
switch (ggml_get_unary_op(op)) {
|
|
|
|
| 1976 |
case GGML_UNARY_OP_SILU:
|
| 1977 |
case GGML_UNARY_OP_RELU:
|
| 1978 |
case GGML_UNARY_OP_GELU_QUICK:
|
| 1979 |
+
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
| 1980 |
+
case GGML_UNARY_OP_SIGMOID:
|
| 1981 |
+
return ggml_is_contiguous(op->src[0]);
|
| 1982 |
default:
|
| 1983 |
return false;
|
| 1984 |
}
|
|
|
|
| 1988 |
case GGML_OP_NORM:
|
| 1989 |
case GGML_OP_RMS_NORM:
|
| 1990 |
return true;
|
| 1991 |
+
case GGML_OP_GROUP_NORM:
|
| 1992 |
+
return ggml_is_contiguous(op->src[0]);
|
| 1993 |
case GGML_OP_MUL_MAT:
|
| 1994 |
if (op->src[0]->type == GGML_TYPE_F16) {
|
| 1995 |
return true;
|
|
|
|
| 2029 |
}
|
| 2030 |
case GGML_OP_IM2COL:
|
| 2031 |
return true;
|
| 2032 |
+
case GGML_OP_ARGSORT:
|
| 2033 |
+
return op->src[0]->type == GGML_TYPE_F32;
|
| 2034 |
+
case GGML_OP_SUM_ROWS:
|
| 2035 |
+
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
| 2036 |
default:
|
| 2037 |
return false;
|
| 2038 |
}
|
|
|
|
| 3359 |
}
|
| 3360 |
}
|
| 3361 |
|
| 3362 |
+
static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3363 |
+
GGML_ASSERT(src0);
|
| 3364 |
+
GGML_ASSERT(src0->extra);
|
| 3365 |
+
GGML_ASSERT(src1);
|
| 3366 |
+
GGML_ASSERT(src1->extra);
|
| 3367 |
+
GGML_ASSERT(dst);
|
| 3368 |
+
GGML_ASSERT(dst->extra);
|
| 3369 |
+
|
| 3370 |
+
const int ne00 = src0->ne[0];
|
| 3371 |
+
const int ne01 = src0->ne[1];
|
| 3372 |
+
const int ne02 = src0->ne[2];
|
| 3373 |
+
const int ne03 = src0->ne[3];
|
| 3374 |
+
|
| 3375 |
+
const cl_ulong nb00 = src0->nb[0];
|
| 3376 |
+
const cl_ulong nb01 = src0->nb[1];
|
| 3377 |
+
const cl_ulong nb02 = src0->nb[2];
|
| 3378 |
+
const cl_ulong nb03 = src0->nb[3];
|
| 3379 |
+
|
| 3380 |
+
const int ne10 = src1->ne[0];
|
| 3381 |
+
const int ne11 = src1->ne[1];
|
| 3382 |
+
const int ne12 = src1->ne[2];
|
| 3383 |
+
const int ne13 = src1->ne[3];
|
| 3384 |
+
|
| 3385 |
+
const cl_ulong nb10 = src1->nb[0];
|
| 3386 |
+
const cl_ulong nb11 = src1->nb[1];
|
| 3387 |
+
const cl_ulong nb12 = src1->nb[2];
|
| 3388 |
+
const cl_ulong nb13 = src1->nb[3];
|
| 3389 |
+
|
| 3390 |
+
const int ne0 = dst->ne[0];
|
| 3391 |
+
|
| 3392 |
+
const cl_ulong nb0 = dst->nb[0];
|
| 3393 |
+
const cl_ulong nb1 = dst->nb[1];
|
| 3394 |
+
const cl_ulong nb2 = dst->nb[2];
|
| 3395 |
+
const cl_ulong nb3 = dst->nb[3];
|
| 3396 |
+
|
| 3397 |
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3398 |
+
cl_command_queue queue = backend_ctx->queue;
|
| 3399 |
+
|
| 3400 |
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3401 |
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
| 3402 |
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
| 3403 |
+
|
| 3404 |
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
| 3405 |
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
| 3406 |
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
| 3407 |
+
|
| 3408 |
+
bool bcast_row = false;
|
| 3409 |
+
cl_kernel kernel;
|
| 3410 |
+
|
| 3411 |
+
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
| 3412 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 3413 |
+
|
| 3414 |
+
// src1 is a row
|
| 3415 |
+
GGML_ASSERT(ne11 == 1);
|
| 3416 |
+
|
| 3417 |
+
bcast_row = true;
|
| 3418 |
+
int ne = ne00 / 4;
|
| 3419 |
+
kernel = backend_ctx->kernel_div_row;
|
| 3420 |
+
|
| 3421 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 3422 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 3423 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
| 3424 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
| 3425 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
| 3426 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
| 3427 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
| 3428 |
+
} else {
|
| 3429 |
+
kernel = backend_ctx->kernel_div;
|
| 3430 |
+
|
| 3431 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 3432 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 3433 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
| 3434 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
| 3435 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
| 3436 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
| 3437 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
|
| 3438 |
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
| 3439 |
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
| 3440 |
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
| 3441 |
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
| 3442 |
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
| 3443 |
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
| 3444 |
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
|
| 3445 |
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
| 3446 |
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
| 3447 |
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
| 3448 |
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
| 3449 |
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
|
| 3450 |
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
|
| 3451 |
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
| 3452 |
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
| 3453 |
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
| 3454 |
+
}
|
| 3455 |
+
|
| 3456 |
+
if (bcast_row) {
|
| 3457 |
+
int n = ggml_nelements(dst)/4;
|
| 3458 |
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3459 |
+
size_t local_work_size[] = {64, 1, 1};
|
| 3460 |
+
|
| 3461 |
+
#ifdef GGML_OPENCL_PROFILING
|
| 3462 |
+
cl_event evt;
|
| 3463 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3464 |
+
|
| 3465 |
+
g_profiling_info.emplace_back();
|
| 3466 |
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3467 |
+
#else
|
| 3468 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3469 |
+
#endif
|
| 3470 |
+
} else {
|
| 3471 |
+
unsigned int nth = MIN(64, ne0);
|
| 3472 |
+
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 3473 |
+
size_t local_work_size[] = {nth, 1, 1};
|
| 3474 |
+
|
| 3475 |
+
#ifdef GGML_OPENCL_PROFILING
|
| 3476 |
+
cl_event evt;
|
| 3477 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3478 |
+
|
| 3479 |
+
g_profiling_info.emplace_back();
|
| 3480 |
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3481 |
+
#else
|
| 3482 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3483 |
+
#endif
|
| 3484 |
+
}
|
| 3485 |
+
}
|
| 3486 |
+
|
| 3487 |
+
static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3488 |
+
GGML_ASSERT(src0);
|
| 3489 |
+
GGML_ASSERT(src0->extra);
|
| 3490 |
+
GGML_ASSERT(src1);
|
| 3491 |
+
GGML_ASSERT(src1->extra);
|
| 3492 |
+
GGML_ASSERT(dst);
|
| 3493 |
+
GGML_ASSERT(dst->extra);
|
| 3494 |
+
|
| 3495 |
+
const int ne00 = src0->ne[0];
|
| 3496 |
+
const int ne01 = src0->ne[1];
|
| 3497 |
+
const int ne02 = src0->ne[2];
|
| 3498 |
+
const int ne03 = src0->ne[3];
|
| 3499 |
+
|
| 3500 |
+
const cl_ulong nb00 = src0->nb[0];
|
| 3501 |
+
const cl_ulong nb01 = src0->nb[1];
|
| 3502 |
+
const cl_ulong nb02 = src0->nb[2];
|
| 3503 |
+
const cl_ulong nb03 = src0->nb[3];
|
| 3504 |
+
|
| 3505 |
+
const int ne10 = src1->ne[0];
|
| 3506 |
+
const int ne11 = src1->ne[1];
|
| 3507 |
+
const int ne12 = src1->ne[2];
|
| 3508 |
+
const int ne13 = src1->ne[3];
|
| 3509 |
+
|
| 3510 |
+
const cl_ulong nb10 = src1->nb[0];
|
| 3511 |
+
const cl_ulong nb11 = src1->nb[1];
|
| 3512 |
+
const cl_ulong nb12 = src1->nb[2];
|
| 3513 |
+
const cl_ulong nb13 = src1->nb[3];
|
| 3514 |
+
|
| 3515 |
+
const int ne0 = dst->ne[0];
|
| 3516 |
+
|
| 3517 |
+
const cl_ulong nb0 = dst->nb[0];
|
| 3518 |
+
const cl_ulong nb1 = dst->nb[1];
|
| 3519 |
+
const cl_ulong nb2 = dst->nb[2];
|
| 3520 |
+
const cl_ulong nb3 = dst->nb[3];
|
| 3521 |
+
|
| 3522 |
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3523 |
+
cl_command_queue queue = backend_ctx->queue;
|
| 3524 |
+
|
| 3525 |
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3526 |
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
| 3527 |
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
| 3528 |
+
|
| 3529 |
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
| 3530 |
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
| 3531 |
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
| 3532 |
+
|
| 3533 |
+
bool bcast_row = false;
|
| 3534 |
+
cl_kernel kernel;
|
| 3535 |
+
|
| 3536 |
+
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
| 3537 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 3538 |
+
|
| 3539 |
+
// src1 is a row
|
| 3540 |
+
GGML_ASSERT(ne11 == 1);
|
| 3541 |
+
|
| 3542 |
+
bcast_row = true;
|
| 3543 |
+
int ne = ne00 / 4;
|
| 3544 |
+
kernel = backend_ctx->kernel_sub_row;
|
| 3545 |
+
|
| 3546 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 3547 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 3548 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
| 3549 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
| 3550 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
| 3551 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
| 3552 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
| 3553 |
+
} else {
|
| 3554 |
+
kernel = backend_ctx->kernel_sub;
|
| 3555 |
+
|
| 3556 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 3557 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 3558 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
| 3559 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
| 3560 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
| 3561 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
| 3562 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
|
| 3563 |
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
| 3564 |
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
| 3565 |
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
| 3566 |
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
| 3567 |
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
| 3568 |
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
| 3569 |
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
|
| 3570 |
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
| 3571 |
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
| 3572 |
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
| 3573 |
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
| 3574 |
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
|
| 3575 |
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
|
| 3576 |
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
| 3577 |
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
| 3578 |
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
| 3579 |
+
}
|
| 3580 |
+
|
| 3581 |
+
if (bcast_row) {
|
| 3582 |
+
int n = ggml_nelements(dst)/4;
|
| 3583 |
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3584 |
+
size_t local_work_size[] = {64, 1, 1};
|
| 3585 |
+
|
| 3586 |
+
#ifdef GGML_OPENCL_PROFILING
|
| 3587 |
+
cl_event evt;
|
| 3588 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3589 |
+
|
| 3590 |
+
g_profiling_info.emplace_back();
|
| 3591 |
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3592 |
+
#else
|
| 3593 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3594 |
+
#endif
|
| 3595 |
+
} else {
|
| 3596 |
+
unsigned int nth = MIN(64, ne0);
|
| 3597 |
+
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 3598 |
+
size_t local_work_size[] = {nth, 1, 1};
|
| 3599 |
+
|
| 3600 |
+
#ifdef GGML_OPENCL_PROFILING
|
| 3601 |
+
cl_event evt;
|
| 3602 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3603 |
+
|
| 3604 |
+
g_profiling_info.emplace_back();
|
| 3605 |
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3606 |
+
#else
|
| 3607 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3608 |
+
#endif
|
| 3609 |
+
}
|
| 3610 |
+
}
|
| 3611 |
+
|
| 3612 |
static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3613 |
GGML_ASSERT(src0);
|
| 3614 |
GGML_ASSERT(src0->extra);
|
|
|
|
| 3800 |
#endif
|
| 3801 |
}
|
| 3802 |
|
| 3803 |
+
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3804 |
+
GGML_ASSERT(src0);
|
| 3805 |
+
GGML_ASSERT(src0->extra);
|
| 3806 |
+
GGML_ASSERT(dst);
|
| 3807 |
+
GGML_ASSERT(dst->extra);
|
| 3808 |
+
|
| 3809 |
+
UNUSED(src1);
|
| 3810 |
+
|
| 3811 |
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3812 |
+
cl_command_queue queue = backend_ctx->queue;
|
| 3813 |
+
|
| 3814 |
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3815 |
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
| 3816 |
+
|
| 3817 |
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
| 3818 |
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
| 3819 |
+
|
| 3820 |
+
cl_kernel kernel;
|
| 3821 |
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
| 3822 |
+
kernel = backend_ctx->kernel_sigmoid_f32;
|
| 3823 |
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
| 3824 |
+
kernel = backend_ctx->kernel_sigmoid_f16;
|
| 3825 |
+
} else {
|
| 3826 |
+
GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
|
| 3827 |
+
}
|
| 3828 |
+
|
| 3829 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 3830 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 3831 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
| 3832 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
| 3833 |
+
|
| 3834 |
+
const int64_t n = ggml_nelements(dst);
|
| 3835 |
+
|
| 3836 |
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3837 |
+
size_t local_work_size[] = {64, 1, 1};
|
| 3838 |
+
|
| 3839 |
+
size_t * local_work_size_ptr = local_work_size;
|
| 3840 |
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
| 3841 |
+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 3842 |
+
}
|
| 3843 |
+
|
| 3844 |
+
#ifdef GGML_OPENCL_PROFILING
|
| 3845 |
+
cl_event evt;
|
| 3846 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 3847 |
+
|
| 3848 |
+
g_profiling_info.emplace_back();
|
| 3849 |
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
| 3850 |
+
#else
|
| 3851 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 3852 |
+
#endif
|
| 3853 |
+
}
|
| 3854 |
+
|
| 3855 |
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3856 |
GGML_ASSERT(src0);
|
| 3857 |
GGML_ASSERT(src0->extra);
|
|
|
|
| 4049 |
#endif
|
| 4050 |
}
|
| 4051 |
|
| 4052 |
+
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 4053 |
+
GGML_ASSERT(src0);
|
| 4054 |
+
GGML_ASSERT(src0->extra);
|
| 4055 |
+
GGML_ASSERT(dst);
|
| 4056 |
+
GGML_ASSERT(dst->extra);
|
| 4057 |
+
|
| 4058 |
+
UNUSED(src1);
|
| 4059 |
+
|
| 4060 |
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4061 |
+
cl_command_queue queue = backend_ctx->queue;
|
| 4062 |
+
|
| 4063 |
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4064 |
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
| 4065 |
+
|
| 4066 |
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
| 4067 |
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
| 4068 |
+
|
| 4069 |
+
int32_t n_groups = ((const int32_t *) dst->op_params)[0];
|
| 4070 |
+
int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
|
| 4071 |
+
float eps = ((const float *) dst->op_params)[1];
|
| 4072 |
+
|
| 4073 |
+
const int ne00 = src0->ne[0];
|
| 4074 |
+
const int ne01 = src0->ne[1];
|
| 4075 |
+
const int ne02 = src0->ne[2];
|
| 4076 |
+
const int ne = ne00*ne01*ne02;
|
| 4077 |
+
|
| 4078 |
+
cl_kernel kernel = backend_ctx->kernel_group_norm;
|
| 4079 |
+
|
| 4080 |
+
size_t sgs = 64;
|
| 4081 |
+
if (backend_ctx->gpu_family == ADRENO) {
|
| 4082 |
+
sgs = 64;
|
| 4083 |
+
} else if (backend_ctx->gpu_family == INTEL) {
|
| 4084 |
+
sgs = 32;
|
| 4085 |
+
} else {
|
| 4086 |
+
GGML_ASSERT(false && "Unsupported GPU");
|
| 4087 |
+
}
|
| 4088 |
+
|
| 4089 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 4090 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 4091 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
| 4092 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
| 4093 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
|
| 4094 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
|
| 4095 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
|
| 4096 |
+
|
| 4097 |
+
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
|
| 4098 |
+
size_t local_work_size[] = {(size_t)sgs, 1, 1};
|
| 4099 |
+
|
| 4100 |
+
#ifdef GGML_OPENCL_PROFILING
|
| 4101 |
+
cl_event evt;
|
| 4102 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 4103 |
+
|
| 4104 |
+
g_profiling_info.emplace_back();
|
| 4105 |
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 4106 |
+
#else
|
| 4107 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 4108 |
+
#endif
|
| 4109 |
+
}
|
| 4110 |
+
|
| 4111 |
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 4112 |
GGML_ASSERT(src0);
|
| 4113 |
GGML_ASSERT(src0->extra);
|
|
|
|
| 5457 |
#endif
|
| 5458 |
}
|
| 5459 |
|
| 5460 |
+
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 5461 |
+
GGML_ASSERT(src0);
|
| 5462 |
+
GGML_ASSERT(src0->extra);
|
| 5463 |
+
GGML_ASSERT(dst);
|
| 5464 |
+
GGML_ASSERT(dst->extra);
|
| 5465 |
+
GGML_UNUSED(src1);
|
| 5466 |
+
|
| 5467 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 5468 |
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
| 5469 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 5470 |
+
|
| 5471 |
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 5472 |
+
cl_command_queue queue = backend_ctx->queue;
|
| 5473 |
+
|
| 5474 |
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 5475 |
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
| 5476 |
+
|
| 5477 |
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
| 5478 |
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
| 5479 |
+
|
| 5480 |
+
const int ne00 = src0->ne[0];
|
| 5481 |
+
const int nrows = ggml_nrows(src0);
|
| 5482 |
+
|
| 5483 |
+
int ne00_padded = 1;
|
| 5484 |
+
while (ne00_padded < ne00) {
|
| 5485 |
+
ne00_padded *= 2;
|
| 5486 |
+
}
|
| 5487 |
+
|
| 5488 |
+
int order = (enum ggml_sort_order) dst->op_params[0];
|
| 5489 |
+
|
| 5490 |
+
cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
|
| 5491 |
+
|
| 5492 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 5493 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 5494 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
| 5495 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
| 5496 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
| 5497 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
|
| 5498 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
|
| 5499 |
+
CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
|
| 5500 |
+
|
| 5501 |
+
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
|
| 5502 |
+
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
|
| 5503 |
+
|
| 5504 |
+
#ifdef GGML_OPENCL_PROFILING
|
| 5505 |
+
cl_event evt;
|
| 5506 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5507 |
+
|
| 5508 |
+
g_profiling_info.emplace_back();
|
| 5509 |
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 5510 |
+
#else
|
| 5511 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5512 |
+
#endif
|
| 5513 |
+
}
|
| 5514 |
+
|
| 5515 |
+
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 5516 |
+
GGML_ASSERT(src0);
|
| 5517 |
+
GGML_ASSERT(src0->extra);
|
| 5518 |
+
GGML_ASSERT(dst);
|
| 5519 |
+
GGML_ASSERT(dst->extra);
|
| 5520 |
+
GGML_UNUSED(src1);
|
| 5521 |
+
|
| 5522 |
+
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
| 5523 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 5524 |
+
|
| 5525 |
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 5526 |
+
cl_command_queue queue = backend_ctx->queue;
|
| 5527 |
+
|
| 5528 |
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 5529 |
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
| 5530 |
+
|
| 5531 |
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
| 5532 |
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
| 5533 |
+
|
| 5534 |
+
const int ne00 = src0->ne[0];
|
| 5535 |
+
const int ne01 = src0->ne[1];
|
| 5536 |
+
const int ne02 = src0->ne[2];
|
| 5537 |
+
const int ne03 = src0->ne[3];
|
| 5538 |
+
|
| 5539 |
+
const cl_ulong nb01 = src0->nb[1];
|
| 5540 |
+
const cl_ulong nb02 = src0->nb[2];
|
| 5541 |
+
const cl_ulong nb03 = src0->nb[3];
|
| 5542 |
+
|
| 5543 |
+
const cl_ulong nb1 = dst->nb[1];
|
| 5544 |
+
const cl_ulong nb2 = dst->nb[2];
|
| 5545 |
+
const cl_ulong nb3 = dst->nb[3];
|
| 5546 |
+
|
| 5547 |
+
cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
|
| 5548 |
+
|
| 5549 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 5550 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 5551 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
| 5552 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
| 5553 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
| 5554 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
| 5555 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
| 5556 |
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
| 5557 |
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
| 5558 |
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
| 5559 |
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
| 5560 |
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
|
| 5561 |
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
|
| 5562 |
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
|
| 5563 |
+
|
| 5564 |
+
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
| 5565 |
+
size_t local_work_size[] = {(size_t)64, 1, 1};
|
| 5566 |
+
|
| 5567 |
+
#ifdef GGML_OPENCL_PROFILING
|
| 5568 |
+
cl_event evt;
|
| 5569 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5570 |
+
|
| 5571 |
+
g_profiling_info.emplace_back();
|
| 5572 |
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 5573 |
+
#else
|
| 5574 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5575 |
+
#endif
|
| 5576 |
+
}
|
| 5577 |
+
|
| 5578 |
//------------------------------------------------------------------------------
|
| 5579 |
// Op offloading
|
| 5580 |
//------------------------------------------------------------------------------
|
|
|
|
| 5623 |
}
|
| 5624 |
func = ggml_cl_mul;
|
| 5625 |
break;
|
| 5626 |
+
case GGML_OP_DIV:
|
| 5627 |
+
if (!any_on_device) {
|
| 5628 |
+
return false;
|
| 5629 |
+
}
|
| 5630 |
+
func = ggml_cl_div;
|
| 5631 |
+
break;
|
| 5632 |
+
case GGML_OP_SUB:
|
| 5633 |
+
if (!any_on_device) {
|
| 5634 |
+
return false;
|
| 5635 |
+
}
|
| 5636 |
+
func = ggml_cl_sub;
|
| 5637 |
+
break;
|
| 5638 |
case GGML_OP_UNARY:
|
| 5639 |
switch (ggml_get_unary_op(tensor)) {
|
| 5640 |
case GGML_UNARY_OP_GELU:
|
|
|
|
| 5661 |
}
|
| 5662 |
func = ggml_cl_relu;
|
| 5663 |
break;
|
| 5664 |
+
case GGML_UNARY_OP_SIGMOID:
|
| 5665 |
+
if (!any_on_device) {
|
| 5666 |
+
return false;
|
| 5667 |
+
}
|
| 5668 |
+
func = ggml_cl_sigmoid;
|
| 5669 |
+
break;
|
| 5670 |
default:
|
| 5671 |
return false;
|
| 5672 |
} break;
|
|
|
|
| 5688 |
}
|
| 5689 |
func = ggml_cl_rms_norm;
|
| 5690 |
break;
|
| 5691 |
+
case GGML_OP_GROUP_NORM:
|
| 5692 |
+
if (!any_on_device) {
|
| 5693 |
+
return false;
|
| 5694 |
+
}
|
| 5695 |
+
func = ggml_cl_group_norm;
|
| 5696 |
+
break;
|
| 5697 |
case GGML_OP_MUL_MAT:
|
| 5698 |
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
| 5699 |
return false;
|
|
|
|
| 5739 |
}
|
| 5740 |
func = ggml_cl_im2col;
|
| 5741 |
break;
|
| 5742 |
+
case GGML_OP_ARGSORT:
|
| 5743 |
+
if (!any_on_device) {
|
| 5744 |
+
return false;
|
| 5745 |
+
}
|
| 5746 |
+
func = ggml_cl_argsort;
|
| 5747 |
+
break;
|
| 5748 |
+
case GGML_OP_SUM_ROWS:
|
| 5749 |
+
if (!any_on_device) {
|
| 5750 |
+
return false;
|
| 5751 |
+
}
|
| 5752 |
+
func = ggml_cl_sum_rows;
|
| 5753 |
+
break;
|
| 5754 |
default:
|
| 5755 |
return false;
|
| 5756 |
}
|
ggml/src/ggml-opencl/kernels/argsort.cl
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
+
|
| 3 |
+
#ifdef cl_intel_subgroups
|
| 4 |
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
| 5 |
+
#else
|
| 6 |
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 7 |
+
#endif
|
| 8 |
+
|
| 9 |
+
#ifdef cl_intel_required_subgroup_size
|
| 10 |
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
| 11 |
+
#define INTEL_GPU 1
|
| 12 |
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
| 13 |
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
| 14 |
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
| 15 |
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 16 |
+
#define ADRENO_GPU 1
|
| 17 |
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
| 18 |
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
| 19 |
+
#endif
|
| 20 |
+
|
| 21 |
+
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
|
| 22 |
+
|
| 23 |
+
enum ggml_sort_order {
|
| 24 |
+
GGML_SORT_ORDER_ASC,
|
| 25 |
+
GGML_SORT_ORDER_DESC,
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
kernel void kernel_argsort_f32_i32(
|
| 29 |
+
global float * src0,
|
| 30 |
+
ulong offset0,
|
| 31 |
+
global int * dst,
|
| 32 |
+
ulong offsetd,
|
| 33 |
+
const int ne00,
|
| 34 |
+
const int ne00_pad,
|
| 35 |
+
const int order,
|
| 36 |
+
local int * dst_row
|
| 37 |
+
) {
|
| 38 |
+
// bitonic sort
|
| 39 |
+
int col = get_local_id(0);
|
| 40 |
+
int row = get_group_id(1);
|
| 41 |
+
|
| 42 |
+
if (col >= ne00_pad) {
|
| 43 |
+
return;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
src0 = (global char *)((global char *)src0 + offset0);
|
| 47 |
+
dst = (global float *)((global char *)dst + offsetd);
|
| 48 |
+
|
| 49 |
+
global float * x_row = src0 + row * ne00;
|
| 50 |
+
|
| 51 |
+
// initialize indices
|
| 52 |
+
dst_row[col] = col;
|
| 53 |
+
|
| 54 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
| 55 |
+
|
| 56 |
+
for (int k = 2; k <= ne00_pad; k *= 2) {
|
| 57 |
+
for (int j = k / 2; j > 0; j /= 2) {
|
| 58 |
+
int ixj = col ^ j;
|
| 59 |
+
if (ixj > col) {
|
| 60 |
+
if ((col & k) == 0) {
|
| 61 |
+
if (dst_row[col] >= ne00 ||
|
| 62 |
+
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
|
| 63 |
+
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
| 64 |
+
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
| 65 |
+
) {
|
| 66 |
+
SWAP(dst_row[col], dst_row[ixj], int);
|
| 67 |
+
}
|
| 68 |
+
} else {
|
| 69 |
+
if (dst_row[ixj] >= ne00 ||
|
| 70 |
+
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
|
| 71 |
+
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
| 72 |
+
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
| 73 |
+
) {
|
| 74 |
+
SWAP(dst_row[col], dst_row[ixj], int);
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
// copy the result to dst without the padding
|
| 83 |
+
if (col < ne00) {
|
| 84 |
+
dst[row * ne00 + col] = dst_row[col];
|
| 85 |
+
}
|
| 86 |
+
}
|
ggml/src/ggml-opencl/kernels/div.cl
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
+
|
| 3 |
+
//------------------------------------------------------------------------------
|
| 4 |
+
// div
|
| 5 |
+
//------------------------------------------------------------------------------
|
| 6 |
+
kernel void kernel_div(
|
| 7 |
+
global char * src0,
|
| 8 |
+
ulong offset0,
|
| 9 |
+
global char * src1,
|
| 10 |
+
ulong offset1,
|
| 11 |
+
global char * dst,
|
| 12 |
+
ulong offsetd,
|
| 13 |
+
ulong nb00,
|
| 14 |
+
ulong nb01,
|
| 15 |
+
ulong nb02,
|
| 16 |
+
ulong nb03,
|
| 17 |
+
int ne10,
|
| 18 |
+
int ne11,
|
| 19 |
+
int ne12,
|
| 20 |
+
int ne13,
|
| 21 |
+
ulong nb10,
|
| 22 |
+
ulong nb11,
|
| 23 |
+
ulong nb12,
|
| 24 |
+
ulong nb13,
|
| 25 |
+
int ne0,
|
| 26 |
+
ulong nb0,
|
| 27 |
+
ulong nb1,
|
| 28 |
+
ulong nb2,
|
| 29 |
+
ulong nb3
|
| 30 |
+
) {
|
| 31 |
+
src0 = src0 + offset0;
|
| 32 |
+
src1 = src1 + offset1;
|
| 33 |
+
dst = dst + offsetd;
|
| 34 |
+
|
| 35 |
+
int i03 = get_group_id(2);
|
| 36 |
+
int i02 = get_group_id(1);
|
| 37 |
+
int i01 = get_group_id(0);
|
| 38 |
+
|
| 39 |
+
int i13 = i03 % ne13;
|
| 40 |
+
int i12 = i02 % ne12;
|
| 41 |
+
int i11 = i01 % ne11;
|
| 42 |
+
|
| 43 |
+
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
| 44 |
+
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
| 45 |
+
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
| 46 |
+
|
| 47 |
+
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
| 48 |
+
const int i10 = i0 % ne10;
|
| 49 |
+
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
// assumption: src1 is a row
|
| 54 |
+
// broadcast src1 into src0
|
| 55 |
+
kernel void kernel_div_row(
|
| 56 |
+
global float4 * src0,
|
| 57 |
+
ulong offset0,
|
| 58 |
+
global float4 * src1,
|
| 59 |
+
ulong offset1,
|
| 60 |
+
global float4 * dst,
|
| 61 |
+
ulong offsetd,
|
| 62 |
+
int ne
|
| 63 |
+
) {
|
| 64 |
+
src0 = (global float4*)((global char*)src0 + offset0);
|
| 65 |
+
src1 = (global float4*)((global char*)src1 + offset1);
|
| 66 |
+
dst = (global float4*)((global char*)dst + offsetd);
|
| 67 |
+
|
| 68 |
+
// This performs better than using %.
|
| 69 |
+
uint gid = get_global_id(0);
|
| 70 |
+
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
| 71 |
+
dst[gid] = src0[gid] / src1[idx1];
|
| 72 |
+
}
|
ggml/src/ggml-opencl/kernels/group_norm.cl
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
+
|
| 3 |
+
#ifdef cl_intel_subgroups
|
| 4 |
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
| 5 |
+
#else
|
| 6 |
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 7 |
+
#endif
|
| 8 |
+
|
| 9 |
+
#ifdef cl_intel_required_subgroup_size
|
| 10 |
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
| 11 |
+
#define INTEL_GPU 1
|
| 12 |
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
| 13 |
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
| 14 |
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
| 15 |
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 16 |
+
#define ADRENO_GPU 1
|
| 17 |
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
| 18 |
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
| 19 |
+
#endif
|
| 20 |
+
|
| 21 |
+
// Workgroup must be a subgroup
|
| 22 |
+
#ifdef INTEL_GPU
|
| 23 |
+
REQD_SUBGROUP_SIZE_32
|
| 24 |
+
#elif defined (ADRENO_GPU)
|
| 25 |
+
REQD_SUBGROUP_SIZE_64
|
| 26 |
+
#endif
|
| 27 |
+
kernel void kernel_group_norm(
|
| 28 |
+
global float * src0,
|
| 29 |
+
ulong offset0,
|
| 30 |
+
global float * dst,
|
| 31 |
+
ulong offsetd,
|
| 32 |
+
int ne,
|
| 33 |
+
int group_size,
|
| 34 |
+
float eps
|
| 35 |
+
) {
|
| 36 |
+
src0 = (global float *)((global char *)src0 + offset0);
|
| 37 |
+
dst = (global float *)((global char *)dst + offsetd);
|
| 38 |
+
|
| 39 |
+
int start = get_group_id(0) * group_size;
|
| 40 |
+
int end = start + group_size;
|
| 41 |
+
|
| 42 |
+
start += get_local_id(0);
|
| 43 |
+
|
| 44 |
+
if (end >= ne) {
|
| 45 |
+
end = ne;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
float tmp = 0.0f;
|
| 49 |
+
|
| 50 |
+
for (int j = start; j < end; j += get_local_size(0)) {
|
| 51 |
+
tmp += src0[j];
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
tmp = sub_group_reduce_add(tmp);
|
| 55 |
+
|
| 56 |
+
const float mean = tmp / group_size;
|
| 57 |
+
tmp = 0.0f;
|
| 58 |
+
|
| 59 |
+
for (int j = start; j < end; j += get_local_size(0)) {
|
| 60 |
+
float xi = src0[j] - mean;
|
| 61 |
+
dst[j] = xi;
|
| 62 |
+
tmp += xi * xi;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
tmp = sub_group_reduce_add(tmp);
|
| 66 |
+
|
| 67 |
+
const float variance = tmp / group_size;
|
| 68 |
+
const float scale = 1.0f/sqrt(variance + eps);
|
| 69 |
+
for (int j = start; j < end; j += get_local_size(0)) {
|
| 70 |
+
dst[j] *= scale;
|
| 71 |
+
}
|
| 72 |
+
}
|
ggml/src/ggml-opencl/kernels/sigmoid.cl
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
+
|
| 3 |
+
//------------------------------------------------------------------------------
|
| 4 |
+
// sigmoid
|
| 5 |
+
//------------------------------------------------------------------------------
|
| 6 |
+
|
| 7 |
+
kernel void kernel_sigmoid_f32(
|
| 8 |
+
global float * src0,
|
| 9 |
+
ulong offset0,
|
| 10 |
+
global float * dst,
|
| 11 |
+
ulong offsetd
|
| 12 |
+
) {
|
| 13 |
+
src0 = (global float*)((global char*)src0 + offset0);
|
| 14 |
+
dst = (global float*)((global char*)dst + offsetd);
|
| 15 |
+
|
| 16 |
+
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
kernel void kernel_sigmoid_f16(
|
| 20 |
+
global half * src0,
|
| 21 |
+
ulong offset0,
|
| 22 |
+
global half * dst,
|
| 23 |
+
ulong offsetd
|
| 24 |
+
) {
|
| 25 |
+
src0 = (global half*)((global char*)src0 + offset0);
|
| 26 |
+
dst = (global half*)((global char*)dst + offsetd);
|
| 27 |
+
|
| 28 |
+
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
| 29 |
+
}
|
ggml/src/ggml-opencl/kernels/sub.cl
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
+
|
| 3 |
+
//------------------------------------------------------------------------------
|
| 4 |
+
// div
|
| 5 |
+
//------------------------------------------------------------------------------
|
| 6 |
+
kernel void kernel_sub(
|
| 7 |
+
global char * src0,
|
| 8 |
+
ulong offset0,
|
| 9 |
+
global char * src1,
|
| 10 |
+
ulong offset1,
|
| 11 |
+
global char * dst,
|
| 12 |
+
ulong offsetd,
|
| 13 |
+
ulong nb00,
|
| 14 |
+
ulong nb01,
|
| 15 |
+
ulong nb02,
|
| 16 |
+
ulong nb03,
|
| 17 |
+
int ne10,
|
| 18 |
+
int ne11,
|
| 19 |
+
int ne12,
|
| 20 |
+
int ne13,
|
| 21 |
+
ulong nb10,
|
| 22 |
+
ulong nb11,
|
| 23 |
+
ulong nb12,
|
| 24 |
+
ulong nb13,
|
| 25 |
+
int ne0,
|
| 26 |
+
ulong nb0,
|
| 27 |
+
ulong nb1,
|
| 28 |
+
ulong nb2,
|
| 29 |
+
ulong nb3
|
| 30 |
+
) {
|
| 31 |
+
src0 = src0 + offset0;
|
| 32 |
+
src1 = src1 + offset1;
|
| 33 |
+
dst = dst + offsetd;
|
| 34 |
+
|
| 35 |
+
int i03 = get_group_id(2);
|
| 36 |
+
int i02 = get_group_id(1);
|
| 37 |
+
int i01 = get_group_id(0);
|
| 38 |
+
|
| 39 |
+
int i13 = i03 % ne13;
|
| 40 |
+
int i12 = i02 % ne12;
|
| 41 |
+
int i11 = i01 % ne11;
|
| 42 |
+
|
| 43 |
+
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
| 44 |
+
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
| 45 |
+
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
| 46 |
+
|
| 47 |
+
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
| 48 |
+
const int i10 = i0 % ne10;
|
| 49 |
+
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
// assumption: src1 is a row
|
| 54 |
+
// broadcast src1 into src0
|
| 55 |
+
kernel void kernel_sub_row(
|
| 56 |
+
global float4 * src0,
|
| 57 |
+
ulong offset0,
|
| 58 |
+
global float4 * src1,
|
| 59 |
+
ulong offset1,
|
| 60 |
+
global float4 * dst,
|
| 61 |
+
ulong offsetd,
|
| 62 |
+
int ne
|
| 63 |
+
) {
|
| 64 |
+
src0 = (global float4*)((global char*)src0 + offset0);
|
| 65 |
+
src1 = (global float4*)((global char*)src1 + offset1);
|
| 66 |
+
dst = (global float4*)((global char*)dst + offsetd);
|
| 67 |
+
|
| 68 |
+
// This performs better than using %.
|
| 69 |
+
uint gid = get_global_id(0);
|
| 70 |
+
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
| 71 |
+
dst[gid] = src0[gid] - src1[idx1];
|
| 72 |
+
}
|
ggml/src/ggml-opencl/kernels/sum_rows.cl
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
kernel void kernel_sum_rows_f32(
|
| 3 |
+
global float * src0,
|
| 4 |
+
ulong offset0,
|
| 5 |
+
global float * dst,
|
| 6 |
+
ulong offsetd,
|
| 7 |
+
int ne00,
|
| 8 |
+
int ne01,
|
| 9 |
+
int ne02,
|
| 10 |
+
int ne03,
|
| 11 |
+
ulong nb01,
|
| 12 |
+
ulong nb02,
|
| 13 |
+
ulong nb03,
|
| 14 |
+
ulong nb1,
|
| 15 |
+
ulong nb2,
|
| 16 |
+
ulong nb3
|
| 17 |
+
) {
|
| 18 |
+
src0 = (global float *)((global char *)src0 + offset0);
|
| 19 |
+
dst = (global float *)((global char *)dst + offsetd);
|
| 20 |
+
|
| 21 |
+
int i3 = get_global_id(2);
|
| 22 |
+
int i2 = get_global_id(1);
|
| 23 |
+
int i1 = get_global_id(0);
|
| 24 |
+
|
| 25 |
+
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
| 26 |
+
return;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
| 30 |
+
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
|
| 31 |
+
|
| 32 |
+
float row_sum = 0;
|
| 33 |
+
|
| 34 |
+
for (int i0 = 0; i0 < ne00; i0++) {
|
| 35 |
+
row_sum += src_row[i0];
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
dst_row[0] = row_sum;
|
| 39 |
+
}
|