Spaces:
Sleeping
Sleeping
lhez
commited on
Commit
·
ae0c7b8
1
Parent(s):
1a9d2d3
opencl: ref count `ggml_backend_opencl_context` and refactor profiling (llama/14254)
Browse files* Move profiling info into `ggml_backend_opencl_context`
* Add `enqueue_ndrange_kernel` to launch kernel
- ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
ggml/src/ggml-opencl/ggml-opencl.cpp
CHANGED
|
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
|
|
| 231 |
return { type, major, minor, patch };
|
| 232 |
}
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
struct ggml_backend_opencl_context;
|
| 235 |
|
| 236 |
// backend device context
|
|
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
|
|
| 254 |
|
| 255 |
// backend context
|
| 256 |
struct ggml_backend_opencl_context {
|
|
|
|
|
|
|
| 257 |
cl_device_id device;
|
| 258 |
std::string device_name;
|
| 259 |
|
|
@@ -369,6 +436,108 @@ struct ggml_backend_opencl_context {
|
|
| 369 |
cl_kernel kernel_timestep_embedding;
|
| 370 |
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
| 371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
| 373 |
// Transpose kernels
|
| 374 |
cl_program program_transpose;
|
|
@@ -395,46 +564,19 @@ struct ggml_backend_opencl_context {
|
|
| 395 |
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
|
| 396 |
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
|
| 397 |
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
| 398 |
-
};
|
| 399 |
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
// Profiling
|
| 404 |
#ifdef GGML_OPENCL_PROFILING
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
cl_kernel kernel;
|
| 410 |
-
cl_event evt;
|
| 411 |
-
|
| 412 |
-
cl_ulong cmd_queued;
|
| 413 |
-
cl_ulong cmd_submit;
|
| 414 |
-
cl_ulong cmd_start;
|
| 415 |
-
cl_ulong cmd_end;
|
| 416 |
-
cl_ulong overhead_start;
|
| 417 |
-
cl_ulong overhead_end;
|
| 418 |
-
// For the times below, see spec for clGetEventProfilingInfo
|
| 419 |
-
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
| 420 |
-
cl_ulong cmd_queued_duration_ns;
|
| 421 |
-
// The time kernel spent for submission - START - SUBMIT
|
| 422 |
-
cl_ulong cmd_submit_duration_ns;
|
| 423 |
-
// Kernel execution time in nanoseconds - END - START
|
| 424 |
-
cl_ulong cmd_duration_ns;
|
| 425 |
-
// The time for the kernel to complete - COMPLETE - END
|
| 426 |
-
cl_ulong cmd_complete_duration_ns;
|
| 427 |
-
// Total time to finish the kernel - COMPELTE - QUEUED
|
| 428 |
-
cl_ulong cmd_total_duration_ns;
|
| 429 |
-
// Global and local work sizes.
|
| 430 |
-
size_t global_size[3];
|
| 431 |
-
size_t local_size[3];
|
| 432 |
-
// Op output size.
|
| 433 |
-
size_t output_size[4];
|
| 434 |
};
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
| 438 |
|
| 439 |
inline std::string read_file(const std::string &path) {
|
| 440 |
std::ifstream ifs(path);
|
|
@@ -1669,6 +1811,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 1669 |
backend_ctx->device = dev_ctx->device;
|
| 1670 |
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
| 1671 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1672 |
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
| 1673 |
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
| 1674 |
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
|
@@ -1841,93 +1989,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 1841 |
return dev_ctx->backend_ctx;
|
| 1842 |
}
|
| 1843 |
|
| 1844 |
-
static void ggml_cl2_free(
|
| 1845 |
-
|
| 1846 |
-
|
| 1847 |
-
if (!fperf) {
|
| 1848 |
-
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
| 1849 |
-
return;
|
| 1850 |
-
}
|
| 1851 |
|
| 1852 |
-
//
|
| 1853 |
-
|
| 1854 |
-
|
| 1855 |
-
|
| 1856 |
-
|
| 1857 |
-
|
| 1858 |
-
|
| 1859 |
-
|
| 1860 |
-
CL_CHECK(clWaitForEvents(1, &info.evt));
|
| 1861 |
-
CL_CHECK(clGetEventProfilingInfo(
|
| 1862 |
-
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
| 1863 |
-
CL_CHECK(clGetEventProfilingInfo(
|
| 1864 |
-
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
| 1865 |
-
CL_CHECK(clGetEventProfilingInfo(
|
| 1866 |
-
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
| 1867 |
-
CL_CHECK(clGetEventProfilingInfo(
|
| 1868 |
-
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
| 1869 |
-
CL_CHECK(clGetEventProfilingInfo(
|
| 1870 |
-
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
| 1871 |
-
CL_CHECK(clReleaseEvent(info.evt));
|
| 1872 |
-
|
| 1873 |
-
char kernel_name[512];
|
| 1874 |
-
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
| 1875 |
-
sizeof(kernel_name), kernel_name, NULL));
|
| 1876 |
-
info.kernel_name = kernel_name;
|
| 1877 |
-
|
| 1878 |
-
info.cmd_queued = cmd_queued;
|
| 1879 |
-
info.cmd_submit = cmd_submit;
|
| 1880 |
-
info.cmd_start = cmd_start;
|
| 1881 |
-
info.cmd_end = cmd_end;
|
| 1882 |
-
|
| 1883 |
-
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
| 1884 |
-
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
| 1885 |
-
info.cmd_duration_ns = cmd_end - cmd_start;
|
| 1886 |
-
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
| 1887 |
-
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
| 1888 |
-
}
|
| 1889 |
-
|
| 1890 |
-
// Dump a csv
|
| 1891 |
-
float total_kernel_time = 0;
|
| 1892 |
-
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
| 1893 |
-
for (const ProfilingInfo & info : g_profiling_info) {
|
| 1894 |
-
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
| 1895 |
-
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
| 1896 |
-
info.op_name.c_str(), info.kernel_name.c_str(),
|
| 1897 |
-
info.cmd_queued_duration_ns/1.e6f,
|
| 1898 |
-
info.cmd_submit_duration_ns/1.e6f,
|
| 1899 |
-
info.cmd_duration_ns/1.e6f,
|
| 1900 |
-
info.cmd_complete_duration_ns/1.e6f,
|
| 1901 |
-
info.cmd_total_duration_ns/1.e6f,
|
| 1902 |
-
info.global_size[0], info.global_size[1], info.global_size[2],
|
| 1903 |
-
info.local_size[0], info.local_size[1], info.local_size[2],
|
| 1904 |
-
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
| 1905 |
-
}
|
| 1906 |
-
fclose(fperf);
|
| 1907 |
-
|
| 1908 |
-
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
| 1909 |
-
|
| 1910 |
-
// Dump a simple chrome trace
|
| 1911 |
-
FILE* ftrace = fopen("cl_trace.json", "w");
|
| 1912 |
-
if (!ftrace) {
|
| 1913 |
-
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
| 1914 |
-
return;
|
| 1915 |
}
|
| 1916 |
|
| 1917 |
-
|
| 1918 |
-
|
| 1919 |
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
| 1920 |
-
info.kernel_name.c_str(), info.cmd_queued/1000);
|
| 1921 |
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
| 1922 |
-
info.kernel_name.c_str(), info.cmd_submit/1000);
|
| 1923 |
-
|
| 1924 |
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
| 1925 |
-
info.kernel_name.c_str(), info.cmd_start/1000);
|
| 1926 |
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
| 1927 |
-
info.kernel_name.c_str(), info.cmd_end/1000);
|
| 1928 |
}
|
| 1929 |
-
fclose(ftrace);
|
| 1930 |
-
#endif
|
| 1931 |
}
|
| 1932 |
|
| 1933 |
//------------------------------------------------------------------------------
|
|
@@ -2011,9 +2088,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
|
|
| 2011 |
}
|
| 2012 |
|
| 2013 |
static void ggml_backend_opencl_free(ggml_backend_t backend) {
|
| 2014 |
-
ggml_cl2_free();
|
| 2015 |
-
|
| 2016 |
-
GGML_UNUSED(backend);
|
| 2017 |
}
|
| 2018 |
|
| 2019 |
static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
@@ -2899,6 +2974,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
|
|
| 2899 |
|
| 2900 |
static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
|
| 2901 |
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
|
|
|
|
|
|
|
| 2902 |
|
| 2903 |
ggml_backend_t backend = new ggml_backend {
|
| 2904 |
/* .guid = */ ggml_backend_opencl_guid(),
|
|
@@ -3159,31 +3236,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
| 3159 |
#define dump_tensor(tensor)
|
| 3160 |
#endif
|
| 3161 |
|
| 3162 |
-
//------------------------------------------------------------------------------
|
| 3163 |
-
// Profiling utility
|
| 3164 |
-
//------------------------------------------------------------------------------
|
| 3165 |
-
#ifdef GGML_OPENCL_PROFILING
|
| 3166 |
-
static void populateProfilingInfo(
|
| 3167 |
-
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
|
| 3168 |
-
size_t global_size[3], size_t local_size[3],
|
| 3169 |
-
const ggml_tensor * tensor) {
|
| 3170 |
-
info.op_name = tensor->name;
|
| 3171 |
-
info.kernel = kernel;
|
| 3172 |
-
info.evt = evt;
|
| 3173 |
-
|
| 3174 |
-
info.local_size[0] = local_size[0];
|
| 3175 |
-
info.local_size[1] = local_size[1];
|
| 3176 |
-
info.local_size[2] = local_size[2];
|
| 3177 |
-
info.global_size[0] = global_size[0];
|
| 3178 |
-
info.global_size[1] = global_size[1];
|
| 3179 |
-
info.global_size[2] = global_size[2];
|
| 3180 |
-
info.output_size[0] = tensor->ne[0];
|
| 3181 |
-
info.output_size[1] = tensor->ne[1];
|
| 3182 |
-
info.output_size[2] = tensor->ne[2];
|
| 3183 |
-
info.output_size[3] = tensor->ne[3];
|
| 3184 |
-
}
|
| 3185 |
-
#endif
|
| 3186 |
-
|
| 3187 |
//------------------------------------------------------------------------------
|
| 3188 |
// Ops
|
| 3189 |
//------------------------------------------------------------------------------
|
|
@@ -3227,7 +3279,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 3227 |
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
| 3228 |
|
| 3229 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3230 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 3231 |
|
| 3232 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3233 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3271,15 +3322,7 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 3271 |
size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
|
| 3272 |
size_t local_work_size[] = {1, 1, 1};
|
| 3273 |
|
| 3274 |
-
|
| 3275 |
-
cl_event evt;
|
| 3276 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3277 |
-
|
| 3278 |
-
g_profiling_info.emplace_back();
|
| 3279 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3280 |
-
#else
|
| 3281 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3282 |
-
#endif
|
| 3283 |
}
|
| 3284 |
|
| 3285 |
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3321,7 +3364,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3321 |
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
| 3322 |
|
| 3323 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3324 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 3325 |
|
| 3326 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3327 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3396,29 +3438,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3396 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 3397 |
}
|
| 3398 |
|
| 3399 |
-
|
| 3400 |
-
cl_event evt;
|
| 3401 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 3402 |
-
|
| 3403 |
-
g_profiling_info.emplace_back();
|
| 3404 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
| 3405 |
-
#else
|
| 3406 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 3407 |
-
#endif
|
| 3408 |
} else {
|
| 3409 |
unsigned int nth = MIN(64, ne0);
|
| 3410 |
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 3411 |
size_t local_work_size[] = {nth, 1, 1};
|
| 3412 |
|
| 3413 |
-
|
| 3414 |
-
cl_event evt;
|
| 3415 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3416 |
-
|
| 3417 |
-
g_profiling_info.emplace_back();
|
| 3418 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3419 |
-
#else
|
| 3420 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3421 |
-
#endif
|
| 3422 |
}
|
| 3423 |
}
|
| 3424 |
|
|
@@ -3461,7 +3487,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3461 |
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
| 3462 |
|
| 3463 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3464 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 3465 |
|
| 3466 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3467 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3536,29 +3561,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3536 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 3537 |
}
|
| 3538 |
|
| 3539 |
-
|
| 3540 |
-
cl_event evt;
|
| 3541 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 3542 |
-
|
| 3543 |
-
g_profiling_info.emplace_back();
|
| 3544 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
| 3545 |
-
#else
|
| 3546 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 3547 |
-
#endif
|
| 3548 |
} else {
|
| 3549 |
unsigned int nth = MIN(64, ne0);
|
| 3550 |
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 3551 |
size_t local_work_size[] = {nth, 1, 1};
|
| 3552 |
|
| 3553 |
-
|
| 3554 |
-
cl_event evt;
|
| 3555 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3556 |
-
|
| 3557 |
-
g_profiling_info.emplace_back();
|
| 3558 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3559 |
-
#else
|
| 3560 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3561 |
-
#endif
|
| 3562 |
}
|
| 3563 |
}
|
| 3564 |
|
|
@@ -3598,7 +3607,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3598 |
const cl_ulong nb3 = dst->nb[3];
|
| 3599 |
|
| 3600 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3601 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 3602 |
|
| 3603 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3604 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3661,29 +3669,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3661 |
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3662 |
size_t local_work_size[] = {64, 1, 1};
|
| 3663 |
|
| 3664 |
-
|
| 3665 |
-
cl_event evt;
|
| 3666 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3667 |
-
|
| 3668 |
-
g_profiling_info.emplace_back();
|
| 3669 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3670 |
-
#else
|
| 3671 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3672 |
-
#endif
|
| 3673 |
} else {
|
| 3674 |
unsigned int nth = MIN(64, ne0);
|
| 3675 |
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 3676 |
size_t local_work_size[] = {nth, 1, 1};
|
| 3677 |
|
| 3678 |
-
|
| 3679 |
-
cl_event evt;
|
| 3680 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3681 |
-
|
| 3682 |
-
g_profiling_info.emplace_back();
|
| 3683 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3684 |
-
#else
|
| 3685 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3686 |
-
#endif
|
| 3687 |
}
|
| 3688 |
}
|
| 3689 |
|
|
@@ -3723,7 +3715,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3723 |
const cl_ulong nb3 = dst->nb[3];
|
| 3724 |
|
| 3725 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3726 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 3727 |
|
| 3728 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3729 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3786,29 +3777,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3786 |
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3787 |
size_t local_work_size[] = {64, 1, 1};
|
| 3788 |
|
| 3789 |
-
|
| 3790 |
-
cl_event evt;
|
| 3791 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3792 |
-
|
| 3793 |
-
g_profiling_info.emplace_back();
|
| 3794 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3795 |
-
#else
|
| 3796 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3797 |
-
#endif
|
| 3798 |
} else {
|
| 3799 |
unsigned int nth = MIN(64, ne0);
|
| 3800 |
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 3801 |
size_t local_work_size[] = {nth, 1, 1};
|
| 3802 |
|
| 3803 |
-
|
| 3804 |
-
cl_event evt;
|
| 3805 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 3806 |
-
|
| 3807 |
-
g_profiling_info.emplace_back();
|
| 3808 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3809 |
-
#else
|
| 3810 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 3811 |
-
#endif
|
| 3812 |
}
|
| 3813 |
}
|
| 3814 |
|
|
@@ -3821,7 +3796,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3821 |
UNUSED(src1);
|
| 3822 |
|
| 3823 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3824 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 3825 |
|
| 3826 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3827 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3848,15 +3822,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3848 |
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3849 |
size_t local_work_size[] = {64, 1, 1};
|
| 3850 |
|
| 3851 |
-
|
| 3852 |
-
cl_event evt;
|
| 3853 |
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
| 3854 |
-
|
| 3855 |
-
g_profiling_info.emplace_back();
|
| 3856 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3857 |
-
#else
|
| 3858 |
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
| 3859 |
-
#endif
|
| 3860 |
}
|
| 3861 |
|
| 3862 |
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3868,7 +3834,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
|
|
| 3868 |
UNUSED(src1);
|
| 3869 |
|
| 3870 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3871 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 3872 |
|
| 3873 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3874 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3895,15 +3860,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
|
|
| 3895 |
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3896 |
size_t local_work_size[] = {64, 1, 1};
|
| 3897 |
|
| 3898 |
-
|
| 3899 |
-
cl_event evt;
|
| 3900 |
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
| 3901 |
-
|
| 3902 |
-
g_profiling_info.emplace_back();
|
| 3903 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 3904 |
-
#else
|
| 3905 |
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
| 3906 |
-
#endif
|
| 3907 |
}
|
| 3908 |
|
| 3909 |
static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3915,7 +3872,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3915 |
UNUSED(src1);
|
| 3916 |
|
| 3917 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3918 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 3919 |
|
| 3920 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3921 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3947,15 +3903,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3947 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 3948 |
}
|
| 3949 |
|
| 3950 |
-
|
| 3951 |
-
cl_event evt;
|
| 3952 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 3953 |
-
|
| 3954 |
-
g_profiling_info.emplace_back();
|
| 3955 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
| 3956 |
-
#else
|
| 3957 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 3958 |
-
#endif
|
| 3959 |
}
|
| 3960 |
|
| 3961 |
static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3967,7 +3915,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3967 |
UNUSED(src1);
|
| 3968 |
|
| 3969 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3970 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 3971 |
|
| 3972 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3973 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3992,15 +3939,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 3992 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 3993 |
}
|
| 3994 |
|
| 3995 |
-
|
| 3996 |
-
cl_event evt;
|
| 3997 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 3998 |
-
|
| 3999 |
-
g_profiling_info.emplace_back();
|
| 4000 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
| 4001 |
-
#else
|
| 4002 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 4003 |
-
#endif
|
| 4004 |
}
|
| 4005 |
|
| 4006 |
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4012,7 +3951,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 4012 |
UNUSED(src1);
|
| 4013 |
|
| 4014 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4015 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4016 |
|
| 4017 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4018 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4044,15 +3982,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 4044 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 4045 |
}
|
| 4046 |
|
| 4047 |
-
|
| 4048 |
-
cl_event evt;
|
| 4049 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 4050 |
-
|
| 4051 |
-
g_profiling_info.emplace_back();
|
| 4052 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
| 4053 |
-
#else
|
| 4054 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 4055 |
-
#endif
|
| 4056 |
}
|
| 4057 |
|
| 4058 |
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4064,7 +3994,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
| 4064 |
UNUSED(src1);
|
| 4065 |
|
| 4066 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4067 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4068 |
|
| 4069 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4070 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4096,15 +4025,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
| 4096 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 4097 |
}
|
| 4098 |
|
| 4099 |
-
|
| 4100 |
-
cl_event evt;
|
| 4101 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 4102 |
-
|
| 4103 |
-
g_profiling_info.emplace_back();
|
| 4104 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
| 4105 |
-
#else
|
| 4106 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 4107 |
-
#endif
|
| 4108 |
}
|
| 4109 |
|
| 4110 |
static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4116,7 +4037,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 4116 |
UNUSED(src1);
|
| 4117 |
|
| 4118 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4119 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4120 |
|
| 4121 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4122 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4157,15 +4077,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 4157 |
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 4158 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 4159 |
|
| 4160 |
-
|
| 4161 |
-
cl_event evt;
|
| 4162 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 4163 |
-
|
| 4164 |
-
g_profiling_info.emplace_back();
|
| 4165 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 4166 |
-
#else
|
| 4167 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 4168 |
-
#endif
|
| 4169 |
}
|
| 4170 |
|
| 4171 |
static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4177,7 +4089,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 4177 |
UNUSED(src1);
|
| 4178 |
|
| 4179 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4180 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4181 |
|
| 4182 |
//ggml_backend_opencl_device_context * dev_ctx =
|
| 4183 |
// (ggml_backend_opencl_device_context *)backend->device->context;
|
|
@@ -4241,15 +4152,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 4241 |
// This is local memory - the size depends on subgroup size.
|
| 4242 |
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
| 4243 |
|
| 4244 |
-
|
| 4245 |
-
cl_event evt;
|
| 4246 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 4247 |
-
|
| 4248 |
-
g_profiling_info.emplace_back();
|
| 4249 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 4250 |
-
#else
|
| 4251 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 4252 |
-
#endif
|
| 4253 |
}
|
| 4254 |
|
| 4255 |
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4261,7 +4164,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
|
| 4261 |
UNUSED(src1);
|
| 4262 |
|
| 4263 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4264 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4265 |
|
| 4266 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4267 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4300,15 +4202,7 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
|
| 4300 |
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
|
| 4301 |
size_t local_work_size[] = {(size_t)sgs, 1, 1};
|
| 4302 |
|
| 4303 |
-
|
| 4304 |
-
cl_event evt;
|
| 4305 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 4306 |
-
|
| 4307 |
-
g_profiling_info.emplace_back();
|
| 4308 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 4309 |
-
#else
|
| 4310 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 4311 |
-
#endif
|
| 4312 |
}
|
| 4313 |
|
| 4314 |
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4320,7 +4214,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 4320 |
UNUSED(src1);
|
| 4321 |
|
| 4322 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4323 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4324 |
|
| 4325 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4326 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4397,16 +4290,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 4397 |
}
|
| 4398 |
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
| 4399 |
|
| 4400 |
-
|
| 4401 |
-
#ifdef GGML_OPENCL_PROFILING
|
| 4402 |
-
cl_event evt;
|
| 4403 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 4404 |
-
|
| 4405 |
-
g_profiling_info.emplace_back();
|
| 4406 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
|
| 4407 |
-
#else
|
| 4408 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 4409 |
-
#endif
|
| 4410 |
}
|
| 4411 |
|
| 4412 |
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
|
@@ -4419,7 +4303,6 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
| 4419 |
UNUSED(src1_shape_def);
|
| 4420 |
|
| 4421 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4422 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4423 |
|
| 4424 |
if (backend_ctx->kernel_repeat == nullptr) {
|
| 4425 |
GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
|
|
@@ -4467,15 +4350,7 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
| 4467 |
|
| 4468 |
size_t global_work_size[] = { gws0, gws1, gws2 };
|
| 4469 |
|
| 4470 |
-
|
| 4471 |
-
cl_event evt;
|
| 4472 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt));
|
| 4473 |
-
|
| 4474 |
-
g_profiling_info.emplace_back();
|
| 4475 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst);
|
| 4476 |
-
#else
|
| 4477 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
|
| 4478 |
-
#endif
|
| 4479 |
}
|
| 4480 |
|
| 4481 |
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -4488,7 +4363,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
|
|
| 4488 |
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
|
| 4489 |
|
| 4490 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4491 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4492 |
|
| 4493 |
if (backend_ctx->kernel_pad == nullptr) {
|
| 4494 |
GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
|
|
@@ -4533,15 +4407,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
|
|
| 4533 |
local_work_size_ptr = nullptr;
|
| 4534 |
}
|
| 4535 |
|
| 4536 |
-
|
| 4537 |
-
cl_event evt;
|
| 4538 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 4539 |
-
|
| 4540 |
-
g_profiling_info.emplace_back();
|
| 4541 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
|
| 4542 |
-
#else
|
| 4543 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 4544 |
-
#endif
|
| 4545 |
}
|
| 4546 |
|
| 4547 |
static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -4553,7 +4419,6 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
| 4553 |
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 4554 |
|
| 4555 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4556 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4557 |
|
| 4558 |
const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
| 4559 |
cl_kernel kernel = nullptr;
|
|
@@ -4644,17 +4509,7 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
| 4644 |
local_work_size_ptr = nullptr;
|
| 4645 |
}
|
| 4646 |
|
| 4647 |
-
|
| 4648 |
-
cl_event evt;
|
| 4649 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 4650 |
-
|
| 4651 |
-
g_profiling_info.emplace_back();
|
| 4652 |
-
size_t profiling_gws[3] = {global_work_size[0], 1, 1};
|
| 4653 |
-
size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1};
|
| 4654 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
|
| 4655 |
-
#else
|
| 4656 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 4657 |
-
#endif
|
| 4658 |
}
|
| 4659 |
|
| 4660 |
static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4732,7 +4587,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
| 4732 |
global_work_size[1] = d_ne1;
|
| 4733 |
global_work_size[2] = d_ne2;
|
| 4734 |
|
| 4735 |
-
|
| 4736 |
}
|
| 4737 |
}
|
| 4738 |
} else {
|
|
@@ -4782,7 +4637,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
| 4782 |
d_ne2 > 0 ? (size_t)d_ne2 : 1,
|
| 4783 |
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
|
| 4784 |
|
| 4785 |
-
|
| 4786 |
}
|
| 4787 |
}
|
| 4788 |
|
|
@@ -4795,7 +4650,6 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
|
|
| 4795 |
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 4796 |
|
| 4797 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4798 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4799 |
|
| 4800 |
if (backend_ctx->kernel_timestep_embedding == nullptr) {
|
| 4801 |
GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
|
|
@@ -4828,17 +4682,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
|
|
| 4828 |
|
| 4829 |
size_t global_work_size[] = {gws0, gws1, 1};
|
| 4830 |
|
| 4831 |
-
|
| 4832 |
-
cl_event evt;
|
| 4833 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
|
| 4834 |
-
|
| 4835 |
-
g_profiling_info.emplace_back();
|
| 4836 |
-
size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1};
|
| 4837 |
-
size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS
|
| 4838 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
|
| 4839 |
-
#else
|
| 4840 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem
|
| 4841 |
-
#endif
|
| 4842 |
}
|
| 4843 |
|
| 4844 |
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4853,7 +4697,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 4853 |
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
| 4854 |
|
| 4855 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 4856 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 4857 |
|
| 4858 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4859 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -5058,15 +4901,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 5058 |
static_cast<size_t>(padded_height_B)
|
| 5059 |
};
|
| 5060 |
|
| 5061 |
-
|
| 5062 |
-
cl_event evt;
|
| 5063 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
|
| 5064 |
-
|
| 5065 |
-
g_profiling_info.emplace_back();
|
| 5066 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
|
| 5067 |
-
#else
|
| 5068 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
|
| 5069 |
-
#endif
|
| 5070 |
} else {
|
| 5071 |
// no need to transpose B in other cases
|
| 5072 |
// create an image for B from sub_buffer
|
|
@@ -5188,16 +5023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 5188 |
|
| 5189 |
// enqueue kernel with profiling
|
| 5190 |
// <--------------------------------------------> //
|
| 5191 |
-
|
| 5192 |
-
cl_event evt;
|
| 5193 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5194 |
-
|
| 5195 |
-
g_profiling_info.emplace_back();
|
| 5196 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 5197 |
-
// enqueue kernel without profiling
|
| 5198 |
-
#else
|
| 5199 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5200 |
-
#endif
|
| 5201 |
// <--------------------------------------------> //
|
| 5202 |
|
| 5203 |
// deallocate sub buffers and images
|
|
@@ -5277,15 +5103,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 5277 |
global_work_size[2] = (size_t)ne12*ne13;
|
| 5278 |
}
|
| 5279 |
|
| 5280 |
-
|
| 5281 |
-
cl_event evt;
|
| 5282 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5283 |
-
|
| 5284 |
-
g_profiling_info.emplace_back();
|
| 5285 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 5286 |
-
#else
|
| 5287 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5288 |
-
#endif
|
| 5289 |
return;
|
| 5290 |
}
|
| 5291 |
#else // GGML_OPENCL_SOA_Q
|
|
@@ -5515,15 +5333,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 5515 |
size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
| 5516 |
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
| 5517 |
|
| 5518 |
-
|
| 5519 |
-
cl_event evt;
|
| 5520 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5521 |
-
|
| 5522 |
-
g_profiling_info.emplace_back();
|
| 5523 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 5524 |
-
#else
|
| 5525 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5526 |
-
#endif
|
| 5527 |
} else if (src0t == GGML_TYPE_Q4_K) {
|
| 5528 |
GGML_ASSERT(false && "not implemented");
|
| 5529 |
} else if (src0t == GGML_TYPE_Q3_K) {
|
|
@@ -5534,30 +5344,14 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 5534 |
size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
| 5535 |
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
| 5536 |
|
| 5537 |
-
|
| 5538 |
-
cl_event evt;
|
| 5539 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5540 |
-
|
| 5541 |
-
g_profiling_info.emplace_back();
|
| 5542 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 5543 |
-
#else
|
| 5544 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5545 |
-
#endif
|
| 5546 |
} else {
|
| 5547 |
int64_t ny = (ne11 + nrows - 1)/nrows;
|
| 5548 |
|
| 5549 |
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
|
| 5550 |
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
| 5551 |
|
| 5552 |
-
|
| 5553 |
-
cl_event evt;
|
| 5554 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5555 |
-
|
| 5556 |
-
g_profiling_info.emplace_back();
|
| 5557 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 5558 |
-
#else
|
| 5559 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5560 |
-
#endif
|
| 5561 |
}
|
| 5562 |
}
|
| 5563 |
|
|
@@ -5574,7 +5368,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
| 5574 |
GGML_ASSERT(src2->extra);
|
| 5575 |
|
| 5576 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 5577 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 5578 |
|
| 5579 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
| 5580 |
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
|
@@ -5680,15 +5473,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
| 5680 |
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
| 5681 |
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
| 5682 |
|
| 5683 |
-
|
| 5684 |
-
cl_event evt;
|
| 5685 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5686 |
-
|
| 5687 |
-
g_profiling_info.emplace_back();
|
| 5688 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 5689 |
-
#else
|
| 5690 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5691 |
-
#endif
|
| 5692 |
}
|
| 5693 |
|
| 5694 |
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5701,7 +5486,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
| 5701 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 5702 |
|
| 5703 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 5704 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 5705 |
|
| 5706 |
float scale;
|
| 5707 |
memcpy(&scale, dst->op_params, sizeof(scale));
|
|
@@ -5730,15 +5514,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
| 5730 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 5731 |
}
|
| 5732 |
|
| 5733 |
-
|
| 5734 |
-
cl_event evt;
|
| 5735 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 5736 |
-
|
| 5737 |
-
g_profiling_info.emplace_back();
|
| 5738 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
| 5739 |
-
#else
|
| 5740 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 5741 |
-
#endif
|
| 5742 |
}
|
| 5743 |
|
| 5744 |
static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5775,7 +5551,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 5775 |
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
| 5776 |
|
| 5777 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 5778 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 5779 |
|
| 5780 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 5781 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -5840,15 +5615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 5840 |
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 5841 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 5842 |
|
| 5843 |
-
|
| 5844 |
-
cl_event evt;
|
| 5845 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5846 |
-
|
| 5847 |
-
g_profiling_info.emplace_back();
|
| 5848 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
|
| 5849 |
-
#else
|
| 5850 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5851 |
-
#endif
|
| 5852 |
}
|
| 5853 |
|
| 5854 |
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5871,7 +5638,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
| 5871 |
const int ne02 = src0 ? src0->ne[2] : 0;
|
| 5872 |
|
| 5873 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 5874 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 5875 |
|
| 5876 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 5877 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5895,15 +5661,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
| 5895 |
size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
|
| 5896 |
size_t local_work_size[] = {64, 1, 1};
|
| 5897 |
|
| 5898 |
-
|
| 5899 |
-
cl_event evt;
|
| 5900 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5901 |
-
|
| 5902 |
-
g_profiling_info.emplace_back();
|
| 5903 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 5904 |
-
#else
|
| 5905 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5906 |
-
#endif
|
| 5907 |
} else {
|
| 5908 |
kernel = backend_ctx->kernel_diag_mask_inf;
|
| 5909 |
|
|
@@ -5923,15 +5681,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
| 5923 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 5924 |
}
|
| 5925 |
|
| 5926 |
-
|
| 5927 |
-
cl_event evt;
|
| 5928 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
| 5929 |
-
|
| 5930 |
-
g_profiling_info.emplace_back();
|
| 5931 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
| 5932 |
-
#else
|
| 5933 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
| 5934 |
-
#endif
|
| 5935 |
}
|
| 5936 |
}
|
| 5937 |
|
|
@@ -5951,7 +5701,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 5951 |
}
|
| 5952 |
|
| 5953 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 5954 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 5955 |
|
| 5956 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 5957 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6031,15 +5780,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 6031 |
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 6032 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 6033 |
|
| 6034 |
-
|
| 6035 |
-
cl_event evt;
|
| 6036 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 6037 |
-
|
| 6038 |
-
g_profiling_info.emplace_back();
|
| 6039 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 6040 |
-
#else
|
| 6041 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 6042 |
-
#endif
|
| 6043 |
}
|
| 6044 |
|
| 6045 |
static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6051,7 +5792,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 6051 |
GGML_ASSERT(dst->extra);
|
| 6052 |
|
| 6053 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 6054 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 6055 |
|
| 6056 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 6057 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -6217,15 +5957,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 6217 |
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 6218 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 6219 |
|
| 6220 |
-
|
| 6221 |
-
cl_event evt;
|
| 6222 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 6223 |
-
|
| 6224 |
-
g_profiling_info.emplace_back();
|
| 6225 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 6226 |
-
#else
|
| 6227 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 6228 |
-
#endif
|
| 6229 |
}
|
| 6230 |
|
| 6231 |
static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6240,7 +5972,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
| 6240 |
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
| 6241 |
|
| 6242 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 6243 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 6244 |
|
| 6245 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
| 6246 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6309,15 +6040,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
| 6309 |
size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
|
| 6310 |
size_t local_work_size[] = {256, 1, 1};
|
| 6311 |
|
| 6312 |
-
|
| 6313 |
-
cl_event evt;
|
| 6314 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 6315 |
-
|
| 6316 |
-
g_profiling_info.emplace_back();
|
| 6317 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 6318 |
-
#else
|
| 6319 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 6320 |
-
#endif
|
| 6321 |
}
|
| 6322 |
|
| 6323 |
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6332,7 +6055,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 6332 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 6333 |
|
| 6334 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 6335 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 6336 |
|
| 6337 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 6338 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6364,15 +6086,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 6364 |
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
|
| 6365 |
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
|
| 6366 |
|
| 6367 |
-
|
| 6368 |
-
cl_event evt;
|
| 6369 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 6370 |
-
|
| 6371 |
-
g_profiling_info.emplace_back();
|
| 6372 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 6373 |
-
#else
|
| 6374 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 6375 |
-
#endif
|
| 6376 |
}
|
| 6377 |
|
| 6378 |
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6386,7 +6100,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 6386 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 6387 |
|
| 6388 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 6389 |
-
cl_command_queue queue = backend_ctx->queue;
|
| 6390 |
|
| 6391 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 6392 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6427,15 +6140,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 6427 |
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
| 6428 |
size_t local_work_size[] = {(size_t)64, 1, 1};
|
| 6429 |
|
| 6430 |
-
|
| 6431 |
-
cl_event evt;
|
| 6432 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 6433 |
-
|
| 6434 |
-
g_profiling_info.emplace_back();
|
| 6435 |
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 6436 |
-
#else
|
| 6437 |
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 6438 |
-
#endif
|
| 6439 |
}
|
| 6440 |
|
| 6441 |
//------------------------------------------------------------------------------
|
|
|
|
| 231 |
return { type, major, minor, patch };
|
| 232 |
}
|
| 233 |
|
| 234 |
+
// Profiling
|
| 235 |
+
struct ProfilingInfo {
|
| 236 |
+
std::string op_name;
|
| 237 |
+
std::string kernel_name;
|
| 238 |
+
|
| 239 |
+
cl_kernel kernel;
|
| 240 |
+
cl_event evt;
|
| 241 |
+
|
| 242 |
+
cl_ulong cmd_queued;
|
| 243 |
+
cl_ulong cmd_submit;
|
| 244 |
+
cl_ulong cmd_start;
|
| 245 |
+
cl_ulong cmd_end;
|
| 246 |
+
cl_ulong overhead_start;
|
| 247 |
+
cl_ulong overhead_end;
|
| 248 |
+
// For the times below, see spec for clGetEventProfilingInfo
|
| 249 |
+
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
| 250 |
+
cl_ulong cmd_queued_duration_ns;
|
| 251 |
+
// The time kernel spent for submission - START - SUBMIT
|
| 252 |
+
cl_ulong cmd_submit_duration_ns;
|
| 253 |
+
// Kernel execution time in nanoseconds - END - START
|
| 254 |
+
cl_ulong cmd_duration_ns;
|
| 255 |
+
// The time for the kernel to complete - COMPLETE - END
|
| 256 |
+
cl_ulong cmd_complete_duration_ns;
|
| 257 |
+
// Total time to finish the kernel - COMPELTE - QUEUED
|
| 258 |
+
cl_ulong cmd_total_duration_ns;
|
| 259 |
+
// Global and local work sizes.
|
| 260 |
+
size_t global_size[3];
|
| 261 |
+
size_t local_size[3];
|
| 262 |
+
// Op output size.
|
| 263 |
+
size_t output_size[4];
|
| 264 |
+
};
|
| 265 |
+
|
| 266 |
+
static void populateProfilingInfo(
|
| 267 |
+
ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
|
| 268 |
+
size_t global_size[3], size_t local_size[3],
|
| 269 |
+
const ggml_tensor * tensor) {
|
| 270 |
+
info.op_name = tensor->name;
|
| 271 |
+
info.kernel = kernel;
|
| 272 |
+
info.evt = evt;
|
| 273 |
+
|
| 274 |
+
// 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
|
| 275 |
+
info.local_size[0] = 0;
|
| 276 |
+
info.local_size[1] = 0;
|
| 277 |
+
info.local_size[2] = 0;
|
| 278 |
+
|
| 279 |
+
info.global_size[0] = 0;
|
| 280 |
+
info.global_size[1] = 0;
|
| 281 |
+
info.global_size[2] = 0;
|
| 282 |
+
|
| 283 |
+
if (local_size) {
|
| 284 |
+
for (cl_uint i = 0; i < work_dim; ++i) {
|
| 285 |
+
info.local_size[i] = local_size[i];
|
| 286 |
+
}
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
for (cl_uint i = 0; i < work_dim; ++i) {
|
| 290 |
+
info.global_size[i] = global_size[i];
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
info.output_size[0] = tensor->ne[0];
|
| 294 |
+
info.output_size[1] = tensor->ne[1];
|
| 295 |
+
info.output_size[2] = tensor->ne[2];
|
| 296 |
+
info.output_size[3] = tensor->ne[3];
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
struct ggml_backend_opencl_context;
|
| 300 |
|
| 301 |
// backend device context
|
|
|
|
| 319 |
|
| 320 |
// backend context
|
| 321 |
struct ggml_backend_opencl_context {
|
| 322 |
+
int ref_count;
|
| 323 |
+
|
| 324 |
cl_device_id device;
|
| 325 |
std::string device_name;
|
| 326 |
|
|
|
|
| 436 |
cl_kernel kernel_timestep_embedding;
|
| 437 |
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
| 438 |
|
| 439 |
+
std::vector<ProfilingInfo> profiling_info;
|
| 440 |
+
|
| 441 |
+
void write_profiling_info() {
|
| 442 |
+
FILE * fperf = fopen("cl_profiling.csv", "w");
|
| 443 |
+
if (!fperf) {
|
| 444 |
+
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
| 445 |
+
return;
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
// Populate profiling info
|
| 449 |
+
for (ProfilingInfo & info : profiling_info) {
|
| 450 |
+
cl_ulong cmd_queued;
|
| 451 |
+
cl_ulong cmd_submit;
|
| 452 |
+
cl_ulong cmd_start;
|
| 453 |
+
cl_ulong cmd_end;
|
| 454 |
+
cl_ulong cmd_complete;
|
| 455 |
+
|
| 456 |
+
CL_CHECK(clWaitForEvents(1, &info.evt));
|
| 457 |
+
CL_CHECK(clGetEventProfilingInfo(
|
| 458 |
+
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
| 459 |
+
CL_CHECK(clGetEventProfilingInfo(
|
| 460 |
+
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
| 461 |
+
CL_CHECK(clGetEventProfilingInfo(
|
| 462 |
+
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
| 463 |
+
CL_CHECK(clGetEventProfilingInfo(
|
| 464 |
+
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
| 465 |
+
CL_CHECK(clGetEventProfilingInfo(
|
| 466 |
+
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
| 467 |
+
CL_CHECK(clReleaseEvent(info.evt));
|
| 468 |
+
|
| 469 |
+
char kernel_name[512];
|
| 470 |
+
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
| 471 |
+
sizeof(kernel_name), kernel_name, NULL));
|
| 472 |
+
info.kernel_name = kernel_name;
|
| 473 |
+
|
| 474 |
+
info.cmd_queued = cmd_queued;
|
| 475 |
+
info.cmd_submit = cmd_submit;
|
| 476 |
+
info.cmd_start = cmd_start;
|
| 477 |
+
info.cmd_end = cmd_end;
|
| 478 |
+
|
| 479 |
+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
| 480 |
+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
| 481 |
+
info.cmd_duration_ns = cmd_end - cmd_start;
|
| 482 |
+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
| 483 |
+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
// Dump a csv
|
| 487 |
+
float total_kernel_time = 0;
|
| 488 |
+
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
| 489 |
+
for (const ProfilingInfo & info : profiling_info) {
|
| 490 |
+
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
| 491 |
+
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
| 492 |
+
info.op_name.c_str(), info.kernel_name.c_str(),
|
| 493 |
+
info.cmd_queued_duration_ns/1.e6f,
|
| 494 |
+
info.cmd_submit_duration_ns/1.e6f,
|
| 495 |
+
info.cmd_duration_ns/1.e6f,
|
| 496 |
+
info.cmd_complete_duration_ns/1.e6f,
|
| 497 |
+
info.cmd_total_duration_ns/1.e6f,
|
| 498 |
+
info.global_size[0], info.global_size[1], info.global_size[2],
|
| 499 |
+
info.local_size[0], info.local_size[1], info.local_size[2],
|
| 500 |
+
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
| 501 |
+
}
|
| 502 |
+
fclose(fperf);
|
| 503 |
+
|
| 504 |
+
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
| 505 |
+
|
| 506 |
+
// Dump a simple chrome trace
|
| 507 |
+
FILE* ftrace = fopen("cl_trace.json", "w");
|
| 508 |
+
if (!ftrace) {
|
| 509 |
+
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
| 510 |
+
return;
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
fprintf(ftrace, "[\n");
|
| 514 |
+
for (const ProfilingInfo & info : profiling_info) {
|
| 515 |
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
| 516 |
+
info.kernel_name.c_str(), info.cmd_queued/1000);
|
| 517 |
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
| 518 |
+
info.kernel_name.c_str(), info.cmd_submit/1000);
|
| 519 |
+
|
| 520 |
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
| 521 |
+
info.kernel_name.c_str(), info.cmd_start/1000);
|
| 522 |
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
| 523 |
+
info.kernel_name.c_str(), info.cmd_end/1000);
|
| 524 |
+
}
|
| 525 |
+
fclose(ftrace);
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
|
| 529 |
+
#ifdef GGML_OPENCL_PROFILING
|
| 530 |
+
cl_event evt;
|
| 531 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 532 |
+
|
| 533 |
+
profiling_info.emplace_back();
|
| 534 |
+
populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
|
| 535 |
+
#else
|
| 536 |
+
GGML_UNUSED(tensor);
|
| 537 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 538 |
+
#endif
|
| 539 |
+
}
|
| 540 |
+
|
| 541 |
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
| 542 |
// Transpose kernels
|
| 543 |
cl_program program_transpose;
|
|
|
|
| 564 |
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
|
| 565 |
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
|
| 566 |
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
|
|
| 567 |
|
| 568 |
+
void free() {
|
| 569 |
+
ref_count--;
|
| 570 |
+
if (ref_count == 0) {
|
|
|
|
| 571 |
#ifdef GGML_OPENCL_PROFILING
|
| 572 |
+
write_profiling_info();
|
| 573 |
+
#endif
|
| 574 |
+
}
|
| 575 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
};
|
| 577 |
|
| 578 |
+
// All registered devices with a default device in the front.
|
| 579 |
+
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
| 580 |
|
| 581 |
inline std::string read_file(const std::string &path) {
|
| 582 |
std::ifstream ifs(path);
|
|
|
|
| 1811 |
backend_ctx->device = dev_ctx->device;
|
| 1812 |
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
| 1813 |
|
| 1814 |
+
// ref_count get increased in ggml_backend_opencl_device_init
|
| 1815 |
+
// This function is also used to retrieve backend context, so we don't want
|
| 1816 |
+
// to increase ref_count for each call. We only want to increase ref_count
|
| 1817 |
+
// when the associated device is initialized
|
| 1818 |
+
backend_ctx->ref_count = 0;
|
| 1819 |
+
|
| 1820 |
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
| 1821 |
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
| 1822 |
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
|
|
|
| 1989 |
return dev_ctx->backend_ctx;
|
| 1990 |
}
|
| 1991 |
|
| 1992 |
+
static void ggml_cl2_free(ggml_backend_t backend) {
|
| 1993 |
+
ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
|
| 1994 |
+
ctx->free();
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1995 |
|
| 1996 |
+
// The CL context is shared by all backends, release it if all backends have been released
|
| 1997 |
+
bool should_release_opencl = true;
|
| 1998 |
+
for (auto device : g_ggml_backend_opencl_devices) {
|
| 1999 |
+
ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
|
| 2000 |
+
if (ctx_dev->backend_ctx->ref_count > 0) {
|
| 2001 |
+
should_release_opencl = false;
|
| 2002 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2003 |
}
|
| 2004 |
|
| 2005 |
+
if (should_release_opencl) {
|
| 2006 |
+
CL_CHECK(clReleaseContext(ctx->context));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2007 |
}
|
|
|
|
|
|
|
| 2008 |
}
|
| 2009 |
|
| 2010 |
//------------------------------------------------------------------------------
|
|
|
|
| 2088 |
}
|
| 2089 |
|
| 2090 |
static void ggml_backend_opencl_free(ggml_backend_t backend) {
|
| 2091 |
+
ggml_cl2_free(backend);
|
|
|
|
|
|
|
| 2092 |
}
|
| 2093 |
|
| 2094 |
static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
| 2974 |
|
| 2975 |
static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
|
| 2976 |
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
|
| 2977 |
+
// Getting a new reference to the backend, increase ref_count
|
| 2978 |
+
backend_ctx->ref_count++;
|
| 2979 |
|
| 2980 |
ggml_backend_t backend = new ggml_backend {
|
| 2981 |
/* .guid = */ ggml_backend_opencl_guid(),
|
|
|
|
| 3236 |
#define dump_tensor(tensor)
|
| 3237 |
#endif
|
| 3238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3239 |
//------------------------------------------------------------------------------
|
| 3240 |
// Ops
|
| 3241 |
//------------------------------------------------------------------------------
|
|
|
|
| 3279 |
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
| 3280 |
|
| 3281 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3282 |
|
| 3283 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3284 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
|
|
| 3322 |
size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
|
| 3323 |
size_t local_work_size[] = {1, 1, 1};
|
| 3324 |
|
| 3325 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3326 |
}
|
| 3327 |
|
| 3328 |
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3364 |
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
| 3365 |
|
| 3366 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3367 |
|
| 3368 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3369 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
|
|
| 3438 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 3439 |
}
|
| 3440 |
|
| 3441 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3442 |
} else {
|
| 3443 |
unsigned int nth = MIN(64, ne0);
|
| 3444 |
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 3445 |
size_t local_work_size[] = {nth, 1, 1};
|
| 3446 |
|
| 3447 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3448 |
}
|
| 3449 |
}
|
| 3450 |
|
|
|
|
| 3487 |
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
| 3488 |
|
| 3489 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3490 |
|
| 3491 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3492 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
|
|
| 3561 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 3562 |
}
|
| 3563 |
|
| 3564 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3565 |
} else {
|
| 3566 |
unsigned int nth = MIN(64, ne0);
|
| 3567 |
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 3568 |
size_t local_work_size[] = {nth, 1, 1};
|
| 3569 |
|
| 3570 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3571 |
}
|
| 3572 |
}
|
| 3573 |
|
|
|
|
| 3607 |
const cl_ulong nb3 = dst->nb[3];
|
| 3608 |
|
| 3609 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3610 |
|
| 3611 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3612 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
|
|
| 3669 |
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3670 |
size_t local_work_size[] = {64, 1, 1};
|
| 3671 |
|
| 3672 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3673 |
} else {
|
| 3674 |
unsigned int nth = MIN(64, ne0);
|
| 3675 |
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 3676 |
size_t local_work_size[] = {nth, 1, 1};
|
| 3677 |
|
| 3678 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3679 |
}
|
| 3680 |
}
|
| 3681 |
|
|
|
|
| 3715 |
const cl_ulong nb3 = dst->nb[3];
|
| 3716 |
|
| 3717 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3718 |
|
| 3719 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3720 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
|
|
| 3777 |
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3778 |
size_t local_work_size[] = {64, 1, 1};
|
| 3779 |
|
| 3780 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3781 |
} else {
|
| 3782 |
unsigned int nth = MIN(64, ne0);
|
| 3783 |
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 3784 |
size_t local_work_size[] = {nth, 1, 1};
|
| 3785 |
|
| 3786 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3787 |
}
|
| 3788 |
}
|
| 3789 |
|
|
|
|
| 3796 |
UNUSED(src1);
|
| 3797 |
|
| 3798 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3799 |
|
| 3800 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3801 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 3822 |
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3823 |
size_t local_work_size[] = {64, 1, 1};
|
| 3824 |
|
| 3825 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3826 |
}
|
| 3827 |
|
| 3828 |
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3834 |
UNUSED(src1);
|
| 3835 |
|
| 3836 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3837 |
|
| 3838 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3839 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 3860 |
size_t global_work_size[] = {(size_t)n, 1, 1};
|
| 3861 |
size_t local_work_size[] = {64, 1, 1};
|
| 3862 |
|
| 3863 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3864 |
}
|
| 3865 |
|
| 3866 |
static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3872 |
UNUSED(src1);
|
| 3873 |
|
| 3874 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3875 |
|
| 3876 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3877 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 3903 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 3904 |
}
|
| 3905 |
|
| 3906 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3907 |
}
|
| 3908 |
|
| 3909 |
static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3915 |
UNUSED(src1);
|
| 3916 |
|
| 3917 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3918 |
|
| 3919 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3920 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 3939 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 3940 |
}
|
| 3941 |
|
| 3942 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3943 |
}
|
| 3944 |
|
| 3945 |
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3951 |
UNUSED(src1);
|
| 3952 |
|
| 3953 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3954 |
|
| 3955 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3956 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 3982 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 3983 |
}
|
| 3984 |
|
| 3985 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3986 |
}
|
| 3987 |
|
| 3988 |
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3994 |
UNUSED(src1);
|
| 3995 |
|
| 3996 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 3997 |
|
| 3998 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3999 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 4025 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 4026 |
}
|
| 4027 |
|
| 4028 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4029 |
}
|
| 4030 |
|
| 4031 |
static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 4037 |
UNUSED(src1);
|
| 4038 |
|
| 4039 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 4040 |
|
| 4041 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4042 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 4077 |
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 4078 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 4079 |
|
| 4080 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4081 |
}
|
| 4082 |
|
| 4083 |
static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 4089 |
UNUSED(src1);
|
| 4090 |
|
| 4091 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 4092 |
|
| 4093 |
//ggml_backend_opencl_device_context * dev_ctx =
|
| 4094 |
// (ggml_backend_opencl_device_context *)backend->device->context;
|
|
|
|
| 4152 |
// This is local memory - the size depends on subgroup size.
|
| 4153 |
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
| 4154 |
|
| 4155 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4156 |
}
|
| 4157 |
|
| 4158 |
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 4164 |
UNUSED(src1);
|
| 4165 |
|
| 4166 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 4167 |
|
| 4168 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4169 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 4202 |
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
|
| 4203 |
size_t local_work_size[] = {(size_t)sgs, 1, 1};
|
| 4204 |
|
| 4205 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4206 |
}
|
| 4207 |
|
| 4208 |
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 4214 |
UNUSED(src1);
|
| 4215 |
|
| 4216 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 4217 |
|
| 4218 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4219 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 4290 |
}
|
| 4291 |
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
| 4292 |
|
| 4293 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4294 |
}
|
| 4295 |
|
| 4296 |
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
|
|
|
| 4303 |
UNUSED(src1_shape_def);
|
| 4304 |
|
| 4305 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 4306 |
|
| 4307 |
if (backend_ctx->kernel_repeat == nullptr) {
|
| 4308 |
GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
|
|
|
|
| 4350 |
|
| 4351 |
size_t global_work_size[] = { gws0, gws1, gws2 };
|
| 4352 |
|
| 4353 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4354 |
}
|
| 4355 |
|
| 4356 |
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
|
|
| 4363 |
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
|
| 4364 |
|
| 4365 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 4366 |
|
| 4367 |
if (backend_ctx->kernel_pad == nullptr) {
|
| 4368 |
GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
|
|
|
|
| 4407 |
local_work_size_ptr = nullptr;
|
| 4408 |
}
|
| 4409 |
|
| 4410 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4411 |
}
|
| 4412 |
|
| 4413 |
static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
|
|
| 4419 |
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 4420 |
|
| 4421 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 4422 |
|
| 4423 |
const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
| 4424 |
cl_kernel kernel = nullptr;
|
|
|
|
| 4509 |
local_work_size_ptr = nullptr;
|
| 4510 |
}
|
| 4511 |
|
| 4512 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4513 |
}
|
| 4514 |
|
| 4515 |
static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 4587 |
global_work_size[1] = d_ne1;
|
| 4588 |
global_work_size[2] = d_ne2;
|
| 4589 |
|
| 4590 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
| 4591 |
}
|
| 4592 |
}
|
| 4593 |
} else {
|
|
|
|
| 4637 |
d_ne2 > 0 ? (size_t)d_ne2 : 1,
|
| 4638 |
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
|
| 4639 |
|
| 4640 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
|
| 4641 |
}
|
| 4642 |
}
|
| 4643 |
|
|
|
|
| 4650 |
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 4651 |
|
| 4652 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 4653 |
|
| 4654 |
if (backend_ctx->kernel_timestep_embedding == nullptr) {
|
| 4655 |
GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
|
|
|
|
| 4682 |
|
| 4683 |
size_t global_work_size[] = {gws0, gws1, 1};
|
| 4684 |
|
| 4685 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4686 |
}
|
| 4687 |
|
| 4688 |
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 4697 |
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
| 4698 |
|
| 4699 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 4700 |
|
| 4701 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 4702 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
|
|
| 4901 |
static_cast<size_t>(padded_height_B)
|
| 4902 |
};
|
| 4903 |
|
| 4904 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4905 |
} else {
|
| 4906 |
// no need to transpose B in other cases
|
| 4907 |
// create an image for B from sub_buffer
|
|
|
|
| 5023 |
|
| 5024 |
// enqueue kernel with profiling
|
| 5025 |
// <--------------------------------------------> //
|
| 5026 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5027 |
// <--------------------------------------------> //
|
| 5028 |
|
| 5029 |
// deallocate sub buffers and images
|
|
|
|
| 5103 |
global_work_size[2] = (size_t)ne12*ne13;
|
| 5104 |
}
|
| 5105 |
|
| 5106 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5107 |
return;
|
| 5108 |
}
|
| 5109 |
#else // GGML_OPENCL_SOA_Q
|
|
|
|
| 5333 |
size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
| 5334 |
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
| 5335 |
|
| 5336 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5337 |
} else if (src0t == GGML_TYPE_Q4_K) {
|
| 5338 |
GGML_ASSERT(false && "not implemented");
|
| 5339 |
} else if (src0t == GGML_TYPE_Q3_K) {
|
|
|
|
| 5344 |
size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
| 5345 |
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
| 5346 |
|
| 5347 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5348 |
} else {
|
| 5349 |
int64_t ny = (ne11 + nrows - 1)/nrows;
|
| 5350 |
|
| 5351 |
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
|
| 5352 |
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
| 5353 |
|
| 5354 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5355 |
}
|
| 5356 |
}
|
| 5357 |
|
|
|
|
| 5368 |
GGML_ASSERT(src2->extra);
|
| 5369 |
|
| 5370 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 5371 |
|
| 5372 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
| 5373 |
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
|
|
|
| 5473 |
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
| 5474 |
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
| 5475 |
|
| 5476 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5477 |
}
|
| 5478 |
|
| 5479 |
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 5486 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 5487 |
|
| 5488 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 5489 |
|
| 5490 |
float scale;
|
| 5491 |
memcpy(&scale, dst->op_params, sizeof(scale));
|
|
|
|
| 5514 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 5515 |
}
|
| 5516 |
|
| 5517 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5518 |
}
|
| 5519 |
|
| 5520 |
static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 5551 |
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
| 5552 |
|
| 5553 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 5554 |
|
| 5555 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 5556 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
|
|
| 5615 |
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 5616 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 5617 |
|
| 5618 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5619 |
}
|
| 5620 |
|
| 5621 |
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 5638 |
const int ne02 = src0 ? src0->ne[2] : 0;
|
| 5639 |
|
| 5640 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 5641 |
|
| 5642 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 5643 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 5661 |
size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
|
| 5662 |
size_t local_work_size[] = {64, 1, 1};
|
| 5663 |
|
| 5664 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5665 |
} else {
|
| 5666 |
kernel = backend_ctx->kernel_diag_mask_inf;
|
| 5667 |
|
|
|
|
| 5681 |
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
| 5682 |
}
|
| 5683 |
|
| 5684 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5685 |
}
|
| 5686 |
}
|
| 5687 |
|
|
|
|
| 5701 |
}
|
| 5702 |
|
| 5703 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 5704 |
|
| 5705 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 5706 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 5780 |
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 5781 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 5782 |
|
| 5783 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5784 |
}
|
| 5785 |
|
| 5786 |
static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 5792 |
GGML_ASSERT(dst->extra);
|
| 5793 |
|
| 5794 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 5795 |
|
| 5796 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 5797 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
|
|
| 5957 |
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 5958 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 5959 |
|
| 5960 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5961 |
}
|
| 5962 |
|
| 5963 |
static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 5972 |
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
| 5973 |
|
| 5974 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 5975 |
|
| 5976 |
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
| 5977 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 6040 |
size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
|
| 6041 |
size_t local_work_size[] = {256, 1, 1};
|
| 6042 |
|
| 6043 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6044 |
}
|
| 6045 |
|
| 6046 |
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 6055 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 6056 |
|
| 6057 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 6058 |
|
| 6059 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 6060 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 6086 |
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
|
| 6087 |
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
|
| 6088 |
|
| 6089 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6090 |
}
|
| 6091 |
|
| 6092 |
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 6100 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 6101 |
|
| 6102 |
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
| 6103 |
|
| 6104 |
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 6105 |
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
| 6140 |
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
| 6141 |
size_t local_work_size[] = {(size_t)64, 1, 1};
|
| 6142 |
|
| 6143 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6144 |
}
|
| 6145 |
|
| 6146 |
//------------------------------------------------------------------------------
|