Spaces:
Running
Running
Henry Linjamäki
commited on
Commit
·
57028a7
1
Parent(s):
9e034be
opencl: use OpenCL C standard supported by the device (llama/12221)
Browse filesThis patch nudges the llama.cpp a bit to be supported on PoCL which
doesn't support OpenCL C CL2.0. The issue is solved by querying the
device for the supported OpenCL C versions and using the highest one
available.
- ggml/CMakeLists.txt +2 -0
- ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- ggml/src/ggml-opencl/ggml-opencl.cpp +133 -43
ggml/CMakeLists.txt
CHANGED
|
@@ -200,6 +200,8 @@ option(GGML_OPENCL "ggml: use OpenCL"
|
|
| 200 |
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
| 201 |
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
| 202 |
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
|
|
|
|
|
|
| 203 |
|
| 204 |
# toolchain for vulkan-shaders-gen
|
| 205 |
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
|
|
|
| 200 |
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
| 201 |
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
| 202 |
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
| 203 |
+
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
| 204 |
+
"gmml: OpenCL API version to target")
|
| 205 |
|
| 206 |
# toolchain for vulkan-shaders-gen
|
| 207 |
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
ggml/src/ggml-opencl/CMakeLists.txt
CHANGED
|
@@ -15,6 +15,7 @@ if (GGML_OPENCL_PROFILING)
|
|
| 15 |
endif ()
|
| 16 |
|
| 17 |
add_compile_definitions(GGML_OPENCL_SOA_Q)
|
|
|
|
| 18 |
|
| 19 |
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
| 20 |
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
|
|
|
|
| 15 |
endif ()
|
| 16 |
|
| 17 |
add_compile_definitions(GGML_OPENCL_SOA_Q)
|
| 18 |
+
add_compile_definitions(GGML_OPENCL_TARGET_VERSION=${GGML_OPENCL_TARGET_VERSION})
|
| 19 |
|
| 20 |
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
| 21 |
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
|
ggml/src/ggml-opencl/ggml-opencl.cpp
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
#define CL_TARGET_OPENCL_VERSION
|
| 2 |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
| 3 |
|
| 4 |
// suppress warnings in CL headers for GCC and Clang
|
|
@@ -25,6 +25,8 @@
|
|
| 25 |
#include <vector>
|
| 26 |
#include <string>
|
| 27 |
#include <cmath>
|
|
|
|
|
|
|
| 28 |
|
| 29 |
#undef MIN
|
| 30 |
#undef MAX
|
|
@@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
|
|
| 62 |
X1E,
|
| 63 |
};
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
| 66 |
if (strstr(device_name, "730") ||
|
| 67 |
strstr(device_name, "740") ||
|
|
@@ -470,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 470 |
// A local ref of cl_device_id for convenience
|
| 471 |
cl_device_id device = backend_ctx->device;
|
| 472 |
|
|
|
|
|
|
|
| 473 |
// Check device OpenCL version, OpenCL 2.0 or above is required
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
|
| 477 |
-
clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
|
| 478 |
-
device_ver_buffer[device_ver_str_size] = '\0';
|
| 479 |
-
GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
|
| 480 |
-
|
| 481 |
-
if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
|
| 482 |
-
strstr(device_ver_buffer, "OpenCL 3") == NULL) {
|
| 483 |
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
| 484 |
return backend_ctx;
|
| 485 |
}
|
|
@@ -516,8 +604,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 516 |
|
| 517 |
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
| 518 |
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
| 519 |
-
if (strstr(
|
| 520 |
-
strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
| 521 |
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
| 522 |
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
| 523 |
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
|
@@ -581,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 581 |
const std::string kernel_src = read_file("ggml-opencl.cl");
|
| 582 |
#endif
|
| 583 |
|
| 584 |
-
|
| 585 |
-
"
|
| 586 |
-
|
|
|
|
|
|
|
|
|
|
| 587 |
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
|
| 588 |
|
| 589 |
// Non matmul kernels.
|
|
@@ -693,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 693 |
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
|
| 694 |
|
| 695 |
// Gemv general
|
| 696 |
-
std::string CL_gemv_compile_opts =
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
if (has_vector_subgroup_broadcast) {
|
| 701 |
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
| 702 |
}
|
|
@@ -713,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 713 |
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
| 714 |
|
| 715 |
// Gemv 2048, 16384
|
| 716 |
-
CL_gemv_compile_opts =
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
if (has_vector_subgroup_broadcast) {
|
| 723 |
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
| 724 |
}
|
|
@@ -735,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 735 |
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
| 736 |
|
| 737 |
// Gemv 2048, 16384
|
| 738 |
-
CL_gemv_compile_opts =
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
if (has_vector_subgroup_broadcast) {
|
| 745 |
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
| 746 |
}
|
|
@@ -750,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 750 |
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
| 751 |
|
| 752 |
// Gemv 5504, 44032
|
| 753 |
-
CL_gemv_compile_opts =
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
if (has_vector_subgroup_broadcast) {
|
| 760 |
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
| 761 |
}
|
|
@@ -765,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 765 |
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
| 766 |
|
| 767 |
// Gemv 16000, 128000
|
| 768 |
-
CL_gemv_compile_opts =
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
if (has_vector_subgroup_broadcast) {
|
| 775 |
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
| 776 |
}
|
|
|
|
| 1 |
+
#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
|
| 2 |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
| 3 |
|
| 4 |
// suppress warnings in CL headers for GCC and Clang
|
|
|
|
| 25 |
#include <vector>
|
| 26 |
#include <string>
|
| 27 |
#include <cmath>
|
| 28 |
+
#include <memory>
|
| 29 |
+
#include <charconv>
|
| 30 |
|
| 31 |
#undef MIN
|
| 32 |
#undef MAX
|
|
|
|
| 64 |
X1E,
|
| 65 |
};
|
| 66 |
|
| 67 |
+
struct ggml_cl_version {
|
| 68 |
+
cl_uint major = 0;
|
| 69 |
+
cl_uint minor = 0;
|
| 70 |
+
};
|
| 71 |
+
|
| 72 |
+
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
|
| 73 |
+
static ggml_cl_version parse_cl_version(std::string_view str) {
|
| 74 |
+
size_t major_str_begin = 0;
|
| 75 |
+
size_t major_str_end = str.find(".", major_str_begin);
|
| 76 |
+
if (major_str_end == std::string::npos) {
|
| 77 |
+
return {};
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
size_t minor_str_begin = major_str_end + 1;
|
| 81 |
+
size_t minor_str_end = str.find(" ", minor_str_begin);
|
| 82 |
+
if (minor_str_end == std::string::npos) {
|
| 83 |
+
return {};
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
cl_uint version_major;
|
| 87 |
+
if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
|
| 88 |
+
return {};
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
cl_uint version_minor;
|
| 92 |
+
if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
|
| 93 |
+
return {};
|
| 94 |
+
}
|
| 95 |
+
return { version_major, version_minor };
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
|
| 99 |
+
static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
|
| 100 |
+
size_t param_size;
|
| 101 |
+
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, ¶m_size));
|
| 102 |
+
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
| 103 |
+
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
|
| 104 |
+
|
| 105 |
+
auto param_value = std::string_view(param_storage.get(), param_size);
|
| 106 |
+
const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
|
| 107 |
+
if (param_value.find(version_prefix) != 0) {
|
| 108 |
+
return {};
|
| 109 |
+
}
|
| 110 |
+
param_value.remove_prefix(version_prefix.length());
|
| 111 |
+
return parse_cl_version(param_value);
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
|
| 115 |
+
static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
|
| 116 |
+
size_t param_size;
|
| 117 |
+
|
| 118 |
+
#if CL_TARGET_OPENCL_VERSION >= 300
|
| 119 |
+
if (platform_version.major >= 3) {
|
| 120 |
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, ¶m_size));
|
| 121 |
+
if (!param_size) {
|
| 122 |
+
return {};
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
|
| 126 |
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
|
| 127 |
+
unsigned versions_count = param_size / sizeof(cl_name_version);
|
| 128 |
+
|
| 129 |
+
cl_version version_max = 0;
|
| 130 |
+
for (unsigned i = 0; i < versions_count; i++) {
|
| 131 |
+
version_max = std::max<cl_version>(versions[i].version, version_max);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
|
| 135 |
+
}
|
| 136 |
+
#else
|
| 137 |
+
GGML_UNUSED(platform_version);
|
| 138 |
+
#endif // CL_TARGET_OPENCL_VERSION >= 300
|
| 139 |
+
|
| 140 |
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, ¶m_size));
|
| 141 |
+
if (!param_size) {
|
| 142 |
+
return {};
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
| 146 |
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
|
| 147 |
+
auto param_value = std::string_view(param_storage.get(), param_size);
|
| 148 |
+
|
| 149 |
+
const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
|
| 150 |
+
if (param_value.find(version_prefix) != 0) {
|
| 151 |
+
return {};
|
| 152 |
+
}
|
| 153 |
+
param_value.remove_prefix(version_prefix.length());
|
| 154 |
+
|
| 155 |
+
return parse_cl_version(param_value);
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
| 159 |
if (strstr(device_name, "730") ||
|
| 160 |
strstr(device_name, "740") ||
|
|
|
|
| 563 |
// A local ref of cl_device_id for convenience
|
| 564 |
cl_device_id device = backend_ctx->device;
|
| 565 |
|
| 566 |
+
ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
|
| 567 |
+
|
| 568 |
// Check device OpenCL version, OpenCL 2.0 or above is required
|
| 569 |
+
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
| 570 |
+
if (opencl_c_version.major < 2) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
| 572 |
return backend_ctx;
|
| 573 |
}
|
|
|
|
| 604 |
|
| 605 |
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
| 606 |
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
| 607 |
+
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
|
|
|
| 608 |
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
| 609 |
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
| 610 |
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
|
|
|
| 668 |
const std::string kernel_src = read_file("ggml-opencl.cl");
|
| 669 |
#endif
|
| 670 |
|
| 671 |
+
auto opencl_c_std =
|
| 672 |
+
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
| 673 |
+
|
| 674 |
+
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
| 675 |
+
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
| 676 |
+
" -cl-finite-math-only -cl-fast-relaxed-math";
|
| 677 |
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
|
| 678 |
|
| 679 |
// Non matmul kernels.
|
|
|
|
| 783 |
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
|
| 784 |
|
| 785 |
// Gemv general
|
| 786 |
+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
| 787 |
+
" -cl-mad-enable "
|
| 788 |
+
" -DSIMDGROUP_WIDTH=" +
|
| 789 |
+
std::to_string(backend_ctx->adreno_wave_size);
|
| 790 |
if (has_vector_subgroup_broadcast) {
|
| 791 |
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
| 792 |
}
|
|
|
|
| 803 |
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
| 804 |
|
| 805 |
// Gemv 2048, 16384
|
| 806 |
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
| 807 |
+
" -cl-mad-enable "
|
| 808 |
+
" -DLINE_STRIDE_A=2048 "
|
| 809 |
+
" -DBLOCK_STRIDE_A=16384 "
|
| 810 |
+
" -DSIMDGROUP_WIDTH=" +
|
| 811 |
+
std::to_string(backend_ctx->adreno_wave_size);
|
| 812 |
if (has_vector_subgroup_broadcast) {
|
| 813 |
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
| 814 |
}
|
|
|
|
| 825 |
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
| 826 |
|
| 827 |
// Gemv 2048, 16384
|
| 828 |
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
| 829 |
+
" -cl-mad-enable "
|
| 830 |
+
" -DLINE_STRIDE_A=2048 "
|
| 831 |
+
" -DBLOCK_STRIDE_A=16384 "
|
| 832 |
+
" -DSIMDGROUP_WIDTH=" +
|
| 833 |
+
std::to_string(backend_ctx->adreno_wave_size);
|
| 834 |
if (has_vector_subgroup_broadcast) {
|
| 835 |
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
| 836 |
}
|
|
|
|
| 840 |
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
| 841 |
|
| 842 |
// Gemv 5504, 44032
|
| 843 |
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
| 844 |
+
" -cl-mad-enable "
|
| 845 |
+
" -DLINE_STRIDE_A=5504 "
|
| 846 |
+
" -DBLOCK_STRIDE_A=44032 "
|
| 847 |
+
" -DSIMDGROUP_WIDTH=" +
|
| 848 |
+
std::to_string(backend_ctx->adreno_wave_size);
|
| 849 |
if (has_vector_subgroup_broadcast) {
|
| 850 |
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
| 851 |
}
|
|
|
|
| 855 |
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
| 856 |
|
| 857 |
// Gemv 16000, 128000
|
| 858 |
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
| 859 |
+
" -cl-mad-enable "
|
| 860 |
+
" -DLINE_STRIDE_A=16000 "
|
| 861 |
+
" -DBLOCK_STRIDE_A=128000 "
|
| 862 |
+
" -DSIMDGROUP_WIDTH=" +
|
| 863 |
+
std::to_string(backend_ctx->adreno_wave_size);
|
| 864 |
if (has_vector_subgroup_broadcast) {
|
| 865 |
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
| 866 |
}
|