Akarshan Biswas commited on
Commit
1d6d451
·
1 Parent(s): 6e89d8c

SYCL: Move CPY kernels to a separate file and add few missing kernels (llama/12133)

Browse files

* SYCL: refactor and move cpy kernels to a separate file

* Add few missing cpy kernels

* refactor and add debug logs

ggml/src/ggml-sycl/backend.hpp CHANGED
@@ -29,6 +29,7 @@
29
  #include "wkv6.hpp"
30
  #include "outprod.hpp"
31
  #include "element_wise.hpp"
 
32
  #include "gla.hpp"
33
 
34
  #endif // GGML_SYCL_BACKEND_HPP
 
29
  #include "wkv6.hpp"
30
  #include "outprod.hpp"
31
  #include "element_wise.hpp"
32
+ #include "cpy.hpp"
33
  #include "gla.hpp"
34
 
35
  #endif // GGML_SYCL_BACKEND_HPP
ggml/src/ggml-sycl/common.hpp CHANGED
@@ -34,6 +34,7 @@
34
  #pragma clang diagnostic ignored "-Wnested-anon-types"
35
  #include "ggml-common.h"
36
  #pragma clang diagnostic pop
 
37
 
38
  void* ggml_sycl_host_malloc(size_t size);
39
  void ggml_sycl_host_free(void* ptr);
 
34
  #pragma clang diagnostic ignored "-Wnested-anon-types"
35
  #include "ggml-common.h"
36
  #pragma clang diagnostic pop
37
+ #include "ggml-impl.h"
38
 
39
  void* ggml_sycl_host_malloc(size_t size);
40
  void ggml_sycl_host_free(void* ptr);
ggml/src/ggml-sycl/cpy.cpp ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "cpy.hpp"
2
+
3
+ #include <float.h>
4
+
5
+ #include "dequantize.hpp"
6
+
7
+ static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
8
+ if (x <= val[0]) {
9
+ return 0;
10
+ }
11
+ if (x >= val[n - 1]) {
12
+ return n - 1;
13
+ }
14
+ int ml = 0, mu = n - 1;
15
+ while (mu - ml > 1) {
16
+ int mav = (ml + mu) / 2;
17
+ if (x < val[mav]) {
18
+ mu = mav;
19
+ } else {
20
+ ml = mav;
21
+ }
22
+ }
23
+ return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu;
24
+ }
25
+
26
+ static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
27
+ const float * xi = (const float *) cxi;
28
+ float * dsti = (float *) cdsti;
29
+
30
+ *dsti = *xi;
31
+ }
32
+
33
+ static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
34
+ const float * xi = (const float *) cxi;
35
+ sycl::half * dsti = (sycl::half *) cdsti;
36
+
37
+ *dsti = sycl::vec<float, 1>(*xi).convert<sycl::half, sycl::rounding_mode::automatic>()[0];
38
+ }
39
+
40
+ static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
41
+ const sycl::half * xi = (const sycl::half *) cxi;
42
+ sycl::half * dsti = (sycl::half *) cdsti;
43
+
44
+ *dsti = *xi;
45
+ }
46
+
47
+ static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
48
+ const sycl::half * xi = (const sycl::half *) cxi;
49
+ float * dsti = (float *) cdsti;
50
+
51
+ *dsti = *xi;
52
+ }
53
+
54
+ static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
55
+ const int16_t * xi = (const int16_t *) cxi;
56
+ int16_t * dsti = (int16_t *) cdsti;
57
+
58
+ *dsti = *xi;
59
+ }
60
+
61
+ static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
62
+ const int32_t * xi = (const int32_t *) cxi;
63
+ int32_t * dsti = (int32_t *) cdsti;
64
+
65
+ *dsti = *xi;
66
+ }
67
+
68
+ template <cpy_kernel_t cpy_1>
69
+ static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
70
+ const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
71
+ const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
72
+ const sycl::nd_item<3> & item_ct1) {
73
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
74
+
75
+ if (i >= ne) {
76
+ return;
77
+ }
78
+
79
+ // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
80
+ // then combine those indices with the corresponding byte offsets to get the total offsets
81
+ const int i03 = i / (ne00 * ne01 * ne02);
82
+ const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
83
+ const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
84
+ const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00;
85
+ const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03;
86
+
87
+ const int i13 = i / (ne10 * ne11 * ne12);
88
+ const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11);
89
+ const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10;
90
+ const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
91
+ const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
92
+
93
+ cpy_1(cx + x_offset, cdst + dst_offset);
94
+ }
95
+
96
+ static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
97
+ const float * xi = (const float *) cxi;
98
+ block_q8_0 * dsti = (block_q8_0 *) cdsti;
99
+
100
+ float amax = 0.0f; // absolute max
101
+
102
+ for (int j = 0; j < QK8_0; j++) {
103
+ const float v = xi[j];
104
+ amax = sycl::fmax(amax, sycl::fabs((float) v));
105
+ }
106
+
107
+ const float d = amax / ((1 << 7) - 1);
108
+ const float id = d ? 1.0f / d : 0.0f;
109
+
110
+ dsti->d = d;
111
+
112
+ for (int j = 0; j < QK8_0; ++j) {
113
+ const float x0 = xi[j] * id;
114
+
115
+ dsti->qs[j] = sycl::round((float) x0);
116
+ }
117
+ }
118
+
119
+ static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
120
+ float * cdstf = (float *) (cdsti);
121
+
122
+ for (int j = 0; j < QK8_0; j += 2) {
123
+ dfloat2 dq;
124
+ dequantize_q8_0(cxi, 0, j, dq);
125
+ *(cdstf + j) = dq.x();
126
+ *(cdstf + j + 1) = dq.y();
127
+ }
128
+ }
129
+
130
+ static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
131
+ const float * xi = (const float *) cxi;
132
+ block_q4_0 * dsti = (block_q4_0 *) cdsti;
133
+
134
+ float amax = 0.0f;
135
+ float vmax = 0.0f;
136
+
137
+ for (int j = 0; j < QK4_0; ++j) {
138
+ const float v = xi[j];
139
+ if (amax < sycl::fabs((float) v)) {
140
+ amax = sycl::fabs((float) v);
141
+ vmax = v;
142
+ }
143
+ }
144
+
145
+ const float d = vmax / -8;
146
+ const float id = d ? 1.0f / d : 0.0f;
147
+
148
+ dsti->d = d;
149
+
150
+ for (int j = 0; j < QK4_0 / 2; ++j) {
151
+ const float x0 = xi[0 + j] * id;
152
+ const float x1 = xi[QK4_0 / 2 + j] * id;
153
+
154
+ const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f));
155
+ const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f));
156
+
157
+ dsti->qs[j] = xi0;
158
+ dsti->qs[j] |= xi1 << 4;
159
+ }
160
+ }
161
+
162
+ static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
163
+ const float * xi = (const float *) cxi;
164
+ block_q4_1 * dsti = (block_q4_1 *) cdsti;
165
+
166
+ float vmin = FLT_MAX;
167
+ float vmax = -FLT_MAX;
168
+
169
+ for (int j = 0; j < QK4_1; ++j) {
170
+ const float v = xi[j];
171
+
172
+ if (v < vmin) {
173
+ vmin = v;
174
+ }
175
+ if (v > vmax) {
176
+ vmax = v;
177
+ }
178
+ }
179
+
180
+ const float d = (vmax - vmin) / ((1 << 4) - 1);
181
+ const float id = d ? 1.0f / d : 0.0f;
182
+
183
+ dsti->dm.x() = d;
184
+ dsti->dm.y() = vmin;
185
+
186
+ for (int j = 0; j < QK4_1 / 2; ++j) {
187
+ const float x0 = (xi[0 + j] - vmin) * id;
188
+ const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id;
189
+
190
+ const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f));
191
+ const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f));
192
+
193
+ dsti->qs[j] = xi0;
194
+ dsti->qs[j] |= xi1 << 4;
195
+ }
196
+ }
197
+
198
+ static void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
199
+ const float * xi = (const float *) cxi;
200
+ block_q5_0 * dsti = (block_q5_0 *) cdsti;
201
+
202
+ float amax = 0.0f;
203
+ float vmax = 0.0f;
204
+
205
+ for (int j = 0; j < QK5_0; ++j) {
206
+ const float v = xi[j];
207
+ if (amax < sycl::fabs((float) v)) {
208
+ amax = sycl::fabs((float) v);
209
+ vmax = v;
210
+ }
211
+ }
212
+
213
+ const float d = vmax / -16;
214
+ const float id = d ? 1.0f / d : 0.0f;
215
+
216
+ dsti->d = d;
217
+
218
+ uint32_t qh = 0;
219
+ for (int j = 0; j < QK5_0 / 2; ++j) {
220
+ const float x0 = xi[0 + j] * id;
221
+ const float x1 = xi[QK5_0 / 2 + j] * id;
222
+
223
+ const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f));
224
+ const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f));
225
+
226
+ dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
227
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
228
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2);
229
+ }
230
+ memcpy(dsti->qh, &qh, sizeof(qh));
231
+ }
232
+
233
+ static void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
234
+ const float * xi = (const float *) cxi;
235
+ block_q5_1 * dsti = (block_q5_1 *) cdsti;
236
+
237
+ float min = xi[0];
238
+ float max = xi[0];
239
+
240
+ for (int j = 1; j < QK5_1; ++j) {
241
+ const float v = xi[j];
242
+ min = v < min ? v : min;
243
+ max = v > max ? v : max;
244
+ }
245
+
246
+ const float d = (max - min) / 31;
247
+ const float id = d ? 1.0f / d : 0.0f;
248
+
249
+ dsti->dm.x() = d;
250
+ dsti->dm.y() = min;
251
+
252
+ uint32_t qh = 0;
253
+ for (int j = 0; j < QK5_1 / 2; ++j) {
254
+ const float x0 = (xi[0 + j] - min) * id;
255
+ const float x1 = (xi[QK5_1 / 2 + j] - min) * id;
256
+
257
+ const uint8_t xi0 = (uint8_t) (x0 + 0.5f);
258
+ const uint8_t xi1 = (uint8_t) (x1 + 0.5f);
259
+
260
+ dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
261
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
262
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2);
263
+ }
264
+ memcpy(dsti->qh, &qh, sizeof(qh));
265
+ }
266
+
267
+ static void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
268
+ const float * xi = (const float *) cxi;
269
+ block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
270
+
271
+ float amax = 0.0f;
272
+ float vmax = 0.0f;
273
+
274
+ for (int j = 0; j < QK4_NL; ++j) {
275
+ const float v = xi[j];
276
+ if (amax < sycl::fabs((float) v)) {
277
+ amax = sycl::fabs((float) v);
278
+ vmax = v;
279
+ }
280
+ }
281
+
282
+ float d = vmax / kvalues_iq4nl[0];
283
+ const float id = d ? 1.0f / d : 0.0f;
284
+
285
+ float sumqx = 0, sumq2 = 0;
286
+ for (int j = 0; j < QK4_NL / 2; ++j) {
287
+ const float x0 = xi[0 + j] * id;
288
+ const float x1 = xi[QK4_NL / 2 + j] * id;
289
+ const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
290
+ const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
291
+ dsti->qs[j] = xi0 | (xi1 << 4);
292
+ const float v0 = kvalues_iq4nl[xi0];
293
+ const float v1 = kvalues_iq4nl[xi1];
294
+ const float w0 = xi[0 + j] * xi[0 + j];
295
+ const float w1 = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j];
296
+ sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j];
297
+ sumq2 += w0 * v0 * v0 + w1 * v1 * v1;
298
+ }
299
+
300
+ dsti->d = sumq2 > 0 ? sumqx / sumq2 : d;
301
+ }
302
+
303
+ template <dequantize_kernel_t dequant, int qk> static void cpy_blck_q_f32(const char * cxi, char * cdsti) {
304
+ float * cdstf = (float *) (cdsti);
305
+
306
+ for (int j = 0; j < qk / 2; j++) {
307
+ dfloat2 dq;
308
+ dequant(cxi, 0, j, dq);
309
+ *(cdstf + j) = dq.x();
310
+ *(cdstf + j + qk / 2) = dq.y();
311
+ }
312
+ }
313
+
314
+ template <cpy_kernel_t cpy_blck, int qk>
315
+ static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
316
+ const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
317
+ const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
318
+ const sycl::nd_item<3> & item_ct1) {
319
+ const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk;
320
+
321
+ if (i >= ne) {
322
+ return;
323
+ }
324
+
325
+ const int i03 = i / (ne00 * ne01 * ne02);
326
+ const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
327
+ const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
328
+ const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00;
329
+ const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03;
330
+
331
+ const int i13 = i / (ne10 * ne11 * ne12);
332
+ const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11);
333
+ const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10;
334
+ const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
335
+ const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
336
+
337
+ cpy_blck(cx + x_offset, cdst + dst_offset);
338
+ }
339
+
340
+ template <cpy_kernel_t cpy_blck, int qk>
341
+ static void cpy_q_f32(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
342
+ const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
343
+ const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
344
+ const sycl::nd_item<3> & item_ct1) {
345
+ const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk;
346
+
347
+ if (i >= ne) {
348
+ return;
349
+ }
350
+
351
+ const int i03 = i / (ne00 * ne01 * ne02);
352
+ const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
353
+ const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
354
+ const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00;
355
+ const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03;
356
+
357
+ const int i13 = i / (ne10 * ne11 * ne12);
358
+ const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11);
359
+ const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10;
360
+ const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
361
+ const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
362
+
363
+ cpy_blck(cx + x_offset, cdst + dst_offset);
364
+ }
365
+
366
+ static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
367
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
368
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
369
+ const int nb12, const int nb13, queue_ptr stream) {
370
+ const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
371
+ {
372
+ dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
373
+
374
+ stream->parallel_for(
375
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
376
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
377
+ [=](sycl::nd_item<3> item_ct1) {
378
+ cpy_f32_f16<cpy_1_f16_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
379
+ nb10, nb11, nb12, nb13, item_ct1);
380
+ });
381
+ }
382
+ }
383
+
384
+ static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
385
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
386
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
387
+ const int nb12, const int nb13, queue_ptr stream) {
388
+ const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
389
+ {
390
+ dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
391
+
392
+ stream->parallel_for(
393
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
394
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
395
+ [=](sycl::nd_item<3> item_ct1) {
396
+ cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
397
+ nb10, nb11, nb12, nb13, item_ct1);
398
+ });
399
+ }
400
+ }
401
+
402
+ static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
403
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
404
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
405
+ const int nb12, const int nb13, queue_ptr stream) {
406
+ const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
407
+ {
408
+ dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
409
+
410
+ stream->parallel_for(
411
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
412
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
413
+ [=](sycl::nd_item<3> item_ct1) {
414
+ cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
415
+ nb10, nb11, nb12, nb13, item_ct1);
416
+ });
417
+ }
418
+ }
419
+
420
+ static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
421
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
422
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
423
+ const int nb12, const int nb13, queue_ptr stream) {
424
+ GGML_ASSERT(ne % QK8_0 == 0);
425
+ const int num_blocks = ne / QK8_0;
426
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
427
+ [=](sycl::nd_item<3> item_ct1) {
428
+ cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
429
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
430
+ });
431
+ }
432
+
433
+ static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
434
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
435
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
436
+ const int nb12, const int nb13, queue_ptr stream) {
437
+ const int num_blocks = ne;
438
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
439
+ [=](sycl::nd_item<3> item_ct1) {
440
+ cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
441
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
442
+ });
443
+ }
444
+
445
+ static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
446
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
447
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
448
+ const int nb12, const int nb13, queue_ptr stream) {
449
+ GGML_ASSERT(ne % QK4_0 == 0);
450
+ const int num_blocks = ne / QK4_0;
451
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
452
+ [=](sycl::nd_item<3> item_ct1) {
453
+ cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
454
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
455
+ });
456
+ }
457
+
458
+ static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
459
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
460
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
461
+ const int nb12, const int nb13, queue_ptr stream) {
462
+ const int num_blocks = ne;
463
+ stream->parallel_for(
464
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
465
+ cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
466
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
467
+ item_ct1);
468
+ });
469
+ }
470
+
471
+ static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
472
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
473
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
474
+ const int nb12, const int nb13, queue_ptr stream) {
475
+ GGML_ASSERT(ne % QK4_1 == 0);
476
+ const int num_blocks = ne / QK4_1;
477
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
478
+ [=](sycl::nd_item<3> item_ct1) {
479
+ cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
480
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
481
+ });
482
+ }
483
+
484
+ static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
485
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
486
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
487
+ const int nb12, const int nb13, queue_ptr stream) {
488
+ const int num_blocks = ne;
489
+ stream->parallel_for(
490
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
491
+ cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
492
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
493
+ item_ct1);
494
+ });
495
+ }
496
+
497
+ static void ggml_cpy_f32_q5_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
498
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
499
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
500
+ const int nb12, const int nb13, queue_ptr stream) {
501
+ GGML_ASSERT(ne % QK5_0 == 0);
502
+ const int num_blocks = ne / QK5_0;
503
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
504
+ [=](sycl::nd_item<3> item_ct1) {
505
+ cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
506
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
507
+ });
508
+ }
509
+
510
+ static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
511
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
512
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
513
+ const int nb12, const int nb13, queue_ptr stream) {
514
+ const int num_blocks = ne;
515
+ stream->parallel_for(
516
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
517
+ cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
518
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
519
+ item_ct1);
520
+ });
521
+ }
522
+
523
+ static void ggml_cpy_f32_q5_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
524
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
525
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
526
+ const int nb12, const int nb13, queue_ptr stream) {
527
+ GGML_ASSERT(ne % QK5_1 == 0);
528
+ const int num_blocks = ne / QK5_1;
529
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
530
+ [=](sycl::nd_item<3> item_ct1) {
531
+ cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
532
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
533
+ });
534
+ }
535
+
536
+ static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
537
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
538
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
539
+ const int nb12, const int nb13, queue_ptr stream) {
540
+ const int num_blocks = ne;
541
+ stream->parallel_for(
542
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
543
+ cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
544
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
545
+ item_ct1);
546
+ });
547
+ }
548
+
549
+ static void ggml_cpy_f32_iq4_nl_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
550
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
551
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
552
+ const int nb12, const int nb13, queue_ptr stream) {
553
+ GGML_ASSERT(ne % QK4_NL == 0);
554
+ const int num_blocks = ne / QK4_NL;
555
+ stream->parallel_for(
556
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
557
+ cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
558
+ ne12, nb10, nb11, nb12, nb13, item_ct1);
559
+ });
560
+ }
561
+
562
+ static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
563
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
564
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
565
+ const int nb12, const int nb13, queue_ptr stream) {
566
+ const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
567
+ {
568
+ dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
569
+
570
+ stream->parallel_for(
571
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
572
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
573
+ [=](sycl::nd_item<3> item_ct1) {
574
+ cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
575
+ nb10, nb11, nb12, nb13, item_ct1);
576
+ });
577
+ }
578
+ }
579
+
580
+ static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
581
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
582
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
583
+ const int nb12, const int nb13, queue_ptr stream) {
584
+ const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
585
+ {
586
+ // dpct::has_capability_or_fail(stream->get_device(),
587
+ // {sycl::aspect::fp16});
588
+
589
+ stream->parallel_for(
590
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
591
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
592
+ [=](sycl::nd_item<3> item_ct1) {
593
+ cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
594
+ nb10, nb11, nb12, nb13, item_ct1);
595
+ });
596
+ }
597
+ }
598
+
599
+ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
600
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
601
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
602
+ const int nb12, const int nb13, queue_ptr stream) {
603
+ const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
604
+ {
605
+ // dpct::has_capability_or_fail(stream->get_device(),
606
+ // {sycl::aspect::fp16});
607
+
608
+ stream->parallel_for(
609
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
610
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
611
+ [=](sycl::nd_item<3> item_ct1) {
612
+ cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
613
+ nb10, nb11, nb12, nb13, item_ct1);
614
+ });
615
+ }
616
+ }
617
+
618
+ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
619
+ const int64_t ne = ggml_nelements(src0);
620
+ GGML_ASSERT(ne == ggml_nelements(src1));
621
+
622
+ GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
623
+ GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
624
+
625
+ GGML_TENSOR_BINARY_OP_LOCALS01;
626
+
627
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
628
+ queue_ptr main_stream = ctx.stream();
629
+
630
+ char * src0_ddc = (char *) src0->data;
631
+ char * src1_ddc = (char *) src1->data;
632
+ GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type),
633
+ ggml_type_name(src1->type));
634
+
635
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
636
+ ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
637
+ nb11, nb12, nb13, main_stream);
638
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
639
+ ggml_cpy_f32_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
640
+ nb11, nb12, nb13, main_stream);
641
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
642
+ ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
643
+ nb11, nb12, nb13, main_stream);
644
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
645
+ ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
646
+ nb11, nb12, nb13, main_stream);
647
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
648
+ ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
649
+ nb11, nb12, nb13, main_stream);
650
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
651
+ ggml_cpy_f16_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
652
+ nb11, nb12, nb13, main_stream);
653
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
654
+ ggml_cpy_f16_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
655
+ nb11, nb12, nb13, main_stream);
656
+ } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
657
+ ggml_cpy_i16_i16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
658
+ nb11, nb12, nb13, main_stream);
659
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
660
+ ggml_cpy_i32_i32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
661
+ nb11, nb12, nb13, main_stream);
662
+ } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
663
+ ggml_cpy_q4_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
664
+ nb11, nb12, nb13, main_stream);
665
+ } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
666
+ ggml_cpy_q4_1_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
667
+ nb11, nb12, nb13, main_stream);
668
+ } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
669
+ ggml_cpy_q8_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
670
+ nb11, nb12, nb13, main_stream);
671
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
672
+ ggml_cpy_f32_q5_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
673
+ nb11, nb12, nb13, main_stream);
674
+ } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
675
+ ggml_cpy_q5_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
676
+ nb11, nb12, nb13, main_stream);
677
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
678
+ ggml_cpy_f32_q5_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
679
+ nb11, nb12, nb13, main_stream);
680
+ } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
681
+ ggml_cpy_q5_1_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
682
+ nb11, nb12, nb13, main_stream);
683
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
684
+ ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
685
+ nb10, nb11, nb12, nb13, main_stream);
686
+ } else {
687
+ GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type),
688
+ ggml_type_name(src1->type));
689
+ GGML_ABORT("fatal error");
690
+ }
691
+ } catch (const sycl::exception & exc) {
692
+ std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
693
+ std::exit(1);
694
+ }
695
+
696
+ void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
697
+ // TODO: why do we pass dst as src1 here?
698
+ GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
699
+ ggml_sycl_cpy(ctx, dst->src[0], dst);
700
+ GGML_SYCL_DEBUG("[SYCL] call %s done\n", __func__);
701
+ }
ggml/src/ggml-sycl/cpy.hpp ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef GGML_SYCL_CPY_HPP
2
+ #define GGML_SYCL_CPY_HPP
3
+
4
+ #include "common.hpp"
5
+
6
+ typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
7
+
8
+ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1);
9
+ void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
10
+
11
+ #endif // GGML_SYCL_CPY_HPP
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -1285,8 +1285,6 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(q
1285
  // struct ggml_sycl_pool_vmm : public ggml_sycl_pool
1286
 
1287
  /// kernels
1288
-
1289
- typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
1290
  typedef void (*ggml_sycl_op_mul_mat_t)(
1291
  ggml_backend_sycl_context & ctx,
1292
  const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
@@ -1468,193 +1466,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1468
  }
1469
  }
1470
 
1471
- static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1472
- const float * xi = (const float *) cxi;
1473
- float * dsti = (float *) cdsti;
1474
-
1475
- *dsti = *xi;
1476
- }
1477
-
1478
- static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1479
- const float * xi = (const float *) cxi;
1480
- sycl::half *dsti = (sycl::half *)cdsti;
1481
-
1482
- *dsti = sycl::vec<float, 1>(*xi)
1483
- .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
1484
- }
1485
-
1486
- static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
1487
- const sycl::half *xi = (const sycl::half *)cxi;
1488
- sycl::half *dsti = (sycl::half *)cdsti;
1489
-
1490
- *dsti = *xi;
1491
- }
1492
-
1493
- static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
1494
- const sycl::half *xi = (const sycl::half *)cxi;
1495
- float * dsti = (float *) cdsti;
1496
-
1497
- *dsti = *xi;
1498
- }
1499
-
1500
- static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
1501
- const int16_t *xi = (const int16_t *)cxi;
1502
- int16_t *dsti = (int16_t *)cdsti;
1503
-
1504
- *dsti = *xi;
1505
- }
1506
-
1507
- static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
1508
- const int32_t *xi = (const int32_t *)cxi;
1509
- int32_t *dsti = (int32_t *)cdsti;
1510
-
1511
- *dsti = *xi;
1512
- }
1513
-
1514
- template <cpy_kernel_t cpy_1>
1515
- static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
1516
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
1517
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
1518
- const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
1519
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
1520
- item_ct1.get_local_id(2);
1521
-
1522
- if (i >= ne) {
1523
- return;
1524
- }
1525
-
1526
- // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
1527
- // then combine those indices with the corresponding byte offsets to get the total offsets
1528
- const int i03 = i/(ne00 * ne01 * ne02);
1529
- const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
1530
- const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
1531
- const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
1532
- const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
1533
-
1534
- const int i13 = i/(ne10 * ne11 * ne12);
1535
- const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
1536
- const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
1537
- const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
1538
- const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
1539
-
1540
- cpy_1(cx + x_offset, cdst + dst_offset);
1541
- }
1542
-
1543
- static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
1544
- const float * xi = (const float *) cxi;
1545
- block_q8_0 * dsti = (block_q8_0 *) cdsti;
1546
-
1547
- float amax = 0.0f; // absolute max
1548
-
1549
- for (int j = 0; j < QK8_0; j++) {
1550
- const float v = xi[j];
1551
- amax = sycl::fmax(amax, sycl::fabs((float)v));
1552
- }
1553
-
1554
- const float d = amax / ((1 << 7) - 1);
1555
- const float id = d ? 1.0f/d : 0.0f;
1556
-
1557
- dsti->d = d;
1558
-
1559
- for (int j = 0; j < QK8_0; ++j) {
1560
- const float x0 = xi[j]*id;
1561
-
1562
- dsti->qs[j] = sycl::round((float)x0);
1563
- }
1564
- }
1565
-
1566
- static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
1567
- const float * xi = (const float *) cxi;
1568
- block_q4_0 * dsti = (block_q4_0 *) cdsti;
1569
-
1570
- float amax = 0.0f;
1571
- float vmax = 0.0f;
1572
-
1573
- for (int j = 0; j < QK4_0; ++j) {
1574
- const float v = xi[j];
1575
- if (amax < sycl::fabs((float)v)) {
1576
- amax = sycl::fabs((float)v);
1577
- vmax = v;
1578
- }
1579
- }
1580
-
1581
- const float d = vmax / -8;
1582
- const float id = d ? 1.0f/d : 0.0f;
1583
-
1584
- dsti->d = d;
1585
-
1586
- for (int j = 0; j < QK4_0/2; ++j) {
1587
- const float x0 = xi[0 + j]*id;
1588
- const float x1 = xi[QK4_0/2 + j]*id;
1589
-
1590
- const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f));
1591
- const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f));
1592
-
1593
- dsti->qs[j] = xi0;
1594
- dsti->qs[j] |= xi1 << 4;
1595
- }
1596
- }
1597
-
1598
- static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
1599
- const float * xi = (const float *) cxi;
1600
- block_q4_1 * dsti = (block_q4_1 *) cdsti;
1601
-
1602
- float vmin = FLT_MAX;
1603
- float vmax = -FLT_MAX;
1604
-
1605
- for (int j = 0; j < QK4_1; ++j) {
1606
- const float v = xi[j];
1607
-
1608
- if (v < vmin) vmin = v;
1609
- if (v > vmax) vmax = v;
1610
- }
1611
-
1612
- const float d = (vmax - vmin) / ((1 << 4) - 1);
1613
- const float id = d ? 1.0f/d : 0.0f;
1614
-
1615
- dsti->dm.x() = d;
1616
- dsti->dm.y() = vmin;
1617
-
1618
- for (int j = 0; j < QK4_1/2; ++j) {
1619
- const float x0 = (xi[0 + j] - vmin)*id;
1620
- const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
1621
-
1622
- const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f));
1623
- const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f));
1624
-
1625
- dsti->qs[j] = xi0;
1626
- dsti->qs[j] |= xi1 << 4;
1627
- }
1628
- }
1629
-
1630
- template <cpy_kernel_t cpy_blck, int qk>
1631
- static void cpy_f32_q(const char * cx, char * cdst, const int ne,
1632
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
1633
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
1634
- const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
1635
- const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
1636
- item_ct1.get_local_id(2)) *
1637
- qk;
1638
-
1639
- if (i >= ne) {
1640
- return;
1641
- }
1642
-
1643
- const int i03 = i/(ne00 * ne01 * ne02);
1644
- const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
1645
- const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
1646
- const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
1647
- const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
1648
-
1649
- const int i13 = i/(ne10 * ne11 * ne12);
1650
- const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
1651
- const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
1652
- const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
1653
- const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
1654
-
1655
- cpy_blck(cx + x_offset, cdst + dst_offset);
1656
- }
1657
-
1658
  static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
1659
  const sycl::nd_item<3> &item_ct1) {
1660
  const int row = item_ct1.get_group(1);
@@ -1903,231 +1714,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
1903
  }
1904
  }
1905
 
1906
- static void
1907
- ggml_cpy_f16_f32_sycl(const char *cx, char *cdst, const int ne, const int ne00,
1908
- const int ne01, const int ne02, const int nb00,
1909
- const int nb01, const int nb02, const int nb03,
1910
- const int ne10, const int ne11, const int ne12,
1911
- const int nb10, const int nb11, const int nb12,
1912
- const int nb13, queue_ptr stream) {
1913
-
1914
- const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
1915
- {
1916
- dpct::has_capability_or_fail(stream->get_device(),
1917
- {sycl::aspect::fp16});
1918
-
1919
- stream->parallel_for(
1920
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
1921
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
1922
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
1923
- [=](sycl::nd_item<3> item_ct1) {
1924
- cpy_f32_f16<cpy_1_f16_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00,
1925
- nb01, nb02, nb03, ne10, ne11, ne12,
1926
- nb10, nb11, nb12, nb13, item_ct1);
1927
- });
1928
- }
1929
- }
1930
 
1931
- static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
1932
- const int ne00, const int ne01,
1933
- const int ne02, const int nb00,
1934
- const int nb01, const int nb02,
1935
- const int nb03, const int ne10,
1936
- const int ne11, const int ne12,
1937
- const int nb10, const int nb11,
1938
- const int nb12, const int nb13,
1939
- queue_ptr stream) {
1940
-
1941
- const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
1942
- {
1943
- dpct::has_capability_or_fail(stream->get_device(),
1944
- {sycl::aspect::fp16});
1945
-
1946
- stream->parallel_for(
1947
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
1948
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
1949
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
1950
- [=](sycl::nd_item<3> item_ct1) {
1951
- cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
1952
- nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
1953
- item_ct1);
1954
- });
1955
- }
1956
- }
1957
-
1958
- static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
1959
- const int ne00, const int ne01,
1960
- const int ne02, const int nb00,
1961
- const int nb01, const int nb02,
1962
- const int nb03, const int ne10,
1963
- const int ne11, const int ne12,
1964
- const int nb10, const int nb11,
1965
- const int nb12, const int nb13,
1966
- queue_ptr stream) {
1967
-
1968
- const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
1969
- {
1970
- dpct::has_capability_or_fail(stream->get_device(),
1971
- {sycl::aspect::fp16});
1972
-
1973
- stream->parallel_for(
1974
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
1975
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
1976
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
1977
- [=](sycl::nd_item<3> item_ct1) {
1978
- cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
1979
- nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
1980
- item_ct1);
1981
- });
1982
- }
1983
- }
1984
-
1985
- static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
1986
- const int ne00, const int ne01,
1987
- const int ne02, const int nb00,
1988
- const int nb01, const int nb02,
1989
- const int nb03, const int ne10,
1990
- const int ne11, const int ne12,
1991
- const int nb10, const int nb11,
1992
- const int nb12, const int nb13,
1993
- queue_ptr stream) {
1994
-
1995
- GGML_ASSERT(ne % QK8_0 == 0);
1996
- const int num_blocks = ne / QK8_0;
1997
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
1998
- sycl::range<3>(1, 1, 1)),
1999
- [=](sycl::nd_item<3> item_ct1) {
2000
- cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
2001
- cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
2002
- nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
2003
- item_ct1);
2004
- });
2005
- }
2006
-
2007
- static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
2008
- const int ne00, const int ne01,
2009
- const int ne02, const int nb00,
2010
- const int nb01, const int nb02,
2011
- const int nb03, const int ne10,
2012
- const int ne11, const int ne12,
2013
- const int nb10, const int nb11,
2014
- const int nb12, const int nb13,
2015
- queue_ptr stream) {
2016
-
2017
- GGML_ASSERT(ne % QK4_0 == 0);
2018
- const int num_blocks = ne / QK4_0;
2019
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
2020
- sycl::range<3>(1, 1, 1)),
2021
- [=](sycl::nd_item<3> item_ct1) {
2022
- cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
2023
- cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
2024
- nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
2025
- item_ct1);
2026
- });
2027
- }
2028
-
2029
- static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
2030
- const int ne00, const int ne01,
2031
- const int ne02, const int nb00,
2032
- const int nb01, const int nb02,
2033
- const int nb03, const int ne10,
2034
- const int ne11, const int ne12,
2035
- const int nb10, const int nb11,
2036
- const int nb12, const int nb13,
2037
- queue_ptr stream) {
2038
-
2039
- GGML_ASSERT(ne % QK4_1 == 0);
2040
- const int num_blocks = ne / QK4_1;
2041
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
2042
- sycl::range<3>(1, 1, 1)),
2043
- [=](sycl::nd_item<3> item_ct1) {
2044
- cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
2045
- cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
2046
- nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
2047
- item_ct1);
2048
- });
2049
- }
2050
-
2051
- static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
2052
- const int ne00, const int ne01,
2053
- const int ne02, const int nb00,
2054
- const int nb01, const int nb02,
2055
- const int nb03, const int ne10,
2056
- const int ne11, const int ne12,
2057
- const int nb10, const int nb11,
2058
- const int nb12, const int nb13,
2059
- queue_ptr stream) {
2060
-
2061
- const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
2062
- {
2063
- dpct::has_capability_or_fail(stream->get_device(),
2064
- {sycl::aspect::fp16});
2065
-
2066
- stream->parallel_for(
2067
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
2068
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
2069
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
2070
- [=](sycl::nd_item<3> item_ct1) {
2071
- cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
2072
- nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
2073
- item_ct1);
2074
- });
2075
- }
2076
- }
2077
-
2078
- static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
2079
- const int ne00, const int ne01,
2080
- const int ne02, const int nb00,
2081
- const int nb01, const int nb02,
2082
- const int nb03, const int ne10,
2083
- const int ne11, const int ne12,
2084
- const int nb10, const int nb11,
2085
- const int nb12, const int nb13,
2086
- queue_ptr stream) {
2087
-
2088
- const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
2089
- {
2090
- // dpct::has_capability_or_fail(stream->get_device(),
2091
- // {sycl::aspect::fp16});
2092
-
2093
- stream->parallel_for(
2094
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
2095
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
2096
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
2097
- [=](sycl::nd_item<3> item_ct1) {
2098
- cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
2099
- nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
2100
- item_ct1);
2101
- });
2102
- }
2103
- }
2104
-
2105
- static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
2106
- const int ne00, const int ne01,
2107
- const int ne02, const int nb00,
2108
- const int nb01, const int nb02,
2109
- const int nb03, const int ne10,
2110
- const int ne11, const int ne12,
2111
- const int nb10, const int nb11,
2112
- const int nb12, const int nb13,
2113
- queue_ptr stream) {
2114
-
2115
- const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
2116
- {
2117
- // dpct::has_capability_or_fail(stream->get_device(),
2118
- // {sycl::aspect::fp16});
2119
-
2120
- stream->parallel_for(
2121
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
2122
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
2123
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
2124
- [=](sycl::nd_item<3> item_ct1) {
2125
- cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
2126
- nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
2127
- item_ct1);
2128
- });
2129
- }
2130
- }
2131
 
2132
  static void scale_f32_sycl(const float *x, float *dst, const float scale,
2133
  const int k, queue_ptr stream) {
@@ -3645,58 +3232,6 @@ static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
3645
  ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_clamp);
3646
  }
3647
 
3648
- static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
3649
- ggml_tensor *dst) try {
3650
- const int64_t ne = ggml_nelements(src0);
3651
- GGML_ASSERT(ne == ggml_nelements(src1));
3652
-
3653
- GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
3654
- GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
3655
-
3656
- GGML_TENSOR_BINARY_OP_LOCALS01;
3657
-
3658
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
3659
- queue_ptr main_stream = ctx.stream();
3660
-
3661
- char * src0_ddc = (char *) src0->data;
3662
- char * src1_ddc = (char *) src1->data;
3663
-
3664
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
3665
- ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
3666
- } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
3667
- ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
3668
- } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
3669
- ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
3670
- } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
3671
- ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
3672
- } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
3673
- ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
3674
- } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
3675
- ggml_cpy_f16_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
3676
- } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
3677
- ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
3678
- } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
3679
- ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
3680
- } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
3681
- ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
3682
- } else {
3683
- GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__,
3684
- ggml_type_name(src0->type), ggml_type_name(src1->type));
3685
- GGML_ABORT("fatal error");
3686
- }
3687
- GGML_UNUSED(dst);
3688
- }
3689
- catch (sycl::exception const &exc) {
3690
- std::cerr << exc.what() << "Exception caught at file:" << __FILE__
3691
- << ", line:" << __LINE__ << std::endl;
3692
- std::exit(1);
3693
- }
3694
-
3695
- static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3696
- // TODO: why do we pass dst as src1 here?
3697
- ggml_sycl_cpy(ctx, dst->src[0], dst, nullptr);
3698
- }
3699
-
3700
  static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3701
  ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf);
3702
  }
@@ -3893,7 +3428,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
3893
  ggml_sycl_clamp(ctx, dst);
3894
  break;
3895
  case GGML_OP_CPY:
3896
- ggml_sycl_cpy(ctx, dst->src[0], dst->src[1], dst);
3897
  break;
3898
  case GGML_OP_CONT:
3899
  ggml_sycl_dup(ctx, dst);
@@ -4407,6 +3942,30 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4407
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
4408
  return true;
4409
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4410
  return false;
4411
  } break;
4412
  case GGML_OP_CONCAT:
 
1285
  // struct ggml_sycl_pool_vmm : public ggml_sycl_pool
1286
 
1287
  /// kernels
 
 
1288
  typedef void (*ggml_sycl_op_mul_mat_t)(
1289
  ggml_backend_sycl_context & ctx,
1290
  const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
 
1466
  }
1467
  }
1468
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1469
  static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
1470
  const sycl::nd_item<3> &item_ct1) {
1471
  const int row = item_ct1.get_group(1);
 
1714
  }
1715
  }
1716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1718
 
1719
  static void scale_f32_sycl(const float *x, float *dst, const float scale,
1720
  const int k, queue_ptr stream) {
 
3232
  ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_clamp);
3233
  }
3234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3235
  static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3236
  ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf);
3237
  }
 
3428
  ggml_sycl_clamp(ctx, dst);
3429
  break;
3430
  case GGML_OP_CPY:
3431
+ ggml_sycl_cpy(ctx, dst->src[0], dst->src[1]);
3432
  break;
3433
  case GGML_OP_CONT:
3434
  ggml_sycl_dup(ctx, dst);
 
3942
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
3943
  return true;
3944
  }
3945
+ if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
3946
+ return true;
3947
+ }
3948
+ if (src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_F32) {
3949
+ return true;
3950
+ }
3951
+ if (src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_F32) {
3952
+ return true;
3953
+ }
3954
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
3955
+ return true;
3956
+ }
3957
+ if (src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_F32) {
3958
+ return true;
3959
+ }
3960
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
3961
+ return true;
3962
+ }
3963
+ if (src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_F32) {
3964
+ return true;
3965
+ }
3966
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
3967
+ return true;
3968
+ }
3969
  return false;
3970
  } break;
3971
  case GGML_OP_CONCAT: