Spaces:
Running
Running
Neo Zhang Jianyu commited on
support/fix OPs GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M (llama/6521)
Browse files- ggml-sycl.cpp +876 -196
ggml-sycl.cpp
CHANGED
|
@@ -3038,6 +3038,10 @@ typedef float dfloat; // dequantize float
|
|
| 3038 |
typedef sycl::float2 dfloat2;
|
| 3039 |
#endif //GGML_SYCL_F16
|
| 3040 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3041 |
bool ggml_sycl_loaded(void);
|
| 3042 |
void * ggml_sycl_host_malloc(size_t size);
|
| 3043 |
void ggml_sycl_host_free(void * ptr);
|
|
@@ -4473,6 +4477,32 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
| 4473 |
|
| 4474 |
}
|
| 4475 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4476 |
template<typename dst_t>
|
| 4477 |
static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
| 4478 |
const sycl::nd_item<3> &item_ct1,
|
|
@@ -4505,26 +4535,26 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
| 4505 |
|
| 4506 |
}
|
| 4507 |
|
| 4508 |
-
template<typename dst_t>
|
| 4509 |
-
static void
|
| 4510 |
-
|
| 4511 |
-
|
| 4512 |
-
|
| 4513 |
-
const uint8_t *kmask_iq2xs) {
|
| 4514 |
|
| 4515 |
const int i = item_ct1.get_group(2);
|
| 4516 |
-
const block_iq3_s * x = (const block_iq3_s
|
| 4517 |
|
| 4518 |
const int tid = item_ct1.get_local_id(2);
|
| 4519 |
#if QK_K == 256
|
| 4520 |
const int il = tid/8; // 0...3
|
| 4521 |
const int ib = tid%8; // 0...7
|
| 4522 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 4523 |
-
const uint8_t
|
| 4524 |
-
const uint8_t
|
| 4525 |
-
const uint8_t
|
| 4526 |
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
| 4527 |
const uint8_t signs = x[i].signs[4*ib + il];
|
|
|
|
| 4528 |
for (int j = 0; j < 4; ++j) {
|
| 4529 |
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 4530 |
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
|
@@ -4535,12 +4565,12 @@ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restr
|
|
| 4535 |
|
| 4536 |
}
|
| 4537 |
|
| 4538 |
-
template<typename dst_t>
|
| 4539 |
-
static void
|
| 4540 |
-
|
| 4541 |
-
|
| 4542 |
-
|
| 4543 |
-
|
| 4544 |
const int i = item_ct1.get_group(2);
|
| 4545 |
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 4546 |
|
|
@@ -4549,14 +4579,49 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
|
|
| 4549 |
const int il = tid/8; // 0...3
|
| 4550 |
const int ib = tid%8; // 0...7
|
| 4551 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 4552 |
-
const
|
| 4553 |
-
const
|
| 4554 |
-
const
|
| 4555 |
-
|
| 4556 |
-
|
| 4557 |
-
|
| 4558 |
-
|
| 4559 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4560 |
}
|
| 4561 |
#else
|
| 4562 |
assert(false);
|
|
@@ -4564,6 +4629,51 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
|
|
| 4564 |
|
| 4565 |
}
|
| 4566 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4567 |
/*
|
| 4568 |
DPCT1110:4: The total declared local variable size in device function
|
| 4569 |
dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
|
|
@@ -7370,6 +7480,58 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
| 7370 |
#endif
|
| 7371 |
}
|
| 7372 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7373 |
static __dpct_inline__ float
|
| 7374 |
vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
| 7375 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
|
@@ -7412,10 +7574,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
| 7412 |
|
| 7413 |
static __dpct_inline__ float
|
| 7414 |
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
| 7415 |
-
|
| 7416 |
-
|
| 7417 |
-
#if DPCT_COMPATIBILITY_TEMP >= \
|
| 7418 |
-
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 7419 |
#if QK_K == 256
|
| 7420 |
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
| 7421 |
|
|
@@ -7427,9 +7587,11 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
| 7427 |
const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
|
| 7428 |
const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
|
| 7429 |
uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
|
| 7430 |
-
((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201,
|
|
|
|
| 7431 |
uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
|
| 7432 |
-
((bq2->signs[4*ib32+l] >>
|
|
|
|
| 7433 |
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
| 7434 |
grid1[0] ^ signs0, signs0, std::minus<>());
|
| 7435 |
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
|
@@ -7438,45 +7600,142 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
| 7438 |
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
|
| 7439 |
q8 += 8;
|
| 7440 |
}
|
| 7441 |
-
const float d =
|
|
|
|
|
|
|
|
|
|
| 7442 |
return d * sumi;
|
| 7443 |
#else
|
| 7444 |
assert(false);
|
| 7445 |
-
return 0.f;
|
| 7446 |
-
#endif
|
| 7447 |
-
#else
|
| 7448 |
-
assert(false);
|
| 7449 |
-
return 0.f;
|
| 7450 |
#endif
|
| 7451 |
}
|
| 7452 |
|
| 7453 |
static __dpct_inline__ float
|
| 7454 |
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
| 7455 |
-
|
| 7456 |
-
|
| 7457 |
#if QK_K == 256
|
| 7458 |
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
| 7459 |
|
| 7460 |
const int ib32 = iqs;
|
| 7461 |
-
const uint8_t * qs = bq1->qs + 4*ib32;
|
| 7462 |
-
const int8_t * q8 = bq8_1[ib32].qs;
|
| 7463 |
int sumi = 0;
|
|
|
|
| 7464 |
for (int l = 0; l < 4; ++l) {
|
| 7465 |
-
const
|
| 7466 |
-
|
| 7467 |
-
|
| 7468 |
-
|
| 7469 |
-
|
| 7470 |
-
|
| 7471 |
-
|
| 7472 |
-
|
| 7473 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7474 |
}
|
| 7475 |
-
|
| 7476 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7477 |
#else
|
| 7478 |
assert(false);
|
| 7479 |
-
return 0.f;
|
| 7480 |
#endif
|
| 7481 |
}
|
| 7482 |
|
|
@@ -8061,8 +8320,199 @@ template <bool need_check> static void
|
|
| 8061 |
|
| 8062 |
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
| 8063 |
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
| 8064 |
-
const sycl::nd_item<3> &item_ct1
|
| 8065 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8066 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8067 |
item_ct1.get_local_id(1);
|
| 8068 |
|
|
@@ -8090,7 +8540,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
|
| 8090 |
(item_ct1.get_local_id(2) %
|
| 8091 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8092 |
|
| 8093 |
-
tmp +=
|
| 8094 |
}
|
| 8095 |
|
| 8096 |
// sum up partial sums and write back result
|
|
@@ -8106,10 +8556,11 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
|
| 8106 |
}
|
| 8107 |
|
| 8108 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8109 |
-
static void
|
| 8110 |
-
|
| 8111 |
-
|
| 8112 |
-
|
|
|
|
| 8113 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8114 |
item_ct1.get_local_id(1);
|
| 8115 |
|
|
@@ -8137,7 +8588,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void
|
|
| 8137 |
(item_ct1.get_local_id(2) %
|
| 8138 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8139 |
|
| 8140 |
-
tmp +=
|
| 8141 |
}
|
| 8142 |
|
| 8143 |
// sum up partial sums and write back result
|
|
@@ -8153,9 +8604,11 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void
|
|
| 8153 |
}
|
| 8154 |
|
| 8155 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8156 |
-
static void
|
| 8157 |
-
|
| 8158 |
-
|
|
|
|
|
|
|
| 8159 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8160 |
item_ct1.get_local_id(1);
|
| 8161 |
|
|
@@ -8183,7 +8636,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void *
|
|
| 8183 |
(item_ct1.get_local_id(2) %
|
| 8184 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8185 |
|
| 8186 |
-
tmp +=
|
| 8187 |
}
|
| 8188 |
|
| 8189 |
// sum up partial sums and write back result
|
|
@@ -8199,9 +8652,11 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void *
|
|
| 8199 |
}
|
| 8200 |
|
| 8201 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8202 |
-
static void
|
| 8203 |
-
|
| 8204 |
-
|
|
|
|
|
|
|
| 8205 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8206 |
item_ct1.get_local_id(1);
|
| 8207 |
|
|
@@ -8229,7 +8684,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
|
|
| 8229 |
(item_ct1.get_local_id(2) %
|
| 8230 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8231 |
|
| 8232 |
-
tmp +=
|
| 8233 |
}
|
| 8234 |
|
| 8235 |
// sum up partial sums and write back result
|
|
@@ -8245,9 +8700,11 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
|
|
| 8245 |
}
|
| 8246 |
|
| 8247 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8248 |
-
static void
|
| 8249 |
-
|
| 8250 |
-
|
|
|
|
|
|
|
| 8251 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8252 |
item_ct1.get_local_id(1);
|
| 8253 |
|
|
@@ -8275,7 +8732,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void *
|
|
| 8275 |
(item_ct1.get_local_id(2) %
|
| 8276 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8277 |
|
| 8278 |
-
tmp +=
|
| 8279 |
}
|
| 8280 |
|
| 8281 |
// sum up partial sums and write back result
|
|
@@ -8290,10 +8747,13 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void *
|
|
| 8290 |
}
|
| 8291 |
}
|
| 8292 |
|
|
|
|
| 8293 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8294 |
-
static void
|
| 8295 |
-
|
| 8296 |
-
|
|
|
|
|
|
|
| 8297 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8298 |
item_ct1.get_local_id(1);
|
| 8299 |
|
|
@@ -8321,7 +8781,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void *
|
|
| 8321 |
(item_ct1.get_local_id(2) %
|
| 8322 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8323 |
|
| 8324 |
-
tmp +=
|
| 8325 |
}
|
| 8326 |
|
| 8327 |
// sum up partial sums and write back result
|
|
@@ -8336,6 +8796,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void *
|
|
| 8336 |
}
|
| 8337 |
}
|
| 8338 |
|
|
|
|
| 8339 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
| 8340 |
static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
|
| 8341 |
const sycl::nd_item<3> &item_ct1) {
|
|
@@ -8897,64 +9358,71 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
|
| 8897 |
}
|
| 8898 |
}
|
| 8899 |
|
|
|
|
| 8900 |
template<typename T>
|
| 8901 |
-
static inline void
|
| 8902 |
T tmp = a;
|
| 8903 |
a = b;
|
| 8904 |
b = tmp;
|
| 8905 |
}
|
| 8906 |
|
| 8907 |
-
template<ggml_sort_order order>
|
| 8908 |
-
static void
|
| 8909 |
-
|
|
|
|
| 8910 |
// bitonic sort
|
| 8911 |
int col = item_ct1.get_local_id(2);
|
| 8912 |
int row = item_ct1.get_group(1);
|
| 8913 |
|
| 8914 |
-
if (col >=
|
|
|
|
|
|
|
| 8915 |
|
| 8916 |
const float * x_row = x + row * ncols;
|
| 8917 |
-
|
| 8918 |
|
| 8919 |
// initialize indices
|
| 8920 |
-
|
| 8921 |
-
|
| 8922 |
-
|
| 8923 |
-
/*
|
| 8924 |
-
DPCT1065:58: Consider replacing sycl::nd_item::barrier() with
|
| 8925 |
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
|
| 8926 |
-
performance if there is no access to global memory.
|
| 8927 |
-
*/
|
| 8928 |
-
item_ct1.barrier();
|
| 8929 |
|
| 8930 |
-
for (int k = 2; k <=
|
| 8931 |
for (int j = k / 2; j > 0; j /= 2) {
|
| 8932 |
int ixj = col ^ j;
|
| 8933 |
if (ixj > col) {
|
| 8934 |
if ((col & k) == 0) {
|
| 8935 |
-
if (
|
| 8936 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8937 |
}
|
| 8938 |
} else {
|
| 8939 |
-
if (
|
| 8940 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8941 |
}
|
| 8942 |
}
|
| 8943 |
}
|
| 8944 |
/*
|
| 8945 |
-
DPCT1118:
|
| 8946 |
in converged control flow. You may need to adjust the code.
|
| 8947 |
*/
|
| 8948 |
-
|
| 8949 |
-
DPCT1065:59: Consider replacing sycl::nd_item::barrier() with
|
| 8950 |
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
| 8951 |
-
better performance if there is no access to global memory.
|
| 8952 |
-
*/
|
| 8953 |
-
item_ct1.barrier();
|
| 8954 |
}
|
| 8955 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8956 |
}
|
| 8957 |
|
|
|
|
| 8958 |
static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
|
| 8959 |
const sycl::nd_item<3> &item_ct1) {
|
| 8960 |
const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
|
@@ -9933,28 +10401,64 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
| 9933 |
#endif
|
| 9934 |
}
|
| 9935 |
|
| 9936 |
-
|
| 9937 |
template <typename dst_t>
|
| 9938 |
-
static void
|
| 9939 |
dpct::queue_ptr stream) {
|
| 9940 |
const int nb = k / QK_K;
|
| 9941 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9942 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9943 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 9944 |
{sycl::aspect::fp16});
|
| 9945 |
|
| 9946 |
stream->submit([&](sycl::handler &cgh) {
|
| 9947 |
-
|
| 9948 |
-
|
| 9949 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9950 |
|
|
|
|
| 9951 |
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 9952 |
sycl::range<3>(1, 1, 32),
|
| 9953 |
sycl::range<3>(1, 1, 32)),
|
| 9954 |
[=](sycl::nd_item<3> item_ct1) {
|
| 9955 |
dequantize_block_iq2_xxs(
|
| 9956 |
-
vx, y, item_ct1,
|
| 9957 |
-
|
| 9958 |
});
|
| 9959 |
});
|
| 9960 |
}
|
|
@@ -9965,105 +10469,130 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
| 9965 |
dpct::queue_ptr stream) {
|
| 9966 |
const int nb = k / QK_K;
|
| 9967 |
{
|
| 9968 |
-
|
| 9969 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 9970 |
{sycl::aspect::fp16});
|
| 9971 |
|
| 9972 |
stream->submit([&](sycl::handler &cgh) {
|
| 9973 |
-
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
| 9974 |
-
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
| 9975 |
-
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
| 9976 |
-
|
| 9977 |
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 9978 |
sycl::range<3>(1, 1, 32),
|
| 9979 |
sycl::range<3>(1, 1, 32)),
|
| 9980 |
[=](sycl::nd_item<3> item_ct1) {
|
| 9981 |
dequantize_block_iq2_xs(
|
| 9982 |
-
vx, y, item_ct1,
|
| 9983 |
-
|
| 9984 |
});
|
| 9985 |
});
|
| 9986 |
}
|
| 9987 |
}
|
| 9988 |
|
| 9989 |
template <typename dst_t>
|
| 9990 |
-
static void
|
| 9991 |
-
|
| 9992 |
const int nb = k / QK_K;
|
| 9993 |
{
|
| 9994 |
-
|
| 9995 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 9996 |
{sycl::aspect::fp16});
|
| 9997 |
|
| 9998 |
stream->submit([&](sycl::handler &cgh) {
|
| 9999 |
-
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
|
| 10000 |
-
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
| 10001 |
-
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
| 10002 |
-
|
| 10003 |
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10004 |
sycl::range<3>(1, 1, 32),
|
| 10005 |
sycl::range<3>(1, 1, 32)),
|
| 10006 |
[=](sycl::nd_item<3> item_ct1) {
|
| 10007 |
-
|
| 10008 |
-
vx, y, item_ct1, iq3xxs_grid_ptr_ct1,
|
| 10009 |
-
ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
|
| 10010 |
});
|
| 10011 |
});
|
| 10012 |
}
|
| 10013 |
}
|
| 10014 |
|
|
|
|
| 10015 |
template <typename dst_t>
|
| 10016 |
-
static void
|
| 10017 |
dpct::queue_ptr stream) {
|
| 10018 |
const int nb = k / QK_K;
|
| 10019 |
{
|
| 10020 |
-
|
| 10021 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10022 |
{sycl::aspect::fp16});
|
| 10023 |
|
| 10024 |
stream->submit([&](sycl::handler &cgh) {
|
| 10025 |
-
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
| 10026 |
-
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
| 10027 |
-
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
| 10028 |
-
|
| 10029 |
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10030 |
sycl::range<3>(1, 1, 32),
|
| 10031 |
sycl::range<3>(1, 1, 32)),
|
| 10032 |
[=](sycl::nd_item<3> item_ct1) {
|
| 10033 |
-
|
| 10034 |
-
vx, y, item_ct1,
|
| 10035 |
-
|
| 10036 |
});
|
| 10037 |
});
|
| 10038 |
}
|
| 10039 |
}
|
| 10040 |
|
| 10041 |
template <typename dst_t>
|
| 10042 |
-
static void
|
| 10043 |
dpct::queue_ptr stream) {
|
| 10044 |
const int nb = k / QK_K;
|
| 10045 |
{
|
| 10046 |
-
|
| 10047 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10048 |
{sycl::aspect::fp16});
|
| 10049 |
|
| 10050 |
stream->submit([&](sycl::handler &cgh) {
|
| 10051 |
-
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
|
| 10052 |
-
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
| 10053 |
-
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
| 10054 |
-
|
| 10055 |
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10056 |
sycl::range<3>(1, 1, 32),
|
| 10057 |
sycl::range<3>(1, 1, 32)),
|
| 10058 |
[=](sycl::nd_item<3> item_ct1) {
|
| 10059 |
-
|
| 10060 |
-
vx, y, item_ct1,
|
| 10061 |
-
ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
|
| 10062 |
});
|
| 10063 |
});
|
| 10064 |
}
|
| 10065 |
}
|
| 10066 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10067 |
template <typename src_t, typename dst_t>
|
| 10068 |
static void convert_unary_sycl(const void *__restrict__ vx,
|
| 10069 |
dst_t *__restrict__ y, const int k,
|
|
@@ -10108,16 +10637,24 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try {
|
|
| 10108 |
return dequantize_row_q5_K_sycl;
|
| 10109 |
case GGML_TYPE_Q6_K:
|
| 10110 |
return dequantize_row_q6_K_sycl;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10111 |
case GGML_TYPE_IQ2_XXS:
|
| 10112 |
return dequantize_row_iq2_xxs_sycl;
|
| 10113 |
case GGML_TYPE_IQ2_XS:
|
| 10114 |
return dequantize_row_iq2_xs_sycl;
|
|
|
|
|
|
|
| 10115 |
case GGML_TYPE_IQ3_XXS:
|
| 10116 |
return dequantize_row_iq3_xxs_sycl;
|
| 10117 |
case GGML_TYPE_IQ3_S:
|
| 10118 |
return dequantize_row_iq3_s_sycl;
|
| 10119 |
-
case
|
| 10120 |
-
return
|
|
|
|
|
|
|
| 10121 |
case GGML_TYPE_F32:
|
| 10122 |
return convert_unary_sycl<float>;
|
| 10123 |
default:
|
|
@@ -10152,16 +10689,24 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
|
|
| 10152 |
return dequantize_row_q5_K_sycl;
|
| 10153 |
case GGML_TYPE_Q6_K:
|
| 10154 |
return dequantize_row_q6_K_sycl;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10155 |
case GGML_TYPE_IQ2_XXS:
|
| 10156 |
return dequantize_row_iq2_xxs_sycl;
|
| 10157 |
case GGML_TYPE_IQ2_XS:
|
| 10158 |
return dequantize_row_iq2_xs_sycl;
|
|
|
|
|
|
|
| 10159 |
case GGML_TYPE_IQ3_XXS:
|
| 10160 |
return dequantize_row_iq3_xxs_sycl;
|
| 10161 |
case GGML_TYPE_IQ3_S:
|
| 10162 |
return dequantize_row_iq3_s_sycl;
|
| 10163 |
-
case
|
| 10164 |
-
return
|
|
|
|
|
|
|
| 10165 |
case GGML_TYPE_F16:
|
| 10166 |
return convert_unary_sycl<sycl::half>;
|
| 10167 |
default:
|
|
@@ -10624,19 +11169,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 10624 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 10625 |
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 10626 |
{
|
| 10627 |
-
|
| 10628 |
stream->submit([&](sycl::handler &cgh) {
|
| 10629 |
-
auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
|
| 10630 |
-
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
| 10631 |
-
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
| 10632 |
-
|
| 10633 |
cgh.parallel_for(
|
| 10634 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 10635 |
[=](sycl::nd_item<3> item_ct1)
|
| 10636 |
[[intel::reqd_sub_group_size(32)]] {
|
| 10637 |
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>(
|
| 10638 |
-
vx, vy, dst, ncols, nrows, item_ct1
|
| 10639 |
-
iq2xxs_grid_ptr_ct1, ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
|
| 10640 |
});
|
| 10641 |
});
|
| 10642 |
}
|
|
@@ -10661,8 +11200,32 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 10661 |
[=](sycl::nd_item<3> item_ct1)
|
| 10662 |
[[intel::reqd_sub_group_size(32)]] {
|
| 10663 |
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>(
|
| 10664 |
-
vx, vy, dst, ncols, nrows, item_ct1
|
| 10665 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10666 |
});
|
| 10667 |
});
|
| 10668 |
}
|
|
@@ -10687,8 +11250,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 10687 |
[=](sycl::nd_item<3> item_ct1)
|
| 10688 |
[[intel::reqd_sub_group_size(32)]] {
|
| 10689 |
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>(
|
| 10690 |
-
vx, vy, dst, ncols, nrows, item_ct1
|
| 10691 |
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
| 10692 |
});
|
| 10693 |
});
|
| 10694 |
}
|
|
@@ -10706,15 +11268,13 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 10706 |
|
| 10707 |
stream->submit([&](sycl::handler &cgh) {
|
| 10708 |
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
| 10709 |
-
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
| 10710 |
|
| 10711 |
cgh.parallel_for(
|
| 10712 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 10713 |
[=](sycl::nd_item<3> item_ct1)
|
| 10714 |
[[intel::reqd_sub_group_size(32)]] {
|
| 10715 |
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
|
| 10716 |
-
vx, vy, dst, ncols, nrows, item_ct1
|
| 10717 |
-
iq3s_grid_ptr_ct1, ksigns64_ptr_ct1);
|
| 10718 |
});
|
| 10719 |
});
|
| 10720 |
}
|
|
@@ -10739,8 +11299,72 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 10739 |
[=](sycl::nd_item<3> item_ct1)
|
| 10740 |
[[intel::reqd_sub_group_size(32)]] {
|
| 10741 |
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
| 10742 |
-
vx, vy, dst, ncols, nrows, item_ct1
|
| 10743 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10744 |
});
|
| 10745 |
});
|
| 10746 |
}
|
|
@@ -12364,36 +12988,54 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
|
| 12364 |
});
|
| 12365 |
}
|
| 12366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12367 |
static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
| 12368 |
const int nrows, ggml_sort_order order,
|
| 12369 |
dpct::queue_ptr stream) {
|
| 12370 |
// bitonic sort requires ncols to be power of 2
|
| 12371 |
-
|
| 12372 |
|
| 12373 |
-
const sycl::range<3> block_dims(1, 1,
|
| 12374 |
const sycl::range<3> block_nums(1, nrows, 1);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12375 |
if (order == GGML_SORT_ORDER_ASC) {
|
| 12376 |
-
|
| 12377 |
-
|
| 12378 |
-
|
| 12379 |
-
|
| 12380 |
-
|
| 12381 |
-
|
| 12382 |
-
|
| 12383 |
-
|
| 12384 |
-
|
| 12385 |
-
|
|
|
|
|
|
|
|
|
|
| 12386 |
} else if (order == GGML_SORT_ORDER_DESC) {
|
| 12387 |
-
|
| 12388 |
-
|
| 12389 |
-
|
| 12390 |
-
|
| 12391 |
-
|
| 12392 |
-
|
| 12393 |
-
|
| 12394 |
-
|
| 12395 |
-
|
| 12396 |
-
|
|
|
|
|
|
|
|
|
|
| 12397 |
} else {
|
| 12398 |
GGML_ASSERT(false);
|
| 12399 |
}
|
|
@@ -13521,8 +14163,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
|
|
| 13521 |
case GGML_TYPE_Q5_K:
|
| 13522 |
case GGML_TYPE_IQ2_XXS:
|
| 13523 |
case GGML_TYPE_IQ2_XS:
|
|
|
|
| 13524 |
case GGML_TYPE_IQ1_S:
|
|
|
|
| 13525 |
case GGML_TYPE_IQ3_XXS:
|
|
|
|
|
|
|
| 13526 |
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
| 13527 |
case GGML_TYPE_IQ3_S:
|
| 13528 |
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
|
@@ -13541,11 +14187,20 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
|
| 13541 |
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
| 13542 |
const dpct::queue_ptr &stream) {
|
| 13543 |
|
| 13544 |
-
|
|
|
|
| 13545 |
|
| 13546 |
const int64_t ne00 = src0->ne[0];
|
| 13547 |
const int64_t row_diff = row_high - row_low;
|
| 13548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13549 |
switch (src0->type) {
|
| 13550 |
case GGML_TYPE_Q4_0:
|
| 13551 |
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
@@ -13577,20 +14232,32 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
|
| 13577 |
case GGML_TYPE_Q6_K:
|
| 13578 |
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 13579 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13580 |
case GGML_TYPE_IQ2_XXS:
|
| 13581 |
mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 13582 |
break;
|
| 13583 |
case GGML_TYPE_IQ2_XS:
|
| 13584 |
mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 13585 |
break;
|
|
|
|
|
|
|
|
|
|
| 13586 |
case GGML_TYPE_IQ3_XXS:
|
| 13587 |
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 13588 |
break;
|
| 13589 |
case GGML_TYPE_IQ3_S:
|
| 13590 |
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 13591 |
break;
|
| 13592 |
-
case
|
| 13593 |
-
|
|
|
|
|
|
|
|
|
|
| 13594 |
break;
|
| 13595 |
default:
|
| 13596 |
GGML_ASSERT(false);
|
|
@@ -13672,6 +14339,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
| 13672 |
convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
| 13673 |
break;
|
| 13674 |
default:
|
|
|
|
| 13675 |
GGML_ASSERT(false);
|
| 13676 |
break;
|
| 13677 |
}
|
|
@@ -14526,8 +15194,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
| 14526 |
src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
|
| 14527 |
}
|
| 14528 |
// do the computation
|
| 14529 |
-
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
| 14530 |
-
dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream);
|
| 14531 |
/*
|
| 14532 |
DPCT1010:93: SYCL uses exceptions to report errors and does not
|
| 14533 |
use the error codes. The call was replaced with 0. You need to
|
|
@@ -15108,7 +15776,14 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
| 15108 |
#ifdef GGML_SYCL_FORCE_DMMV
|
| 15109 |
const bool use_mul_mat_vec_q = false;
|
| 15110 |
#else
|
| 15111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15112 |
#endif // GGML_SYCL_FORCE_DMMV
|
| 15113 |
|
| 15114 |
if (use_mul_mat_vec_q) {
|
|
@@ -16968,9 +17643,14 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
| 16968 |
return false;
|
| 16969 |
}
|
| 16970 |
ggml_type a_type = a->type;
|
| 16971 |
-
if (a_type == GGML_TYPE_IQ4_NL
|
| 16972 |
-
a_type ==
|
| 16973 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16974 |
}
|
| 16975 |
return true;
|
| 16976 |
} break;
|
|
|
|
| 3038 |
typedef sycl::float2 dfloat2;
|
| 3039 |
#endif //GGML_SYCL_F16
|
| 3040 |
|
| 3041 |
+
#define MMVQ_MAX_BATCH_SIZE 8
|
| 3042 |
+
|
| 3043 |
+
static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
| 3044 |
+
|
| 3045 |
bool ggml_sycl_loaded(void);
|
| 3046 |
void * ggml_sycl_host_malloc(size_t size);
|
| 3047 |
void ggml_sycl_host_free(void * ptr);
|
|
|
|
| 4477 |
|
| 4478 |
}
|
| 4479 |
|
| 4480 |
+
template <typename dst_t>
|
| 4481 |
+
__dpct_inline__ static void
|
| 4482 |
+
dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
| 4483 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 4484 |
+
|
| 4485 |
+
const int i = item_ct1.get_group(2);
|
| 4486 |
+
const block_iq2_s * x = (const block_iq2_s *) vx;
|
| 4487 |
+
|
| 4488 |
+
const int tid = item_ct1.get_local_id(2);
|
| 4489 |
+
#if QK_K == 256
|
| 4490 |
+
const int il = tid/8; // 0...3
|
| 4491 |
+
const int ib = tid%8; // 0...7
|
| 4492 |
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 4493 |
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
| 4494 |
+
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 4495 |
+
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
| 4496 |
+
#pragma unroll
|
| 4497 |
+
for (int j = 0; j < 8; ++j)
|
| 4498 |
+
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 4499 |
+
#else
|
| 4500 |
+
assert(false);
|
| 4501 |
+
|
| 4502 |
+
#endif
|
| 4503 |
+
|
| 4504 |
+
}
|
| 4505 |
+
|
| 4506 |
template<typename dst_t>
|
| 4507 |
static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
| 4508 |
const sycl::nd_item<3> &item_ct1,
|
|
|
|
| 4535 |
|
| 4536 |
}
|
| 4537 |
|
| 4538 |
+
template <typename dst_t>
|
| 4539 |
+
__dpct_inline__ static void
|
| 4540 |
+
dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
| 4541 |
+
const sycl::nd_item<3> &item_ct1,
|
| 4542 |
+
const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
|
|
|
|
| 4543 |
|
| 4544 |
const int i = item_ct1.get_group(2);
|
| 4545 |
+
const block_iq3_s * x = (const block_iq3_s *) vx;
|
| 4546 |
|
| 4547 |
const int tid = item_ct1.get_local_id(2);
|
| 4548 |
#if QK_K == 256
|
| 4549 |
const int il = tid/8; // 0...3
|
| 4550 |
const int ib = tid%8; // 0...7
|
| 4551 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 4552 |
+
const uint8_t * qs = x[i].qs + 8*ib;
|
| 4553 |
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
| 4554 |
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
|
| 4555 |
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
| 4556 |
const uint8_t signs = x[i].signs[4*ib + il];
|
| 4557 |
+
#pragma unroll
|
| 4558 |
for (int j = 0; j < 4; ++j) {
|
| 4559 |
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 4560 |
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
|
|
|
| 4565 |
|
| 4566 |
}
|
| 4567 |
|
| 4568 |
+
template <typename dst_t>
|
| 4569 |
+
__dpct_inline__ static void
|
| 4570 |
+
dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
| 4571 |
+
const sycl::nd_item<3> &item_ct1,
|
| 4572 |
+
const uint32_t *iq1s_grid_gpu) {
|
| 4573 |
+
|
| 4574 |
const int i = item_ct1.get_group(2);
|
| 4575 |
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 4576 |
|
|
|
|
| 4579 |
const int il = tid/8; // 0...3
|
| 4580 |
const int ib = tid%8; // 0...7
|
| 4581 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 4582 |
+
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
|
| 4583 |
+
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
|
| 4584 |
+
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
| 4585 |
+
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
|
| 4586 |
+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
| 4587 |
+
grid32[0] &= 0x0f0f0f0f;
|
| 4588 |
+
#pragma unroll
|
| 4589 |
+
for (int j = 0; j < 8; ++j) {
|
| 4590 |
+
y[j] = d * (q[j] + delta);
|
| 4591 |
+
}
|
| 4592 |
+
#else
|
| 4593 |
+
assert(false);
|
| 4594 |
+
#endif
|
| 4595 |
+
|
| 4596 |
+
}
|
| 4597 |
+
|
| 4598 |
+
template <typename dst_t>
|
| 4599 |
+
__dpct_inline__ static void
|
| 4600 |
+
dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
| 4601 |
+
const sycl::nd_item<3> &item_ct1,
|
| 4602 |
+
const uint32_t *iq1s_grid_gpu) {
|
| 4603 |
+
|
| 4604 |
+
const int i = item_ct1.get_group(2);
|
| 4605 |
+
const block_iq1_m * x = (const block_iq1_m *) vx;
|
| 4606 |
+
|
| 4607 |
+
const int tid = item_ct1.get_local_id(2);
|
| 4608 |
+
#if QK_K == 256
|
| 4609 |
+
const int il = tid/8; // 0...3
|
| 4610 |
+
const int ib = tid%8; // 0...7
|
| 4611 |
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 4612 |
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
| 4613 |
+
iq1m_scale_t scale;
|
| 4614 |
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 4615 |
+
const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
|
| 4616 |
+
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
|
| 4617 |
+
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
|
| 4618 |
+
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
| 4619 |
+
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
|
| 4620 |
+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
| 4621 |
+
grid32[0] &= 0x0f0f0f0f;
|
| 4622 |
+
#pragma unroll
|
| 4623 |
+
for (int j = 0; j < 8; ++j) {
|
| 4624 |
+
y[j] = d * (q[j] + delta);
|
| 4625 |
}
|
| 4626 |
#else
|
| 4627 |
assert(false);
|
|
|
|
| 4629 |
|
| 4630 |
}
|
| 4631 |
|
| 4632 |
+
template <typename dst_t>
|
| 4633 |
+
__dpct_inline__ static void
|
| 4634 |
+
dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
| 4635 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 4636 |
+
|
| 4637 |
+
const int i = item_ct1.get_group(2);
|
| 4638 |
+
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
| 4639 |
+
|
| 4640 |
+
const int tid = item_ct1.get_local_id(2);
|
| 4641 |
+
const int il = tid/8; // 0...3
|
| 4642 |
+
const int ib = tid%8; // 0...7
|
| 4643 |
+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
| 4644 |
+
const uint8_t * q4 = x[ib].qs + 4*il;
|
| 4645 |
+
const float d = (float)x[ib].d;
|
| 4646 |
+
#pragma unroll
|
| 4647 |
+
for (int j = 0; j < 4; ++j) {
|
| 4648 |
+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
| 4649 |
+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
| 4650 |
+
}
|
| 4651 |
+
|
| 4652 |
+
}
|
| 4653 |
+
|
| 4654 |
+
|
| 4655 |
+
template <typename dst_t>
|
| 4656 |
+
__dpct_inline__ static void
|
| 4657 |
+
dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
| 4658 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 4659 |
+
const int i = item_ct1.get_group(2);
|
| 4660 |
+
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
| 4661 |
+
|
| 4662 |
+
const int tid = item_ct1.get_local_id(2);
|
| 4663 |
+
const int il = tid/8; // 0...3
|
| 4664 |
+
const int ib = tid%8; // 0...7
|
| 4665 |
+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
| 4666 |
+
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
| 4667 |
+
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
| 4668 |
+
#pragma unroll
|
| 4669 |
+
for (int j = 0; j < 4; ++j) {
|
| 4670 |
+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
| 4671 |
+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
| 4672 |
+
}
|
| 4673 |
+
}
|
| 4674 |
+
|
| 4675 |
+
|
| 4676 |
+
|
| 4677 |
/*
|
| 4678 |
DPCT1110:4: The total declared local variable size in device function
|
| 4679 |
dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
|
|
|
|
| 7480 |
#endif
|
| 7481 |
}
|
| 7482 |
|
| 7483 |
+
static __dpct_inline__ float
|
| 7484 |
+
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
| 7485 |
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 7486 |
+
#if QK_K == 256
|
| 7487 |
+
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
| 7488 |
+
|
| 7489 |
+
const int ib32 = iqs;
|
| 7490 |
+
const int8_t * q8 = bq8_1[ib32].qs;
|
| 7491 |
+
const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
|
| 7492 |
+
const uint8_t ls1 = bq2->scales[ib32] & 0xf;
|
| 7493 |
+
const uint8_t ls2 = bq2->scales[ib32] >> 4;
|
| 7494 |
+
int sumi1 = 0;
|
| 7495 |
+
for (int l = 0; l < 2; ++l) {
|
| 7496 |
+
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
| 7497 |
+
const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
|
| 7498 |
+
((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
|
| 7499 |
+
std::equal_to<>());
|
| 7500 |
+
const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
|
| 7501 |
+
((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
|
| 7502 |
+
std::equal_to<>());
|
| 7503 |
+
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
| 7504 |
+
grid[0] ^ signs0, signs0, std::minus<>());
|
| 7505 |
+
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
| 7506 |
+
grid[1] ^ signs1, signs1, std::minus<>());
|
| 7507 |
+
sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
|
| 7508 |
+
sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
|
| 7509 |
+
q8 += 8;
|
| 7510 |
+
}
|
| 7511 |
+
int sumi2 = 0;
|
| 7512 |
+
for (int l = 2; l < 4; ++l) {
|
| 7513 |
+
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
| 7514 |
+
const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
|
| 7515 |
+
((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
|
| 7516 |
+
std::equal_to<>());
|
| 7517 |
+
const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
|
| 7518 |
+
((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
|
| 7519 |
+
std::equal_to<>());
|
| 7520 |
+
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
| 7521 |
+
grid[0] ^ signs0, signs0, std::minus<>());
|
| 7522 |
+
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
| 7523 |
+
grid[1] ^ signs1, signs1, std::minus<>());
|
| 7524 |
+
sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
|
| 7525 |
+
sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
|
| 7526 |
+
q8 += 8;
|
| 7527 |
+
}
|
| 7528 |
+
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
| 7529 |
+
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
| 7530 |
+
#else
|
| 7531 |
+
assert(false);
|
| 7532 |
+
#endif
|
| 7533 |
+
}
|
| 7534 |
+
|
| 7535 |
static __dpct_inline__ float
|
| 7536 |
vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
| 7537 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
|
|
|
| 7574 |
|
| 7575 |
static __dpct_inline__ float
|
| 7576 |
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
| 7577 |
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
| 7578 |
+
const uint32_t *iq3s_grid) {
|
|
|
|
|
|
|
| 7579 |
#if QK_K == 256
|
| 7580 |
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
| 7581 |
|
|
|
|
| 7587 |
const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
|
| 7588 |
const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
|
| 7589 |
uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
|
| 7590 |
+
((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201,
|
| 7591 |
+
0x08040201, std::equal_to<>());
|
| 7592 |
uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
|
| 7593 |
+
((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201,
|
| 7594 |
+
0x08040201, std::equal_to<>());
|
| 7595 |
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
| 7596 |
grid1[0] ^ signs0, signs0, std::minus<>());
|
| 7597 |
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
|
|
|
| 7600 |
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
|
| 7601 |
q8 += 8;
|
| 7602 |
}
|
| 7603 |
+
const float d =
|
| 7604 |
+
(float)bq2->d *
|
| 7605 |
+
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
| 7606 |
+
bq8_1[ib32].ds[0];
|
| 7607 |
return d * sumi;
|
| 7608 |
#else
|
| 7609 |
assert(false);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7610 |
#endif
|
| 7611 |
}
|
| 7612 |
|
| 7613 |
static __dpct_inline__ float
|
| 7614 |
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
| 7615 |
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
| 7616 |
+
const uint32_t *iq1s_grid_gpu) {
|
| 7617 |
#if QK_K == 256
|
| 7618 |
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
| 7619 |
|
| 7620 |
const int ib32 = iqs;
|
|
|
|
|
|
|
| 7621 |
int sumi = 0;
|
| 7622 |
+
const int * q8 = (const int *)bq8_1[ib32].qs;
|
| 7623 |
for (int l = 0; l < 4; ++l) {
|
| 7624 |
+
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
|
| 7625 |
+
int grid0 = grid[0] & 0x0f0f0f0f;
|
| 7626 |
+
int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
|
| 7627 |
+
sumi = dpct::dp4a(q8[2 * l + 1], grid1,
|
| 7628 |
+
dpct::dp4a(q8[2 * l + 0], grid0, sumi));
|
| 7629 |
+
}
|
| 7630 |
+
|
| 7631 |
+
const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
|
| 7632 |
+
const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
|
| 7633 |
+
const float d = d1q * bq8_1[ib32].ds[0];
|
| 7634 |
+
const float m = d1q * bq8_1[ib32].ds[1];
|
| 7635 |
+
return d * sumi + m * delta;
|
| 7636 |
+
#else
|
| 7637 |
+
assert(false);
|
| 7638 |
+
#endif
|
| 7639 |
+
}
|
| 7640 |
+
|
| 7641 |
+
static __dpct_inline__ float
|
| 7642 |
+
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
| 7643 |
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 7644 |
+
#if QK_K == 256
|
| 7645 |
+
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
| 7646 |
+
|
| 7647 |
+
const int ib32 = iqs;
|
| 7648 |
+
int sumi[2] = {0, 0};
|
| 7649 |
+
float sumf[2] = {0.f, 0.f};
|
| 7650 |
+
|
| 7651 |
+
const int * q8 = (const int *)bq8_1[ib32].qs;
|
| 7652 |
+
for (int l = 0; l < 4; ++l) {
|
| 7653 |
+
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
|
| 7654 |
+
int grid0 = grid[0] & 0x0f0f0f0f;
|
| 7655 |
+
int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
|
| 7656 |
+
sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1,
|
| 7657 |
+
dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2]));
|
| 7658 |
+
const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
|
| 7659 |
+
const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101,
|
| 7660 |
+
dpct::dp4a(q8[2 * l + 0], 0x01010101, 0));
|
| 7661 |
+
sumf[l/2] += delta*sumy;
|
| 7662 |
+
}
|
| 7663 |
+
|
| 7664 |
+
iq1m_scale_t scale;
|
| 7665 |
+
const uint16_t * sc = (const uint16_t *)bq1->scales;
|
| 7666 |
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 7667 |
+
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
| 7668 |
+
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
| 7669 |
+
#else
|
| 7670 |
+
assert(false);
|
| 7671 |
+
#endif
|
| 7672 |
+
}
|
| 7673 |
+
|
| 7674 |
+
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
| 7675 |
+
const uint8_t *values,
|
| 7676 |
+
int &val1, int &val2) {
|
| 7677 |
+
|
| 7678 |
+
uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
|
| 7679 |
+
aux32 = q4 & 0x0f0f0f0f;
|
| 7680 |
+
uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
|
| 7681 |
+
uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
|
| 7682 |
+
val1 = v1 | (v2 << 16);
|
| 7683 |
+
aux32 = (q4 >> 4) & 0x0f0f0f0f;
|
| 7684 |
+
v1 = values[q8[0]] | (values[q8[1]] << 8);
|
| 7685 |
+
v2 = values[q8[2]] | (values[q8[3]] << 8);
|
| 7686 |
+
val2 = v1 | (v2 << 16);
|
| 7687 |
+
}
|
| 7688 |
+
|
| 7689 |
+
|
| 7690 |
+
static __dpct_inline__ float
|
| 7691 |
+
vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
|
| 7692 |
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 7693 |
+
|
| 7694 |
+
const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
|
| 7695 |
+
|
| 7696 |
+
const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
|
| 7697 |
+
const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
|
| 7698 |
+
|
| 7699 |
+
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
| 7700 |
+
|
| 7701 |
+
int v1, v2;
|
| 7702 |
+
int sumi1 = 0, sumi2 = 0;
|
| 7703 |
+
for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
| 7704 |
+
const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
|
| 7705 |
+
get_int_from_table_16(aux, values, v1, v2);
|
| 7706 |
+
sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1);
|
| 7707 |
+
sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2);
|
| 7708 |
}
|
| 7709 |
+
|
| 7710 |
+
const float d = (float)bq->d * bq8_1->ds[0];
|
| 7711 |
+
return d * (sumi1 + sumi2);
|
| 7712 |
+
}
|
| 7713 |
+
|
| 7714 |
+
|
| 7715 |
+
static __dpct_inline__ float
|
| 7716 |
+
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
| 7717 |
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 7718 |
+
|
| 7719 |
+
#if QK_K == 256
|
| 7720 |
+
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
| 7721 |
+
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
| 7722 |
+
|
| 7723 |
+
// iqs is 0...7
|
| 7724 |
+
const int ib32 = iqs;
|
| 7725 |
+
const int32_t * q8 = (const int *)bq8_1[ib32].qs;
|
| 7726 |
+
const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
|
| 7727 |
+
const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
|
| 7728 |
+
const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0];
|
| 7729 |
+
int v1, v2;
|
| 7730 |
+
int sumi1 = 0, sumi2 = 0;
|
| 7731 |
+
for (int j = 0; j < 4; ++j) {
|
| 7732 |
+
get_int_from_table_16(q4[j], values, v1, v2);
|
| 7733 |
+
sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1);
|
| 7734 |
+
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
| 7735 |
+
}
|
| 7736 |
+
return d * (sumi1 + sumi2);
|
| 7737 |
#else
|
| 7738 |
assert(false);
|
|
|
|
| 7739 |
#endif
|
| 7740 |
}
|
| 7741 |
|
|
|
|
| 8320 |
|
| 8321 |
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
| 8322 |
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
| 8323 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 8324 |
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8325 |
+
item_ct1.get_local_id(1);
|
| 8326 |
+
|
| 8327 |
+
if (row >= nrows) {
|
| 8328 |
+
return;
|
| 8329 |
+
}
|
| 8330 |
+
|
| 8331 |
+
const int blocks_per_row = ncols / qk;
|
| 8332 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 8333 |
+
|
| 8334 |
+
// partial sum for each thread
|
| 8335 |
+
float tmp = 0.0f;
|
| 8336 |
+
|
| 8337 |
+
const block_q_t * x = (const block_q_t *) vx;
|
| 8338 |
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 8339 |
+
|
| 8340 |
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
| 8341 |
+
i += blocks_per_warp) {
|
| 8342 |
+
const int ibx = row*blocks_per_row + i; // x block index
|
| 8343 |
+
|
| 8344 |
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
| 8345 |
+
|
| 8346 |
+
const int iqs =
|
| 8347 |
+
vdr *
|
| 8348 |
+
(item_ct1.get_local_id(2) %
|
| 8349 |
+
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8350 |
+
|
| 8351 |
+
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
| 8352 |
+
}
|
| 8353 |
+
|
| 8354 |
+
// sum up partial sums and write back result
|
| 8355 |
+
#pragma unroll
|
| 8356 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 8357 |
+
tmp +=
|
| 8358 |
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 8359 |
+
}
|
| 8360 |
+
|
| 8361 |
+
if (item_ct1.get_local_id(2) == 0) {
|
| 8362 |
+
dst[row] = tmp;
|
| 8363 |
+
}
|
| 8364 |
+
}
|
| 8365 |
+
|
| 8366 |
+
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8367 |
+
static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
| 8368 |
+
const void *__restrict__ vy,
|
| 8369 |
+
float *__restrict__ dst, const int ncols,
|
| 8370 |
+
const int nrows,
|
| 8371 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 8372 |
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8373 |
+
item_ct1.get_local_id(1);
|
| 8374 |
+
|
| 8375 |
+
if (row >= nrows) {
|
| 8376 |
+
return;
|
| 8377 |
+
}
|
| 8378 |
+
|
| 8379 |
+
const int blocks_per_row = ncols / qk;
|
| 8380 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 8381 |
+
|
| 8382 |
+
// partial sum for each thread
|
| 8383 |
+
float tmp = 0.0f;
|
| 8384 |
+
|
| 8385 |
+
const block_q_t * x = (const block_q_t *) vx;
|
| 8386 |
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 8387 |
+
|
| 8388 |
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
| 8389 |
+
i += blocks_per_warp) {
|
| 8390 |
+
const int ibx = row*blocks_per_row + i; // x block index
|
| 8391 |
+
|
| 8392 |
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
| 8393 |
+
|
| 8394 |
+
const int iqs =
|
| 8395 |
+
vdr *
|
| 8396 |
+
(item_ct1.get_local_id(2) %
|
| 8397 |
+
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8398 |
+
|
| 8399 |
+
tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
|
| 8400 |
+
}
|
| 8401 |
+
|
| 8402 |
+
// sum up partial sums and write back result
|
| 8403 |
+
#pragma unroll
|
| 8404 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 8405 |
+
tmp +=
|
| 8406 |
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 8407 |
+
}
|
| 8408 |
+
|
| 8409 |
+
if (item_ct1.get_local_id(2) == 0) {
|
| 8410 |
+
dst[row] = tmp;
|
| 8411 |
+
}
|
| 8412 |
+
}
|
| 8413 |
+
|
| 8414 |
+
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8415 |
+
static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
| 8416 |
+
const void *__restrict__ vy,
|
| 8417 |
+
float *__restrict__ dst, const int ncols,
|
| 8418 |
+
const int nrows,
|
| 8419 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 8420 |
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8421 |
+
item_ct1.get_local_id(1);
|
| 8422 |
+
|
| 8423 |
+
if (row >= nrows) {
|
| 8424 |
+
return;
|
| 8425 |
+
}
|
| 8426 |
+
|
| 8427 |
+
const int blocks_per_row = ncols / qk;
|
| 8428 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 8429 |
+
|
| 8430 |
+
// partial sum for each thread
|
| 8431 |
+
float tmp = 0.0f;
|
| 8432 |
+
|
| 8433 |
+
const block_q_t * x = (const block_q_t *) vx;
|
| 8434 |
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 8435 |
+
|
| 8436 |
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
| 8437 |
+
i += blocks_per_warp) {
|
| 8438 |
+
const int ibx = row*blocks_per_row + i; // x block index
|
| 8439 |
+
|
| 8440 |
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
| 8441 |
+
|
| 8442 |
+
const int iqs =
|
| 8443 |
+
vdr *
|
| 8444 |
+
(item_ct1.get_local_id(2) %
|
| 8445 |
+
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8446 |
+
|
| 8447 |
+
tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
|
| 8448 |
+
}
|
| 8449 |
+
|
| 8450 |
+
// sum up partial sums and write back result
|
| 8451 |
+
#pragma unroll
|
| 8452 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 8453 |
+
tmp +=
|
| 8454 |
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 8455 |
+
}
|
| 8456 |
+
|
| 8457 |
+
if (item_ct1.get_local_id(2) == 0) {
|
| 8458 |
+
dst[row] = tmp;
|
| 8459 |
+
}
|
| 8460 |
+
}
|
| 8461 |
+
|
| 8462 |
+
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8463 |
+
static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
| 8464 |
+
const void *__restrict__ vy,
|
| 8465 |
+
float *__restrict__ dst, const int ncols,
|
| 8466 |
+
const int nrows,
|
| 8467 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 8468 |
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8469 |
+
item_ct1.get_local_id(1);
|
| 8470 |
+
|
| 8471 |
+
if (row >= nrows) {
|
| 8472 |
+
return;
|
| 8473 |
+
}
|
| 8474 |
+
|
| 8475 |
+
const int blocks_per_row = ncols / qk;
|
| 8476 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 8477 |
+
|
| 8478 |
+
// partial sum for each thread
|
| 8479 |
+
float tmp = 0.0f;
|
| 8480 |
+
|
| 8481 |
+
const block_q_t * x = (const block_q_t *) vx;
|
| 8482 |
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 8483 |
+
|
| 8484 |
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
| 8485 |
+
i += blocks_per_warp) {
|
| 8486 |
+
const int ibx = row*blocks_per_row + i; // x block index
|
| 8487 |
+
|
| 8488 |
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
| 8489 |
+
|
| 8490 |
+
const int iqs =
|
| 8491 |
+
vdr *
|
| 8492 |
+
(item_ct1.get_local_id(2) %
|
| 8493 |
+
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8494 |
+
|
| 8495 |
+
tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
|
| 8496 |
+
}
|
| 8497 |
+
|
| 8498 |
+
// sum up partial sums and write back result
|
| 8499 |
+
#pragma unroll
|
| 8500 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 8501 |
+
tmp +=
|
| 8502 |
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 8503 |
+
}
|
| 8504 |
+
|
| 8505 |
+
if (item_ct1.get_local_id(2) == 0) {
|
| 8506 |
+
dst[row] = tmp;
|
| 8507 |
+
}
|
| 8508 |
+
}
|
| 8509 |
+
|
| 8510 |
+
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8511 |
+
static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
| 8512 |
+
const void *__restrict__ vy,
|
| 8513 |
+
float *__restrict__ dst, const int ncols,
|
| 8514 |
+
const int nrows,
|
| 8515 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 8516 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8517 |
item_ct1.get_local_id(1);
|
| 8518 |
|
|
|
|
| 8540 |
(item_ct1.get_local_id(2) %
|
| 8541 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8542 |
|
| 8543 |
+
tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
|
| 8544 |
}
|
| 8545 |
|
| 8546 |
// sum up partial sums and write back result
|
|
|
|
| 8556 |
}
|
| 8557 |
|
| 8558 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8559 |
+
static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
| 8560 |
+
const void *__restrict__ vy,
|
| 8561 |
+
float *__restrict__ dst, const int ncols,
|
| 8562 |
+
const int nrows,
|
| 8563 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 8564 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8565 |
item_ct1.get_local_id(1);
|
| 8566 |
|
|
|
|
| 8588 |
(item_ct1.get_local_id(2) %
|
| 8589 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8590 |
|
| 8591 |
+
tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
|
| 8592 |
}
|
| 8593 |
|
| 8594 |
// sum up partial sums and write back result
|
|
|
|
| 8604 |
}
|
| 8605 |
|
| 8606 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8607 |
+
static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
| 8608 |
+
const void *__restrict__ vy,
|
| 8609 |
+
float *__restrict__ dst, const int ncols,
|
| 8610 |
+
const int nrows,
|
| 8611 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 8612 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8613 |
item_ct1.get_local_id(1);
|
| 8614 |
|
|
|
|
| 8636 |
(item_ct1.get_local_id(2) %
|
| 8637 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8638 |
|
| 8639 |
+
tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
|
| 8640 |
}
|
| 8641 |
|
| 8642 |
// sum up partial sums and write back result
|
|
|
|
| 8652 |
}
|
| 8653 |
|
| 8654 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8655 |
+
static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
| 8656 |
+
const void *__restrict__ vy,
|
| 8657 |
+
float *__restrict__ dst, const int ncols,
|
| 8658 |
+
const int nrows,
|
| 8659 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 8660 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8661 |
item_ct1.get_local_id(1);
|
| 8662 |
|
|
|
|
| 8684 |
(item_ct1.get_local_id(2) %
|
| 8685 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8686 |
|
| 8687 |
+
tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
|
| 8688 |
}
|
| 8689 |
|
| 8690 |
// sum up partial sums and write back result
|
|
|
|
| 8700 |
}
|
| 8701 |
|
| 8702 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8703 |
+
static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
| 8704 |
+
const void *__restrict__ vy,
|
| 8705 |
+
float *__restrict__ dst, const int ncols,
|
| 8706 |
+
const int nrows,
|
| 8707 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 8708 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8709 |
item_ct1.get_local_id(1);
|
| 8710 |
|
|
|
|
| 8732 |
(item_ct1.get_local_id(2) %
|
| 8733 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8734 |
|
| 8735 |
+
tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
|
| 8736 |
}
|
| 8737 |
|
| 8738 |
// sum up partial sums and write back result
|
|
|
|
| 8747 |
}
|
| 8748 |
}
|
| 8749 |
|
| 8750 |
+
|
| 8751 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8752 |
+
static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
| 8753 |
+
const void *__restrict__ vy,
|
| 8754 |
+
float *__restrict__ dst, const int ncols,
|
| 8755 |
+
const int nrows,
|
| 8756 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 8757 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8758 |
item_ct1.get_local_id(1);
|
| 8759 |
|
|
|
|
| 8781 |
(item_ct1.get_local_id(2) %
|
| 8782 |
(qi / vdr)); // x block quant index when casting the quants to int
|
| 8783 |
|
| 8784 |
+
tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
|
| 8785 |
}
|
| 8786 |
|
| 8787 |
// sum up partial sums and write back result
|
|
|
|
| 8796 |
}
|
| 8797 |
}
|
| 8798 |
|
| 8799 |
+
|
| 8800 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
| 8801 |
static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
|
| 8802 |
const sycl::nd_item<3> &item_ct1) {
|
|
|
|
| 9358 |
}
|
| 9359 |
}
|
| 9360 |
|
| 9361 |
+
|
| 9362 |
template<typename T>
|
| 9363 |
+
static inline void ggml_sycl_swap(T & a, T & b) {
|
| 9364 |
T tmp = a;
|
| 9365 |
a = b;
|
| 9366 |
b = tmp;
|
| 9367 |
}
|
| 9368 |
|
| 9369 |
+
template <ggml_sort_order order>
|
| 9370 |
+
__dpct_inline__ static void
|
| 9371 |
+
k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad,
|
| 9372 |
+
const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) {
|
| 9373 |
// bitonic sort
|
| 9374 |
int col = item_ct1.get_local_id(2);
|
| 9375 |
int row = item_ct1.get_group(1);
|
| 9376 |
|
| 9377 |
+
if (col >= ncols_pad) {
|
| 9378 |
+
return;
|
| 9379 |
+
}
|
| 9380 |
|
| 9381 |
const float * x_row = x + row * ncols;
|
| 9382 |
+
auto dst_row = (int *)dpct_local;
|
| 9383 |
|
| 9384 |
// initialize indices
|
| 9385 |
+
dst_row[col] = col;
|
| 9386 |
+
|
| 9387 |
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9388 |
|
| 9389 |
+
for (int k = 2; k <= ncols_pad; k *= 2) {
|
| 9390 |
for (int j = k / 2; j > 0; j /= 2) {
|
| 9391 |
int ixj = col ^ j;
|
| 9392 |
if (ixj > col) {
|
| 9393 |
if ((col & k) == 0) {
|
| 9394 |
+
if (dst_row[col] >= ncols ||
|
| 9395 |
+
(dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
|
| 9396 |
+
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
| 9397 |
+
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
| 9398 |
+
) {
|
| 9399 |
+
ggml_sycl_swap(dst_row[col], dst_row[ixj]);
|
| 9400 |
}
|
| 9401 |
} else {
|
| 9402 |
+
if (dst_row[ixj] >= ncols ||
|
| 9403 |
+
(dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
|
| 9404 |
+
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
| 9405 |
+
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
| 9406 |
+
) {
|
| 9407 |
+
ggml_sycl_swap(dst_row[col], dst_row[ixj]);
|
| 9408 |
}
|
| 9409 |
}
|
| 9410 |
}
|
| 9411 |
/*
|
| 9412 |
+
DPCT1118:1: SYCL group functions and algorithms must be encountered
|
| 9413 |
in converged control flow. You may need to adjust the code.
|
| 9414 |
*/
|
| 9415 |
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9416 |
}
|
| 9417 |
}
|
| 9418 |
+
|
| 9419 |
+
// copy the result to dst without the padding
|
| 9420 |
+
if (col < ncols) {
|
| 9421 |
+
dst[row * ncols + col] = dst_row[col];
|
| 9422 |
+
}
|
| 9423 |
}
|
| 9424 |
|
| 9425 |
+
|
| 9426 |
static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
|
| 9427 |
const sycl::nd_item<3> &item_ct1) {
|
| 9428 |
const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
|
|
|
| 10401 |
#endif
|
| 10402 |
}
|
| 10403 |
|
|
|
|
| 10404 |
template <typename dst_t>
|
| 10405 |
+
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
|
| 10406 |
dpct::queue_ptr stream) {
|
| 10407 |
const int nb = k / QK_K;
|
| 10408 |
{
|
| 10409 |
+
dpct::has_capability_or_fail(stream->get_device(),
|
| 10410 |
+
{sycl::aspect::fp16});
|
| 10411 |
+
|
| 10412 |
+
stream->submit([&](sycl::handler &cgh) {
|
| 10413 |
+
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10414 |
+
sycl::range<3>(1, 1, 32),
|
| 10415 |
+
sycl::range<3>(1, 1, 32)),
|
| 10416 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 10417 |
+
dequantize_block_iq1_s(
|
| 10418 |
+
vx, y, item_ct1, iq1s_grid_gpu
|
| 10419 |
+
);
|
| 10420 |
+
});
|
| 10421 |
+
});
|
| 10422 |
+
}
|
| 10423 |
+
}
|
| 10424 |
|
| 10425 |
+
template <typename dst_t>
|
| 10426 |
+
static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
|
| 10427 |
+
dpct::queue_ptr stream) {
|
| 10428 |
+
const int nb = k / QK_K;
|
| 10429 |
+
{
|
| 10430 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10431 |
{sycl::aspect::fp16});
|
| 10432 |
|
| 10433 |
stream->submit([&](sycl::handler &cgh) {
|
| 10434 |
+
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10435 |
+
sycl::range<3>(1, 1, 32),
|
| 10436 |
+
sycl::range<3>(1, 1, 32)),
|
| 10437 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 10438 |
+
dequantize_block_iq1_m(
|
| 10439 |
+
vx, y, item_ct1, iq1s_grid_gpu
|
| 10440 |
+
);
|
| 10441 |
+
});
|
| 10442 |
+
});
|
| 10443 |
+
}
|
| 10444 |
+
}
|
| 10445 |
+
|
| 10446 |
+
template <typename dst_t>
|
| 10447 |
+
static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
|
| 10448 |
+
dpct::queue_ptr stream) {
|
| 10449 |
+
const int nb = k / QK_K;
|
| 10450 |
+
{
|
| 10451 |
+
dpct::has_capability_or_fail(stream->get_device(),
|
| 10452 |
+
{sycl::aspect::fp16});
|
| 10453 |
|
| 10454 |
+
stream->submit([&](sycl::handler &cgh) {
|
| 10455 |
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10456 |
sycl::range<3>(1, 1, 32),
|
| 10457 |
sycl::range<3>(1, 1, 32)),
|
| 10458 |
[=](sycl::nd_item<3> item_ct1) {
|
| 10459 |
dequantize_block_iq2_xxs(
|
| 10460 |
+
vx, y, item_ct1, iq2xxs_grid,
|
| 10461 |
+
ksigns_iq2xs, kmask_iq2xs);
|
| 10462 |
});
|
| 10463 |
});
|
| 10464 |
}
|
|
|
|
| 10469 |
dpct::queue_ptr stream) {
|
| 10470 |
const int nb = k / QK_K;
|
| 10471 |
{
|
|
|
|
| 10472 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10473 |
{sycl::aspect::fp16});
|
| 10474 |
|
| 10475 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10476 |
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10477 |
sycl::range<3>(1, 1, 32),
|
| 10478 |
sycl::range<3>(1, 1, 32)),
|
| 10479 |
[=](sycl::nd_item<3> item_ct1) {
|
| 10480 |
dequantize_block_iq2_xs(
|
| 10481 |
+
vx, y, item_ct1, iq2xs_grid,
|
| 10482 |
+
ksigns_iq2xs, kmask_iq2xs);
|
| 10483 |
});
|
| 10484 |
});
|
| 10485 |
}
|
| 10486 |
}
|
| 10487 |
|
| 10488 |
template <typename dst_t>
|
| 10489 |
+
static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
|
| 10490 |
+
dpct::queue_ptr stream) {
|
| 10491 |
const int nb = k / QK_K;
|
| 10492 |
{
|
|
|
|
| 10493 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10494 |
{sycl::aspect::fp16});
|
| 10495 |
|
| 10496 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10497 |
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10498 |
sycl::range<3>(1, 1, 32),
|
| 10499 |
sycl::range<3>(1, 1, 32)),
|
| 10500 |
[=](sycl::nd_item<3> item_ct1) {
|
| 10501 |
+
dequantize_block_iq2_s(vx, y, item_ct1);
|
|
|
|
|
|
|
| 10502 |
});
|
| 10503 |
});
|
| 10504 |
}
|
| 10505 |
}
|
| 10506 |
|
| 10507 |
+
|
| 10508 |
template <typename dst_t>
|
| 10509 |
+
static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
|
| 10510 |
dpct::queue_ptr stream) {
|
| 10511 |
const int nb = k / QK_K;
|
| 10512 |
{
|
|
|
|
| 10513 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10514 |
{sycl::aspect::fp16});
|
| 10515 |
|
| 10516 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10517 |
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10518 |
sycl::range<3>(1, 1, 32),
|
| 10519 |
sycl::range<3>(1, 1, 32)),
|
| 10520 |
[=](sycl::nd_item<3> item_ct1) {
|
| 10521 |
+
dequantize_block_iq3_xxs(
|
| 10522 |
+
vx, y, item_ct1, iq3xxs_grid,
|
| 10523 |
+
ksigns_iq2xs, kmask_iq2xs);
|
| 10524 |
});
|
| 10525 |
});
|
| 10526 |
}
|
| 10527 |
}
|
| 10528 |
|
| 10529 |
template <typename dst_t>
|
| 10530 |
+
static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
|
| 10531 |
dpct::queue_ptr stream) {
|
| 10532 |
const int nb = k / QK_K;
|
| 10533 |
{
|
|
|
|
| 10534 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10535 |
{sycl::aspect::fp16});
|
| 10536 |
|
| 10537 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10538 |
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10539 |
sycl::range<3>(1, 1, 32),
|
| 10540 |
sycl::range<3>(1, 1, 32)),
|
| 10541 |
[=](sycl::nd_item<3> item_ct1) {
|
| 10542 |
+
dequantize_block_iq3_s(
|
| 10543 |
+
vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
|
|
|
|
| 10544 |
});
|
| 10545 |
});
|
| 10546 |
}
|
| 10547 |
}
|
| 10548 |
|
| 10549 |
+
template <typename dst_t>
|
| 10550 |
+
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
| 10551 |
+
dpct::queue_ptr stream) {
|
| 10552 |
+
const int nb = (k + QK_K - 1) / QK_K;
|
| 10553 |
+
#if QK_K == 64
|
| 10554 |
+
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
| 10555 |
+
#else
|
| 10556 |
+
{
|
| 10557 |
+
dpct::has_capability_or_fail(stream->get_device(),
|
| 10558 |
+
{sycl::aspect::fp16});
|
| 10559 |
+
|
| 10560 |
+
stream->submit([&](sycl::handler &cgh) {
|
| 10561 |
+
cgh.parallel_for(
|
| 10562 |
+
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10563 |
+
sycl::range<3>(1, 1, 32),
|
| 10564 |
+
sycl::range<3>(1, 1, 32)),
|
| 10565 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 10566 |
+
dequantize_block_iq4_xs(vx, y, item_ct1);
|
| 10567 |
+
});
|
| 10568 |
+
});
|
| 10569 |
+
}
|
| 10570 |
+
#endif
|
| 10571 |
+
}
|
| 10572 |
+
|
| 10573 |
+
|
| 10574 |
+
template <typename dst_t>
|
| 10575 |
+
static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
|
| 10576 |
+
dpct::queue_ptr stream) {
|
| 10577 |
+
const int nb = (k + QK_K - 1) / QK_K;
|
| 10578 |
+
{
|
| 10579 |
+
dpct::has_capability_or_fail(stream->get_device(),
|
| 10580 |
+
{sycl::aspect::fp16});
|
| 10581 |
+
|
| 10582 |
+
stream->submit([&](sycl::handler &cgh) {
|
| 10583 |
+
cgh.parallel_for(
|
| 10584 |
+
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10585 |
+
sycl::range<3>(1, 1, 32),
|
| 10586 |
+
sycl::range<3>(1, 1, 32)),
|
| 10587 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 10588 |
+
dequantize_block_iq4_nl(vx, y, item_ct1);
|
| 10589 |
+
});
|
| 10590 |
+
});
|
| 10591 |
+
}
|
| 10592 |
+
}
|
| 10593 |
+
|
| 10594 |
+
|
| 10595 |
+
|
| 10596 |
template <typename src_t, typename dst_t>
|
| 10597 |
static void convert_unary_sycl(const void *__restrict__ vx,
|
| 10598 |
dst_t *__restrict__ y, const int k,
|
|
|
|
| 10637 |
return dequantize_row_q5_K_sycl;
|
| 10638 |
case GGML_TYPE_Q6_K:
|
| 10639 |
return dequantize_row_q6_K_sycl;
|
| 10640 |
+
case GGML_TYPE_IQ1_S:
|
| 10641 |
+
return dequantize_row_iq1_s_sycl;
|
| 10642 |
+
case GGML_TYPE_IQ1_M:
|
| 10643 |
+
return dequantize_row_iq1_m_sycl;
|
| 10644 |
case GGML_TYPE_IQ2_XXS:
|
| 10645 |
return dequantize_row_iq2_xxs_sycl;
|
| 10646 |
case GGML_TYPE_IQ2_XS:
|
| 10647 |
return dequantize_row_iq2_xs_sycl;
|
| 10648 |
+
case GGML_TYPE_IQ2_S:
|
| 10649 |
+
return dequantize_row_iq2_s_sycl;
|
| 10650 |
case GGML_TYPE_IQ3_XXS:
|
| 10651 |
return dequantize_row_iq3_xxs_sycl;
|
| 10652 |
case GGML_TYPE_IQ3_S:
|
| 10653 |
return dequantize_row_iq3_s_sycl;
|
| 10654 |
+
case GGML_TYPE_IQ4_XS:
|
| 10655 |
+
return dequantize_row_iq4_xs_sycl;
|
| 10656 |
+
case GGML_TYPE_IQ4_NL:
|
| 10657 |
+
return dequantize_row_iq4_nl_sycl;
|
| 10658 |
case GGML_TYPE_F32:
|
| 10659 |
return convert_unary_sycl<float>;
|
| 10660 |
default:
|
|
|
|
| 10689 |
return dequantize_row_q5_K_sycl;
|
| 10690 |
case GGML_TYPE_Q6_K:
|
| 10691 |
return dequantize_row_q6_K_sycl;
|
| 10692 |
+
case GGML_TYPE_IQ1_S:
|
| 10693 |
+
return dequantize_row_iq1_s_sycl;
|
| 10694 |
+
case GGML_TYPE_IQ1_M:
|
| 10695 |
+
return dequantize_row_iq1_m_sycl;
|
| 10696 |
case GGML_TYPE_IQ2_XXS:
|
| 10697 |
return dequantize_row_iq2_xxs_sycl;
|
| 10698 |
case GGML_TYPE_IQ2_XS:
|
| 10699 |
return dequantize_row_iq2_xs_sycl;
|
| 10700 |
+
case GGML_TYPE_IQ2_S:
|
| 10701 |
+
return dequantize_row_iq2_s_sycl;
|
| 10702 |
case GGML_TYPE_IQ3_XXS:
|
| 10703 |
return dequantize_row_iq3_xxs_sycl;
|
| 10704 |
case GGML_TYPE_IQ3_S:
|
| 10705 |
return dequantize_row_iq3_s_sycl;
|
| 10706 |
+
case GGML_TYPE_IQ4_XS:
|
| 10707 |
+
return dequantize_row_iq4_xs_sycl;
|
| 10708 |
+
case GGML_TYPE_IQ4_NL:
|
| 10709 |
+
return dequantize_row_iq4_nl_sycl;
|
| 10710 |
case GGML_TYPE_F16:
|
| 10711 |
return convert_unary_sycl<sycl::half>;
|
| 10712 |
default:
|
|
|
|
| 11169 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 11170 |
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 11171 |
{
|
|
|
|
| 11172 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11173 |
cgh.parallel_for(
|
| 11174 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 11175 |
[=](sycl::nd_item<3> item_ct1)
|
| 11176 |
[[intel::reqd_sub_group_size(32)]] {
|
| 11177 |
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>(
|
| 11178 |
+
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 11179 |
});
|
| 11180 |
});
|
| 11181 |
}
|
|
|
|
| 11200 |
[=](sycl::nd_item<3> item_ct1)
|
| 11201 |
[[intel::reqd_sub_group_size(32)]] {
|
| 11202 |
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>(
|
| 11203 |
+
vx, vy, dst, ncols, nrows, item_ct1);
|
| 11204 |
+
});
|
| 11205 |
+
});
|
| 11206 |
+
}
|
| 11207 |
+
}
|
| 11208 |
+
|
| 11209 |
+
static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
| 11210 |
+
float *dst, const int ncols,
|
| 11211 |
+
const int nrows,
|
| 11212 |
+
dpct::queue_ptr stream) {
|
| 11213 |
+
GGML_ASSERT(ncols % QK_K == 0);
|
| 11214 |
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 11215 |
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 11216 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 11217 |
+
{
|
| 11218 |
+
|
| 11219 |
+
stream->submit([&](sycl::handler &cgh) {
|
| 11220 |
+
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
| 11221 |
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
| 11222 |
+
|
| 11223 |
+
cgh.parallel_for(
|
| 11224 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 11225 |
+
[=](sycl::nd_item<3> item_ct1)
|
| 11226 |
+
[[intel::reqd_sub_group_size(32)]] {
|
| 11227 |
+
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>(
|
| 11228 |
+
vx, vy, dst, ncols, nrows, item_ct1);
|
| 11229 |
});
|
| 11230 |
});
|
| 11231 |
}
|
|
|
|
| 11250 |
[=](sycl::nd_item<3> item_ct1)
|
| 11251 |
[[intel::reqd_sub_group_size(32)]] {
|
| 11252 |
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>(
|
| 11253 |
+
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 11254 |
});
|
| 11255 |
});
|
| 11256 |
}
|
|
|
|
| 11268 |
|
| 11269 |
stream->submit([&](sycl::handler &cgh) {
|
| 11270 |
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
|
|
|
| 11271 |
|
| 11272 |
cgh.parallel_for(
|
| 11273 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 11274 |
[=](sycl::nd_item<3> item_ct1)
|
| 11275 |
[[intel::reqd_sub_group_size(32)]] {
|
| 11276 |
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
|
| 11277 |
+
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 11278 |
});
|
| 11279 |
});
|
| 11280 |
}
|
|
|
|
| 11299 |
[=](sycl::nd_item<3> item_ct1)
|
| 11300 |
[[intel::reqd_sub_group_size(32)]] {
|
| 11301 |
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
| 11302 |
+
vx, vy, dst, ncols, nrows, item_ct1);
|
| 11303 |
+
});
|
| 11304 |
+
});
|
| 11305 |
+
}
|
| 11306 |
+
}
|
| 11307 |
+
|
| 11308 |
+
static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
|
| 11309 |
+
float *dst, const int ncols,
|
| 11310 |
+
const int nrows,
|
| 11311 |
+
dpct::queue_ptr stream) {
|
| 11312 |
+
GGML_ASSERT(ncols % QK_K == 0);
|
| 11313 |
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 11314 |
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 11315 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 11316 |
+
{
|
| 11317 |
+
stream->submit([&](sycl::handler &cgh) {
|
| 11318 |
+
cgh.parallel_for(
|
| 11319 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 11320 |
+
[=](sycl::nd_item<3> item_ct1)
|
| 11321 |
+
[[intel::reqd_sub_group_size(32)]] {
|
| 11322 |
+
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
| 11323 |
+
vx, vy, dst, ncols, nrows, item_ct1);
|
| 11324 |
+
});
|
| 11325 |
+
});
|
| 11326 |
+
}
|
| 11327 |
+
}
|
| 11328 |
+
|
| 11329 |
+
static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
| 11330 |
+
float *dst, const int ncols,
|
| 11331 |
+
const int nrows,
|
| 11332 |
+
dpct::queue_ptr stream) {
|
| 11333 |
+
GGML_ASSERT(ncols % QK4_NL == 0);
|
| 11334 |
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 11335 |
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 11336 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 11337 |
+
{
|
| 11338 |
+
|
| 11339 |
+
stream->submit([&](sycl::handler &cgh) {
|
| 11340 |
+
cgh.parallel_for(
|
| 11341 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 11342 |
+
[=](sycl::nd_item<3> item_ct1)
|
| 11343 |
+
[[intel::reqd_sub_group_size(32)]] {
|
| 11344 |
+
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
|
| 11345 |
+
vx, vy, dst, ncols, nrows, item_ct1);
|
| 11346 |
+
});
|
| 11347 |
+
});
|
| 11348 |
+
}
|
| 11349 |
+
}
|
| 11350 |
+
|
| 11351 |
+
static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
| 11352 |
+
float *dst, const int ncols,
|
| 11353 |
+
const int nrows,
|
| 11354 |
+
dpct::queue_ptr stream) {
|
| 11355 |
+
GGML_ASSERT(ncols % QK_K == 0);
|
| 11356 |
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 11357 |
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 11358 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 11359 |
+
{
|
| 11360 |
+
|
| 11361 |
+
stream->submit([&](sycl::handler &cgh) {
|
| 11362 |
+
cgh.parallel_for(
|
| 11363 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 11364 |
+
[=](sycl::nd_item<3> item_ct1)
|
| 11365 |
+
[[intel::reqd_sub_group_size(32)]] {
|
| 11366 |
+
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS, block_iq4_xs, 1>(
|
| 11367 |
+
vx, vy, dst, ncols, nrows, item_ct1);
|
| 11368 |
});
|
| 11369 |
});
|
| 11370 |
}
|
|
|
|
| 12988 |
});
|
| 12989 |
}
|
| 12990 |
|
| 12991 |
+
static int next_power_of_2(int x) {
|
| 12992 |
+
int n = 1;
|
| 12993 |
+
while (n < x) {
|
| 12994 |
+
n *= 2;
|
| 12995 |
+
}
|
| 12996 |
+
return n;
|
| 12997 |
+
}
|
| 12998 |
+
|
| 12999 |
static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
| 13000 |
const int nrows, ggml_sort_order order,
|
| 13001 |
dpct::queue_ptr stream) {
|
| 13002 |
// bitonic sort requires ncols to be power of 2
|
| 13003 |
+
const int ncols_pad = next_power_of_2(ncols);
|
| 13004 |
|
| 13005 |
+
const sycl::range<3> block_dims(1, 1, ncols_pad);
|
| 13006 |
const sycl::range<3> block_nums(1, nrows, 1);
|
| 13007 |
+
const size_t shared_mem = ncols_pad * sizeof(int);
|
| 13008 |
+
|
| 13009 |
+
// GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
|
| 13010 |
+
|
| 13011 |
if (order == GGML_SORT_ORDER_ASC) {
|
| 13012 |
+
stream->submit([&](sycl::handler &cgh) {
|
| 13013 |
+
sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
|
| 13014 |
+
sycl::range<1>(shared_mem), cgh);
|
| 13015 |
+
|
| 13016 |
+
cgh.parallel_for(
|
| 13017 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 13018 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 13019 |
+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
|
| 13020 |
+
x, dst, ncols, ncols_pad, item_ct1,
|
| 13021 |
+
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
|
| 13022 |
+
.get());
|
| 13023 |
+
});
|
| 13024 |
+
});
|
| 13025 |
} else if (order == GGML_SORT_ORDER_DESC) {
|
| 13026 |
+
stream->submit([&](sycl::handler &cgh) {
|
| 13027 |
+
sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
|
| 13028 |
+
sycl::range<1>(shared_mem), cgh);
|
| 13029 |
+
|
| 13030 |
+
cgh.parallel_for(
|
| 13031 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 13032 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 13033 |
+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
|
| 13034 |
+
x, dst, ncols, ncols_pad, item_ct1,
|
| 13035 |
+
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
|
| 13036 |
+
.get());
|
| 13037 |
+
});
|
| 13038 |
+
});
|
| 13039 |
} else {
|
| 13040 |
GGML_ASSERT(false);
|
| 13041 |
}
|
|
|
|
| 14163 |
case GGML_TYPE_Q5_K:
|
| 14164 |
case GGML_TYPE_IQ2_XXS:
|
| 14165 |
case GGML_TYPE_IQ2_XS:
|
| 14166 |
+
case GGML_TYPE_IQ2_S:
|
| 14167 |
case GGML_TYPE_IQ1_S:
|
| 14168 |
+
case GGML_TYPE_IQ1_M:
|
| 14169 |
case GGML_TYPE_IQ3_XXS:
|
| 14170 |
+
case GGML_TYPE_IQ4_XS:
|
| 14171 |
+
case GGML_TYPE_IQ4_NL:
|
| 14172 |
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
| 14173 |
case GGML_TYPE_IQ3_S:
|
| 14174 |
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
|
|
|
| 14187 |
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
| 14188 |
const dpct::queue_ptr &stream) {
|
| 14189 |
|
| 14190 |
+
const int64_t ne10 = src1->ne[0];
|
| 14191 |
+
GGML_ASSERT(ne10 % QK8_1 == 0);
|
| 14192 |
|
| 14193 |
const int64_t ne00 = src0->ne[0];
|
| 14194 |
const int64_t row_diff = row_high - row_low;
|
| 14195 |
|
| 14196 |
+
int id;
|
| 14197 |
+
SYCL_CHECK(
|
| 14198 |
+
CHECK_TRY_ERROR(id = get_current_device_id()));
|
| 14199 |
+
|
| 14200 |
+
// the main device has a larger memory buffer to hold the results from all GPUs
|
| 14201 |
+
// nrows_dst == nrows of the matrix that the kernel writes into
|
| 14202 |
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne00 : row_diff;
|
| 14203 |
+
|
| 14204 |
switch (src0->type) {
|
| 14205 |
case GGML_TYPE_Q4_0:
|
| 14206 |
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
|
|
| 14232 |
case GGML_TYPE_Q6_K:
|
| 14233 |
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 14234 |
break;
|
| 14235 |
+
case GGML_TYPE_IQ1_S:
|
| 14236 |
+
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 14237 |
+
break;
|
| 14238 |
+
case GGML_TYPE_IQ1_M:
|
| 14239 |
+
mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 14240 |
+
break;
|
| 14241 |
case GGML_TYPE_IQ2_XXS:
|
| 14242 |
mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 14243 |
break;
|
| 14244 |
case GGML_TYPE_IQ2_XS:
|
| 14245 |
mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 14246 |
break;
|
| 14247 |
+
case GGML_TYPE_IQ2_S:
|
| 14248 |
+
mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 14249 |
+
break;
|
| 14250 |
case GGML_TYPE_IQ3_XXS:
|
| 14251 |
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 14252 |
break;
|
| 14253 |
case GGML_TYPE_IQ3_S:
|
| 14254 |
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 14255 |
break;
|
| 14256 |
+
case GGML_TYPE_IQ4_NL:
|
| 14257 |
+
mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 14258 |
+
break;
|
| 14259 |
+
case GGML_TYPE_IQ4_XS:
|
| 14260 |
+
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
| 14261 |
break;
|
| 14262 |
default:
|
| 14263 |
GGML_ASSERT(false);
|
|
|
|
| 14339 |
convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
| 14340 |
break;
|
| 14341 |
default:
|
| 14342 |
+
printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
|
| 14343 |
GGML_ASSERT(false);
|
| 14344 |
break;
|
| 14345 |
}
|
|
|
|
| 15194 |
src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
|
| 15195 |
}
|
| 15196 |
// do the computation
|
| 15197 |
+
SYCL_CHECK(CHECK_TRY_ERROR(op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
| 15198 |
+
dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
|
| 15199 |
/*
|
| 15200 |
DPCT1010:93: SYCL uses exceptions to report errors and does not
|
| 15201 |
use the error codes. The call was replaced with 0. You need to
|
|
|
|
| 15776 |
#ifdef GGML_SYCL_FORCE_DMMV
|
| 15777 |
const bool use_mul_mat_vec_q = false;
|
| 15778 |
#else
|
| 15779 |
+
bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
|
| 15780 |
+
use_mul_mat_vec_q = use_mul_mat_vec_q ||
|
| 15781 |
+
(src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
|
| 15782 |
+
(src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
|
| 15783 |
+
(src0->type == GGML_TYPE_IQ4_NL) || (src0->type == GGML_TYPE_IQ4_XS) ||
|
| 15784 |
+
(src0->type == GGML_TYPE_IQ1_S) || (src0->type == GGML_TYPE_IQ1_M);
|
| 15785 |
+
|
| 15786 |
+
|
| 15787 |
#endif // GGML_SYCL_FORCE_DMMV
|
| 15788 |
|
| 15789 |
if (use_mul_mat_vec_q) {
|
|
|
|
| 17643 |
return false;
|
| 17644 |
}
|
| 17645 |
ggml_type a_type = a->type;
|
| 17646 |
+
if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS ||
|
| 17647 |
+
a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
|
| 17648 |
+
a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S ||
|
| 17649 |
+
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M
|
| 17650 |
+
) {
|
| 17651 |
+
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
| 17652 |
+
return false;
|
| 17653 |
+
}
|
| 17654 |
}
|
| 17655 |
return true;
|
| 17656 |
} break;
|