Neo Zhang Jianyu commited on
Commit
873102e
·
unverified ·
1 Parent(s): 21c143f

support/fix OPs GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M (llama/6521)

Browse files
Files changed (1) hide show
  1. ggml-sycl.cpp +876 -196
ggml-sycl.cpp CHANGED
@@ -3038,6 +3038,10 @@ typedef float dfloat; // dequantize float
3038
  typedef sycl::float2 dfloat2;
3039
  #endif //GGML_SYCL_F16
3040
 
 
 
 
 
3041
  bool ggml_sycl_loaded(void);
3042
  void * ggml_sycl_host_malloc(size_t size);
3043
  void ggml_sycl_host_free(void * ptr);
@@ -4473,6 +4477,32 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4473
 
4474
  }
4475
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4476
  template<typename dst_t>
4477
  static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
4478
  const sycl::nd_item<3> &item_ct1,
@@ -4505,26 +4535,26 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4505
 
4506
  }
4507
 
4508
- template<typename dst_t>
4509
- static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
4510
- const sycl::nd_item<3> &item_ct1,
4511
- const uint32_t *iq3s_grid,
4512
- const uint8_t *ksigns_iq2xs,
4513
- const uint8_t *kmask_iq2xs) {
4514
 
4515
  const int i = item_ct1.get_group(2);
4516
- const block_iq3_s * x = (const block_iq3_s *) vx;
4517
 
4518
  const int tid = item_ct1.get_local_id(2);
4519
  #if QK_K == 256
4520
  const int il = tid/8; // 0...3
4521
  const int ib = tid%8; // 0...7
4522
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4523
- const uint8_t * qs = x[i].qs + 8*ib;
4524
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + qs[2*il+0]);
4525
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + qs[2*il+1]);
4526
  const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
4527
  const uint8_t signs = x[i].signs[4*ib + il];
 
4528
  for (int j = 0; j < 4; ++j) {
4529
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4530
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
@@ -4535,12 +4565,12 @@ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restr
4535
 
4536
  }
4537
 
4538
- template<typename dst_t>
4539
- static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
4540
- const sycl::nd_item<3> &item_ct1,
4541
- const uint32_t *iq1s_grid,
4542
- const uint8_t *ksigns_iq2xs,
4543
- const uint8_t *kmask_iq2xs) {
4544
  const int i = item_ct1.get_group(2);
4545
  const block_iq1_s * x = (const block_iq1_s *) vx;
4546
 
@@ -4549,14 +4579,49 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
4549
  const int il = tid/8; // 0...3
4550
  const int ib = tid%8; // 0...7
4551
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4552
- const uint8_t * qs = x[i].qs + 8*ib;
4553
- const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]);
4554
- const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]);
4555
- const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1);
4556
- const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7];
4557
- for (int j = 0; j < 4; ++j) {
4558
- y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4559
- y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4560
  }
4561
  #else
4562
  assert(false);
@@ -4564,6 +4629,51 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
4564
 
4565
  }
4566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4567
  /*
4568
  DPCT1110:4: The total declared local variable size in device function
4569
  dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
@@ -7370,6 +7480,58 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7370
  #endif
7371
  }
7372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7373
  static __dpct_inline__ float
7374
  vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7375
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
@@ -7412,10 +7574,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7412
 
7413
  static __dpct_inline__ float
7414
  vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7415
- const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7416
- const uint32_t *iq3s_grid, const uint64_t *ksigns64) {
7417
- #if DPCT_COMPATIBILITY_TEMP >= \
7418
- MIN_CC_DP4A // lowest compute capability for integer intrinsics
7419
  #if QK_K == 256
7420
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7421
 
@@ -7427,9 +7587,11 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7427
  const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
7428
  const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
7429
  uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
7430
- ((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
 
7431
  uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
7432
- ((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
 
7433
  const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7434
  grid1[0] ^ signs0, signs0, std::minus<>());
7435
  const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
@@ -7438,45 +7600,142 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7438
  sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
7439
  q8 += 8;
7440
  }
7441
- const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * bq8_1[ib32].ds[0];
 
 
 
7442
  return d * sumi;
7443
  #else
7444
  assert(false);
7445
- return 0.f;
7446
- #endif
7447
- #else
7448
- assert(false);
7449
- return 0.f;
7450
  #endif
7451
  }
7452
 
7453
  static __dpct_inline__ float
7454
  vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7455
- const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7456
- const uint32_t *iq1s_grid, const uint64_t *ksigns64) {
7457
  #if QK_K == 256
7458
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7459
 
7460
  const int ib32 = iqs;
7461
- const uint8_t * qs = bq1->qs + 4*ib32;
7462
- const int8_t * q8 = bq8_1[ib32].qs;
7463
  int sumi = 0;
 
7464
  for (int l = 0; l < 4; ++l) {
7465
- const uint32_t * grid = (const uint32_t *)(iq1s_grid + qs[l]);
7466
- const uint32_t * signs = (const uint32_t *)(ksigns64 + (qs[l] >> 8));
7467
- const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7468
- grid[0] ^ signs[0], signs[0], std::minus<>());
7469
- const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
7470
- grid[1] ^ signs[1], signs[1], std::minus<>());
7471
- sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
7472
- sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
7473
- q8 += 8;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7474
  }
7475
- const float d = (float)bq1->d * bq8_1[ib32].ds[0] * 0.25f;
7476
- return d * sumi;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7477
  #else
7478
  assert(false);
7479
- return 0.f;
7480
  #endif
7481
  }
7482
 
@@ -8061,8 +8320,199 @@ template <bool need_check> static void
8061
 
8062
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
8063
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8064
- const sycl::nd_item<3> &item_ct1,
8065
- const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8066
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8067
  item_ct1.get_local_id(1);
8068
 
@@ -8090,7 +8540,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
8090
  (item_ct1.get_local_id(2) %
8091
  (qi / vdr)); // x block quant index when casting the quants to int
8092
 
8093
- tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
8094
  }
8095
 
8096
  // sum up partial sums and write back result
@@ -8106,10 +8556,11 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
8106
  }
8107
 
8108
  template <int qk, int qi, typename block_q_t, int vdr>
8109
- static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8110
- const sycl::nd_item<3> &item_ct1,
8111
- const uint64_t *iq2xxs_grid_ptr, const uint8_t *ksigns_iq2xs_ptr,
8112
- const uint8_t *kmask_iq2xs_ptr ) {
 
8113
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8114
  item_ct1.get_local_id(1);
8115
 
@@ -8137,7 +8588,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void
8137
  (item_ct1.get_local_id(2) %
8138
  (qi / vdr)); // x block quant index when casting the quants to int
8139
 
8140
- tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid_ptr, ksigns_iq2xs_ptr, kmask_iq2xs_ptr);
8141
  }
8142
 
8143
  // sum up partial sums and write back result
@@ -8153,9 +8604,11 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void
8153
  }
8154
 
8155
  template <int qk, int qi, typename block_q_t, int vdr>
8156
- static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8157
- const sycl::nd_item<3> &item_ct1,
8158
- const uint64_t *iq2xs_grid_ptr, const uint64_t *ksigns64_ptr ) {
 
 
8159
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8160
  item_ct1.get_local_id(1);
8161
 
@@ -8183,7 +8636,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void *
8183
  (item_ct1.get_local_id(2) %
8184
  (qi / vdr)); // x block quant index when casting the quants to int
8185
 
8186
- tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid_ptr, ksigns64_ptr);
8187
  }
8188
 
8189
  // sum up partial sums and write back result
@@ -8199,9 +8652,11 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void *
8199
  }
8200
 
8201
  template <int qk, int qi, typename block_q_t, int vdr>
8202
- static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8203
- const sycl::nd_item<3> &item_ct1,
8204
- const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr ) {
 
 
8205
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8206
  item_ct1.get_local_id(1);
8207
 
@@ -8229,7 +8684,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
8229
  (item_ct1.get_local_id(2) %
8230
  (qi / vdr)); // x block quant index when casting the quants to int
8231
 
8232
- tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid_ptr, ksigns64_ptr);
8233
  }
8234
 
8235
  // sum up partial sums and write back result
@@ -8245,9 +8700,11 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
8245
  }
8246
 
8247
  template <int qk, int qi, typename block_q_t, int vdr>
8248
- static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8249
- const sycl::nd_item<3> &item_ct1,
8250
- const uint32_t *iq3s_grid_ptr, const uint64_t *ksigns64_ptr ) {
 
 
8251
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8252
  item_ct1.get_local_id(1);
8253
 
@@ -8275,7 +8732,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void *
8275
  (item_ct1.get_local_id(2) %
8276
  (qi / vdr)); // x block quant index when casting the quants to int
8277
 
8278
- tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid_ptr, ksigns64_ptr);
8279
  }
8280
 
8281
  // sum up partial sums and write back result
@@ -8290,10 +8747,13 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void *
8290
  }
8291
  }
8292
 
 
8293
  template <int qk, int qi, typename block_q_t, int vdr>
8294
- static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8295
- const sycl::nd_item<3> &item_ct1,
8296
- const uint32_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) {
 
 
8297
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8298
  item_ct1.get_local_id(1);
8299
 
@@ -8321,7 +8781,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void *
8321
  (item_ct1.get_local_id(2) %
8322
  (qi / vdr)); // x block quant index when casting the quants to int
8323
 
8324
- tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_ptr, ksigns64_ptr);
8325
  }
8326
 
8327
  // sum up partial sums and write back result
@@ -8336,6 +8796,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void *
8336
  }
8337
  }
8338
 
 
8339
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
8340
  static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
8341
  const sycl::nd_item<3> &item_ct1) {
@@ -8897,64 +9358,71 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
8897
  }
8898
  }
8899
 
 
8900
  template<typename T>
8901
- static inline void swap(T & a, T & b) {
8902
  T tmp = a;
8903
  a = b;
8904
  b = tmp;
8905
  }
8906
 
8907
- template<ggml_sort_order order>
8908
- static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
8909
- const sycl::nd_item<3> &item_ct1) {
 
8910
  // bitonic sort
8911
  int col = item_ct1.get_local_id(2);
8912
  int row = item_ct1.get_group(1);
8913
 
8914
- if (col >= ncols) return;
 
 
8915
 
8916
  const float * x_row = x + row * ncols;
8917
- int * dst_row = dst + row * ncols;
8918
 
8919
  // initialize indices
8920
- if (col < ncols) {
8921
- dst_row[col] = col;
8922
- }
8923
- /*
8924
- DPCT1065:58: Consider replacing sycl::nd_item::barrier() with
8925
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
8926
- performance if there is no access to global memory.
8927
- */
8928
- item_ct1.barrier();
8929
 
8930
- for (int k = 2; k <= ncols; k *= 2) {
8931
  for (int j = k / 2; j > 0; j /= 2) {
8932
  int ixj = col ^ j;
8933
  if (ixj > col) {
8934
  if ((col & k) == 0) {
8935
- if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
8936
- swap(dst_row[col], dst_row[ixj]);
 
 
 
 
8937
  }
8938
  } else {
8939
- if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
8940
- swap(dst_row[col], dst_row[ixj]);
 
 
 
 
8941
  }
8942
  }
8943
  }
8944
  /*
8945
- DPCT1118:11: SYCL group functions and algorithms must be encountered
8946
  in converged control flow. You may need to adjust the code.
8947
  */
8948
- /*
8949
- DPCT1065:59: Consider replacing sycl::nd_item::barrier() with
8950
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8951
- better performance if there is no access to global memory.
8952
- */
8953
- item_ct1.barrier();
8954
  }
8955
  }
 
 
 
 
 
8956
  }
8957
 
 
8958
  static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
8959
  const sycl::nd_item<3> &item_ct1) {
8960
  const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
@@ -9933,28 +10401,64 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
9933
  #endif
9934
  }
9935
 
9936
-
9937
  template <typename dst_t>
9938
- static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
9939
  dpct::queue_ptr stream) {
9940
  const int nb = k / QK_K;
9941
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9942
 
 
 
 
 
 
9943
  dpct::has_capability_or_fail(stream->get_device(),
9944
  {sycl::aspect::fp16});
9945
 
9946
  stream->submit([&](sycl::handler &cgh) {
9947
- auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
9948
- auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
9949
- auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9950
 
 
9951
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
9952
  sycl::range<3>(1, 1, 32),
9953
  sycl::range<3>(1, 1, 32)),
9954
  [=](sycl::nd_item<3> item_ct1) {
9955
  dequantize_block_iq2_xxs(
9956
- vx, y, item_ct1, iq2xxs_grid_ptr_ct1,
9957
- ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
9958
  });
9959
  });
9960
  }
@@ -9965,105 +10469,130 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
9965
  dpct::queue_ptr stream) {
9966
  const int nb = k / QK_K;
9967
  {
9968
-
9969
  dpct::has_capability_or_fail(stream->get_device(),
9970
  {sycl::aspect::fp16});
9971
 
9972
  stream->submit([&](sycl::handler &cgh) {
9973
- auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
9974
- auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
9975
- auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
9976
-
9977
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
9978
  sycl::range<3>(1, 1, 32),
9979
  sycl::range<3>(1, 1, 32)),
9980
  [=](sycl::nd_item<3> item_ct1) {
9981
  dequantize_block_iq2_xs(
9982
- vx, y, item_ct1, iq2xs_grid_ptr_ct1,
9983
- ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
9984
  });
9985
  });
9986
  }
9987
  }
9988
 
9989
  template <typename dst_t>
9990
- static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
9991
- dpct::queue_ptr stream) {
9992
  const int nb = k / QK_K;
9993
  {
9994
-
9995
  dpct::has_capability_or_fail(stream->get_device(),
9996
  {sycl::aspect::fp16});
9997
 
9998
  stream->submit([&](sycl::handler &cgh) {
9999
- auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
10000
- auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10001
- auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10002
-
10003
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10004
  sycl::range<3>(1, 1, 32),
10005
  sycl::range<3>(1, 1, 32)),
10006
  [=](sycl::nd_item<3> item_ct1) {
10007
- dequantize_block_iq3_xxs(
10008
- vx, y, item_ct1, iq3xxs_grid_ptr_ct1,
10009
- ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10010
  });
10011
  });
10012
  }
10013
  }
10014
 
 
10015
  template <typename dst_t>
10016
- static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
10017
  dpct::queue_ptr stream) {
10018
  const int nb = k / QK_K;
10019
  {
10020
-
10021
  dpct::has_capability_or_fail(stream->get_device(),
10022
  {sycl::aspect::fp16});
10023
 
10024
  stream->submit([&](sycl::handler &cgh) {
10025
- auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
10026
- auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10027
- auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10028
-
10029
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10030
  sycl::range<3>(1, 1, 32),
10031
  sycl::range<3>(1, 1, 32)),
10032
  [=](sycl::nd_item<3> item_ct1) {
10033
- dequantize_block_iq3_s(
10034
- vx, y, item_ct1, iq3s_grid_ptr_ct1,
10035
- ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10036
  });
10037
  });
10038
  }
10039
  }
10040
 
10041
  template <typename dst_t>
10042
- static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
10043
  dpct::queue_ptr stream) {
10044
  const int nb = k / QK_K;
10045
  {
10046
-
10047
  dpct::has_capability_or_fail(stream->get_device(),
10048
  {sycl::aspect::fp16});
10049
 
10050
  stream->submit([&](sycl::handler &cgh) {
10051
- auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
10052
- auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10053
- auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10054
-
10055
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10056
  sycl::range<3>(1, 1, 32),
10057
  sycl::range<3>(1, 1, 32)),
10058
  [=](sycl::nd_item<3> item_ct1) {
10059
- dequantize_block_iq1_s(
10060
- vx, y, item_ct1, iq1s_grid_ptr_ct1,
10061
- ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10062
  });
10063
  });
10064
  }
10065
  }
10066
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10067
  template <typename src_t, typename dst_t>
10068
  static void convert_unary_sycl(const void *__restrict__ vx,
10069
  dst_t *__restrict__ y, const int k,
@@ -10108,16 +10637,24 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try {
10108
  return dequantize_row_q5_K_sycl;
10109
  case GGML_TYPE_Q6_K:
10110
  return dequantize_row_q6_K_sycl;
 
 
 
 
10111
  case GGML_TYPE_IQ2_XXS:
10112
  return dequantize_row_iq2_xxs_sycl;
10113
  case GGML_TYPE_IQ2_XS:
10114
  return dequantize_row_iq2_xs_sycl;
 
 
10115
  case GGML_TYPE_IQ3_XXS:
10116
  return dequantize_row_iq3_xxs_sycl;
10117
  case GGML_TYPE_IQ3_S:
10118
  return dequantize_row_iq3_s_sycl;
10119
- case GGML_TYPE_IQ1_S:
10120
- return dequantize_row_iq1_s_sycl;
 
 
10121
  case GGML_TYPE_F32:
10122
  return convert_unary_sycl<float>;
10123
  default:
@@ -10152,16 +10689,24 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
10152
  return dequantize_row_q5_K_sycl;
10153
  case GGML_TYPE_Q6_K:
10154
  return dequantize_row_q6_K_sycl;
 
 
 
 
10155
  case GGML_TYPE_IQ2_XXS:
10156
  return dequantize_row_iq2_xxs_sycl;
10157
  case GGML_TYPE_IQ2_XS:
10158
  return dequantize_row_iq2_xs_sycl;
 
 
10159
  case GGML_TYPE_IQ3_XXS:
10160
  return dequantize_row_iq3_xxs_sycl;
10161
  case GGML_TYPE_IQ3_S:
10162
  return dequantize_row_iq3_s_sycl;
10163
- case GGML_TYPE_IQ1_S:
10164
- return dequantize_row_iq1_s_sycl;
 
 
10165
  case GGML_TYPE_F16:
10166
  return convert_unary_sycl<sycl::half>;
10167
  default:
@@ -10624,19 +11169,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10624
  const sycl::range<3> block_nums(1, 1, block_num_y);
10625
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10626
  {
10627
-
10628
  stream->submit([&](sycl::handler &cgh) {
10629
- auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
10630
- auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10631
- auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10632
-
10633
  cgh.parallel_for(
10634
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10635
  [=](sycl::nd_item<3> item_ct1)
10636
  [[intel::reqd_sub_group_size(32)]] {
10637
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>(
10638
- vx, vy, dst, ncols, nrows, item_ct1,
10639
- iq2xxs_grid_ptr_ct1, ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10640
  });
10641
  });
10642
  }
@@ -10661,8 +11200,32 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
10661
  [=](sycl::nd_item<3> item_ct1)
10662
  [[intel::reqd_sub_group_size(32)]] {
10663
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>(
10664
- vx, vy, dst, ncols, nrows, item_ct1,
10665
- iq2xs_grid_ptr_ct1, ksigns64_ptr_ct1);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10666
  });
10667
  });
10668
  }
@@ -10687,8 +11250,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
10687
  [=](sycl::nd_item<3> item_ct1)
10688
  [[intel::reqd_sub_group_size(32)]] {
10689
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>(
10690
- vx, vy, dst, ncols, nrows, item_ct1,
10691
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10692
  });
10693
  });
10694
  }
@@ -10706,15 +11268,13 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
10706
 
10707
  stream->submit([&](sycl::handler &cgh) {
10708
  auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
10709
- auto ksigns64_ptr_ct1 = &ksigns64[0];
10710
 
10711
  cgh.parallel_for(
10712
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10713
  [=](sycl::nd_item<3> item_ct1)
10714
  [[intel::reqd_sub_group_size(32)]] {
10715
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
10716
- vx, vy, dst, ncols, nrows, item_ct1,
10717
- iq3s_grid_ptr_ct1, ksigns64_ptr_ct1);
10718
  });
10719
  });
10720
  }
@@ -10739,8 +11299,72 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
10739
  [=](sycl::nd_item<3> item_ct1)
10740
  [[intel::reqd_sub_group_size(32)]] {
10741
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
10742
- vx, vy, dst, ncols, nrows, item_ct1,
10743
- iq1s_grid_ptr_ct1, ksigns64_ptr_ct1);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10744
  });
10745
  });
10746
  }
@@ -12364,36 +12988,54 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
12364
  });
12365
  }
12366
 
 
 
 
 
 
 
 
 
12367
  static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
12368
  const int nrows, ggml_sort_order order,
12369
  dpct::queue_ptr stream) {
12370
  // bitonic sort requires ncols to be power of 2
12371
- GGML_ASSERT((ncols & (ncols - 1)) == 0);
12372
 
12373
- const sycl::range<3> block_dims(1, 1, ncols);
12374
  const sycl::range<3> block_nums(1, nrows, 1);
 
 
 
 
12375
  if (order == GGML_SORT_ORDER_ASC) {
12376
- /*
12377
- DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
12378
- the limit. To get the device limit, query
12379
- info::device::max_work_group_size. Adjust the work-group size if needed.
12380
- */
12381
- stream->parallel_for(
12382
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
12383
- [=](sycl::nd_item<3> item_ct1) {
12384
- k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
12385
- });
 
 
 
12386
  } else if (order == GGML_SORT_ORDER_DESC) {
12387
- /*
12388
- DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
12389
- the limit. To get the device limit, query
12390
- info::device::max_work_group_size. Adjust the work-group size if needed.
12391
- */
12392
- stream->parallel_for(
12393
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
12394
- [=](sycl::nd_item<3> item_ct1) {
12395
- k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
12396
- });
 
 
 
12397
  } else {
12398
  GGML_ASSERT(false);
12399
  }
@@ -13521,8 +14163,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
13521
  case GGML_TYPE_Q5_K:
13522
  case GGML_TYPE_IQ2_XXS:
13523
  case GGML_TYPE_IQ2_XS:
 
13524
  case GGML_TYPE_IQ1_S:
 
13525
  case GGML_TYPE_IQ3_XXS:
 
 
13526
  return max_compute_capability >= VER_GEN9 ? 128 : 64;
13527
  case GGML_TYPE_IQ3_S:
13528
  return max_compute_capability >= VER_GEN9 ? 128 : 64;
@@ -13541,11 +14187,20 @@ inline void ggml_sycl_op_mul_mat_vec_q(
13541
  const int64_t src1_ncols, const int64_t src1_padded_row_size,
13542
  const dpct::queue_ptr &stream) {
13543
 
13544
- GGML_ASSERT(ggml_nrows(src1) == 1);
 
13545
 
13546
  const int64_t ne00 = src0->ne[0];
13547
  const int64_t row_diff = row_high - row_low;
13548
 
 
 
 
 
 
 
 
 
13549
  switch (src0->type) {
13550
  case GGML_TYPE_Q4_0:
13551
  mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
@@ -13577,20 +14232,32 @@ inline void ggml_sycl_op_mul_mat_vec_q(
13577
  case GGML_TYPE_Q6_K:
13578
  mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13579
  break;
 
 
 
 
 
 
13580
  case GGML_TYPE_IQ2_XXS:
13581
  mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13582
  break;
13583
  case GGML_TYPE_IQ2_XS:
13584
  mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13585
  break;
 
 
 
13586
  case GGML_TYPE_IQ3_XXS:
13587
  mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13588
  break;
13589
  case GGML_TYPE_IQ3_S:
13590
  mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13591
  break;
13592
- case GGML_TYPE_IQ1_S:
13593
- mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
 
 
 
13594
  break;
13595
  default:
13596
  GGML_ASSERT(false);
@@ -13672,6 +14339,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
13672
  convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
13673
  break;
13674
  default:
 
13675
  GGML_ASSERT(false);
13676
  break;
13677
  }
@@ -14526,8 +15194,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
14526
  src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
14527
  }
14528
  // do the computation
14529
- op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
14530
- dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream);
14531
  /*
14532
  DPCT1010:93: SYCL uses exceptions to report errors and does not
14533
  use the error codes. The call was replaced with 0. You need to
@@ -15108,7 +15776,14 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
15108
  #ifdef GGML_SYCL_FORCE_DMMV
15109
  const bool use_mul_mat_vec_q = false;
15110
  #else
15111
- const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
 
 
 
 
 
 
 
15112
  #endif // GGML_SYCL_FORCE_DMMV
15113
 
15114
  if (use_mul_mat_vec_q) {
@@ -16968,9 +17643,14 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
16968
  return false;
16969
  }
16970
  ggml_type a_type = a->type;
16971
- if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S ||
16972
- a_type == GGML_TYPE_IQ4_XS) {
16973
- return false;
 
 
 
 
 
16974
  }
16975
  return true;
16976
  } break;
 
3038
  typedef sycl::float2 dfloat2;
3039
  #endif //GGML_SYCL_F16
3040
 
3041
+ #define MMVQ_MAX_BATCH_SIZE 8
3042
+
3043
+ static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
3044
+
3045
  bool ggml_sycl_loaded(void);
3046
  void * ggml_sycl_host_malloc(size_t size);
3047
  void ggml_sycl_host_free(void * ptr);
 
4477
 
4478
  }
4479
 
4480
+ template <typename dst_t>
4481
+ __dpct_inline__ static void
4482
+ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4483
+ const sycl::nd_item<3> &item_ct1) {
4484
+
4485
+ const int i = item_ct1.get_group(2);
4486
+ const block_iq2_s * x = (const block_iq2_s *) vx;
4487
+
4488
+ const int tid = item_ct1.get_local_id(2);
4489
+ #if QK_K == 256
4490
+ const int il = tid/8; // 0...3
4491
+ const int ib = tid%8; // 0...7
4492
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4493
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
4494
+ const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4495
+ const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
4496
+ #pragma unroll
4497
+ for (int j = 0; j < 8; ++j)
4498
+ y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4499
+ #else
4500
+ assert(false);
4501
+
4502
+ #endif
4503
+
4504
+ }
4505
+
4506
  template<typename dst_t>
4507
  static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
4508
  const sycl::nd_item<3> &item_ct1,
 
4535
 
4536
  }
4537
 
4538
+ template <typename dst_t>
4539
+ __dpct_inline__ static void
4540
+ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4541
+ const sycl::nd_item<3> &item_ct1,
4542
+ const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
 
4543
 
4544
  const int i = item_ct1.get_group(2);
4545
+ const block_iq3_s * x = (const block_iq3_s *) vx;
4546
 
4547
  const int tid = item_ct1.get_local_id(2);
4548
  #if QK_K == 256
4549
  const int il = tid/8; // 0...3
4550
  const int ib = tid%8; // 0...7
4551
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4552
+ const uint8_t * qs = x[i].qs + 8*ib;
4553
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
4554
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
4555
  const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
4556
  const uint8_t signs = x[i].signs[4*ib + il];
4557
+ #pragma unroll
4558
  for (int j = 0; j < 4; ++j) {
4559
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4560
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
 
4565
 
4566
  }
4567
 
4568
+ template <typename dst_t>
4569
+ __dpct_inline__ static void
4570
+ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4571
+ const sycl::nd_item<3> &item_ct1,
4572
+ const uint32_t *iq1s_grid_gpu) {
4573
+
4574
  const int i = item_ct1.get_group(2);
4575
  const block_iq1_s * x = (const block_iq1_s *) vx;
4576
 
 
4579
  const int il = tid/8; // 0...3
4580
  const int ib = tid%8; // 0...7
4581
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4582
+ const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
4583
+ const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
4584
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
4585
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
4586
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
4587
+ grid32[0] &= 0x0f0f0f0f;
4588
+ #pragma unroll
4589
+ for (int j = 0; j < 8; ++j) {
4590
+ y[j] = d * (q[j] + delta);
4591
+ }
4592
+ #else
4593
+ assert(false);
4594
+ #endif
4595
+
4596
+ }
4597
+
4598
+ template <typename dst_t>
4599
+ __dpct_inline__ static void
4600
+ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4601
+ const sycl::nd_item<3> &item_ct1,
4602
+ const uint32_t *iq1s_grid_gpu) {
4603
+
4604
+ const int i = item_ct1.get_group(2);
4605
+ const block_iq1_m * x = (const block_iq1_m *) vx;
4606
+
4607
+ const int tid = item_ct1.get_local_id(2);
4608
+ #if QK_K == 256
4609
+ const int il = tid/8; // 0...3
4610
+ const int ib = tid%8; // 0...7
4611
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4612
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
4613
+ iq1m_scale_t scale;
4614
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
4615
+ const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
4616
+ const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
4617
+ const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
4618
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
4619
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
4620
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
4621
+ grid32[0] &= 0x0f0f0f0f;
4622
+ #pragma unroll
4623
+ for (int j = 0; j < 8; ++j) {
4624
+ y[j] = d * (q[j] + delta);
4625
  }
4626
  #else
4627
  assert(false);
 
4629
 
4630
  }
4631
 
4632
+ template <typename dst_t>
4633
+ __dpct_inline__ static void
4634
+ dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
4635
+ const sycl::nd_item<3> &item_ct1) {
4636
+
4637
+ const int i = item_ct1.get_group(2);
4638
+ const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
4639
+
4640
+ const int tid = item_ct1.get_local_id(2);
4641
+ const int il = tid/8; // 0...3
4642
+ const int ib = tid%8; // 0...7
4643
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
4644
+ const uint8_t * q4 = x[ib].qs + 4*il;
4645
+ const float d = (float)x[ib].d;
4646
+ #pragma unroll
4647
+ for (int j = 0; j < 4; ++j) {
4648
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
4649
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
4650
+ }
4651
+
4652
+ }
4653
+
4654
+
4655
+ template <typename dst_t>
4656
+ __dpct_inline__ static void
4657
+ dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
4658
+ const sycl::nd_item<3> &item_ct1) {
4659
+ const int i = item_ct1.get_group(2);
4660
+ const block_iq4_xs * x = (const block_iq4_xs *)vx;
4661
+
4662
+ const int tid = item_ct1.get_local_id(2);
4663
+ const int il = tid/8; // 0...3
4664
+ const int ib = tid%8; // 0...7
4665
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
4666
+ const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
4667
+ const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
4668
+ #pragma unroll
4669
+ for (int j = 0; j < 4; ++j) {
4670
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
4671
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
4672
+ }
4673
+ }
4674
+
4675
+
4676
+
4677
  /*
4678
  DPCT1110:4: The total declared local variable size in device function
4679
  dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
 
7480
  #endif
7481
  }
7482
 
7483
+ static __dpct_inline__ float
7484
+ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7485
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7486
+ #if QK_K == 256
7487
+ const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
7488
+
7489
+ const int ib32 = iqs;
7490
+ const int8_t * q8 = bq8_1[ib32].qs;
7491
+ const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
7492
+ const uint8_t ls1 = bq2->scales[ib32] & 0xf;
7493
+ const uint8_t ls2 = bq2->scales[ib32] >> 4;
7494
+ int sumi1 = 0;
7495
+ for (int l = 0; l < 2; ++l) {
7496
+ const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
7497
+ const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
7498
+ ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
7499
+ std::equal_to<>());
7500
+ const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
7501
+ ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
7502
+ std::equal_to<>());
7503
+ const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7504
+ grid[0] ^ signs0, signs0, std::minus<>());
7505
+ const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
7506
+ grid[1] ^ signs1, signs1, std::minus<>());
7507
+ sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
7508
+ sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
7509
+ q8 += 8;
7510
+ }
7511
+ int sumi2 = 0;
7512
+ for (int l = 2; l < 4; ++l) {
7513
+ const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
7514
+ const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
7515
+ ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
7516
+ std::equal_to<>());
7517
+ const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
7518
+ ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
7519
+ std::equal_to<>());
7520
+ const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7521
+ grid[0] ^ signs0, signs0, std::minus<>());
7522
+ const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
7523
+ grid[1] ^ signs1, signs1, std::minus<>());
7524
+ sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
7525
+ sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
7526
+ q8 += 8;
7527
+ }
7528
+ const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
7529
+ return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
7530
+ #else
7531
+ assert(false);
7532
+ #endif
7533
+ }
7534
+
7535
  static __dpct_inline__ float
7536
  vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7537
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
 
7574
 
7575
  static __dpct_inline__ float
7576
  vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7577
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7578
+ const uint32_t *iq3s_grid) {
 
 
7579
  #if QK_K == 256
7580
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7581
 
 
7587
  const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
7588
  const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
7589
  uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
7590
+ ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201,
7591
+ 0x08040201, std::equal_to<>());
7592
  uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
7593
+ ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201,
7594
+ 0x08040201, std::equal_to<>());
7595
  const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7596
  grid1[0] ^ signs0, signs0, std::minus<>());
7597
  const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
 
7600
  sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
7601
  q8 += 8;
7602
  }
7603
+ const float d =
7604
+ (float)bq2->d *
7605
+ (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
7606
+ bq8_1[ib32].ds[0];
7607
  return d * sumi;
7608
  #else
7609
  assert(false);
 
 
 
 
 
7610
  #endif
7611
  }
7612
 
7613
  static __dpct_inline__ float
7614
  vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7615
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7616
+ const uint32_t *iq1s_grid_gpu) {
7617
  #if QK_K == 256
7618
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7619
 
7620
  const int ib32 = iqs;
 
 
7621
  int sumi = 0;
7622
+ const int * q8 = (const int *)bq8_1[ib32].qs;
7623
  for (int l = 0; l < 4; ++l) {
7624
+ const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
7625
+ int grid0 = grid[0] & 0x0f0f0f0f;
7626
+ int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
7627
+ sumi = dpct::dp4a(q8[2 * l + 1], grid1,
7628
+ dpct::dp4a(q8[2 * l + 0], grid0, sumi));
7629
+ }
7630
+
7631
+ const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
7632
+ const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
7633
+ const float d = d1q * bq8_1[ib32].ds[0];
7634
+ const float m = d1q * bq8_1[ib32].ds[1];
7635
+ return d * sumi + m * delta;
7636
+ #else
7637
+ assert(false);
7638
+ #endif
7639
+ }
7640
+
7641
+ static __dpct_inline__ float
7642
+ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7643
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7644
+ #if QK_K == 256
7645
+ const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
7646
+
7647
+ const int ib32 = iqs;
7648
+ int sumi[2] = {0, 0};
7649
+ float sumf[2] = {0.f, 0.f};
7650
+
7651
+ const int * q8 = (const int *)bq8_1[ib32].qs;
7652
+ for (int l = 0; l < 4; ++l) {
7653
+ const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
7654
+ int grid0 = grid[0] & 0x0f0f0f0f;
7655
+ int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
7656
+ sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1,
7657
+ dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2]));
7658
+ const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
7659
+ const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101,
7660
+ dpct::dp4a(q8[2 * l + 0], 0x01010101, 0));
7661
+ sumf[l/2] += delta*sumy;
7662
+ }
7663
+
7664
+ iq1m_scale_t scale;
7665
+ const uint16_t * sc = (const uint16_t *)bq1->scales;
7666
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
7667
+ const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
7668
+ return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
7669
+ #else
7670
+ assert(false);
7671
+ #endif
7672
+ }
7673
+
7674
+ static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
7675
+ const uint8_t *values,
7676
+ int &val1, int &val2) {
7677
+
7678
+ uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
7679
+ aux32 = q4 & 0x0f0f0f0f;
7680
+ uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
7681
+ uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
7682
+ val1 = v1 | (v2 << 16);
7683
+ aux32 = (q4 >> 4) & 0x0f0f0f0f;
7684
+ v1 = values[q8[0]] | (values[q8[1]] << 8);
7685
+ v2 = values[q8[2]] | (values[q8[3]] << 8);
7686
+ val2 = v1 | (v2 << 16);
7687
+ }
7688
+
7689
+
7690
+ static __dpct_inline__ float
7691
+ vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
7692
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7693
+
7694
+ const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
7695
+
7696
+ const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
7697
+ const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
7698
+
7699
+ const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7700
+
7701
+ int v1, v2;
7702
+ int sumi1 = 0, sumi2 = 0;
7703
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
7704
+ const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
7705
+ get_int_from_table_16(aux, values, v1, v2);
7706
+ sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1);
7707
+ sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2);
7708
  }
7709
+
7710
+ const float d = (float)bq->d * bq8_1->ds[0];
7711
+ return d * (sumi1 + sumi2);
7712
+ }
7713
+
7714
+
7715
+ static __dpct_inline__ float
7716
+ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7717
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7718
+
7719
+ #if QK_K == 256
7720
+ const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
7721
+ const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7722
+
7723
+ // iqs is 0...7
7724
+ const int ib32 = iqs;
7725
+ const int32_t * q8 = (const int *)bq8_1[ib32].qs;
7726
+ const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
7727
+ const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
7728
+ const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0];
7729
+ int v1, v2;
7730
+ int sumi1 = 0, sumi2 = 0;
7731
+ for (int j = 0; j < 4; ++j) {
7732
+ get_int_from_table_16(q4[j], values, v1, v2);
7733
+ sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1);
7734
+ sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
7735
+ }
7736
+ return d * (sumi1 + sumi2);
7737
  #else
7738
  assert(false);
 
7739
  #endif
7740
  }
7741
 
 
8320
 
8321
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
8322
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8323
+ const sycl::nd_item<3> &item_ct1) {
8324
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8325
+ item_ct1.get_local_id(1);
8326
+
8327
+ if (row >= nrows) {
8328
+ return;
8329
+ }
8330
+
8331
+ const int blocks_per_row = ncols / qk;
8332
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8333
+
8334
+ // partial sum for each thread
8335
+ float tmp = 0.0f;
8336
+
8337
+ const block_q_t * x = (const block_q_t *) vx;
8338
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8339
+
8340
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8341
+ i += blocks_per_warp) {
8342
+ const int ibx = row*blocks_per_row + i; // x block index
8343
+
8344
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8345
+
8346
+ const int iqs =
8347
+ vdr *
8348
+ (item_ct1.get_local_id(2) %
8349
+ (qi / vdr)); // x block quant index when casting the quants to int
8350
+
8351
+ tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
8352
+ }
8353
+
8354
+ // sum up partial sums and write back result
8355
+ #pragma unroll
8356
+ for (int mask = 16; mask > 0; mask >>= 1) {
8357
+ tmp +=
8358
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8359
+ }
8360
+
8361
+ if (item_ct1.get_local_id(2) == 0) {
8362
+ dst[row] = tmp;
8363
+ }
8364
+ }
8365
+
8366
+ template <int qk, int qi, typename block_q_t, int vdr>
8367
+ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
8368
+ const void *__restrict__ vy,
8369
+ float *__restrict__ dst, const int ncols,
8370
+ const int nrows,
8371
+ const sycl::nd_item<3> &item_ct1) {
8372
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8373
+ item_ct1.get_local_id(1);
8374
+
8375
+ if (row >= nrows) {
8376
+ return;
8377
+ }
8378
+
8379
+ const int blocks_per_row = ncols / qk;
8380
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8381
+
8382
+ // partial sum for each thread
8383
+ float tmp = 0.0f;
8384
+
8385
+ const block_q_t * x = (const block_q_t *) vx;
8386
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8387
+
8388
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8389
+ i += blocks_per_warp) {
8390
+ const int ibx = row*blocks_per_row + i; // x block index
8391
+
8392
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8393
+
8394
+ const int iqs =
8395
+ vdr *
8396
+ (item_ct1.get_local_id(2) %
8397
+ (qi / vdr)); // x block quant index when casting the quants to int
8398
+
8399
+ tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
8400
+ }
8401
+
8402
+ // sum up partial sums and write back result
8403
+ #pragma unroll
8404
+ for (int mask = 16; mask > 0; mask >>= 1) {
8405
+ tmp +=
8406
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8407
+ }
8408
+
8409
+ if (item_ct1.get_local_id(2) == 0) {
8410
+ dst[row] = tmp;
8411
+ }
8412
+ }
8413
+
8414
+ template <int qk, int qi, typename block_q_t, int vdr>
8415
+ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
8416
+ const void *__restrict__ vy,
8417
+ float *__restrict__ dst, const int ncols,
8418
+ const int nrows,
8419
+ const sycl::nd_item<3> &item_ct1) {
8420
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8421
+ item_ct1.get_local_id(1);
8422
+
8423
+ if (row >= nrows) {
8424
+ return;
8425
+ }
8426
+
8427
+ const int blocks_per_row = ncols / qk;
8428
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8429
+
8430
+ // partial sum for each thread
8431
+ float tmp = 0.0f;
8432
+
8433
+ const block_q_t * x = (const block_q_t *) vx;
8434
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8435
+
8436
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8437
+ i += blocks_per_warp) {
8438
+ const int ibx = row*blocks_per_row + i; // x block index
8439
+
8440
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8441
+
8442
+ const int iqs =
8443
+ vdr *
8444
+ (item_ct1.get_local_id(2) %
8445
+ (qi / vdr)); // x block quant index when casting the quants to int
8446
+
8447
+ tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
8448
+ }
8449
+
8450
+ // sum up partial sums and write back result
8451
+ #pragma unroll
8452
+ for (int mask = 16; mask > 0; mask >>= 1) {
8453
+ tmp +=
8454
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8455
+ }
8456
+
8457
+ if (item_ct1.get_local_id(2) == 0) {
8458
+ dst[row] = tmp;
8459
+ }
8460
+ }
8461
+
8462
+ template <int qk, int qi, typename block_q_t, int vdr>
8463
+ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
8464
+ const void *__restrict__ vy,
8465
+ float *__restrict__ dst, const int ncols,
8466
+ const int nrows,
8467
+ const sycl::nd_item<3> &item_ct1) {
8468
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8469
+ item_ct1.get_local_id(1);
8470
+
8471
+ if (row >= nrows) {
8472
+ return;
8473
+ }
8474
+
8475
+ const int blocks_per_row = ncols / qk;
8476
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8477
+
8478
+ // partial sum for each thread
8479
+ float tmp = 0.0f;
8480
+
8481
+ const block_q_t * x = (const block_q_t *) vx;
8482
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8483
+
8484
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8485
+ i += blocks_per_warp) {
8486
+ const int ibx = row*blocks_per_row + i; // x block index
8487
+
8488
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8489
+
8490
+ const int iqs =
8491
+ vdr *
8492
+ (item_ct1.get_local_id(2) %
8493
+ (qi / vdr)); // x block quant index when casting the quants to int
8494
+
8495
+ tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
8496
+ }
8497
+
8498
+ // sum up partial sums and write back result
8499
+ #pragma unroll
8500
+ for (int mask = 16; mask > 0; mask >>= 1) {
8501
+ tmp +=
8502
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8503
+ }
8504
+
8505
+ if (item_ct1.get_local_id(2) == 0) {
8506
+ dst[row] = tmp;
8507
+ }
8508
+ }
8509
+
8510
+ template <int qk, int qi, typename block_q_t, int vdr>
8511
+ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
8512
+ const void *__restrict__ vy,
8513
+ float *__restrict__ dst, const int ncols,
8514
+ const int nrows,
8515
+ const sycl::nd_item<3> &item_ct1) {
8516
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8517
  item_ct1.get_local_id(1);
8518
 
 
8540
  (item_ct1.get_local_id(2) %
8541
  (qi / vdr)); // x block quant index when casting the quants to int
8542
 
8543
+ tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
8544
  }
8545
 
8546
  // sum up partial sums and write back result
 
8556
  }
8557
 
8558
  template <int qk, int qi, typename block_q_t, int vdr>
8559
+ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
8560
+ const void *__restrict__ vy,
8561
+ float *__restrict__ dst, const int ncols,
8562
+ const int nrows,
8563
+ const sycl::nd_item<3> &item_ct1) {
8564
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8565
  item_ct1.get_local_id(1);
8566
 
 
8588
  (item_ct1.get_local_id(2) %
8589
  (qi / vdr)); // x block quant index when casting the quants to int
8590
 
8591
+ tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
8592
  }
8593
 
8594
  // sum up partial sums and write back result
 
8604
  }
8605
 
8606
  template <int qk, int qi, typename block_q_t, int vdr>
8607
+ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
8608
+ const void *__restrict__ vy,
8609
+ float *__restrict__ dst, const int ncols,
8610
+ const int nrows,
8611
+ const sycl::nd_item<3> &item_ct1) {
8612
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8613
  item_ct1.get_local_id(1);
8614
 
 
8636
  (item_ct1.get_local_id(2) %
8637
  (qi / vdr)); // x block quant index when casting the quants to int
8638
 
8639
+ tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
8640
  }
8641
 
8642
  // sum up partial sums and write back result
 
8652
  }
8653
 
8654
  template <int qk, int qi, typename block_q_t, int vdr>
8655
+ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
8656
+ const void *__restrict__ vy,
8657
+ float *__restrict__ dst, const int ncols,
8658
+ const int nrows,
8659
+ const sycl::nd_item<3> &item_ct1) {
8660
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8661
  item_ct1.get_local_id(1);
8662
 
 
8684
  (item_ct1.get_local_id(2) %
8685
  (qi / vdr)); // x block quant index when casting the quants to int
8686
 
8687
+ tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
8688
  }
8689
 
8690
  // sum up partial sums and write back result
 
8700
  }
8701
 
8702
  template <int qk, int qi, typename block_q_t, int vdr>
8703
+ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
8704
+ const void *__restrict__ vy,
8705
+ float *__restrict__ dst, const int ncols,
8706
+ const int nrows,
8707
+ const sycl::nd_item<3> &item_ct1) {
8708
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8709
  item_ct1.get_local_id(1);
8710
 
 
8732
  (item_ct1.get_local_id(2) %
8733
  (qi / vdr)); // x block quant index when casting the quants to int
8734
 
8735
+ tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
8736
  }
8737
 
8738
  // sum up partial sums and write back result
 
8747
  }
8748
  }
8749
 
8750
+
8751
  template <int qk, int qi, typename block_q_t, int vdr>
8752
+ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
8753
+ const void *__restrict__ vy,
8754
+ float *__restrict__ dst, const int ncols,
8755
+ const int nrows,
8756
+ const sycl::nd_item<3> &item_ct1) {
8757
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8758
  item_ct1.get_local_id(1);
8759
 
 
8781
  (item_ct1.get_local_id(2) %
8782
  (qi / vdr)); // x block quant index when casting the quants to int
8783
 
8784
+ tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
8785
  }
8786
 
8787
  // sum up partial sums and write back result
 
8796
  }
8797
  }
8798
 
8799
+
8800
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
8801
  static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
8802
  const sycl::nd_item<3> &item_ct1) {
 
9358
  }
9359
  }
9360
 
9361
+
9362
  template<typename T>
9363
+ static inline void ggml_sycl_swap(T & a, T & b) {
9364
  T tmp = a;
9365
  a = b;
9366
  b = tmp;
9367
  }
9368
 
9369
+ template <ggml_sort_order order>
9370
+ __dpct_inline__ static void
9371
+ k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad,
9372
+ const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) {
9373
  // bitonic sort
9374
  int col = item_ct1.get_local_id(2);
9375
  int row = item_ct1.get_group(1);
9376
 
9377
+ if (col >= ncols_pad) {
9378
+ return;
9379
+ }
9380
 
9381
  const float * x_row = x + row * ncols;
9382
+ auto dst_row = (int *)dpct_local;
9383
 
9384
  // initialize indices
9385
+ dst_row[col] = col;
9386
+
9387
+ item_ct1.barrier(sycl::access::fence_space::local_space);
 
 
 
 
 
 
9388
 
9389
+ for (int k = 2; k <= ncols_pad; k *= 2) {
9390
  for (int j = k / 2; j > 0; j /= 2) {
9391
  int ixj = col ^ j;
9392
  if (ixj > col) {
9393
  if ((col & k) == 0) {
9394
+ if (dst_row[col] >= ncols ||
9395
+ (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
9396
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
9397
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
9398
+ ) {
9399
+ ggml_sycl_swap(dst_row[col], dst_row[ixj]);
9400
  }
9401
  } else {
9402
+ if (dst_row[ixj] >= ncols ||
9403
+ (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
9404
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
9405
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
9406
+ ) {
9407
+ ggml_sycl_swap(dst_row[col], dst_row[ixj]);
9408
  }
9409
  }
9410
  }
9411
  /*
9412
+ DPCT1118:1: SYCL group functions and algorithms must be encountered
9413
  in converged control flow. You may need to adjust the code.
9414
  */
9415
+ item_ct1.barrier(sycl::access::fence_space::local_space);
 
 
 
 
 
9416
  }
9417
  }
9418
+
9419
+ // copy the result to dst without the padding
9420
+ if (col < ncols) {
9421
+ dst[row * ncols + col] = dst_row[col];
9422
+ }
9423
  }
9424
 
9425
+
9426
  static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
9427
  const sycl::nd_item<3> &item_ct1) {
9428
  const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
 
10401
  #endif
10402
  }
10403
 
 
10404
  template <typename dst_t>
10405
+ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
10406
  dpct::queue_ptr stream) {
10407
  const int nb = k / QK_K;
10408
  {
10409
+ dpct::has_capability_or_fail(stream->get_device(),
10410
+ {sycl::aspect::fp16});
10411
+
10412
+ stream->submit([&](sycl::handler &cgh) {
10413
+ cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10414
+ sycl::range<3>(1, 1, 32),
10415
+ sycl::range<3>(1, 1, 32)),
10416
+ [=](sycl::nd_item<3> item_ct1) {
10417
+ dequantize_block_iq1_s(
10418
+ vx, y, item_ct1, iq1s_grid_gpu
10419
+ );
10420
+ });
10421
+ });
10422
+ }
10423
+ }
10424
 
10425
+ template <typename dst_t>
10426
+ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
10427
+ dpct::queue_ptr stream) {
10428
+ const int nb = k / QK_K;
10429
+ {
10430
  dpct::has_capability_or_fail(stream->get_device(),
10431
  {sycl::aspect::fp16});
10432
 
10433
  stream->submit([&](sycl::handler &cgh) {
10434
+ cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10435
+ sycl::range<3>(1, 1, 32),
10436
+ sycl::range<3>(1, 1, 32)),
10437
+ [=](sycl::nd_item<3> item_ct1) {
10438
+ dequantize_block_iq1_m(
10439
+ vx, y, item_ct1, iq1s_grid_gpu
10440
+ );
10441
+ });
10442
+ });
10443
+ }
10444
+ }
10445
+
10446
+ template <typename dst_t>
10447
+ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
10448
+ dpct::queue_ptr stream) {
10449
+ const int nb = k / QK_K;
10450
+ {
10451
+ dpct::has_capability_or_fail(stream->get_device(),
10452
+ {sycl::aspect::fp16});
10453
 
10454
+ stream->submit([&](sycl::handler &cgh) {
10455
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10456
  sycl::range<3>(1, 1, 32),
10457
  sycl::range<3>(1, 1, 32)),
10458
  [=](sycl::nd_item<3> item_ct1) {
10459
  dequantize_block_iq2_xxs(
10460
+ vx, y, item_ct1, iq2xxs_grid,
10461
+ ksigns_iq2xs, kmask_iq2xs);
10462
  });
10463
  });
10464
  }
 
10469
  dpct::queue_ptr stream) {
10470
  const int nb = k / QK_K;
10471
  {
 
10472
  dpct::has_capability_or_fail(stream->get_device(),
10473
  {sycl::aspect::fp16});
10474
 
10475
  stream->submit([&](sycl::handler &cgh) {
 
 
 
 
10476
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10477
  sycl::range<3>(1, 1, 32),
10478
  sycl::range<3>(1, 1, 32)),
10479
  [=](sycl::nd_item<3> item_ct1) {
10480
  dequantize_block_iq2_xs(
10481
+ vx, y, item_ct1, iq2xs_grid,
10482
+ ksigns_iq2xs, kmask_iq2xs);
10483
  });
10484
  });
10485
  }
10486
  }
10487
 
10488
  template <typename dst_t>
10489
+ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
10490
+ dpct::queue_ptr stream) {
10491
  const int nb = k / QK_K;
10492
  {
 
10493
  dpct::has_capability_or_fail(stream->get_device(),
10494
  {sycl::aspect::fp16});
10495
 
10496
  stream->submit([&](sycl::handler &cgh) {
 
 
 
 
10497
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10498
  sycl::range<3>(1, 1, 32),
10499
  sycl::range<3>(1, 1, 32)),
10500
  [=](sycl::nd_item<3> item_ct1) {
10501
+ dequantize_block_iq2_s(vx, y, item_ct1);
 
 
10502
  });
10503
  });
10504
  }
10505
  }
10506
 
10507
+
10508
  template <typename dst_t>
10509
+ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
10510
  dpct::queue_ptr stream) {
10511
  const int nb = k / QK_K;
10512
  {
 
10513
  dpct::has_capability_or_fail(stream->get_device(),
10514
  {sycl::aspect::fp16});
10515
 
10516
  stream->submit([&](sycl::handler &cgh) {
 
 
 
 
10517
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10518
  sycl::range<3>(1, 1, 32),
10519
  sycl::range<3>(1, 1, 32)),
10520
  [=](sycl::nd_item<3> item_ct1) {
10521
+ dequantize_block_iq3_xxs(
10522
+ vx, y, item_ct1, iq3xxs_grid,
10523
+ ksigns_iq2xs, kmask_iq2xs);
10524
  });
10525
  });
10526
  }
10527
  }
10528
 
10529
  template <typename dst_t>
10530
+ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
10531
  dpct::queue_ptr stream) {
10532
  const int nb = k / QK_K;
10533
  {
 
10534
  dpct::has_capability_or_fail(stream->get_device(),
10535
  {sycl::aspect::fp16});
10536
 
10537
  stream->submit([&](sycl::handler &cgh) {
 
 
 
 
10538
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10539
  sycl::range<3>(1, 1, 32),
10540
  sycl::range<3>(1, 1, 32)),
10541
  [=](sycl::nd_item<3> item_ct1) {
10542
+ dequantize_block_iq3_s(
10543
+ vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
 
10544
  });
10545
  });
10546
  }
10547
  }
10548
 
10549
+ template <typename dst_t>
10550
+ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10551
+ dpct::queue_ptr stream) {
10552
+ const int nb = (k + QK_K - 1) / QK_K;
10553
+ #if QK_K == 64
10554
+ dequantize_row_iq4_nl_sycl(vx, y, k, stream);
10555
+ #else
10556
+ {
10557
+ dpct::has_capability_or_fail(stream->get_device(),
10558
+ {sycl::aspect::fp16});
10559
+
10560
+ stream->submit([&](sycl::handler &cgh) {
10561
+ cgh.parallel_for(
10562
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10563
+ sycl::range<3>(1, 1, 32),
10564
+ sycl::range<3>(1, 1, 32)),
10565
+ [=](sycl::nd_item<3> item_ct1) {
10566
+ dequantize_block_iq4_xs(vx, y, item_ct1);
10567
+ });
10568
+ });
10569
+ }
10570
+ #endif
10571
+ }
10572
+
10573
+
10574
+ template <typename dst_t>
10575
+ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
10576
+ dpct::queue_ptr stream) {
10577
+ const int nb = (k + QK_K - 1) / QK_K;
10578
+ {
10579
+ dpct::has_capability_or_fail(stream->get_device(),
10580
+ {sycl::aspect::fp16});
10581
+
10582
+ stream->submit([&](sycl::handler &cgh) {
10583
+ cgh.parallel_for(
10584
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10585
+ sycl::range<3>(1, 1, 32),
10586
+ sycl::range<3>(1, 1, 32)),
10587
+ [=](sycl::nd_item<3> item_ct1) {
10588
+ dequantize_block_iq4_nl(vx, y, item_ct1);
10589
+ });
10590
+ });
10591
+ }
10592
+ }
10593
+
10594
+
10595
+
10596
  template <typename src_t, typename dst_t>
10597
  static void convert_unary_sycl(const void *__restrict__ vx,
10598
  dst_t *__restrict__ y, const int k,
 
10637
  return dequantize_row_q5_K_sycl;
10638
  case GGML_TYPE_Q6_K:
10639
  return dequantize_row_q6_K_sycl;
10640
+ case GGML_TYPE_IQ1_S:
10641
+ return dequantize_row_iq1_s_sycl;
10642
+ case GGML_TYPE_IQ1_M:
10643
+ return dequantize_row_iq1_m_sycl;
10644
  case GGML_TYPE_IQ2_XXS:
10645
  return dequantize_row_iq2_xxs_sycl;
10646
  case GGML_TYPE_IQ2_XS:
10647
  return dequantize_row_iq2_xs_sycl;
10648
+ case GGML_TYPE_IQ2_S:
10649
+ return dequantize_row_iq2_s_sycl;
10650
  case GGML_TYPE_IQ3_XXS:
10651
  return dequantize_row_iq3_xxs_sycl;
10652
  case GGML_TYPE_IQ3_S:
10653
  return dequantize_row_iq3_s_sycl;
10654
+ case GGML_TYPE_IQ4_XS:
10655
+ return dequantize_row_iq4_xs_sycl;
10656
+ case GGML_TYPE_IQ4_NL:
10657
+ return dequantize_row_iq4_nl_sycl;
10658
  case GGML_TYPE_F32:
10659
  return convert_unary_sycl<float>;
10660
  default:
 
10689
  return dequantize_row_q5_K_sycl;
10690
  case GGML_TYPE_Q6_K:
10691
  return dequantize_row_q6_K_sycl;
10692
+ case GGML_TYPE_IQ1_S:
10693
+ return dequantize_row_iq1_s_sycl;
10694
+ case GGML_TYPE_IQ1_M:
10695
+ return dequantize_row_iq1_m_sycl;
10696
  case GGML_TYPE_IQ2_XXS:
10697
  return dequantize_row_iq2_xxs_sycl;
10698
  case GGML_TYPE_IQ2_XS:
10699
  return dequantize_row_iq2_xs_sycl;
10700
+ case GGML_TYPE_IQ2_S:
10701
+ return dequantize_row_iq2_s_sycl;
10702
  case GGML_TYPE_IQ3_XXS:
10703
  return dequantize_row_iq3_xxs_sycl;
10704
  case GGML_TYPE_IQ3_S:
10705
  return dequantize_row_iq3_s_sycl;
10706
+ case GGML_TYPE_IQ4_XS:
10707
+ return dequantize_row_iq4_xs_sycl;
10708
+ case GGML_TYPE_IQ4_NL:
10709
+ return dequantize_row_iq4_nl_sycl;
10710
  case GGML_TYPE_F16:
10711
  return convert_unary_sycl<sycl::half>;
10712
  default:
 
11169
  const sycl::range<3> block_nums(1, 1, block_num_y);
11170
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11171
  {
 
11172
  stream->submit([&](sycl::handler &cgh) {
 
 
 
 
11173
  cgh.parallel_for(
11174
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
11175
  [=](sycl::nd_item<3> item_ct1)
11176
  [[intel::reqd_sub_group_size(32)]] {
11177
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>(
11178
+ vx, vy, dst, ncols, nrows, item_ct1);
 
11179
  });
11180
  });
11181
  }
 
11200
  [=](sycl::nd_item<3> item_ct1)
11201
  [[intel::reqd_sub_group_size(32)]] {
11202
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>(
11203
+ vx, vy, dst, ncols, nrows, item_ct1);
11204
+ });
11205
+ });
11206
+ }
11207
+ }
11208
+
11209
+ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
11210
+ float *dst, const int ncols,
11211
+ const int nrows,
11212
+ dpct::queue_ptr stream) {
11213
+ GGML_ASSERT(ncols % QK_K == 0);
11214
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
11215
+ const sycl::range<3> block_nums(1, 1, block_num_y);
11216
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11217
+ {
11218
+
11219
+ stream->submit([&](sycl::handler &cgh) {
11220
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
11221
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11222
+
11223
+ cgh.parallel_for(
11224
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
11225
+ [=](sycl::nd_item<3> item_ct1)
11226
+ [[intel::reqd_sub_group_size(32)]] {
11227
+ mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>(
11228
+ vx, vy, dst, ncols, nrows, item_ct1);
11229
  });
11230
  });
11231
  }
 
11250
  [=](sycl::nd_item<3> item_ct1)
11251
  [[intel::reqd_sub_group_size(32)]] {
11252
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>(
11253
+ vx, vy, dst, ncols, nrows, item_ct1);
 
11254
  });
11255
  });
11256
  }
 
11268
 
11269
  stream->submit([&](sycl::handler &cgh) {
11270
  auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
 
11271
 
11272
  cgh.parallel_for(
11273
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
11274
  [=](sycl::nd_item<3> item_ct1)
11275
  [[intel::reqd_sub_group_size(32)]] {
11276
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
11277
+ vx, vy, dst, ncols, nrows, item_ct1);
 
11278
  });
11279
  });
11280
  }
 
11299
  [=](sycl::nd_item<3> item_ct1)
11300
  [[intel::reqd_sub_group_size(32)]] {
11301
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
11302
+ vx, vy, dst, ncols, nrows, item_ct1);
11303
+ });
11304
+ });
11305
+ }
11306
+ }
11307
+
11308
+ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
11309
+ float *dst, const int ncols,
11310
+ const int nrows,
11311
+ dpct::queue_ptr stream) {
11312
+ GGML_ASSERT(ncols % QK_K == 0);
11313
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
11314
+ const sycl::range<3> block_nums(1, 1, block_num_y);
11315
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11316
+ {
11317
+ stream->submit([&](sycl::handler &cgh) {
11318
+ cgh.parallel_for(
11319
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
11320
+ [=](sycl::nd_item<3> item_ct1)
11321
+ [[intel::reqd_sub_group_size(32)]] {
11322
+ mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
11323
+ vx, vy, dst, ncols, nrows, item_ct1);
11324
+ });
11325
+ });
11326
+ }
11327
+ }
11328
+
11329
+ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
11330
+ float *dst, const int ncols,
11331
+ const int nrows,
11332
+ dpct::queue_ptr stream) {
11333
+ GGML_ASSERT(ncols % QK4_NL == 0);
11334
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
11335
+ const sycl::range<3> block_nums(1, 1, block_num_y);
11336
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11337
+ {
11338
+
11339
+ stream->submit([&](sycl::handler &cgh) {
11340
+ cgh.parallel_for(
11341
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
11342
+ [=](sycl::nd_item<3> item_ct1)
11343
+ [[intel::reqd_sub_group_size(32)]] {
11344
+ mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
11345
+ vx, vy, dst, ncols, nrows, item_ct1);
11346
+ });
11347
+ });
11348
+ }
11349
+ }
11350
+
11351
+ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
11352
+ float *dst, const int ncols,
11353
+ const int nrows,
11354
+ dpct::queue_ptr stream) {
11355
+ GGML_ASSERT(ncols % QK_K == 0);
11356
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
11357
+ const sycl::range<3> block_nums(1, 1, block_num_y);
11358
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11359
+ {
11360
+
11361
+ stream->submit([&](sycl::handler &cgh) {
11362
+ cgh.parallel_for(
11363
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
11364
+ [=](sycl::nd_item<3> item_ct1)
11365
+ [[intel::reqd_sub_group_size(32)]] {
11366
+ mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS, block_iq4_xs, 1>(
11367
+ vx, vy, dst, ncols, nrows, item_ct1);
11368
  });
11369
  });
11370
  }
 
12988
  });
12989
  }
12990
 
12991
+ static int next_power_of_2(int x) {
12992
+ int n = 1;
12993
+ while (n < x) {
12994
+ n *= 2;
12995
+ }
12996
+ return n;
12997
+ }
12998
+
12999
  static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
13000
  const int nrows, ggml_sort_order order,
13001
  dpct::queue_ptr stream) {
13002
  // bitonic sort requires ncols to be power of 2
13003
+ const int ncols_pad = next_power_of_2(ncols);
13004
 
13005
+ const sycl::range<3> block_dims(1, 1, ncols_pad);
13006
  const sycl::range<3> block_nums(1, nrows, 1);
13007
+ const size_t shared_mem = ncols_pad * sizeof(int);
13008
+
13009
+ // GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
13010
+
13011
  if (order == GGML_SORT_ORDER_ASC) {
13012
+ stream->submit([&](sycl::handler &cgh) {
13013
+ sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
13014
+ sycl::range<1>(shared_mem), cgh);
13015
+
13016
+ cgh.parallel_for(
13017
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
13018
+ [=](sycl::nd_item<3> item_ct1) {
13019
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
13020
+ x, dst, ncols, ncols_pad, item_ct1,
13021
+ dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
13022
+ .get());
13023
+ });
13024
+ });
13025
  } else if (order == GGML_SORT_ORDER_DESC) {
13026
+ stream->submit([&](sycl::handler &cgh) {
13027
+ sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
13028
+ sycl::range<1>(shared_mem), cgh);
13029
+
13030
+ cgh.parallel_for(
13031
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
13032
+ [=](sycl::nd_item<3> item_ct1) {
13033
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
13034
+ x, dst, ncols, ncols_pad, item_ct1,
13035
+ dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
13036
+ .get());
13037
+ });
13038
+ });
13039
  } else {
13040
  GGML_ASSERT(false);
13041
  }
 
14163
  case GGML_TYPE_Q5_K:
14164
  case GGML_TYPE_IQ2_XXS:
14165
  case GGML_TYPE_IQ2_XS:
14166
+ case GGML_TYPE_IQ2_S:
14167
  case GGML_TYPE_IQ1_S:
14168
+ case GGML_TYPE_IQ1_M:
14169
  case GGML_TYPE_IQ3_XXS:
14170
+ case GGML_TYPE_IQ4_XS:
14171
+ case GGML_TYPE_IQ4_NL:
14172
  return max_compute_capability >= VER_GEN9 ? 128 : 64;
14173
  case GGML_TYPE_IQ3_S:
14174
  return max_compute_capability >= VER_GEN9 ? 128 : 64;
 
14187
  const int64_t src1_ncols, const int64_t src1_padded_row_size,
14188
  const dpct::queue_ptr &stream) {
14189
 
14190
+ const int64_t ne10 = src1->ne[0];
14191
+ GGML_ASSERT(ne10 % QK8_1 == 0);
14192
 
14193
  const int64_t ne00 = src0->ne[0];
14194
  const int64_t row_diff = row_high - row_low;
14195
 
14196
+ int id;
14197
+ SYCL_CHECK(
14198
+ CHECK_TRY_ERROR(id = get_current_device_id()));
14199
+
14200
+ // the main device has a larger memory buffer to hold the results from all GPUs
14201
+ // nrows_dst == nrows of the matrix that the kernel writes into
14202
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne00 : row_diff;
14203
+
14204
  switch (src0->type) {
14205
  case GGML_TYPE_Q4_0:
14206
  mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
 
14232
  case GGML_TYPE_Q6_K:
14233
  mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14234
  break;
14235
+ case GGML_TYPE_IQ1_S:
14236
+ mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14237
+ break;
14238
+ case GGML_TYPE_IQ1_M:
14239
+ mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14240
+ break;
14241
  case GGML_TYPE_IQ2_XXS:
14242
  mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14243
  break;
14244
  case GGML_TYPE_IQ2_XS:
14245
  mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14246
  break;
14247
+ case GGML_TYPE_IQ2_S:
14248
+ mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14249
+ break;
14250
  case GGML_TYPE_IQ3_XXS:
14251
  mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14252
  break;
14253
  case GGML_TYPE_IQ3_S:
14254
  mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14255
  break;
14256
+ case GGML_TYPE_IQ4_NL:
14257
+ mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14258
+ break;
14259
+ case GGML_TYPE_IQ4_XS:
14260
+ mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14261
  break;
14262
  default:
14263
  GGML_ASSERT(false);
 
14339
  convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
14340
  break;
14341
  default:
14342
+ printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
14343
  GGML_ASSERT(false);
14344
  break;
14345
  }
 
15194
  src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
15195
  }
15196
  // do the computation
15197
+ SYCL_CHECK(CHECK_TRY_ERROR(op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
15198
+ dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
15199
  /*
15200
  DPCT1010:93: SYCL uses exceptions to report errors and does not
15201
  use the error codes. The call was replaced with 0. You need to
 
15776
  #ifdef GGML_SYCL_FORCE_DMMV
15777
  const bool use_mul_mat_vec_q = false;
15778
  #else
15779
+ bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
15780
+ use_mul_mat_vec_q = use_mul_mat_vec_q ||
15781
+ (src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
15782
+ (src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
15783
+ (src0->type == GGML_TYPE_IQ4_NL) || (src0->type == GGML_TYPE_IQ4_XS) ||
15784
+ (src0->type == GGML_TYPE_IQ1_S) || (src0->type == GGML_TYPE_IQ1_M);
15785
+
15786
+
15787
  #endif // GGML_SYCL_FORCE_DMMV
15788
 
15789
  if (use_mul_mat_vec_q) {
 
17643
  return false;
17644
  }
17645
  ggml_type a_type = a->type;
17646
+ if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS ||
17647
+ a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
17648
+ a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S ||
17649
+ a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M
17650
+ ) {
17651
+ if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
17652
+ return false;
17653
+ }
17654
  }
17655
  return true;
17656
  } break;