ggerganov commited on
Commit
cc06e31
·
unverified ·
1 Parent(s): c692ba5

cuda : sync some minor stuff from llama.cpp (#1548)

Browse files
Files changed (1) hide show
  1. ggml-cuda.cu +62 -32
ggml-cuda.cu CHANGED
@@ -1,4 +1,5 @@
1
  #include <algorithm>
 
2
  #include <cstddef>
3
  #include <cstdint>
4
  #include <limits>
@@ -235,7 +236,7 @@ typedef float2 dfloat2;
235
  #endif //GGML_CUDA_F16
236
 
237
  static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
238
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
 
240
  int x32 = 0;
241
  x32 |= x16[0] << 0;
@@ -245,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
245
  }
246
 
247
  static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
248
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
 
250
  int x32 = 0;
251
  x32 |= x16[0] << 0;
@@ -255,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
255
  }
256
 
257
  static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
258
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
  }
260
 
261
  static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
262
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
  }
264
 
265
  template<typename T>
@@ -469,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
469
  #define MUL_MAT_SRC1_COL_STRIDE 128
470
 
471
  #define MAX_STREAMS 8
472
- static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
473
 
474
  struct ggml_tensor_extra_gpu {
475
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -2248,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
2248
  }
2249
 
2250
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
2251
 
2252
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2253
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
@@ -2259,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
2259
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
2260
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2261
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2262
-
2263
  GGML_CUDA_ASSUME(i_offset >= 0);
2264
  GGML_CUDA_ASSUME(i_offset < nwarps);
2265
  GGML_CUDA_ASSUME(k >= 0);
@@ -2268,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2268
  const int kbx = k / QI4_0;
2269
  const int kqsx = k % QI4_0;
2270
 
2271
- const block_q4_0 * bx0 = (block_q4_0 *) vx;
2272
 
2273
  float * x_dmf = (float *) x_dm;
2274
 
@@ -2306,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2306
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
2307
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2308
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
2309
 
2310
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2311
- const float * x_dmf = (float *) x_dm;
2312
 
2313
  int u[2*VDR_Q4_0_Q8_1_MMQ];
2314
 
@@ -2342,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
2342
  }
2343
 
2344
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
2345
 
2346
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
2347
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
@@ -2353,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
2353
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2354
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2355
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
2356
 
2357
  GGML_CUDA_ASSUME(i_offset >= 0);
2358
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2362,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2362
  const int kbx = k / QI4_1;
2363
  const int kqsx = k % QI4_1;
2364
 
2365
- const block_q4_1 * bx0 = (block_q4_1 *) vx;
2366
 
2367
  #pragma unroll
2368
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2397,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2397
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
2398
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2399
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
2400
 
2401
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2402
 
@@ -2434,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
2434
  }
2435
 
2436
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
2437
 
2438
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2439
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
@@ -2445,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
2445
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2446
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2447
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
2448
 
2449
  GGML_CUDA_ASSUME(i_offset >= 0);
2450
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2454,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2454
  const int kbx = k / QI5_0;
2455
  const int kqsx = k % QI5_0;
2456
 
2457
- const block_q5_0 * bx0 = (block_q5_0 *) vx;
2458
 
2459
  #pragma unroll
2460
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2509,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2509
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
2510
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2511
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
2512
 
2513
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2514
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
@@ -2548,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2548
  }
2549
 
2550
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
2551
 
2552
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2553
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
@@ -2559,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
2559
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2560
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2561
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
2562
 
2563
  GGML_CUDA_ASSUME(i_offset >= 0);
2564
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2568,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2568
  const int kbx = k / QI5_1;
2569
  const int kqsx = k % QI5_1;
2570
 
2571
- const block_q5_1 * bx0 = (block_q5_1 *) vx;
2572
 
2573
  #pragma unroll
2574
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2620,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2620
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
2621
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2622
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
2623
 
2624
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2625
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
@@ -2654,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2654
  }
2655
 
2656
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
2657
 
2658
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2659
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
@@ -2665,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
2665
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2666
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2667
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
2668
 
2669
  GGML_CUDA_ASSUME(i_offset >= 0);
2670
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2675,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2675
  const int kqsx = k % QI8_0;
2676
  float * x_dmf = (float *) x_dm;
2677
 
2678
- const block_q8_0 * bx0 = (block_q8_0 *) vx;
2679
 
2680
  #pragma unroll
2681
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2710,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2710
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2711
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2712
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
2713
 
2714
  const float * x_dmf = (const float *) x_dm;
2715
  const float * y_df = (const float *) y_ds;
@@ -2743,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2743
  }
2744
 
2745
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
2746
 
2747
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2748
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
@@ -2756,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
2756
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2757
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2758
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
2759
 
2760
  GGML_CUDA_ASSUME(i_offset >= 0);
2761
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2765,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2765
  const int kbx = k / QI2_K;
2766
  const int kqsx = k % QI2_K;
2767
 
2768
- const block_q2_K * bx0 = (block_q2_K *) vx;
2769
 
2770
  #pragma unroll
2771
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2813,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2813
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2814
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2815
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
2816
 
2817
  const int kbx = k / QI2_K;
2818
  const int ky = (k % QI2_K) * QR2_K;
@@ -2886,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2886
  const int kbx = k / QI3_K;
2887
  const int kqsx = k % QI3_K;
2888
 
2889
- const block_q3_K * bx0 = (block_q3_K *) vx;
2890
 
2891
  #pragma unroll
2892
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2967,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2967
  const float * x_dmf = (const float *) x_dm;
2968
  const float * y_df = (const float *) y_ds;
2969
 
2970
- const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2971
 
2972
  int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2973
 
@@ -3082,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3082
  }
3083
 
3084
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
3085
 
3086
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
3087
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
@@ -3095,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
3095
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
3096
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3097
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
3098
 
3099
  GGML_CUDA_ASSUME(i_offset >= 0);
3100
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3104,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3104
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
3105
  const int kqsx = k % QI4_K; // == k if QK_K == 256
3106
 
3107
- const block_q4_K * bx0 = (block_q4_K *) vx;
3108
 
3109
  #pragma unroll
3110
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3149,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3149
 
3150
  const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
3151
 
3152
- const int * scales = (int *) bxi->scales;
3153
 
3154
  const int ksc = k % (WARP_SIZE/8);
3155
 
@@ -3164,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3164
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
3165
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3166
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
3167
 
3168
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
3169
 
@@ -3263,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3263
  }
3264
 
3265
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
3266
 
3267
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3268
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
@@ -3276,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
3276
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
3277
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3278
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
3279
 
3280
  GGML_CUDA_ASSUME(i_offset >= 0);
3281
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3285,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3285
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3286
  const int kqsx = k % QI5_K; // == k if QK_K == 256
3287
 
3288
- const block_q5_K * bx0 = (block_q5_K *) vx;
3289
 
3290
  #pragma unroll
3291
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3341,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3341
 
3342
  const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
3343
 
3344
- const int * scales = (int *) bxi->scales;
3345
 
3346
  const int ksc = k % (WARP_SIZE/8);
3347
 
@@ -3356,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3356
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3357
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3358
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
3359
 
3360
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3361
 
@@ -3392,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3392
  }
3393
 
3394
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
3395
 
3396
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3397
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
@@ -3405,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
3405
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3406
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3407
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
3408
 
3409
  GGML_CUDA_ASSUME(i_offset >= 0);
3410
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3414,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3414
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3415
  const int kqsx = k % QI6_K; // == k if QK_K == 256
3416
 
3417
- const block_q6_K * bx0 = (block_q6_K *) vx;
3418
 
3419
  #pragma unroll
3420
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3476,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3476
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3477
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3478
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
3479
 
3480
  const float * x_dmf = (const float *) x_dm;
3481
  const float * y_df = (const float *) y_ds;
@@ -3518,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
3518
  __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3519
  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3520
 
3521
- float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3522
 
3523
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3524
 
@@ -5840,7 +5867,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5840
  return ptr;
5841
  }
5842
  #ifdef DEBUG_CUDA_MALLOC
5843
- fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5844
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5845
  #endif
5846
  void * ptr;
@@ -5978,7 +6005,7 @@ void * ggml_cuda_host_malloc(size_t size) {
5978
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
5979
  // This can fixed the OOM error in WSL.
5980
  cudaGetLastError();
5981
- fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
5982
  size/1024.0/1024.0, cudaGetErrorString(err));
5983
  return nullptr;
5984
  }
@@ -6359,6 +6386,7 @@ static int64_t get_row_rounding(ggml_type type) {
6359
  case GGML_TYPE_Q8_0:
6360
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
6361
  case GGML_TYPE_F16:
 
6362
  return 1;
6363
  case GGML_TYPE_Q2_K:
6364
  return max_compute_capability >= CC_RDNA2 ? 128 : 32;
@@ -6381,6 +6409,7 @@ static int64_t get_row_rounding(ggml_type type) {
6381
  case GGML_TYPE_Q8_0:
6382
  return 64;
6383
  case GGML_TYPE_F16:
 
6384
  return 1;
6385
  case GGML_TYPE_Q2_K:
6386
  case GGML_TYPE_Q3_K:
@@ -6990,7 +7019,7 @@ static void ggml_cuda_op_mul_mat(
6990
  const int64_t ne01 = src0->ne[1];
6991
  const int64_t ne02 = src0->ne[2];
6992
  const int64_t ne03 = src0->ne[3];
6993
- const int64_t nrows0 = ggml_nrows(src0);
6994
 
6995
  const int64_t ne10 = src1->ne[0];
6996
  const int64_t ne11 = src1->ne[1];
@@ -7091,7 +7120,7 @@ static void ggml_cuda_op_mul_mat(
7091
  if (src0_on_device && src0_is_contiguous) {
7092
  src0_dd[id] = (char *) src0_extra->data_device[id];
7093
  } else {
7094
- const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7095
  src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
7096
  }
7097
 
@@ -7324,7 +7353,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7324
  }
7325
 
7326
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7327
- if (!g_cublas_loaded) return false;
7328
 
7329
  const int64_t ne10 = src1->ne[0];
7330
 
@@ -7402,7 +7431,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7402
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7403
  }
7404
 
7405
- __global__ void k_compute_batched_ptrs(
7406
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7407
  const void ** ptrs_src, void ** ptrs_dst,
7408
  int ne12, int ne13,
@@ -8018,7 +8047,7 @@ void ggml_cuda_free_scratch() {
8018
  }
8019
 
8020
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8021
- if (!g_cublas_loaded) return false;
8022
 
8023
  ggml_cuda_func_t func;
8024
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -8032,7 +8061,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8032
  if (tensor->op == GGML_OP_MUL_MAT) {
8033
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8034
  #ifndef NDEBUG
8035
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8036
  #endif
8037
  return false;
8038
  }
@@ -8317,14 +8346,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8317
  UNUSED(cgraph);
8318
  }
8319
 
8320
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8321
  GGML_ASSERT(!"not implemented");
8322
 
8323
  UNUSED(backend);
8324
  UNUSED(plan);
8325
  }
8326
 
8327
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8328
  GGML_ASSERT(!"not implemented");
8329
 
8330
  UNUSED(backend);
@@ -8340,8 +8369,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8340
  for (int i = 0; i < cgraph->n_nodes; i++) {
8341
  ggml_tensor * node = cgraph->nodes[i];
8342
 
8343
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8344
  continue;
 
8345
  assert(node->backend == GGML_BACKEND_GPU);
8346
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8347
  if (node->src[j] != nullptr) {
 
1
  #include <algorithm>
2
+ #include <cinttypes>
3
  #include <cstddef>
4
  #include <cstdint>
5
  #include <limits>
 
236
  #endif //GGML_CUDA_F16
237
 
238
  static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
239
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
240
 
241
  int x32 = 0;
242
  x32 |= x16[0] << 0;
 
246
  }
247
 
248
  static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
249
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
250
 
251
  int x32 = 0;
252
  x32 |= x16[0] << 0;
 
256
  }
257
 
258
  static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
259
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
260
  }
261
 
262
  static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
263
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
264
  }
265
 
266
  template<typename T>
 
470
  #define MUL_MAT_SRC1_COL_STRIDE 128
471
 
472
  #define MAX_STREAMS 8
473
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
474
 
475
  struct ggml_tensor_extra_gpu {
476
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
 
2249
  }
2250
 
2251
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2252
+ (void)x_qh; (void)x_sc;
2253
 
2254
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2255
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
 
2261
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
2262
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2263
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2264
+ (void)x_qh; (void)x_sc;
2265
  GGML_CUDA_ASSUME(i_offset >= 0);
2266
  GGML_CUDA_ASSUME(i_offset < nwarps);
2267
  GGML_CUDA_ASSUME(k >= 0);
 
2270
  const int kbx = k / QI4_0;
2271
  const int kqsx = k % QI4_0;
2272
 
2273
+ const block_q4_0 * bx0 = (const block_q4_0 *) vx;
2274
 
2275
  float * x_dmf = (float *) x_dm;
2276
 
 
2308
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
2309
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2310
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2311
+ (void)x_qh; (void)x_sc;
2312
 
2313
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2314
+ const float * x_dmf = (const float *) x_dm;
2315
 
2316
  int u[2*VDR_Q4_0_Q8_1_MMQ];
2317
 
 
2345
  }
2346
 
2347
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2348
+ (void)x_qh; (void)x_sc;
2349
 
2350
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
2351
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
 
2357
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2358
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2359
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2360
+ (void)x_qh; (void)x_sc;
2361
 
2362
  GGML_CUDA_ASSUME(i_offset >= 0);
2363
  GGML_CUDA_ASSUME(i_offset < nwarps);
 
2367
  const int kbx = k / QI4_1;
2368
  const int kqsx = k % QI4_1;
2369
 
2370
+ const block_q4_1 * bx0 = (const block_q4_1 *) vx;
2371
 
2372
  #pragma unroll
2373
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 
2402
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
2403
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2404
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2405
+ (void)x_qh; (void)x_sc;
2406
 
2407
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2408
 
 
2440
  }
2441
 
2442
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2443
+ (void)x_qh; (void)x_sc;
2444
 
2445
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2446
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
 
2452
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2453
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2454
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2455
+ (void)x_qh; (void)x_sc;
2456
 
2457
  GGML_CUDA_ASSUME(i_offset >= 0);
2458
  GGML_CUDA_ASSUME(i_offset < nwarps);
 
2462
  const int kbx = k / QI5_0;
2463
  const int kqsx = k % QI5_0;
2464
 
2465
+ const block_q5_0 * bx0 = (const block_q5_0 *) vx;
2466
 
2467
  #pragma unroll
2468
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 
2517
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
2518
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2519
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2520
+ (void)x_qh; (void)x_sc;
2521
 
2522
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2523
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
 
2557
  }
2558
 
2559
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2560
+ (void)x_qh; (void)x_sc;
2561
 
2562
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2563
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
 
2569
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2570
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2571
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2572
+ (void)x_qh; (void)x_sc;
2573
 
2574
  GGML_CUDA_ASSUME(i_offset >= 0);
2575
  GGML_CUDA_ASSUME(i_offset < nwarps);
 
2579
  const int kbx = k / QI5_1;
2580
  const int kqsx = k % QI5_1;
2581
 
2582
+ const block_q5_1 * bx0 = (const block_q5_1 *) vx;
2583
 
2584
  #pragma unroll
2585
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 
2631
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
2632
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2633
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2634
+ (void)x_qh; (void)x_sc;
2635
 
2636
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2637
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
 
2666
  }
2667
 
2668
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2669
+ (void)x_qh; (void)x_sc;
2670
 
2671
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2672
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
 
2678
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2679
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2680
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2681
+ (void)x_qh; (void)x_sc;
2682
 
2683
  GGML_CUDA_ASSUME(i_offset >= 0);
2684
  GGML_CUDA_ASSUME(i_offset < nwarps);
 
2689
  const int kqsx = k % QI8_0;
2690
  float * x_dmf = (float *) x_dm;
2691
 
2692
+ const block_q8_0 * bx0 = (const block_q8_0 *) vx;
2693
 
2694
  #pragma unroll
2695
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 
2724
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2725
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2726
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2727
+ (void)x_qh; (void)x_sc;
2728
 
2729
  const float * x_dmf = (const float *) x_dm;
2730
  const float * y_df = (const float *) y_ds;
 
2758
  }
2759
 
2760
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2761
+ (void)x_qh;
2762
 
2763
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2764
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
 
2772
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2773
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2774
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2775
+ (void)x_qh;
2776
 
2777
  GGML_CUDA_ASSUME(i_offset >= 0);
2778
  GGML_CUDA_ASSUME(i_offset < nwarps);
 
2782
  const int kbx = k / QI2_K;
2783
  const int kqsx = k % QI2_K;
2784
 
2785
+ const block_q2_K * bx0 = (const block_q2_K *) vx;
2786
 
2787
  #pragma unroll
2788
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 
2830
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2831
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2832
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2833
+ (void)x_qh;
2834
 
2835
  const int kbx = k / QI2_K;
2836
  const int ky = (k % QI2_K) * QR2_K;
 
2904
  const int kbx = k / QI3_K;
2905
  const int kqsx = k % QI3_K;
2906
 
2907
+ const block_q3_K * bx0 = (const block_q3_K *) vx;
2908
 
2909
  #pragma unroll
2910
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 
2985
  const float * x_dmf = (const float *) x_dm;
2986
  const float * y_df = (const float *) y_ds;
2987
 
2988
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2989
 
2990
  int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2991
 
 
3100
  }
3101
 
3102
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3103
+ (void)x_qh;
3104
 
3105
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
3106
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
 
3114
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
3115
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3116
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3117
+ (void)x_qh;
3118
 
3119
  GGML_CUDA_ASSUME(i_offset >= 0);
3120
  GGML_CUDA_ASSUME(i_offset < nwarps);
 
3124
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
3125
  const int kqsx = k % QI4_K; // == k if QK_K == 256
3126
 
3127
+ const block_q4_K * bx0 = (const block_q4_K *) vx;
3128
 
3129
  #pragma unroll
3130
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 
3169
 
3170
  const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
3171
 
3172
+ const int * scales = (const int *) bxi->scales;
3173
 
3174
  const int ksc = k % (WARP_SIZE/8);
3175
 
 
3184
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
3185
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3186
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3187
+ (void)x_qh;
3188
 
3189
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
3190
 
 
3284
  }
3285
 
3286
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3287
+ (void)x_qh;
3288
 
3289
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3290
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
 
3298
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
3299
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3300
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3301
+ (void)x_qh;
3302
 
3303
  GGML_CUDA_ASSUME(i_offset >= 0);
3304
  GGML_CUDA_ASSUME(i_offset < nwarps);
 
3308
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3309
  const int kqsx = k % QI5_K; // == k if QK_K == 256
3310
 
3311
+ const block_q5_K * bx0 = (const block_q5_K *) vx;
3312
 
3313
  #pragma unroll
3314
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 
3364
 
3365
  const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
3366
 
3367
+ const int * scales = (const int *) bxi->scales;
3368
 
3369
  const int ksc = k % (WARP_SIZE/8);
3370
 
 
3379
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3380
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3381
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3382
+ (void)x_qh;
3383
 
3384
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3385
 
 
3416
  }
3417
 
3418
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3419
+ (void)x_qh;
3420
 
3421
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3422
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
 
3430
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3431
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3432
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3433
+ (void)x_qh;
3434
 
3435
  GGML_CUDA_ASSUME(i_offset >= 0);
3436
  GGML_CUDA_ASSUME(i_offset < nwarps);
 
3440
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3441
  const int kqsx = k % QI6_K; // == k if QK_K == 256
3442
 
3443
+ const block_q6_K * bx0 = (const block_q6_K *) vx;
3444
 
3445
  #pragma unroll
3446
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 
3502
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3503
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3504
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3505
+ (void)x_qh;
3506
 
3507
  const float * x_dmf = (const float *) x_dm;
3508
  const float * y_df = (const float *) y_ds;
 
3545
  __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3546
  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3547
 
3548
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
3549
 
3550
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3551
 
 
5867
  return ptr;
5868
  }
5869
  #ifdef DEBUG_CUDA_MALLOC
5870
+ fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
5871
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5872
  #endif
5873
  void * ptr;
 
6005
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
6006
  // This can fixed the OOM error in WSL.
6007
  cudaGetLastError();
6008
+ fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
6009
  size/1024.0/1024.0, cudaGetErrorString(err));
6010
  return nullptr;
6011
  }
 
6386
  case GGML_TYPE_Q8_0:
6387
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
6388
  case GGML_TYPE_F16:
6389
+ case GGML_TYPE_F32:
6390
  return 1;
6391
  case GGML_TYPE_Q2_K:
6392
  return max_compute_capability >= CC_RDNA2 ? 128 : 32;
 
6409
  case GGML_TYPE_Q8_0:
6410
  return 64;
6411
  case GGML_TYPE_F16:
6412
+ case GGML_TYPE_F32:
6413
  return 1;
6414
  case GGML_TYPE_Q2_K:
6415
  case GGML_TYPE_Q3_K:
 
7019
  const int64_t ne01 = src0->ne[1];
7020
  const int64_t ne02 = src0->ne[2];
7021
  const int64_t ne03 = src0->ne[3];
7022
+ // const int64_t nrows0 = ggml_nrows(src0);
7023
 
7024
  const int64_t ne10 = src1->ne[0];
7025
  const int64_t ne11 = src1->ne[1];
 
7120
  if (src0_on_device && src0_is_contiguous) {
7121
  src0_dd[id] = (char *) src0_extra->data_device[id];
7122
  } else {
7123
+ // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7124
  src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
7125
  }
7126
 
 
7353
  }
7354
 
7355
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7356
+ if (!g_cublas_loaded) { return false; }
7357
 
7358
  const int64_t ne10 = src1->ne[0];
7359
 
 
7431
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7432
  }
7433
 
7434
+ __global__ static void k_compute_batched_ptrs(
7435
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7436
  const void ** ptrs_src, void ** ptrs_dst,
7437
  int ne12, int ne13,
 
8047
  }
8048
 
8049
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8050
+ if (!g_cublas_loaded) { return false; }
8051
 
8052
  ggml_cuda_func_t func;
8053
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
 
8061
  if (tensor->op == GGML_OP_MUL_MAT) {
8062
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8063
  #ifndef NDEBUG
8064
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8065
  #endif
8066
  return false;
8067
  }
 
8346
  UNUSED(cgraph);
8347
  }
8348
 
8349
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8350
  GGML_ASSERT(!"not implemented");
8351
 
8352
  UNUSED(backend);
8353
  UNUSED(plan);
8354
  }
8355
 
8356
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8357
  GGML_ASSERT(!"not implemented");
8358
 
8359
  UNUSED(backend);
 
8369
  for (int i = 0; i < cgraph->n_nodes; i++) {
8370
  ggml_tensor * node = cgraph->nodes[i];
8371
 
8372
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
8373
  continue;
8374
+ }
8375
  assert(node->backend == GGML_BACKEND_GPU);
8376
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8377
  if (node->src[j] != nullptr) {