Spaces:
Sleeping
Sleeping
cuda : sync some minor stuff from llama.cpp (#1548)
Browse files- ggml-cuda.cu +62 -32
ggml-cuda.cu
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
#include <algorithm>
|
|
|
|
| 2 |
#include <cstddef>
|
| 3 |
#include <cstdint>
|
| 4 |
#include <limits>
|
|
@@ -235,7 +236,7 @@ typedef float2 dfloat2;
|
|
| 235 |
#endif //GGML_CUDA_F16
|
| 236 |
|
| 237 |
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
| 238 |
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
| 239 |
|
| 240 |
int x32 = 0;
|
| 241 |
x32 |= x16[0] << 0;
|
|
@@ -245,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
|
|
| 245 |
}
|
| 246 |
|
| 247 |
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
| 248 |
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
| 249 |
|
| 250 |
int x32 = 0;
|
| 251 |
x32 |= x16[0] << 0;
|
|
@@ -255,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
|
|
| 255 |
}
|
| 256 |
|
| 257 |
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
| 258 |
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
| 259 |
}
|
| 260 |
|
| 261 |
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
| 262 |
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
| 263 |
}
|
| 264 |
|
| 265 |
template<typename T>
|
|
@@ -469,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
| 469 |
#define MUL_MAT_SRC1_COL_STRIDE 128
|
| 470 |
|
| 471 |
#define MAX_STREAMS 8
|
| 472 |
-
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
| 473 |
|
| 474 |
struct ggml_tensor_extra_gpu {
|
| 475 |
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
|
@@ -2248,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
| 2248 |
}
|
| 2249 |
|
| 2250 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
|
|
| 2251 |
|
| 2252 |
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
| 2253 |
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
|
@@ -2259,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
|
|
| 2259 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
| 2260 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2261 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
| 2262 |
-
|
| 2263 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2264 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
| 2265 |
GGML_CUDA_ASSUME(k >= 0);
|
|
@@ -2268,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2268 |
const int kbx = k / QI4_0;
|
| 2269 |
const int kqsx = k % QI4_0;
|
| 2270 |
|
| 2271 |
-
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
| 2272 |
|
| 2273 |
float * x_dmf = (float *) x_dm;
|
| 2274 |
|
|
@@ -2306,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2306 |
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
| 2307 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2308 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
|
|
| 2309 |
|
| 2310 |
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
| 2311 |
-
const float * x_dmf = (float *) x_dm;
|
| 2312 |
|
| 2313 |
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
| 2314 |
|
|
@@ -2342,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
| 2342 |
}
|
| 2343 |
|
| 2344 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
|
|
| 2345 |
|
| 2346 |
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
| 2347 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
|
@@ -2353,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
|
|
| 2353 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
| 2354 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2355 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
|
|
| 2356 |
|
| 2357 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2358 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
@@ -2362,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2362 |
const int kbx = k / QI4_1;
|
| 2363 |
const int kqsx = k % QI4_1;
|
| 2364 |
|
| 2365 |
-
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
| 2366 |
|
| 2367 |
#pragma unroll
|
| 2368 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
@@ -2397,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2397 |
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
| 2398 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2399 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
|
|
| 2400 |
|
| 2401 |
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
| 2402 |
|
|
@@ -2434,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
| 2434 |
}
|
| 2435 |
|
| 2436 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
|
|
| 2437 |
|
| 2438 |
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
| 2439 |
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
|
@@ -2445,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
|
|
| 2445 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
| 2446 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2447 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
|
|
| 2448 |
|
| 2449 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2450 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
@@ -2454,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2454 |
const int kbx = k / QI5_0;
|
| 2455 |
const int kqsx = k % QI5_0;
|
| 2456 |
|
| 2457 |
-
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
| 2458 |
|
| 2459 |
#pragma unroll
|
| 2460 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
@@ -2509,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2509 |
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
| 2510 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2511 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
|
|
| 2512 |
|
| 2513 |
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
| 2514 |
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
|
@@ -2548,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
| 2548 |
}
|
| 2549 |
|
| 2550 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
|
|
| 2551 |
|
| 2552 |
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
| 2553 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
|
@@ -2559,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
|
|
| 2559 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
| 2560 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2561 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
|
|
| 2562 |
|
| 2563 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2564 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
@@ -2568,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2568 |
const int kbx = k / QI5_1;
|
| 2569 |
const int kqsx = k % QI5_1;
|
| 2570 |
|
| 2571 |
-
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
| 2572 |
|
| 2573 |
#pragma unroll
|
| 2574 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
@@ -2620,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2620 |
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
| 2621 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2622 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
|
|
| 2623 |
|
| 2624 |
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
| 2625 |
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
|
@@ -2654,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
| 2654 |
}
|
| 2655 |
|
| 2656 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
|
|
| 2657 |
|
| 2658 |
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
| 2659 |
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
|
@@ -2665,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
|
|
| 2665 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
| 2666 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2667 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
|
|
| 2668 |
|
| 2669 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2670 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
@@ -2675,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2675 |
const int kqsx = k % QI8_0;
|
| 2676 |
float * x_dmf = (float *) x_dm;
|
| 2677 |
|
| 2678 |
-
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
| 2679 |
|
| 2680 |
#pragma unroll
|
| 2681 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
@@ -2710,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2710 |
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
| 2711 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2712 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
|
|
| 2713 |
|
| 2714 |
const float * x_dmf = (const float *) x_dm;
|
| 2715 |
const float * y_df = (const float *) y_ds;
|
|
@@ -2743,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
| 2743 |
}
|
| 2744 |
|
| 2745 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
|
|
| 2746 |
|
| 2747 |
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
| 2748 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
|
@@ -2756,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
|
|
| 2756 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
| 2757 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2758 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
|
|
| 2759 |
|
| 2760 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2761 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
@@ -2765,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2765 |
const int kbx = k / QI2_K;
|
| 2766 |
const int kqsx = k % QI2_K;
|
| 2767 |
|
| 2768 |
-
const block_q2_K * bx0 = (block_q2_K *) vx;
|
| 2769 |
|
| 2770 |
#pragma unroll
|
| 2771 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
@@ -2813,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2813 |
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
| 2814 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2815 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
|
|
| 2816 |
|
| 2817 |
const int kbx = k / QI2_K;
|
| 2818 |
const int ky = (k % QI2_K) * QR2_K;
|
|
@@ -2886,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 2886 |
const int kbx = k / QI3_K;
|
| 2887 |
const int kqsx = k % QI3_K;
|
| 2888 |
|
| 2889 |
-
const block_q3_K * bx0 = (block_q3_K *) vx;
|
| 2890 |
|
| 2891 |
#pragma unroll
|
| 2892 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
@@ -2967,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
| 2967 |
const float * x_dmf = (const float *) x_dm;
|
| 2968 |
const float * y_df = (const float *) y_ds;
|
| 2969 |
|
| 2970 |
-
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
| 2971 |
|
| 2972 |
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
| 2973 |
|
|
@@ -3082,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
| 3082 |
}
|
| 3083 |
|
| 3084 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
|
|
| 3085 |
|
| 3086 |
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
| 3087 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
|
@@ -3095,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
|
|
| 3095 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
| 3096 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 3097 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
|
|
| 3098 |
|
| 3099 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 3100 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
@@ -3104,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 3104 |
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
| 3105 |
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
| 3106 |
|
| 3107 |
-
const block_q4_K * bx0 = (block_q4_K *) vx;
|
| 3108 |
|
| 3109 |
#pragma unroll
|
| 3110 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
@@ -3149,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 3149 |
|
| 3150 |
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
| 3151 |
|
| 3152 |
-
const int * scales = (int *) bxi->scales;
|
| 3153 |
|
| 3154 |
const int ksc = k % (WARP_SIZE/8);
|
| 3155 |
|
|
@@ -3164,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 3164 |
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
| 3165 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 3166 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
|
|
| 3167 |
|
| 3168 |
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
| 3169 |
|
|
@@ -3263,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
| 3263 |
}
|
| 3264 |
|
| 3265 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
|
|
| 3266 |
|
| 3267 |
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
| 3268 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
|
@@ -3276,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
|
|
| 3276 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
| 3277 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 3278 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
|
|
| 3279 |
|
| 3280 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 3281 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
@@ -3285,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 3285 |
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
| 3286 |
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
| 3287 |
|
| 3288 |
-
const block_q5_K * bx0 = (block_q5_K *) vx;
|
| 3289 |
|
| 3290 |
#pragma unroll
|
| 3291 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
@@ -3341,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 3341 |
|
| 3342 |
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
| 3343 |
|
| 3344 |
-
const int * scales = (int *) bxi->scales;
|
| 3345 |
|
| 3346 |
const int ksc = k % (WARP_SIZE/8);
|
| 3347 |
|
|
@@ -3356,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 3356 |
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
| 3357 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 3358 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
|
|
| 3359 |
|
| 3360 |
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
| 3361 |
|
|
@@ -3392,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
| 3392 |
}
|
| 3393 |
|
| 3394 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
|
|
| 3395 |
|
| 3396 |
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
| 3397 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
|
@@ -3405,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
|
|
| 3405 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
| 3406 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 3407 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
|
|
| 3408 |
|
| 3409 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 3410 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
@@ -3414,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 3414 |
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
| 3415 |
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
| 3416 |
|
| 3417 |
-
const block_q6_K * bx0 = (block_q6_K *) vx;
|
| 3418 |
|
| 3419 |
#pragma unroll
|
| 3420 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
@@ -3476,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 3476 |
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
| 3477 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 3478 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
|
|
| 3479 |
|
| 3480 |
const float * x_dmf = (const float *) x_dm;
|
| 3481 |
const float * y_df = (const float *) y_ds;
|
|
@@ -3518,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
| 3518 |
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
| 3519 |
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
| 3520 |
|
| 3521 |
-
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
| 3522 |
|
| 3523 |
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
| 3524 |
|
|
@@ -5840,7 +5867,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
| 5840 |
return ptr;
|
| 5841 |
}
|
| 5842 |
#ifdef DEBUG_CUDA_MALLOC
|
| 5843 |
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
| 5844 |
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
| 5845 |
#endif
|
| 5846 |
void * ptr;
|
|
@@ -5978,7 +6005,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
| 5978 |
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
| 5979 |
// This can fixed the OOM error in WSL.
|
| 5980 |
cudaGetLastError();
|
| 5981 |
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
| 5982 |
size/1024.0/1024.0, cudaGetErrorString(err));
|
| 5983 |
return nullptr;
|
| 5984 |
}
|
|
@@ -6359,6 +6386,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
| 6359 |
case GGML_TYPE_Q8_0:
|
| 6360 |
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
| 6361 |
case GGML_TYPE_F16:
|
|
|
|
| 6362 |
return 1;
|
| 6363 |
case GGML_TYPE_Q2_K:
|
| 6364 |
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
|
@@ -6381,6 +6409,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
| 6381 |
case GGML_TYPE_Q8_0:
|
| 6382 |
return 64;
|
| 6383 |
case GGML_TYPE_F16:
|
|
|
|
| 6384 |
return 1;
|
| 6385 |
case GGML_TYPE_Q2_K:
|
| 6386 |
case GGML_TYPE_Q3_K:
|
|
@@ -6990,7 +7019,7 @@ static void ggml_cuda_op_mul_mat(
|
|
| 6990 |
const int64_t ne01 = src0->ne[1];
|
| 6991 |
const int64_t ne02 = src0->ne[2];
|
| 6992 |
const int64_t ne03 = src0->ne[3];
|
| 6993 |
-
const int64_t nrows0 = ggml_nrows(src0);
|
| 6994 |
|
| 6995 |
const int64_t ne10 = src1->ne[0];
|
| 6996 |
const int64_t ne11 = src1->ne[1];
|
|
@@ -7091,7 +7120,7 @@ static void ggml_cuda_op_mul_mat(
|
|
| 7091 |
if (src0_on_device && src0_is_contiguous) {
|
| 7092 |
src0_dd[id] = (char *) src0_extra->data_device[id];
|
| 7093 |
} else {
|
| 7094 |
-
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
| 7095 |
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
| 7096 |
}
|
| 7097 |
|
|
@@ -7324,7 +7353,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
| 7324 |
}
|
| 7325 |
|
| 7326 |
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
| 7327 |
-
if (!g_cublas_loaded) return false;
|
| 7328 |
|
| 7329 |
const int64_t ne10 = src1->ne[0];
|
| 7330 |
|
|
@@ -7402,7 +7431,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
| 7402 |
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
| 7403 |
}
|
| 7404 |
|
| 7405 |
-
__global__ void k_compute_batched_ptrs(
|
| 7406 |
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
| 7407 |
const void ** ptrs_src, void ** ptrs_dst,
|
| 7408 |
int ne12, int ne13,
|
|
@@ -8018,7 +8047,7 @@ void ggml_cuda_free_scratch() {
|
|
| 8018 |
}
|
| 8019 |
|
| 8020 |
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
| 8021 |
-
if (!g_cublas_loaded) return false;
|
| 8022 |
|
| 8023 |
ggml_cuda_func_t func;
|
| 8024 |
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
|
@@ -8032,7 +8061,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
| 8032 |
if (tensor->op == GGML_OP_MUL_MAT) {
|
| 8033 |
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
| 8034 |
#ifndef NDEBUG
|
| 8035 |
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] =
|
| 8036 |
#endif
|
| 8037 |
return false;
|
| 8038 |
}
|
|
@@ -8317,14 +8346,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
| 8317 |
UNUSED(cgraph);
|
| 8318 |
}
|
| 8319 |
|
| 8320 |
-
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 8321 |
GGML_ASSERT(!"not implemented");
|
| 8322 |
|
| 8323 |
UNUSED(backend);
|
| 8324 |
UNUSED(plan);
|
| 8325 |
}
|
| 8326 |
|
| 8327 |
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 8328 |
GGML_ASSERT(!"not implemented");
|
| 8329 |
|
| 8330 |
UNUSED(backend);
|
|
@@ -8340,8 +8369,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
| 8340 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 8341 |
ggml_tensor * node = cgraph->nodes[i];
|
| 8342 |
|
| 8343 |
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
| 8344 |
continue;
|
|
|
|
| 8345 |
assert(node->backend == GGML_BACKEND_GPU);
|
| 8346 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 8347 |
if (node->src[j] != nullptr) {
|
|
|
|
| 1 |
#include <algorithm>
|
| 2 |
+
#include <cinttypes>
|
| 3 |
#include <cstddef>
|
| 4 |
#include <cstdint>
|
| 5 |
#include <limits>
|
|
|
|
| 236 |
#endif //GGML_CUDA_F16
|
| 237 |
|
| 238 |
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
| 239 |
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
| 240 |
|
| 241 |
int x32 = 0;
|
| 242 |
x32 |= x16[0] << 0;
|
|
|
|
| 246 |
}
|
| 247 |
|
| 248 |
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
| 249 |
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
| 250 |
|
| 251 |
int x32 = 0;
|
| 252 |
x32 |= x16[0] << 0;
|
|
|
|
| 256 |
}
|
| 257 |
|
| 258 |
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
| 259 |
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
| 260 |
}
|
| 261 |
|
| 262 |
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
| 263 |
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
| 264 |
}
|
| 265 |
|
| 266 |
template<typename T>
|
|
|
|
| 470 |
#define MUL_MAT_SRC1_COL_STRIDE 128
|
| 471 |
|
| 472 |
#define MAX_STREAMS 8
|
| 473 |
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
|
| 474 |
|
| 475 |
struct ggml_tensor_extra_gpu {
|
| 476 |
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
|
|
|
| 2249 |
}
|
| 2250 |
|
| 2251 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
| 2252 |
+
(void)x_qh; (void)x_sc;
|
| 2253 |
|
| 2254 |
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
| 2255 |
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
|
|
|
| 2261 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
| 2262 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2263 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
| 2264 |
+
(void)x_qh; (void)x_sc;
|
| 2265 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2266 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
| 2267 |
GGML_CUDA_ASSUME(k >= 0);
|
|
|
|
| 2270 |
const int kbx = k / QI4_0;
|
| 2271 |
const int kqsx = k % QI4_0;
|
| 2272 |
|
| 2273 |
+
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
| 2274 |
|
| 2275 |
float * x_dmf = (float *) x_dm;
|
| 2276 |
|
|
|
|
| 2308 |
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
| 2309 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2310 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
| 2311 |
+
(void)x_qh; (void)x_sc;
|
| 2312 |
|
| 2313 |
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
| 2314 |
+
const float * x_dmf = (const float *) x_dm;
|
| 2315 |
|
| 2316 |
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
| 2317 |
|
|
|
|
| 2345 |
}
|
| 2346 |
|
| 2347 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
| 2348 |
+
(void)x_qh; (void)x_sc;
|
| 2349 |
|
| 2350 |
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
| 2351 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
|
|
|
| 2357 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
| 2358 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2359 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
| 2360 |
+
(void)x_qh; (void)x_sc;
|
| 2361 |
|
| 2362 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2363 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
|
|
| 2367 |
const int kbx = k / QI4_1;
|
| 2368 |
const int kqsx = k % QI4_1;
|
| 2369 |
|
| 2370 |
+
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
| 2371 |
|
| 2372 |
#pragma unroll
|
| 2373 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
|
|
| 2402 |
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
| 2403 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2404 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
| 2405 |
+
(void)x_qh; (void)x_sc;
|
| 2406 |
|
| 2407 |
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
| 2408 |
|
|
|
|
| 2440 |
}
|
| 2441 |
|
| 2442 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
| 2443 |
+
(void)x_qh; (void)x_sc;
|
| 2444 |
|
| 2445 |
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
| 2446 |
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
|
|
|
| 2452 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
| 2453 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2454 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
| 2455 |
+
(void)x_qh; (void)x_sc;
|
| 2456 |
|
| 2457 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2458 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
|
|
| 2462 |
const int kbx = k / QI5_0;
|
| 2463 |
const int kqsx = k % QI5_0;
|
| 2464 |
|
| 2465 |
+
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
| 2466 |
|
| 2467 |
#pragma unroll
|
| 2468 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
|
|
| 2517 |
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
| 2518 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2519 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
| 2520 |
+
(void)x_qh; (void)x_sc;
|
| 2521 |
|
| 2522 |
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
| 2523 |
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
|
|
|
| 2557 |
}
|
| 2558 |
|
| 2559 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
| 2560 |
+
(void)x_qh; (void)x_sc;
|
| 2561 |
|
| 2562 |
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
| 2563 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
|
|
|
| 2569 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
| 2570 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2571 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
| 2572 |
+
(void)x_qh; (void)x_sc;
|
| 2573 |
|
| 2574 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2575 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
|
|
| 2579 |
const int kbx = k / QI5_1;
|
| 2580 |
const int kqsx = k % QI5_1;
|
| 2581 |
|
| 2582 |
+
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
| 2583 |
|
| 2584 |
#pragma unroll
|
| 2585 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
|
|
| 2631 |
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
| 2632 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2633 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
| 2634 |
+
(void)x_qh; (void)x_sc;
|
| 2635 |
|
| 2636 |
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
| 2637 |
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
|
|
|
| 2666 |
}
|
| 2667 |
|
| 2668 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
| 2669 |
+
(void)x_qh; (void)x_sc;
|
| 2670 |
|
| 2671 |
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
| 2672 |
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
|
|
|
| 2678 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
| 2679 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2680 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
| 2681 |
+
(void)x_qh; (void)x_sc;
|
| 2682 |
|
| 2683 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2684 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
|
|
| 2689 |
const int kqsx = k % QI8_0;
|
| 2690 |
float * x_dmf = (float *) x_dm;
|
| 2691 |
|
| 2692 |
+
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
| 2693 |
|
| 2694 |
#pragma unroll
|
| 2695 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
|
|
| 2724 |
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
| 2725 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2726 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
| 2727 |
+
(void)x_qh; (void)x_sc;
|
| 2728 |
|
| 2729 |
const float * x_dmf = (const float *) x_dm;
|
| 2730 |
const float * y_df = (const float *) y_ds;
|
|
|
|
| 2758 |
}
|
| 2759 |
|
| 2760 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
| 2761 |
+
(void)x_qh;
|
| 2762 |
|
| 2763 |
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
| 2764 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
|
|
|
| 2772 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
| 2773 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 2774 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
| 2775 |
+
(void)x_qh;
|
| 2776 |
|
| 2777 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 2778 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
|
|
| 2782 |
const int kbx = k / QI2_K;
|
| 2783 |
const int kqsx = k % QI2_K;
|
| 2784 |
|
| 2785 |
+
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
| 2786 |
|
| 2787 |
#pragma unroll
|
| 2788 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
|
|
| 2830 |
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
| 2831 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 2832 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
| 2833 |
+
(void)x_qh;
|
| 2834 |
|
| 2835 |
const int kbx = k / QI2_K;
|
| 2836 |
const int ky = (k % QI2_K) * QR2_K;
|
|
|
|
| 2904 |
const int kbx = k / QI3_K;
|
| 2905 |
const int kqsx = k % QI3_K;
|
| 2906 |
|
| 2907 |
+
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
| 2908 |
|
| 2909 |
#pragma unroll
|
| 2910 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
|
|
| 2985 |
const float * x_dmf = (const float *) x_dm;
|
| 2986 |
const float * y_df = (const float *) y_ds;
|
| 2987 |
|
| 2988 |
+
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
| 2989 |
|
| 2990 |
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
| 2991 |
|
|
|
|
| 3100 |
}
|
| 3101 |
|
| 3102 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
| 3103 |
+
(void)x_qh;
|
| 3104 |
|
| 3105 |
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
| 3106 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
|
|
|
| 3114 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
| 3115 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 3116 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
| 3117 |
+
(void)x_qh;
|
| 3118 |
|
| 3119 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 3120 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
|
|
| 3124 |
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
| 3125 |
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
| 3126 |
|
| 3127 |
+
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
| 3128 |
|
| 3129 |
#pragma unroll
|
| 3130 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
|
|
| 3169 |
|
| 3170 |
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
| 3171 |
|
| 3172 |
+
const int * scales = (const int *) bxi->scales;
|
| 3173 |
|
| 3174 |
const int ksc = k % (WARP_SIZE/8);
|
| 3175 |
|
|
|
|
| 3184 |
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
| 3185 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 3186 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
| 3187 |
+
(void)x_qh;
|
| 3188 |
|
| 3189 |
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
| 3190 |
|
|
|
|
| 3284 |
}
|
| 3285 |
|
| 3286 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
| 3287 |
+
(void)x_qh;
|
| 3288 |
|
| 3289 |
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
| 3290 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
|
|
|
| 3298 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
| 3299 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 3300 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
| 3301 |
+
(void)x_qh;
|
| 3302 |
|
| 3303 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 3304 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
|
|
| 3308 |
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
| 3309 |
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
| 3310 |
|
| 3311 |
+
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
| 3312 |
|
| 3313 |
#pragma unroll
|
| 3314 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
|
|
| 3364 |
|
| 3365 |
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
| 3366 |
|
| 3367 |
+
const int * scales = (const int *) bxi->scales;
|
| 3368 |
|
| 3369 |
const int ksc = k % (WARP_SIZE/8);
|
| 3370 |
|
|
|
|
| 3379 |
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
| 3380 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 3381 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
| 3382 |
+
(void)x_qh;
|
| 3383 |
|
| 3384 |
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
| 3385 |
|
|
|
|
| 3416 |
}
|
| 3417 |
|
| 3418 |
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
| 3419 |
+
(void)x_qh;
|
| 3420 |
|
| 3421 |
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
| 3422 |
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
|
|
|
| 3430 |
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
| 3431 |
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
| 3432 |
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
| 3433 |
+
(void)x_qh;
|
| 3434 |
|
| 3435 |
GGML_CUDA_ASSUME(i_offset >= 0);
|
| 3436 |
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
|
|
| 3440 |
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
| 3441 |
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
| 3442 |
|
| 3443 |
+
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
| 3444 |
|
| 3445 |
#pragma unroll
|
| 3446 |
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
|
|
| 3502 |
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
| 3503 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 3504 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
| 3505 |
+
(void)x_qh;
|
| 3506 |
|
| 3507 |
const float * x_dmf = (const float *) x_dm;
|
| 3508 |
const float * y_df = (const float *) y_ds;
|
|
|
|
| 3545 |
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
| 3546 |
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
| 3547 |
|
| 3548 |
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
| 3549 |
|
| 3550 |
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
| 3551 |
|
|
|
|
| 5867 |
return ptr;
|
| 5868 |
}
|
| 5869 |
#ifdef DEBUG_CUDA_MALLOC
|
| 5870 |
+
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
|
| 5871 |
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
| 5872 |
#endif
|
| 5873 |
void * ptr;
|
|
|
|
| 6005 |
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
| 6006 |
// This can fixed the OOM error in WSL.
|
| 6007 |
cudaGetLastError();
|
| 6008 |
+
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
|
| 6009 |
size/1024.0/1024.0, cudaGetErrorString(err));
|
| 6010 |
return nullptr;
|
| 6011 |
}
|
|
|
|
| 6386 |
case GGML_TYPE_Q8_0:
|
| 6387 |
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
| 6388 |
case GGML_TYPE_F16:
|
| 6389 |
+
case GGML_TYPE_F32:
|
| 6390 |
return 1;
|
| 6391 |
case GGML_TYPE_Q2_K:
|
| 6392 |
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
|
|
|
| 6409 |
case GGML_TYPE_Q8_0:
|
| 6410 |
return 64;
|
| 6411 |
case GGML_TYPE_F16:
|
| 6412 |
+
case GGML_TYPE_F32:
|
| 6413 |
return 1;
|
| 6414 |
case GGML_TYPE_Q2_K:
|
| 6415 |
case GGML_TYPE_Q3_K:
|
|
|
|
| 7019 |
const int64_t ne01 = src0->ne[1];
|
| 7020 |
const int64_t ne02 = src0->ne[2];
|
| 7021 |
const int64_t ne03 = src0->ne[3];
|
| 7022 |
+
// const int64_t nrows0 = ggml_nrows(src0);
|
| 7023 |
|
| 7024 |
const int64_t ne10 = src1->ne[0];
|
| 7025 |
const int64_t ne11 = src1->ne[1];
|
|
|
|
| 7120 |
if (src0_on_device && src0_is_contiguous) {
|
| 7121 |
src0_dd[id] = (char *) src0_extra->data_device[id];
|
| 7122 |
} else {
|
| 7123 |
+
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
| 7124 |
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
| 7125 |
}
|
| 7126 |
|
|
|
|
| 7353 |
}
|
| 7354 |
|
| 7355 |
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
| 7356 |
+
if (!g_cublas_loaded) { return false; }
|
| 7357 |
|
| 7358 |
const int64_t ne10 = src1->ne[0];
|
| 7359 |
|
|
|
|
| 7431 |
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
| 7432 |
}
|
| 7433 |
|
| 7434 |
+
__global__ static void k_compute_batched_ptrs(
|
| 7435 |
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
| 7436 |
const void ** ptrs_src, void ** ptrs_dst,
|
| 7437 |
int ne12, int ne13,
|
|
|
|
| 8047 |
}
|
| 8048 |
|
| 8049 |
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
| 8050 |
+
if (!g_cublas_loaded) { return false; }
|
| 8051 |
|
| 8052 |
ggml_cuda_func_t func;
|
| 8053 |
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
|
|
|
| 8061 |
if (tensor->op == GGML_OP_MUL_MAT) {
|
| 8062 |
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
| 8063 |
#ifndef NDEBUG
|
| 8064 |
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
| 8065 |
#endif
|
| 8066 |
return false;
|
| 8067 |
}
|
|
|
|
| 8346 |
UNUSED(cgraph);
|
| 8347 |
}
|
| 8348 |
|
| 8349 |
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 8350 |
GGML_ASSERT(!"not implemented");
|
| 8351 |
|
| 8352 |
UNUSED(backend);
|
| 8353 |
UNUSED(plan);
|
| 8354 |
}
|
| 8355 |
|
| 8356 |
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 8357 |
GGML_ASSERT(!"not implemented");
|
| 8358 |
|
| 8359 |
UNUSED(backend);
|
|
|
|
| 8369 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 8370 |
ggml_tensor * node = cgraph->nodes[i];
|
| 8371 |
|
| 8372 |
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
| 8373 |
continue;
|
| 8374 |
+
}
|
| 8375 |
assert(node->backend == GGML_BACKEND_GPU);
|
| 8376 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 8377 |
if (node->src[j] != nullptr) {
|