Spaces:
Running
ggml : refactor online repacking (llama/10446)
Browse files* rename ggml-cpu-aarch64.c to .cpp
* reformat extra cpu backend.
- clean Q4_0_N_M and IQ4_0_N_M
- remove from "file" tensor type
- allow only with dynamic repack
- extract cpu extra bufts and convert to C++
- hbm
- "aarch64"
- more generic use of extra buffer
- generalise extra_supports_op
- new API for "cpu-accel":
- amx
- aarch64
* clang-format
* Clean Q4_0_N_M ref
Enable restrict on C++
* add op GGML_OP_MUL_MAT_ID for Q4_0_N_M with runtime repack
* added/corrected control on tensor size for Q4 repacking.
* Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
Co-authored-by: Georgi Gerganov <[email protected]>
* Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
Co-authored-by: Georgi Gerganov <[email protected]>
* add debug logs on repacks.
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- ggml/include/ggml-cpu.h +0 -17
- ggml/include/ggml.h +17 -12
- ggml/src/CMakeLists.txt +1 -3
- ggml/src/ggml-cann/ggml-cann.cpp +1 -1
- ggml/src/ggml-common.h +41 -47
- ggml/src/ggml-cpu/CMakeLists.txt +5 -1
- ggml/src/ggml-cpu/amx/amx.cpp +92 -68
- ggml/src/ggml-cpu/amx/amx.h +1 -13
- ggml/src/ggml-cpu/amx/common.h +5 -14
- ggml/src/ggml-cpu/amx/mmq.cpp +3 -14
- ggml/src/ggml-cpu/amx/mmq.h +1 -7
- ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -0
- ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -26
- ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- ggml/src/ggml-cpu/ggml-cpu.c +150 -292
- ggml/src/ggml-cpu/ggml-cpu.cpp +34 -138
- ggml/src/ggml-cuda/ggml-cuda.cu +1 -1
- ggml/src/ggml-quants.c +0 -9
- ggml/src/ggml-sycl/ggml-sycl.cpp +1 -1
- ggml/src/ggml.c +46 -40
|
@@ -103,24 +103,14 @@ extern "C" {
|
|
| 103 |
|
| 104 |
// Internal types and functions exposed for tests and benchmarks
|
| 105 |
|
| 106 |
-
typedef void (*ggml_from_float_to_mat_t)
|
| 107 |
-
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
| 108 |
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
| 109 |
const void * GGML_RESTRICT y, size_t by, int nrc);
|
| 110 |
-
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
| 111 |
-
const void * GGML_RESTRICT y, int nr, int nc);
|
| 112 |
-
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
| 113 |
-
const void * GGML_RESTRICT y, int nr, int nc);
|
| 114 |
|
| 115 |
struct ggml_type_traits_cpu {
|
| 116 |
ggml_from_float_t from_float;
|
| 117 |
-
ggml_from_float_to_mat_t from_float_to_mat;
|
| 118 |
ggml_vec_dot_t vec_dot;
|
| 119 |
enum ggml_type vec_dot_type;
|
| 120 |
int64_t nrows; // number of rows to process simultaneously
|
| 121 |
-
int64_t ncols; // number of columns to process simultaneously
|
| 122 |
-
ggml_gemv_t gemv;
|
| 123 |
-
ggml_gemm_t gemm;
|
| 124 |
};
|
| 125 |
|
| 126 |
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
|
@@ -140,13 +130,6 @@ extern "C" {
|
|
| 140 |
|
| 141 |
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
| 142 |
|
| 143 |
-
#ifdef GGML_USE_CPU_HBM
|
| 144 |
-
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
| 145 |
-
#endif
|
| 146 |
-
|
| 147 |
-
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
| 148 |
-
GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
|
| 149 |
-
|
| 150 |
#ifdef __cplusplus
|
| 151 |
}
|
| 152 |
#endif
|
|
|
|
| 103 |
|
| 104 |
// Internal types and functions exposed for tests and benchmarks
|
| 105 |
|
|
|
|
|
|
|
| 106 |
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
| 107 |
const void * GGML_RESTRICT y, size_t by, int nrc);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
struct ggml_type_traits_cpu {
|
| 110 |
ggml_from_float_t from_float;
|
|
|
|
| 111 |
ggml_vec_dot_t vec_dot;
|
| 112 |
enum ggml_type vec_dot_type;
|
| 113 |
int64_t nrows; // number of rows to process simultaneously
|
|
|
|
|
|
|
|
|
|
| 114 |
};
|
| 115 |
|
| 116 |
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
|
|
|
| 130 |
|
| 131 |
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
#ifdef __cplusplus
|
| 134 |
}
|
| 135 |
#endif
|
|
@@ -384,15 +384,15 @@ extern "C" {
|
|
| 384 |
GGML_TYPE_F64 = 28,
|
| 385 |
GGML_TYPE_IQ1_M = 29,
|
| 386 |
GGML_TYPE_BF16 = 30,
|
| 387 |
-
GGML_TYPE_Q4_0_4_4 = 31,
|
| 388 |
-
GGML_TYPE_Q4_0_4_8 = 32,
|
| 389 |
-
GGML_TYPE_Q4_0_8_8 = 33,
|
| 390 |
GGML_TYPE_TQ1_0 = 34,
|
| 391 |
GGML_TYPE_TQ2_0 = 35,
|
| 392 |
-
GGML_TYPE_IQ4_NL_4_4 = 36,
|
| 393 |
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
| 394 |
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
| 395 |
-
GGML_TYPE_COUNT,
|
| 396 |
};
|
| 397 |
|
| 398 |
// precision
|
|
@@ -433,9 +433,6 @@ extern "C" {
|
|
| 433 |
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
| 434 |
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
| 435 |
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
| 436 |
-
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
| 437 |
-
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
| 438 |
-
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
| 439 |
};
|
| 440 |
|
| 441 |
// available tensor operations:
|
|
@@ -2205,11 +2202,19 @@ extern "C" {
|
|
| 2205 |
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
| 2206 |
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
| 2207 |
|
| 2208 |
-
#ifdef
|
| 2209 |
-
// restrict not standard in C++
|
| 2210 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2211 |
#else
|
| 2212 |
-
#define GGML_RESTRICT restrict
|
| 2213 |
#endif
|
| 2214 |
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 2215 |
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
|
|
| 384 |
GGML_TYPE_F64 = 28,
|
| 385 |
GGML_TYPE_IQ1_M = 29,
|
| 386 |
GGML_TYPE_BF16 = 30,
|
| 387 |
+
// GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
|
| 388 |
+
// GGML_TYPE_Q4_0_4_8 = 32,
|
| 389 |
+
// GGML_TYPE_Q4_0_8_8 = 33,
|
| 390 |
GGML_TYPE_TQ1_0 = 34,
|
| 391 |
GGML_TYPE_TQ2_0 = 35,
|
| 392 |
+
// GGML_TYPE_IQ4_NL_4_4 = 36,
|
| 393 |
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
| 394 |
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
| 395 |
+
GGML_TYPE_COUNT = 39,
|
| 396 |
};
|
| 397 |
|
| 398 |
// precision
|
|
|
|
| 433 |
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
| 434 |
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
| 435 |
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
|
|
|
|
|
|
|
|
|
| 436 |
};
|
| 437 |
|
| 438 |
// available tensor operations:
|
|
|
|
| 2202 |
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
| 2203 |
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
| 2204 |
|
| 2205 |
+
#ifdef __cplusplus
|
| 2206 |
+
// restrict not standard in C++
|
| 2207 |
+
# if defined(__GNUC__)
|
| 2208 |
+
# define GGML_RESTRICT __restrict__
|
| 2209 |
+
# elif defined(__clang__)
|
| 2210 |
+
# define GGML_RESTRICT __restrict
|
| 2211 |
+
# elif defined(_MSC_VER)
|
| 2212 |
+
# define GGML_RESTRICT __restrict
|
| 2213 |
+
# else
|
| 2214 |
+
# define GGML_RESTRICT
|
| 2215 |
+
# endif
|
| 2216 |
#else
|
| 2217 |
+
# define GGML_RESTRICT restrict
|
| 2218 |
#endif
|
| 2219 |
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 2220 |
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
@@ -220,9 +220,7 @@ add_library(ggml-base
|
|
| 220 |
ggml-threading.cpp
|
| 221 |
ggml-threading.h
|
| 222 |
ggml-quants.c
|
| 223 |
-
ggml-quants.h
|
| 224 |
-
ggml-aarch64.c
|
| 225 |
-
ggml-aarch64.h)
|
| 226 |
|
| 227 |
target_include_directories(ggml-base PRIVATE .)
|
| 228 |
|
|
|
|
| 220 |
ggml-threading.cpp
|
| 221 |
ggml-threading.h
|
| 222 |
ggml-quants.c
|
| 223 |
+
ggml-quants.h)
|
|
|
|
|
|
|
| 224 |
|
| 225 |
target_include_directories(ggml-base PRIVATE .)
|
| 226 |
|
|
@@ -2089,7 +2089,7 @@ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, con
|
|
| 2089 |
static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
|
| 2090 |
/* .get_name = */ ggml_backend_cann_reg_get_name,
|
| 2091 |
/* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
|
| 2092 |
-
/* .
|
| 2093 |
/* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
|
| 2094 |
};
|
| 2095 |
|
|
|
|
| 2089 |
static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
|
| 2090 |
/* .get_name = */ ggml_backend_cann_reg_get_name,
|
| 2091 |
/* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
|
| 2092 |
+
/* .get_device = */ ggml_backend_cann_reg_get_device,
|
| 2093 |
/* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
|
| 2094 |
};
|
| 2095 |
|
|
@@ -6,7 +6,20 @@
|
|
| 6 |
typedef uint16_t ggml_half;
|
| 7 |
typedef uint32_t ggml_half2;
|
| 8 |
|
| 9 |
-
#define
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
#define GGML_COMMON_DECL
|
| 12 |
#elif defined(GGML_COMMON_DECL_METAL)
|
|
@@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
|
|
| 15 |
typedef half ggml_half;
|
| 16 |
typedef half2 ggml_half2;
|
| 17 |
|
| 18 |
-
#define
|
|
|
|
| 19 |
|
| 20 |
#define GGML_COMMON_DECL
|
| 21 |
#elif defined(GGML_COMMON_DECL_CUDA)
|
|
@@ -29,7 +43,8 @@ typedef half2 ggml_half2;
|
|
| 29 |
typedef half ggml_half;
|
| 30 |
typedef half2 ggml_half2;
|
| 31 |
|
| 32 |
-
#define
|
|
|
|
| 33 |
|
| 34 |
#define GGML_COMMON_DECL
|
| 35 |
#elif defined(GGML_COMMON_DECL_HIP)
|
|
@@ -39,7 +54,8 @@ typedef half2 ggml_half2;
|
|
| 39 |
typedef half ggml_half;
|
| 40 |
typedef half2 ggml_half2;
|
| 41 |
|
| 42 |
-
#define
|
|
|
|
| 43 |
|
| 44 |
#define GGML_COMMON_DECL
|
| 45 |
#elif defined(GGML_COMMON_DECL_SYCL)
|
|
@@ -49,7 +65,8 @@ typedef half2 ggml_half2;
|
|
| 49 |
typedef sycl::half ggml_half;
|
| 50 |
typedef sycl::half2 ggml_half2;
|
| 51 |
|
| 52 |
-
#define
|
|
|
|
| 53 |
|
| 54 |
#define GGML_COMMON_DECL
|
| 55 |
#endif
|
|
@@ -154,9 +171,9 @@ typedef struct {
|
|
| 154 |
struct {
|
| 155 |
ggml_half d; // delta
|
| 156 |
ggml_half m; // min
|
| 157 |
-
}
|
| 158 |
ggml_half2 dm;
|
| 159 |
-
};
|
| 160 |
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
| 161 |
} block_q4_1;
|
| 162 |
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
|
@@ -175,9 +192,9 @@ typedef struct {
|
|
| 175 |
struct {
|
| 176 |
ggml_half d; // delta
|
| 177 |
ggml_half m; // min
|
| 178 |
-
}
|
| 179 |
ggml_half2 dm;
|
| 180 |
-
};
|
| 181 |
uint8_t qh[4]; // 5-th bit of quants
|
| 182 |
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
| 183 |
} block_q5_1;
|
|
@@ -196,37 +213,13 @@ typedef struct {
|
|
| 196 |
struct {
|
| 197 |
ggml_half d; // delta
|
| 198 |
ggml_half s; // d * sum(qs[i])
|
| 199 |
-
}
|
| 200 |
ggml_half2 ds;
|
| 201 |
-
};
|
| 202 |
int8_t qs[QK8_1]; // quants
|
| 203 |
} block_q8_1;
|
| 204 |
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
| 205 |
|
| 206 |
-
typedef struct {
|
| 207 |
-
ggml_half d[4]; // deltas for 4 q4_0 blocks
|
| 208 |
-
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
|
| 209 |
-
} block_q4_0x4;
|
| 210 |
-
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
|
| 211 |
-
|
| 212 |
-
typedef struct {
|
| 213 |
-
ggml_half d[8]; // deltas for 8 q4_0 blocks
|
| 214 |
-
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
|
| 215 |
-
} block_q4_0x8;
|
| 216 |
-
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
| 217 |
-
|
| 218 |
-
typedef struct {
|
| 219 |
-
ggml_half d[4]; // deltas for 4 q8_0 blocks
|
| 220 |
-
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
| 221 |
-
} block_q8_0x4;
|
| 222 |
-
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
|
| 223 |
-
|
| 224 |
-
typedef struct {
|
| 225 |
-
ggml_half d[8]; // deltas for 8 q8_0 blocks
|
| 226 |
-
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
|
| 227 |
-
} block_q8_0x8;
|
| 228 |
-
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
| 229 |
-
|
| 230 |
//
|
| 231 |
// Ternary quantization
|
| 232 |
//
|
|
@@ -261,9 +254,9 @@ typedef struct {
|
|
| 261 |
struct {
|
| 262 |
ggml_half d; // super-block scale for quantized scales
|
| 263 |
ggml_half dmin; // super-block scale for quantized mins
|
| 264 |
-
}
|
| 265 |
ggml_half2 dm;
|
| 266 |
-
};
|
| 267 |
} block_q2_K;
|
| 268 |
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
| 269 |
|
|
@@ -288,9 +281,9 @@ typedef struct {
|
|
| 288 |
struct {
|
| 289 |
ggml_half d; // super-block scale for quantized scales
|
| 290 |
ggml_half dmin; // super-block scale for quantized mins
|
| 291 |
-
}
|
| 292 |
ggml_half2 dm;
|
| 293 |
-
};
|
| 294 |
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
| 295 |
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 296 |
} block_q4_K;
|
|
@@ -305,9 +298,9 @@ typedef struct {
|
|
| 305 |
struct {
|
| 306 |
ggml_half d; // super-block scale for quantized scales
|
| 307 |
ggml_half dmin; // super-block scale for quantized mins
|
| 308 |
-
}
|
| 309 |
ggml_half2 dm;
|
| 310 |
-
};
|
| 311 |
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
| 312 |
uint8_t qh[QK_K/8]; // quants, high bit
|
| 313 |
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
|
@@ -418,12 +411,6 @@ typedef struct {
|
|
| 418 |
} block_iq4_xs;
|
| 419 |
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
| 420 |
|
| 421 |
-
typedef struct {
|
| 422 |
-
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
| 423 |
-
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
|
| 424 |
-
} block_iq4_nlx4;
|
| 425 |
-
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
| 426 |
-
|
| 427 |
#endif // GGML_COMMON_DECL
|
| 428 |
#endif // GGML_COMMON_DECL
|
| 429 |
|
|
@@ -437,6 +424,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
|
|
| 437 |
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
| 438 |
#define GGML_TABLE_END() };
|
| 439 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
#define GGML_COMMON_IMPL
|
| 441 |
#elif defined(GGML_COMMON_IMPL_METAL)
|
| 442 |
#include <metal_stdlib>
|
|
|
|
| 6 |
typedef uint16_t ggml_half;
|
| 7 |
typedef uint32_t ggml_half2;
|
| 8 |
|
| 9 |
+
#define GGML_COMMON_AGGR_U
|
| 10 |
+
#define GGML_COMMON_AGGR_S
|
| 11 |
+
|
| 12 |
+
#define GGML_COMMON_DECL
|
| 13 |
+
#elif defined(GGML_COMMON_DECL_CPP)
|
| 14 |
+
#include <cstdint>
|
| 15 |
+
|
| 16 |
+
typedef uint16_t ggml_half;
|
| 17 |
+
typedef uint32_t ggml_half2;
|
| 18 |
+
|
| 19 |
+
// std-c++ allow anonymous unions but some compiler warn on it
|
| 20 |
+
#define GGML_COMMON_AGGR_U data
|
| 21 |
+
// std-c++ do not allow it.
|
| 22 |
+
#define GGML_COMMON_AGGR_S data
|
| 23 |
|
| 24 |
#define GGML_COMMON_DECL
|
| 25 |
#elif defined(GGML_COMMON_DECL_METAL)
|
|
|
|
| 28 |
typedef half ggml_half;
|
| 29 |
typedef half2 ggml_half2;
|
| 30 |
|
| 31 |
+
#define GGML_COMMON_AGGR_U
|
| 32 |
+
#define GGML_COMMON_AGGR_S
|
| 33 |
|
| 34 |
#define GGML_COMMON_DECL
|
| 35 |
#elif defined(GGML_COMMON_DECL_CUDA)
|
|
|
|
| 43 |
typedef half ggml_half;
|
| 44 |
typedef half2 ggml_half2;
|
| 45 |
|
| 46 |
+
#define GGML_COMMON_AGGR_U
|
| 47 |
+
#define GGML_COMMON_AGGR_S data
|
| 48 |
|
| 49 |
#define GGML_COMMON_DECL
|
| 50 |
#elif defined(GGML_COMMON_DECL_HIP)
|
|
|
|
| 54 |
typedef half ggml_half;
|
| 55 |
typedef half2 ggml_half2;
|
| 56 |
|
| 57 |
+
#define GGML_COMMON_AGGR_U
|
| 58 |
+
#define GGML_COMMON_AGGR_S data
|
| 59 |
|
| 60 |
#define GGML_COMMON_DECL
|
| 61 |
#elif defined(GGML_COMMON_DECL_SYCL)
|
|
|
|
| 65 |
typedef sycl::half ggml_half;
|
| 66 |
typedef sycl::half2 ggml_half2;
|
| 67 |
|
| 68 |
+
#define GGML_COMMON_AGGR_U
|
| 69 |
+
#define GGML_COMMON_AGGR_S data
|
| 70 |
|
| 71 |
#define GGML_COMMON_DECL
|
| 72 |
#endif
|
|
|
|
| 171 |
struct {
|
| 172 |
ggml_half d; // delta
|
| 173 |
ggml_half m; // min
|
| 174 |
+
} GGML_COMMON_AGGR_S;
|
| 175 |
ggml_half2 dm;
|
| 176 |
+
} GGML_COMMON_AGGR_U;
|
| 177 |
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
| 178 |
} block_q4_1;
|
| 179 |
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
|
|
|
| 192 |
struct {
|
| 193 |
ggml_half d; // delta
|
| 194 |
ggml_half m; // min
|
| 195 |
+
} GGML_COMMON_AGGR_S;
|
| 196 |
ggml_half2 dm;
|
| 197 |
+
} GGML_COMMON_AGGR_U;
|
| 198 |
uint8_t qh[4]; // 5-th bit of quants
|
| 199 |
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
| 200 |
} block_q5_1;
|
|
|
|
| 213 |
struct {
|
| 214 |
ggml_half d; // delta
|
| 215 |
ggml_half s; // d * sum(qs[i])
|
| 216 |
+
} GGML_COMMON_AGGR_S;
|
| 217 |
ggml_half2 ds;
|
| 218 |
+
} GGML_COMMON_AGGR_U;
|
| 219 |
int8_t qs[QK8_1]; // quants
|
| 220 |
} block_q8_1;
|
| 221 |
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
//
|
| 224 |
// Ternary quantization
|
| 225 |
//
|
|
|
|
| 254 |
struct {
|
| 255 |
ggml_half d; // super-block scale for quantized scales
|
| 256 |
ggml_half dmin; // super-block scale for quantized mins
|
| 257 |
+
} GGML_COMMON_AGGR_S;
|
| 258 |
ggml_half2 dm;
|
| 259 |
+
} GGML_COMMON_AGGR_U;
|
| 260 |
} block_q2_K;
|
| 261 |
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
| 262 |
|
|
|
|
| 281 |
struct {
|
| 282 |
ggml_half d; // super-block scale for quantized scales
|
| 283 |
ggml_half dmin; // super-block scale for quantized mins
|
| 284 |
+
} GGML_COMMON_AGGR_S;
|
| 285 |
ggml_half2 dm;
|
| 286 |
+
} GGML_COMMON_AGGR_U;
|
| 287 |
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
| 288 |
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 289 |
} block_q4_K;
|
|
|
|
| 298 |
struct {
|
| 299 |
ggml_half d; // super-block scale for quantized scales
|
| 300 |
ggml_half dmin; // super-block scale for quantized mins
|
| 301 |
+
} GGML_COMMON_AGGR_S;
|
| 302 |
ggml_half2 dm;
|
| 303 |
+
} GGML_COMMON_AGGR_U;
|
| 304 |
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
| 305 |
uint8_t qh[QK_K/8]; // quants, high bit
|
| 306 |
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
|
|
|
| 411 |
} block_iq4_xs;
|
| 412 |
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
| 413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
#endif // GGML_COMMON_DECL
|
| 415 |
#endif // GGML_COMMON_DECL
|
| 416 |
|
|
|
|
| 424 |
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
| 425 |
#define GGML_TABLE_END() };
|
| 426 |
|
| 427 |
+
#define GGML_COMMON_IMPL
|
| 428 |
+
#elif defined(GGML_COMMON_IMPL_CPP)
|
| 429 |
+
#include <cstdint>
|
| 430 |
+
|
| 431 |
+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
| 432 |
+
#define GGML_TABLE_END() };
|
| 433 |
+
|
| 434 |
#define GGML_COMMON_IMPL
|
| 435 |
#elif defined(GGML_COMMON_IMPL_METAL)
|
| 436 |
#include <metal_stdlib>
|
|
@@ -10,10 +10,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
| 10 |
list (APPEND GGML_CPU_SOURCES
|
| 11 |
ggml-cpu/ggml-cpu.c
|
| 12 |
ggml-cpu/ggml-cpu.cpp
|
| 13 |
-
ggml-cpu/ggml-cpu-aarch64.
|
| 14 |
ggml-cpu/ggml-cpu-aarch64.h
|
|
|
|
|
|
|
| 15 |
ggml-cpu/ggml-cpu-quants.c
|
| 16 |
ggml-cpu/ggml-cpu-quants.h
|
|
|
|
|
|
|
| 17 |
ggml-cpu/amx/amx.cpp
|
| 18 |
ggml-cpu/amx/amx.h
|
| 19 |
ggml-cpu/amx/mmq.cpp
|
|
|
|
| 10 |
list (APPEND GGML_CPU_SOURCES
|
| 11 |
ggml-cpu/ggml-cpu.c
|
| 12 |
ggml-cpu/ggml-cpu.cpp
|
| 13 |
+
ggml-cpu/ggml-cpu-aarch64.cpp
|
| 14 |
ggml-cpu/ggml-cpu-aarch64.h
|
| 15 |
+
ggml-cpu/ggml-cpu-hbm.cpp
|
| 16 |
+
ggml-cpu/ggml-cpu-hbm.h
|
| 17 |
ggml-cpu/ggml-cpu-quants.c
|
| 18 |
ggml-cpu/ggml-cpu-quants.h
|
| 19 |
+
ggml-cpu/ggml-cpu-traits.cpp
|
| 20 |
+
ggml-cpu/ggml-cpu-traits.h
|
| 21 |
ggml-cpu/amx/amx.cpp
|
| 22 |
ggml-cpu/amx/amx.h
|
| 23 |
ggml-cpu/amx/mmq.cpp
|
|
@@ -5,6 +5,7 @@
|
|
| 5 |
#include "ggml-backend.h"
|
| 6 |
#include "ggml-impl.h"
|
| 7 |
#include "ggml-cpu.h"
|
|
|
|
| 8 |
|
| 9 |
#if defined(__gnu_linux__)
|
| 10 |
#include <sys/syscall.h>
|
|
@@ -17,31 +18,65 @@
|
|
| 17 |
|
| 18 |
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
// AMX buffer interface
|
| 21 |
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 22 |
free(buffer->context);
|
| 23 |
}
|
| 24 |
|
| 25 |
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 26 |
-
return (void *)(buffer->context);
|
| 27 |
}
|
| 28 |
|
| 29 |
-
static void
|
| 30 |
-
|
| 31 |
|
| 32 |
GGML_UNUSED(buffer);
|
| 33 |
}
|
| 34 |
|
| 35 |
-
static void
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
if (qtype_has_amx_kernels(tensor->type)) {
|
|
|
|
| 37 |
ggml_backend_amx_convert_weight(tensor, data, offset, size);
|
| 38 |
} else {
|
| 39 |
-
memcpy((char *)tensor->data + offset, data, size);
|
| 40 |
}
|
| 41 |
|
| 42 |
GGML_UNUSED(buffer);
|
| 43 |
}
|
| 44 |
|
|
|
|
|
|
|
| 45 |
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 46 |
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
|
| 47 |
memcpy(data, (const char *)tensor->data + offset, size);
|
|
@@ -62,6 +97,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
|
| 62 |
|
| 63 |
GGML_UNUSED(buffer);
|
| 64 |
}
|
|
|
|
| 65 |
|
| 66 |
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 67 |
memset(buffer->context, value, buffer->size);
|
|
@@ -70,13 +106,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|
| 70 |
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
| 71 |
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
| 72 |
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
| 73 |
-
/* .init_tensor = */
|
| 74 |
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
|
| 75 |
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
|
| 76 |
-
/* .get_tensor = */
|
| 77 |
-
/* .cpy_tensor = */
|
| 78 |
/* .clear = */ ggml_backend_amx_buffer_clear,
|
| 79 |
-
/* .reset = */
|
| 80 |
};
|
| 81 |
|
| 82 |
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
@@ -101,14 +137,44 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
|
|
| 101 |
GGML_UNUSED(buft);
|
| 102 |
}
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
GGML_UNUSED(buft);
|
| 114 |
}
|
|
@@ -129,68 +195,26 @@ static bool ggml_amx_init() {
|
|
| 129 |
return true;
|
| 130 |
#endif
|
| 131 |
}
|
|
|
|
| 132 |
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
| 133 |
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
| 134 |
/* .iface = */ {
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
| 143 |
-
/* .context = */
|
| 144 |
};
|
| 145 |
|
| 146 |
if (!ggml_amx_init()) {
|
| 147 |
-
return
|
| 148 |
}
|
| 149 |
|
| 150 |
return &ggml_backend_buffer_type_amx;
|
| 151 |
}
|
| 152 |
|
| 153 |
-
|
| 154 |
-
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
|
| 155 |
-
}
|
| 156 |
-
|
| 157 |
-
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
|
| 158 |
-
// handle only 2d gemm for now
|
| 159 |
-
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
|
| 160 |
-
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
|
| 161 |
-
};
|
| 162 |
-
|
| 163 |
-
switch (op->op) {
|
| 164 |
-
case GGML_OP_NONE:
|
| 165 |
-
case GGML_OP_RESHAPE:
|
| 166 |
-
case GGML_OP_VIEW:
|
| 167 |
-
case GGML_OP_PERMUTE:
|
| 168 |
-
case GGML_OP_TRANSPOSE:
|
| 169 |
-
return true;
|
| 170 |
-
|
| 171 |
-
case GGML_OP_MUL_MAT: {
|
| 172 |
-
const struct ggml_tensor * src0 = op->src[0];
|
| 173 |
-
const struct ggml_tensor * src1 = op->src[1];
|
| 174 |
-
|
| 175 |
-
const enum ggml_type type = src0->type;
|
| 176 |
-
const int64_t ne0 = op->ne[0];
|
| 177 |
-
|
| 178 |
-
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
|
| 179 |
-
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
|
| 180 |
-
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
|
| 181 |
-
|
| 182 |
-
bool can_use_amx =
|
| 183 |
-
is_contiguous_2d(src0) && // src0 must be contiguous
|
| 184 |
-
is_contiguous_2d(src1) && // src1 must be contiguous
|
| 185 |
-
src1->type == GGML_TYPE_F32 && // src1 must be float32
|
| 186 |
-
has_amx_kernels && // with amx kernel impls
|
| 187 |
-
ne0 % (TILE_N * 2) == 0; // out_features is 32x
|
| 188 |
-
|
| 189 |
-
return can_use_amx;
|
| 190 |
-
}
|
| 191 |
-
default:
|
| 192 |
-
return false;
|
| 193 |
-
}
|
| 194 |
-
}
|
| 195 |
-
|
| 196 |
-
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
|
|
|
| 5 |
#include "ggml-backend.h"
|
| 6 |
#include "ggml-impl.h"
|
| 7 |
#include "ggml-cpu.h"
|
| 8 |
+
#include "ggml-cpu-traits.h"
|
| 9 |
|
| 10 |
#if defined(__gnu_linux__)
|
| 11 |
#include <sys/syscall.h>
|
|
|
|
| 18 |
|
| 19 |
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 20 |
|
| 21 |
+
// AMX type_trais
|
| 22 |
+
namespace ggml::cpu::amx {
|
| 23 |
+
class tensor_traits : public ggml::cpu::tensor_traits {
|
| 24 |
+
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
| 25 |
+
size = ggml_backend_amx_desired_wsize(op);
|
| 26 |
+
return true;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
|
| 30 |
+
if (op->op == GGML_OP_MUL_MAT) {
|
| 31 |
+
ggml_backend_amx_mul_mat(params, op);
|
| 32 |
+
return true;
|
| 33 |
+
}
|
| 34 |
+
return false;
|
| 35 |
+
}
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
|
| 39 |
+
static tensor_traits traits;
|
| 40 |
+
return &traits;
|
| 41 |
+
}
|
| 42 |
+
} // namespace ggml::cpu::amx
|
| 43 |
+
|
| 44 |
// AMX buffer interface
|
| 45 |
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 46 |
free(buffer->context);
|
| 47 |
}
|
| 48 |
|
| 49 |
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 50 |
+
return (void *) (buffer->context);
|
| 51 |
}
|
| 52 |
|
| 53 |
+
static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 54 |
+
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
|
| 55 |
|
| 56 |
GGML_UNUSED(buffer);
|
| 57 |
}
|
| 58 |
|
| 59 |
+
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
| 60 |
+
uint8_t value, size_t offset, size_t size) {
|
| 61 |
+
memset((char *) tensor->data + offset, value, size);
|
| 62 |
+
|
| 63 |
+
GGML_UNUSED(buffer);
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
| 67 |
+
const void * data, size_t offset, size_t size) {
|
| 68 |
if (qtype_has_amx_kernels(tensor->type)) {
|
| 69 |
+
GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
|
| 70 |
ggml_backend_amx_convert_weight(tensor, data, offset, size);
|
| 71 |
} else {
|
| 72 |
+
memcpy((char *) tensor->data + offset, data, size);
|
| 73 |
}
|
| 74 |
|
| 75 |
GGML_UNUSED(buffer);
|
| 76 |
}
|
| 77 |
|
| 78 |
+
/*
|
| 79 |
+
// need to figure what we need to do with buffer->extra.
|
| 80 |
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 81 |
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
|
| 82 |
memcpy(data, (const char *)tensor->data + offset, size);
|
|
|
|
| 97 |
|
| 98 |
GGML_UNUSED(buffer);
|
| 99 |
}
|
| 100 |
+
*/
|
| 101 |
|
| 102 |
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 103 |
memset(buffer->context, value, buffer->size);
|
|
|
|
| 106 |
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
| 107 |
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
| 108 |
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
| 109 |
+
/* .init_tensor = */ ggml_backend_amx_buffer_init_tensor,
|
| 110 |
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
|
| 111 |
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
|
| 112 |
+
/* .get_tensor = */ nullptr,
|
| 113 |
+
/* .cpy_tensor = */ nullptr,
|
| 114 |
/* .clear = */ ggml_backend_amx_buffer_clear,
|
| 115 |
+
/* .reset = */ nullptr,
|
| 116 |
};
|
| 117 |
|
| 118 |
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
|
|
| 137 |
GGML_UNUSED(buft);
|
| 138 |
}
|
| 139 |
|
| 140 |
+
namespace ggml::cpu::amx {
|
| 141 |
+
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
| 142 |
+
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
| 143 |
+
// handle only 2d gemm for now
|
| 144 |
+
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
|
| 145 |
+
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
|
| 146 |
+
};
|
| 147 |
+
|
| 148 |
+
if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
|
| 149 |
+
is_contiguous_2d(op->src[1]) && // src1 must be contiguous
|
| 150 |
+
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
|
| 151 |
+
op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
|
| 152 |
+
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
|
| 153 |
+
// src1 must be host buffer
|
| 154 |
+
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
| 155 |
+
return false;
|
| 156 |
+
}
|
| 157 |
+
// src1 must be float32
|
| 158 |
+
if (op->src[1]->type == GGML_TYPE_F32) {
|
| 159 |
+
return true;
|
| 160 |
+
}
|
| 161 |
+
}
|
| 162 |
+
return false;
|
| 163 |
+
}
|
| 164 |
|
| 165 |
+
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
| 166 |
+
if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
|
| 167 |
+
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
|
| 168 |
+
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
| 169 |
+
}
|
| 170 |
|
| 171 |
+
return nullptr;
|
| 172 |
+
}
|
| 173 |
+
};
|
| 174 |
+
} // namespace ggml::cpu::amx
|
| 175 |
+
|
| 176 |
+
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 177 |
+
return ggml_backend_amx_get_alloc_size(tensor);
|
| 178 |
|
| 179 |
GGML_UNUSED(buft);
|
| 180 |
}
|
|
|
|
| 195 |
return true;
|
| 196 |
#endif
|
| 197 |
}
|
| 198 |
+
|
| 199 |
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
| 200 |
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
| 201 |
/* .iface = */ {
|
| 202 |
+
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
| 203 |
+
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
| 204 |
+
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
| 205 |
+
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
| 206 |
+
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
| 207 |
+
/* .is_host = */ nullptr,
|
| 208 |
+
},
|
| 209 |
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
| 210 |
+
/* .context = */ new ggml::cpu::amx::extra_buffer_type(),
|
| 211 |
};
|
| 212 |
|
| 213 |
if (!ggml_amx_init()) {
|
| 214 |
+
return nullptr;
|
| 215 |
}
|
| 216 |
|
| 217 |
return &ggml_backend_buffer_type_amx;
|
| 218 |
}
|
| 219 |
|
| 220 |
+
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,20 +1,8 @@
|
|
| 1 |
#include "ggml-backend.h"
|
| 2 |
#include "ggml-cpu-impl.h"
|
| 3 |
|
| 4 |
-
|
| 5 |
-
extern "C" {
|
| 6 |
-
#endif
|
| 7 |
|
| 8 |
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 9 |
-
|
| 10 |
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
| 11 |
-
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
|
| 12 |
-
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
|
| 13 |
-
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 14 |
-
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
|
| 15 |
-
|
| 16 |
-
#endif
|
| 17 |
-
|
| 18 |
-
#ifdef __cplusplus
|
| 19 |
-
}
|
| 20 |
#endif
|
|
|
|
| 1 |
#include "ggml-backend.h"
|
| 2 |
#include "ggml-cpu-impl.h"
|
| 3 |
|
| 4 |
+
// GGML internal header
|
|
|
|
|
|
|
| 5 |
|
| 6 |
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
|
|
|
| 7 |
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
#endif
|
|
@@ -7,7 +7,7 @@
|
|
| 7 |
#include <memory>
|
| 8 |
#include <type_traits>
|
| 9 |
|
| 10 |
-
#if defined(
|
| 11 |
#include <omp.h>
|
| 12 |
#endif
|
| 13 |
|
|
@@ -56,11 +56,11 @@ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
|
|
| 56 |
}
|
| 57 |
|
| 58 |
template <typename func_t>
|
| 59 |
-
inline void parallel_for(int
|
| 60 |
-
#if defined(
|
| 61 |
-
#pragma omp parallel
|
| 62 |
{
|
| 63 |
-
|
| 64 |
int ith = omp_get_thread_num();
|
| 65 |
int tbegin, tend;
|
| 66 |
balance211(n, nth, ith, tbegin, tend);
|
|
@@ -68,8 +68,6 @@ inline void parallel_for(int nth, int n, const func_t& f) {
|
|
| 68 |
}
|
| 69 |
#else
|
| 70 |
f(0, n);
|
| 71 |
-
|
| 72 |
-
GGML_UNUSED(nth);
|
| 73 |
#endif
|
| 74 |
}
|
| 75 |
|
|
@@ -91,10 +89,3 @@ inline bool qtype_has_amx_kernels(const enum ggml_type type) {
|
|
| 91 |
(type == GGML_TYPE_Q6_K) ||
|
| 92 |
(type == GGML_TYPE_IQ4_XS);
|
| 93 |
}
|
| 94 |
-
|
| 95 |
-
// ggml backend context
|
| 96 |
-
struct ggml_backend_amx_context {
|
| 97 |
-
int n_threads = GGML_DEFAULT_N_THREADS;
|
| 98 |
-
std::unique_ptr<char[]> work_data;
|
| 99 |
-
size_t work_size = 0;
|
| 100 |
-
};
|
|
|
|
| 7 |
#include <memory>
|
| 8 |
#include <type_traits>
|
| 9 |
|
| 10 |
+
#if defined(GGML_USE_OPENMP)
|
| 11 |
#include <omp.h>
|
| 12 |
#endif
|
| 13 |
|
|
|
|
| 56 |
}
|
| 57 |
|
| 58 |
template <typename func_t>
|
| 59 |
+
inline void parallel_for(int n, const func_t& f) {
|
| 60 |
+
#if defined(GGML_USE_OPENMP)
|
| 61 |
+
#pragma omp parallel
|
| 62 |
{
|
| 63 |
+
int nth = omp_get_num_threads();
|
| 64 |
int ith = omp_get_thread_num();
|
| 65 |
int tbegin, tend;
|
| 66 |
balance211(n, nth, ith, tbegin, tend);
|
|
|
|
| 68 |
}
|
| 69 |
#else
|
| 70 |
f(0, n);
|
|
|
|
|
|
|
| 71 |
#endif
|
| 72 |
}
|
| 73 |
|
|
|
|
| 89 |
(type == GGML_TYPE_Q6_K) ||
|
| 90 |
(type == GGML_TYPE_IQ4_XS);
|
| 91 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -18,10 +18,6 @@
|
|
| 18 |
#include <unistd.h>
|
| 19 |
#endif
|
| 20 |
|
| 21 |
-
#if defined(_OPENMP)
|
| 22 |
-
#include <omp.h>
|
| 23 |
-
#endif
|
| 24 |
-
|
| 25 |
#if (defined(_WIN32) || defined(_WIN64))
|
| 26 |
#define RESTRICT __restrict
|
| 27 |
#else
|
|
@@ -1382,13 +1378,13 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
|
|
| 1382 |
#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
|
| 1383 |
|
| 1384 |
template<typename TB, int BLOCK_K>
|
| 1385 |
-
void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K
|
| 1386 |
const int NB = N / TILE_N;
|
| 1387 |
const int KB = K / BLOCK_K;
|
| 1388 |
const int TILE_SIZE = get_tile_size<TB>();
|
| 1389 |
|
| 1390 |
// parallel on NB should be enough
|
| 1391 |
-
parallel_for(
|
| 1392 |
for (int n = begin; n < end; ++n) {
|
| 1393 |
for (int k = 0; k < KB; ++k) {
|
| 1394 |
int n0 = n * TILE_N;
|
|
@@ -2334,15 +2330,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
|
|
| 2334 |
const int K = tensor->ne[0]; // ne0: in_features
|
| 2335 |
const int N = tensor->ne[1]; // ne1: out_features
|
| 2336 |
|
| 2337 |
-
#if defined(_OPENMP)
|
| 2338 |
-
// the buffer ctx is not initialized when .set_tensor is called
|
| 2339 |
-
int n_threads = omp_get_num_threads();
|
| 2340 |
-
#else
|
| 2341 |
-
int n_threads = 1;
|
| 2342 |
-
#endif
|
| 2343 |
-
|
| 2344 |
GGML_DISPATCH_QTYPES(TYPE, [&] {
|
| 2345 |
-
convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K
|
| 2346 |
});
|
| 2347 |
}
|
| 2348 |
|
|
|
|
| 18 |
#include <unistd.h>
|
| 19 |
#endif
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
#if (defined(_WIN32) || defined(_WIN64))
|
| 22 |
#define RESTRICT __restrict
|
| 23 |
#else
|
|
|
|
| 1378 |
#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
|
| 1379 |
|
| 1380 |
template<typename TB, int BLOCK_K>
|
| 1381 |
+
void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) {
|
| 1382 |
const int NB = N / TILE_N;
|
| 1383 |
const int KB = K / BLOCK_K;
|
| 1384 |
const int TILE_SIZE = get_tile_size<TB>();
|
| 1385 |
|
| 1386 |
// parallel on NB should be enough
|
| 1387 |
+
parallel_for(NB, [&](int begin, int end) {
|
| 1388 |
for (int n = begin; n < end; ++n) {
|
| 1389 |
for (int k = 0; k < KB; ++k) {
|
| 1390 |
int n0 = n * TILE_N;
|
|
|
|
| 2330 |
const int K = tensor->ne[0]; // ne0: in_features
|
| 2331 |
const int N = tensor->ne[1]; // ne1: out_features
|
| 2332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2333 |
GGML_DISPATCH_QTYPES(TYPE, [&] {
|
| 2334 |
+
convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K);
|
| 2335 |
});
|
| 2336 |
}
|
| 2337 |
|
|
@@ -1,16 +1,10 @@
|
|
| 1 |
#pragma once
|
| 2 |
#include "common.h"
|
| 3 |
|
| 4 |
-
|
| 5 |
-
extern "C" {
|
| 6 |
-
#endif
|
| 7 |
|
| 8 |
size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
|
| 9 |
|
| 10 |
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 11 |
|
| 12 |
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 13 |
-
|
| 14 |
-
#ifdef __cplusplus
|
| 15 |
-
}
|
| 16 |
-
#endif
|
|
|
|
| 1 |
#pragma once
|
| 2 |
#include "common.h"
|
| 3 |
|
| 4 |
+
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
|
|
|
|
|
|
|
| 5 |
|
| 6 |
size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
|
| 7 |
|
| 8 |
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 9 |
|
| 10 |
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
The diff for this file is too large to render.
See raw diff
|
|
|
|
@@ -1,32 +1,8 @@
|
|
| 1 |
#pragma once
|
| 2 |
|
|
|
|
| 3 |
#include "ggml.h"
|
| 4 |
|
| 5 |
// GGML internal header
|
| 6 |
|
| 7 |
-
|
| 8 |
-
extern "C" {
|
| 9 |
-
#endif
|
| 10 |
-
|
| 11 |
-
// Quantization
|
| 12 |
-
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
| 13 |
-
|
| 14 |
-
// GEMV
|
| 15 |
-
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 16 |
-
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 17 |
-
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 18 |
-
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 19 |
-
|
| 20 |
-
// GEMM
|
| 21 |
-
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 22 |
-
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 23 |
-
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 24 |
-
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 25 |
-
|
| 26 |
-
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
|
| 27 |
-
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
|
| 28 |
-
|
| 29 |
-
#ifdef __cplusplus
|
| 30 |
-
}
|
| 31 |
-
#endif
|
| 32 |
-
|
|
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
+
#include "ggml-cpu-traits.h"
|
| 4 |
#include "ggml.h"
|
| 5 |
|
| 6 |
// GGML internal header
|
| 7 |
|
| 8 |
+
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifdef GGML_USE_CPU_HBM
|
| 2 |
+
|
| 3 |
+
#include "ggml-backend.h"
|
| 4 |
+
#include "ggml-backend-impl.h"
|
| 5 |
+
#include "ggml-cpu.h"
|
| 6 |
+
#include "ggml-impl.h"
|
| 7 |
+
|
| 8 |
+
#include "ggml-cpu-hbm.h"
|
| 9 |
+
|
| 10 |
+
// buffer type HBM
|
| 11 |
+
|
| 12 |
+
#include <hbwmalloc.h>
|
| 13 |
+
|
| 14 |
+
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 15 |
+
return "CPU_HBM";
|
| 16 |
+
|
| 17 |
+
GGML_UNUSED(buft);
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 21 |
+
hbw_free(buffer->context);
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
| 25 |
+
size_t size) {
|
| 26 |
+
void * ptr;
|
| 27 |
+
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
| 28 |
+
if (result != 0) {
|
| 29 |
+
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
| 30 |
+
return NULL;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 34 |
+
buffer->buft = buft;
|
| 35 |
+
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
| 36 |
+
|
| 37 |
+
return buffer;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
| 41 |
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
| 42 |
+
/* .iface = */ {
|
| 43 |
+
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
| 44 |
+
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
| 45 |
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 46 |
+
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
| 47 |
+
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
|
| 48 |
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 49 |
+
},
|
| 50 |
+
/* .context = */ nullptr,
|
| 51 |
+
};
|
| 52 |
+
|
| 53 |
+
return &ggml_backend_cpu_buffer_type_hbm;
|
| 54 |
+
}
|
| 55 |
+
#endif
|
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "ggml-backend.h"
|
| 4 |
+
#include "ggml.h"
|
| 5 |
+
|
| 6 |
+
// GGML CPU internal header
|
| 7 |
+
|
| 8 |
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "ggml-cpu-traits.h"
|
| 2 |
+
|
| 3 |
+
#include "ggml-backend-impl.h"
|
| 4 |
+
#include "ggml-backend.h"
|
| 5 |
+
|
| 6 |
+
namespace ggml::cpu {
|
| 7 |
+
tensor_traits::~tensor_traits() {}
|
| 8 |
+
|
| 9 |
+
extra_buffer_type::~extra_buffer_type() {}
|
| 10 |
+
} // namespace ggml::cpu
|
| 11 |
+
|
| 12 |
+
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
| 13 |
+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
| 14 |
+
if (extra && extra->context) {
|
| 15 |
+
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
| 16 |
+
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
| 17 |
+
if (tensor_traits && tensor_traits->compute_forward(params, op)) {
|
| 18 |
+
return true;
|
| 19 |
+
}
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
return false;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
|
| 26 |
+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
| 27 |
+
if (extra && extra->context) {
|
| 28 |
+
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
| 29 |
+
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
| 30 |
+
if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) {
|
| 31 |
+
return true;
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
return false;
|
| 36 |
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include "ggml-backend-impl.h"
|
| 3 |
+
#include "ggml-cpu-impl.h"
|
| 4 |
+
#include "ggml.h"
|
| 5 |
+
|
| 6 |
+
#ifdef __cplusplus
|
| 7 |
+
# include <vector>
|
| 8 |
+
extern "C" {
|
| 9 |
+
#endif
|
| 10 |
+
|
| 11 |
+
// return true if op part of extra "accelerator"
|
| 12 |
+
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
|
| 13 |
+
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
|
| 14 |
+
|
| 15 |
+
#ifdef __cplusplus
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
namespace ggml::cpu {
|
| 19 |
+
// register in tensor->extra
|
| 20 |
+
class tensor_traits {
|
| 21 |
+
public:
|
| 22 |
+
virtual ~tensor_traits();
|
| 23 |
+
virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0;
|
| 24 |
+
virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
|
| 25 |
+
};
|
| 26 |
+
|
| 27 |
+
class extra_buffer_type {
|
| 28 |
+
public:
|
| 29 |
+
virtual ~extra_buffer_type();
|
| 30 |
+
virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
|
| 31 |
+
virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0;
|
| 32 |
+
};
|
| 33 |
+
} // namespace ggml::cpu
|
| 34 |
+
|
| 35 |
+
// implemented in ggml-cpu.cpp.
|
| 36 |
+
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
|
| 37 |
+
|
| 38 |
+
#endif
|
|
@@ -3,7 +3,7 @@
|
|
| 3 |
|
| 4 |
#include "ggml-backend-impl.h"
|
| 5 |
#include "ggml-backend.h"
|
| 6 |
-
#include "ggml-cpu-
|
| 7 |
#include "ggml-cpu-impl.h"
|
| 8 |
#include "ggml-cpu.h"
|
| 9 |
#include "ggml-impl.h"
|
|
@@ -224,10 +224,6 @@ typedef void * thread_ret_t;
|
|
| 224 |
|
| 225 |
typedef pthread_t ggml_thread_t;
|
| 226 |
|
| 227 |
-
#ifdef GGML_USE_CPU_HBM
|
| 228 |
-
#include <hbwmalloc.h>
|
| 229 |
-
#endif
|
| 230 |
-
|
| 231 |
#if defined(__APPLE__)
|
| 232 |
#include <unistd.h>
|
| 233 |
#include <mach/mach.h>
|
|
@@ -301,7 +297,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|
| 301 |
},
|
| 302 |
[GGML_TYPE_Q8_0] = {
|
| 303 |
.from_float = quantize_row_q8_0,
|
| 304 |
-
.from_float_to_mat = quantize_mat_q8_0,
|
| 305 |
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
| 306 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 307 |
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
|
@@ -409,33 +404,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|
| 409 |
.vec_dot_type = GGML_TYPE_BF16,
|
| 410 |
.nrows = 1,
|
| 411 |
},
|
| 412 |
-
[GGML_TYPE_Q4_0_4_4] = {
|
| 413 |
-
.from_float = NULL,
|
| 414 |
-
.vec_dot = NULL,
|
| 415 |
-
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 416 |
-
.nrows = 1,
|
| 417 |
-
.ncols = 4,
|
| 418 |
-
.gemv = ggml_gemv_q4_0_4x4_q8_0,
|
| 419 |
-
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
| 420 |
-
},
|
| 421 |
-
[GGML_TYPE_Q4_0_4_8] = {
|
| 422 |
-
.from_float = NULL,
|
| 423 |
-
.vec_dot = NULL,
|
| 424 |
-
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 425 |
-
.nrows = 1,
|
| 426 |
-
.ncols = 4,
|
| 427 |
-
.gemv = ggml_gemv_q4_0_4x8_q8_0,
|
| 428 |
-
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
| 429 |
-
},
|
| 430 |
-
[GGML_TYPE_Q4_0_8_8] = {
|
| 431 |
-
.from_float = NULL,
|
| 432 |
-
.vec_dot = NULL,
|
| 433 |
-
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 434 |
-
.nrows = 1,
|
| 435 |
-
.ncols = 8,
|
| 436 |
-
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
| 437 |
-
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
| 438 |
-
},
|
| 439 |
[GGML_TYPE_TQ1_0] = {
|
| 440 |
.from_float = quantize_row_tq1_0,
|
| 441 |
.vec_dot = ggml_vec_dot_tq1_0_q8_K,
|
|
@@ -448,15 +416,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|
| 448 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 449 |
.nrows = 1,
|
| 450 |
},
|
| 451 |
-
[GGML_TYPE_IQ4_NL_4_4] = {
|
| 452 |
-
.from_float = NULL,
|
| 453 |
-
.vec_dot = NULL,
|
| 454 |
-
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 455 |
-
.nrows = 1,
|
| 456 |
-
.ncols = 4,
|
| 457 |
-
.gemv = ggml_gemv_iq4_nl_4x4_q8_0,
|
| 458 |
-
.gemm = ggml_gemm_iq4_nl_4x4_q8_0,
|
| 459 |
-
},
|
| 460 |
};
|
| 461 |
|
| 462 |
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
|
@@ -4509,9 +4468,6 @@ static void ggml_compute_forward_add(
|
|
| 4509 |
case GGML_TYPE_IQ4_XS:
|
| 4510 |
case GGML_TYPE_IQ3_S:
|
| 4511 |
case GGML_TYPE_IQ2_S:
|
| 4512 |
-
case GGML_TYPE_Q4_0_4_4:
|
| 4513 |
-
case GGML_TYPE_Q4_0_4_8:
|
| 4514 |
-
case GGML_TYPE_Q4_0_8_8:
|
| 4515 |
{
|
| 4516 |
ggml_compute_forward_add_q_f32(params, dst);
|
| 4517 |
} break;
|
|
@@ -4889,9 +4845,6 @@ static void ggml_compute_forward_add1(
|
|
| 4889 |
case GGML_TYPE_IQ4_XS:
|
| 4890 |
case GGML_TYPE_IQ3_S:
|
| 4891 |
case GGML_TYPE_IQ2_S:
|
| 4892 |
-
case GGML_TYPE_Q4_0_4_4:
|
| 4893 |
-
case GGML_TYPE_Q4_0_4_8:
|
| 4894 |
-
case GGML_TYPE_Q4_0_8_8:
|
| 4895 |
{
|
| 4896 |
ggml_compute_forward_add1_q_f32(params, dst);
|
| 4897 |
} break;
|
|
@@ -5019,9 +4972,6 @@ static void ggml_compute_forward_acc(
|
|
| 5019 |
case GGML_TYPE_IQ4_XS:
|
| 5020 |
case GGML_TYPE_IQ3_S:
|
| 5021 |
case GGML_TYPE_IQ2_S:
|
| 5022 |
-
case GGML_TYPE_Q4_0_4_4:
|
| 5023 |
-
case GGML_TYPE_Q4_0_4_8:
|
| 5024 |
-
case GGML_TYPE_Q4_0_8_8:
|
| 5025 |
default:
|
| 5026 |
{
|
| 5027 |
GGML_ABORT("fatal error");
|
|
@@ -7437,27 +7387,9 @@ static void ggml_compute_forward_mul_mat(
|
|
| 7437 |
const int ith = params->ith;
|
| 7438 |
const int nth = params->nth;
|
| 7439 |
|
| 7440 |
-
enum ggml_type
|
| 7441 |
-
|
| 7442 |
-
if (src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
|
| 7443 |
-
type = (enum ggml_type)(intptr_t)src0->extra;
|
| 7444 |
-
}
|
| 7445 |
-
|
| 7446 |
-
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 7447 |
-
if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
|
| 7448 |
-
ggml_backend_amx_mul_mat(params, dst);
|
| 7449 |
-
return;
|
| 7450 |
-
}
|
| 7451 |
-
#endif
|
| 7452 |
-
|
| 7453 |
-
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
| 7454 |
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
| 7455 |
-
|
| 7456 |
-
int64_t const vec_dot_num_rows = type_traits_cpu[type].nrows;
|
| 7457 |
-
int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
|
| 7458 |
-
int64_t const blck_size_interleave = ggml_get_type_traits(type)->blck_size_interleave;
|
| 7459 |
-
ggml_gemv_t const gemv = type_traits_cpu[type].gemv;
|
| 7460 |
-
ggml_gemm_t const gemm = type_traits_cpu[type].gemm;
|
| 7461 |
|
| 7462 |
GGML_ASSERT(ne0 == ne01);
|
| 7463 |
GGML_ASSERT(ne1 == ne11);
|
|
@@ -7465,7 +7397,7 @@ static void ggml_compute_forward_mul_mat(
|
|
| 7465 |
GGML_ASSERT(ne3 == ne13);
|
| 7466 |
|
| 7467 |
// we don't support permuted src0 or src1
|
| 7468 |
-
GGML_ASSERT(nb00 == ggml_type_size(type));
|
| 7469 |
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
| 7470 |
|
| 7471 |
// dst cannot be transposed or permuted
|
|
@@ -7477,6 +7409,7 @@ static void ggml_compute_forward_mul_mat(
|
|
| 7477 |
// nb01 >= nb00 - src0 is not transposed
|
| 7478 |
// compute by src0 rows
|
| 7479 |
|
|
|
|
| 7480 |
#if GGML_USE_LLAMAFILE
|
| 7481 |
// broadcast factors
|
| 7482 |
const int64_t r2 = ne12 / ne02;
|
|
@@ -7487,15 +7420,15 @@ static void ggml_compute_forward_mul_mat(
|
|
| 7487 |
if (src1_cont) {
|
| 7488 |
for (int64_t i13 = 0; i13 < ne13; i13++)
|
| 7489 |
for (int64_t i12 = 0; i12 < ne12; i12++)
|
| 7490 |
-
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
|
| 7491 |
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
| 7492 |
-
nb01/ggml_type_size(type),
|
| 7493 |
(const char *)src1->data + i12*nb12 + i13*nb13,
|
| 7494 |
nb11/ggml_type_size(src1->type),
|
| 7495 |
(char *)dst->data + i12*nb2 + i13*nb3,
|
| 7496 |
nb1/ggml_type_size(dst->type),
|
| 7497 |
ith, nth,
|
| 7498 |
-
type,
|
| 7499 |
src1->type,
|
| 7500 |
dst->type))
|
| 7501 |
goto UseGgmlGemm1;
|
|
@@ -7516,19 +7449,10 @@ UseGgmlGemm1:;
|
|
| 7516 |
|
| 7517 |
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
| 7518 |
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
| 7519 |
-
int64_t
|
| 7520 |
-
if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
|
| 7521 |
-
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
| 7522 |
-
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
| 7523 |
-
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
| 7524 |
-
4, ne10, blck_size_interleave);
|
| 7525 |
-
}
|
| 7526 |
-
i11_processed = ne11 - ne11 % 4;
|
| 7527 |
-
}
|
| 7528 |
-
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
| 7529 |
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
| 7530 |
-
|
| 7531 |
-
|
| 7532 |
}
|
| 7533 |
}
|
| 7534 |
}
|
|
@@ -7548,15 +7472,15 @@ UseGgmlGemm1:;
|
|
| 7548 |
|
| 7549 |
for (int64_t i13 = 0; i13 < ne13; i13++)
|
| 7550 |
for (int64_t i12 = 0; i12 < ne12; i12++)
|
| 7551 |
-
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
|
| 7552 |
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
| 7553 |
-
nb01/ggml_type_size(type),
|
| 7554 |
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
| 7555 |
row_size/ggml_type_size(vec_dot_type),
|
| 7556 |
(char *)dst->data + i12*nb2 + i13*nb3,
|
| 7557 |
nb1/ggml_type_size(dst->type),
|
| 7558 |
ith, nth,
|
| 7559 |
-
type,
|
| 7560 |
vec_dot_type,
|
| 7561 |
dst->type))
|
| 7562 |
goto UseGgmlGemm2;
|
|
@@ -7598,28 +7522,6 @@ UseGgmlGemm2:;
|
|
| 7598 |
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
| 7599 |
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
| 7600 |
|
| 7601 |
-
if ((ggml_n_dims(src0) == 2) && gemv) {
|
| 7602 |
-
const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
| 7603 |
-
const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
|
| 7604 |
-
int64_t src0_start = (ith * ne01) / nth;
|
| 7605 |
-
int64_t src0_end = ((ith + 1) * ne01) / nth;
|
| 7606 |
-
src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
|
| 7607 |
-
src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
|
| 7608 |
-
if (src0_start >= src0_end) return;
|
| 7609 |
-
|
| 7610 |
-
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
| 7611 |
-
if (gemm && (ne11 > 3)) {
|
| 7612 |
-
gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
|
| 7613 |
-
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
|
| 7614 |
-
}
|
| 7615 |
-
for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
|
| 7616 |
-
gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
|
| 7617 |
-
(const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
|
| 7618 |
-
src0_end - src0_start);
|
| 7619 |
-
}
|
| 7620 |
-
return;
|
| 7621 |
-
}
|
| 7622 |
-
|
| 7623 |
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
| 7624 |
int current_chunk = ith;
|
| 7625 |
|
|
@@ -7642,7 +7544,7 @@ UseGgmlGemm2:;
|
|
| 7642 |
num_rows_per_vec_dot = 1;
|
| 7643 |
}
|
| 7644 |
|
| 7645 |
-
ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
| 7646 |
|
| 7647 |
if (nth >= nchunk0 * nchunk1) {
|
| 7648 |
break;
|
|
@@ -7674,8 +7576,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
| 7674 |
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
| 7675 |
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
| 7676 |
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
| 7677 |
-
int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
|
| 7678 |
-
ggml_gemv_t const gemv = type_traits_cpu[type].gemv;
|
| 7679 |
|
| 7680 |
// we don't support permuted src0 or src1
|
| 7681 |
GGML_ASSERT(nb00 == ggml_type_size(type));
|
|
@@ -7761,34 +7661,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
| 7761 |
const int64_t nr0 = ne01; // src0 rows
|
| 7762 |
const int64_t nr1 = cne1; // src1 rows
|
| 7763 |
|
| 7764 |
-
if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
|
| 7765 |
-
int64_t src0_cur_start = (ith * ne01) / nth;
|
| 7766 |
-
int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
|
| 7767 |
-
src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
|
| 7768 |
-
src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
|
| 7769 |
-
if (src0_cur_start >= src0_cur_end) return;
|
| 7770 |
-
|
| 7771 |
-
for (int ir1 = 0; ir1 < nr1; ir1++) {
|
| 7772 |
-
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
|
| 7773 |
-
const int id = row_mapping.i1; // selected expert index
|
| 7774 |
-
|
| 7775 |
-
const int64_t i11 = id % ne11;
|
| 7776 |
-
const int64_t i12 = row_mapping.i2; // row index in src1
|
| 7777 |
-
|
| 7778 |
-
const int64_t i1 = id; // selected expert index
|
| 7779 |
-
const int64_t i2 = i12; // row
|
| 7780 |
-
|
| 7781 |
-
const char * src1_col = (const char *) wdata +
|
| 7782 |
-
(src1_cont || src1->type != vec_dot_type
|
| 7783 |
-
? (i11 + i12 * ne11) * row_size
|
| 7784 |
-
: (i11 * nb11 + i12 * nb12));
|
| 7785 |
-
|
| 7786 |
-
gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
|
| 7787 |
-
(const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
|
| 7788 |
-
}
|
| 7789 |
-
continue;
|
| 7790 |
-
}
|
| 7791 |
-
|
| 7792 |
// distribute the thread work across the inner or outer loop based on which one is larger
|
| 7793 |
|
| 7794 |
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
|
@@ -8096,9 +7968,6 @@ static void ggml_compute_forward_out_prod(
|
|
| 8096 |
case GGML_TYPE_IQ4_XS:
|
| 8097 |
case GGML_TYPE_IQ3_S:
|
| 8098 |
case GGML_TYPE_IQ2_S:
|
| 8099 |
-
case GGML_TYPE_Q4_0_4_4:
|
| 8100 |
-
case GGML_TYPE_Q4_0_4_8:
|
| 8101 |
-
case GGML_TYPE_Q4_0_8_8:
|
| 8102 |
{
|
| 8103 |
ggml_compute_forward_out_prod_q_f32(params, dst);
|
| 8104 |
} break;
|
|
@@ -8361,9 +8230,6 @@ static void ggml_compute_forward_set(
|
|
| 8361 |
case GGML_TYPE_IQ4_XS:
|
| 8362 |
case GGML_TYPE_IQ3_S:
|
| 8363 |
case GGML_TYPE_IQ2_S:
|
| 8364 |
-
case GGML_TYPE_Q4_0_4_4:
|
| 8365 |
-
case GGML_TYPE_Q4_0_4_8:
|
| 8366 |
-
case GGML_TYPE_Q4_0_8_8:
|
| 8367 |
default:
|
| 8368 |
{
|
| 8369 |
GGML_ABORT("fatal error");
|
|
@@ -8625,9 +8491,6 @@ static void ggml_compute_forward_get_rows(
|
|
| 8625 |
case GGML_TYPE_IQ4_XS:
|
| 8626 |
case GGML_TYPE_IQ3_S:
|
| 8627 |
case GGML_TYPE_IQ2_S:
|
| 8628 |
-
case GGML_TYPE_Q4_0_4_4:
|
| 8629 |
-
case GGML_TYPE_Q4_0_4_8:
|
| 8630 |
-
case GGML_TYPE_Q4_0_8_8:
|
| 8631 |
{
|
| 8632 |
ggml_compute_forward_get_rows_q(params, dst);
|
| 8633 |
} break;
|
|
@@ -9217,10 +9080,6 @@ static void ggml_compute_forward_clamp(
|
|
| 9217 |
case GGML_TYPE_IQ3_S:
|
| 9218 |
case GGML_TYPE_IQ2_S:
|
| 9219 |
case GGML_TYPE_Q8_K:
|
| 9220 |
-
case GGML_TYPE_Q4_0_4_4:
|
| 9221 |
-
case GGML_TYPE_Q4_0_4_8:
|
| 9222 |
-
case GGML_TYPE_Q4_0_8_8:
|
| 9223 |
-
case GGML_TYPE_IQ4_NL_4_4:
|
| 9224 |
case GGML_TYPE_I8:
|
| 9225 |
case GGML_TYPE_I16:
|
| 9226 |
case GGML_TYPE_I32:
|
|
@@ -12426,6 +12285,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
| 12426 |
return;
|
| 12427 |
}
|
| 12428 |
|
|
|
|
|
|
|
|
|
|
| 12429 |
switch (tensor->op) {
|
| 12430 |
case GGML_OP_DUP:
|
| 12431 |
{
|
|
@@ -13373,146 +13235,142 @@ struct ggml_cplan ggml_graph_plan(
|
|
| 13373 |
|
| 13374 |
size_t cur = 0;
|
| 13375 |
|
| 13376 |
-
|
| 13377 |
-
|
| 13378 |
-
|
| 13379 |
-
|
| 13380 |
-
|
| 13381 |
-
|
| 13382 |
-
(node->
|
| 13383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13384 |
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
| 13385 |
-
}
|
| 13386 |
-
|
| 13387 |
-
|
| 13388 |
-
|
| 13389 |
-
|
| 13390 |
-
|
| 13391 |
-
|
| 13392 |
-
|
| 13393 |
-
|
| 13394 |
-
|
| 13395 |
-
|
| 13396 |
-
|
| 13397 |
-
|
| 13398 |
-
|
| 13399 |
-
|
| 13400 |
-
|
| 13401 |
-
|
| 13402 |
-
|
| 13403 |
-
|
| 13404 |
-
|
| 13405 |
-
|
| 13406 |
-
|
| 13407 |
-
|
| 13408 |
-
|
| 13409 |
-
|
| 13410 |
-
|
| 13411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13412 |
|
| 13413 |
-
|
| 13414 |
-
|
| 13415 |
-
|
| 13416 |
-
}
|
| 13417 |
-
} break;
|
| 13418 |
-
case GGML_OP_MUL_MAT_ID:
|
| 13419 |
-
{
|
| 13420 |
-
cur = 0;
|
| 13421 |
-
const struct ggml_tensor * src0 = node->src[0];
|
| 13422 |
-
const struct ggml_tensor * src1 = node->src[1];
|
| 13423 |
-
const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
|
| 13424 |
-
if (src1->type != vec_dot_type) {
|
| 13425 |
-
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
| 13426 |
-
}
|
| 13427 |
-
const int n_as = src0->ne[2];
|
| 13428 |
-
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
| 13429 |
-
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
| 13430 |
-
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
|
| 13431 |
-
} break;
|
| 13432 |
-
case GGML_OP_OUT_PROD:
|
| 13433 |
-
{
|
| 13434 |
-
if (ggml_is_quantized(node->src[0]->type)) {
|
| 13435 |
-
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
| 13436 |
-
}
|
| 13437 |
-
} break;
|
| 13438 |
-
case GGML_OP_SOFT_MAX:
|
| 13439 |
-
case GGML_OP_ROPE:
|
| 13440 |
-
{
|
| 13441 |
-
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
| 13442 |
-
} break;
|
| 13443 |
-
case GGML_OP_CONV_TRANSPOSE_1D:
|
| 13444 |
-
{
|
| 13445 |
-
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
| 13446 |
-
GGML_ASSERT(node->src[1]->ne[2] == 1);
|
| 13447 |
-
GGML_ASSERT(node->src[1]->ne[3] == 1);
|
| 13448 |
-
|
| 13449 |
-
const int64_t ne00 = node->src[0]->ne[0]; // K
|
| 13450 |
-
const int64_t ne01 = node->src[0]->ne[1]; // Cout
|
| 13451 |
-
const int64_t ne02 = node->src[0]->ne[2]; // Cin
|
| 13452 |
-
|
| 13453 |
-
const int64_t ne10 = node->src[1]->ne[0]; // L
|
| 13454 |
-
const int64_t ne11 = node->src[1]->ne[1]; // Cin
|
| 13455 |
-
|
| 13456 |
-
if ((node->src[0]->type == GGML_TYPE_F16 ||
|
| 13457 |
-
node->src[0]->type == GGML_TYPE_BF16) &&
|
| 13458 |
-
node->src[1]->type == GGML_TYPE_F32) {
|
| 13459 |
-
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
|
| 13460 |
-
cur += sizeof(ggml_fp16_t)*ne10*ne11;
|
| 13461 |
-
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
| 13462 |
-
node->src[1]->type == GGML_TYPE_F32) {
|
| 13463 |
-
cur += sizeof(float)*ne00*ne01*ne02;
|
| 13464 |
-
cur += sizeof(float)*ne10*ne11;
|
| 13465 |
-
} else {
|
| 13466 |
-
GGML_ABORT("fatal error");
|
| 13467 |
-
}
|
| 13468 |
-
} break;
|
| 13469 |
-
case GGML_OP_CONV_TRANSPOSE_2D:
|
| 13470 |
-
{
|
| 13471 |
-
const int64_t ne00 = node->src[0]->ne[0]; // W
|
| 13472 |
-
const int64_t ne01 = node->src[0]->ne[1]; // H
|
| 13473 |
-
const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
|
| 13474 |
-
const int64_t ne03 = node->src[0]->ne[3]; // Channels In
|
| 13475 |
-
|
| 13476 |
-
const int64_t ne10 = node->src[1]->ne[0]; // W
|
| 13477 |
-
const int64_t ne11 = node->src[1]->ne[1]; // H
|
| 13478 |
-
const int64_t ne12 = node->src[1]->ne[2]; // Channels In
|
| 13479 |
-
|
| 13480 |
-
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
| 13481 |
-
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
| 13482 |
-
} break;
|
| 13483 |
-
case GGML_OP_FLASH_ATTN_EXT:
|
| 13484 |
-
{
|
| 13485 |
-
const int64_t ne00 = node->src[0]->ne[0]; // D
|
| 13486 |
|
| 13487 |
-
|
| 13488 |
-
|
| 13489 |
-
|
| 13490 |
-
|
| 13491 |
-
|
| 13492 |
-
|
| 13493 |
-
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
| 13494 |
-
if (node->src[1]->type == GGML_TYPE_F32) {
|
| 13495 |
-
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
| 13496 |
-
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
| 13497 |
-
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
| 13498 |
-
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
| 13499 |
-
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
| 13500 |
-
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
| 13501 |
-
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
| 13502 |
-
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
| 13503 |
-
}
|
| 13504 |
-
} break;
|
| 13505 |
|
| 13506 |
-
|
| 13507 |
-
|
| 13508 |
-
|
| 13509 |
-
|
| 13510 |
-
|
| 13511 |
-
|
| 13512 |
-
|
| 13513 |
-
|
| 13514 |
-
|
| 13515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13516 |
}
|
| 13517 |
|
| 13518 |
work_size = MAX(work_size, cur);
|
|
|
|
| 3 |
|
| 4 |
#include "ggml-backend-impl.h"
|
| 5 |
#include "ggml-backend.h"
|
| 6 |
+
#include "ggml-cpu-traits.h"
|
| 7 |
#include "ggml-cpu-impl.h"
|
| 8 |
#include "ggml-cpu.h"
|
| 9 |
#include "ggml-impl.h"
|
|
|
|
| 224 |
|
| 225 |
typedef pthread_t ggml_thread_t;
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
#if defined(__APPLE__)
|
| 228 |
#include <unistd.h>
|
| 229 |
#include <mach/mach.h>
|
|
|
|
| 297 |
},
|
| 298 |
[GGML_TYPE_Q8_0] = {
|
| 299 |
.from_float = quantize_row_q8_0,
|
|
|
|
| 300 |
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
| 301 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 302 |
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
|
|
|
| 404 |
.vec_dot_type = GGML_TYPE_BF16,
|
| 405 |
.nrows = 1,
|
| 406 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
[GGML_TYPE_TQ1_0] = {
|
| 408 |
.from_float = quantize_row_tq1_0,
|
| 409 |
.vec_dot = ggml_vec_dot_tq1_0_q8_K,
|
|
|
|
| 416 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 417 |
.nrows = 1,
|
| 418 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
};
|
| 420 |
|
| 421 |
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
|
|
|
| 4468 |
case GGML_TYPE_IQ4_XS:
|
| 4469 |
case GGML_TYPE_IQ3_S:
|
| 4470 |
case GGML_TYPE_IQ2_S:
|
|
|
|
|
|
|
|
|
|
| 4471 |
{
|
| 4472 |
ggml_compute_forward_add_q_f32(params, dst);
|
| 4473 |
} break;
|
|
|
|
| 4845 |
case GGML_TYPE_IQ4_XS:
|
| 4846 |
case GGML_TYPE_IQ3_S:
|
| 4847 |
case GGML_TYPE_IQ2_S:
|
|
|
|
|
|
|
|
|
|
| 4848 |
{
|
| 4849 |
ggml_compute_forward_add1_q_f32(params, dst);
|
| 4850 |
} break;
|
|
|
|
| 4972 |
case GGML_TYPE_IQ4_XS:
|
| 4973 |
case GGML_TYPE_IQ3_S:
|
| 4974 |
case GGML_TYPE_IQ2_S:
|
|
|
|
|
|
|
|
|
|
| 4975 |
default:
|
| 4976 |
{
|
| 4977 |
GGML_ABORT("fatal error");
|
|
|
|
| 7387 |
const int ith = params->ith;
|
| 7388 |
const int nth = params->nth;
|
| 7389 |
|
| 7390 |
+
enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7391 |
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
| 7392 |
+
int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7393 |
|
| 7394 |
GGML_ASSERT(ne0 == ne01);
|
| 7395 |
GGML_ASSERT(ne1 == ne11);
|
|
|
|
| 7397 |
GGML_ASSERT(ne3 == ne13);
|
| 7398 |
|
| 7399 |
// we don't support permuted src0 or src1
|
| 7400 |
+
GGML_ASSERT(nb00 == ggml_type_size(src0->type));
|
| 7401 |
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
| 7402 |
|
| 7403 |
// dst cannot be transposed or permuted
|
|
|
|
| 7409 |
// nb01 >= nb00 - src0 is not transposed
|
| 7410 |
// compute by src0 rows
|
| 7411 |
|
| 7412 |
+
// TODO: extract to "extra_op"
|
| 7413 |
#if GGML_USE_LLAMAFILE
|
| 7414 |
// broadcast factors
|
| 7415 |
const int64_t r2 = ne12 / ne02;
|
|
|
|
| 7420 |
if (src1_cont) {
|
| 7421 |
for (int64_t i13 = 0; i13 < ne13; i13++)
|
| 7422 |
for (int64_t i12 = 0; i12 < ne12; i12++)
|
| 7423 |
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
| 7424 |
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
| 7425 |
+
nb01/ggml_type_size(src0->type),
|
| 7426 |
(const char *)src1->data + i12*nb12 + i13*nb13,
|
| 7427 |
nb11/ggml_type_size(src1->type),
|
| 7428 |
(char *)dst->data + i12*nb2 + i13*nb3,
|
| 7429 |
nb1/ggml_type_size(dst->type),
|
| 7430 |
ith, nth,
|
| 7431 |
+
src0->type,
|
| 7432 |
src1->type,
|
| 7433 |
dst->type))
|
| 7434 |
goto UseGgmlGemm1;
|
|
|
|
| 7449 |
|
| 7450 |
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
| 7451 |
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
| 7452 |
+
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7453 |
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
| 7454 |
+
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
| 7455 |
+
ne10);
|
| 7456 |
}
|
| 7457 |
}
|
| 7458 |
}
|
|
|
|
| 7472 |
|
| 7473 |
for (int64_t i13 = 0; i13 < ne13; i13++)
|
| 7474 |
for (int64_t i12 = 0; i12 < ne12; i12++)
|
| 7475 |
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
| 7476 |
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
| 7477 |
+
nb01/ggml_type_size(src0->type),
|
| 7478 |
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
| 7479 |
row_size/ggml_type_size(vec_dot_type),
|
| 7480 |
(char *)dst->data + i12*nb2 + i13*nb3,
|
| 7481 |
nb1/ggml_type_size(dst->type),
|
| 7482 |
ith, nth,
|
| 7483 |
+
src0->type,
|
| 7484 |
vec_dot_type,
|
| 7485 |
dst->type))
|
| 7486 |
goto UseGgmlGemm2;
|
|
|
|
| 7522 |
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
| 7523 |
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
| 7524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7525 |
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
| 7526 |
int current_chunk = ith;
|
| 7527 |
|
|
|
|
| 7544 |
num_rows_per_vec_dot = 1;
|
| 7545 |
}
|
| 7546 |
|
| 7547 |
+
ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
| 7548 |
|
| 7549 |
if (nth >= nchunk0 * nchunk1) {
|
| 7550 |
break;
|
|
|
|
| 7576 |
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
| 7577 |
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
| 7578 |
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
|
|
|
|
|
|
| 7579 |
|
| 7580 |
// we don't support permuted src0 or src1
|
| 7581 |
GGML_ASSERT(nb00 == ggml_type_size(type));
|
|
|
|
| 7661 |
const int64_t nr0 = ne01; // src0 rows
|
| 7662 |
const int64_t nr1 = cne1; // src1 rows
|
| 7663 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7664 |
// distribute the thread work across the inner or outer loop based on which one is larger
|
| 7665 |
|
| 7666 |
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
|
|
|
| 7968 |
case GGML_TYPE_IQ4_XS:
|
| 7969 |
case GGML_TYPE_IQ3_S:
|
| 7970 |
case GGML_TYPE_IQ2_S:
|
|
|
|
|
|
|
|
|
|
| 7971 |
{
|
| 7972 |
ggml_compute_forward_out_prod_q_f32(params, dst);
|
| 7973 |
} break;
|
|
|
|
| 8230 |
case GGML_TYPE_IQ4_XS:
|
| 8231 |
case GGML_TYPE_IQ3_S:
|
| 8232 |
case GGML_TYPE_IQ2_S:
|
|
|
|
|
|
|
|
|
|
| 8233 |
default:
|
| 8234 |
{
|
| 8235 |
GGML_ABORT("fatal error");
|
|
|
|
| 8491 |
case GGML_TYPE_IQ4_XS:
|
| 8492 |
case GGML_TYPE_IQ3_S:
|
| 8493 |
case GGML_TYPE_IQ2_S:
|
|
|
|
|
|
|
|
|
|
| 8494 |
{
|
| 8495 |
ggml_compute_forward_get_rows_q(params, dst);
|
| 8496 |
} break;
|
|
|
|
| 9080 |
case GGML_TYPE_IQ3_S:
|
| 9081 |
case GGML_TYPE_IQ2_S:
|
| 9082 |
case GGML_TYPE_Q8_K:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9083 |
case GGML_TYPE_I8:
|
| 9084 |
case GGML_TYPE_I16:
|
| 9085 |
case GGML_TYPE_I32:
|
|
|
|
| 12285 |
return;
|
| 12286 |
}
|
| 12287 |
|
| 12288 |
+
// extra_buffer op?
|
| 12289 |
+
if (ggml_cpu_extra_compute_forward(params, tensor)) return;
|
| 12290 |
+
|
| 12291 |
switch (tensor->op) {
|
| 12292 |
case GGML_OP_DUP:
|
| 12293 |
{
|
|
|
|
| 13235 |
|
| 13236 |
size_t cur = 0;
|
| 13237 |
|
| 13238 |
+
if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) {
|
| 13239 |
+
|
| 13240 |
+
switch (node->op) {
|
| 13241 |
+
case GGML_OP_CPY:
|
| 13242 |
+
case GGML_OP_DUP:
|
| 13243 |
+
{
|
| 13244 |
+
if (ggml_is_quantized(node->type) ||
|
| 13245 |
+
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
|
| 13246 |
+
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
|
| 13247 |
+
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
|
| 13248 |
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
| 13249 |
+
}
|
| 13250 |
+
} break;
|
| 13251 |
+
case GGML_OP_ADD:
|
| 13252 |
+
case GGML_OP_ADD1:
|
| 13253 |
+
{
|
| 13254 |
+
if (ggml_is_quantized(node->src[0]->type)) {
|
| 13255 |
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
| 13256 |
+
}
|
| 13257 |
+
} break;
|
| 13258 |
+
case GGML_OP_ACC:
|
| 13259 |
+
{
|
| 13260 |
+
if (ggml_is_quantized(node->src[0]->type)) {
|
| 13261 |
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
| 13262 |
+
}
|
| 13263 |
+
} break;
|
| 13264 |
+
case GGML_OP_COUNT_EQUAL:
|
| 13265 |
+
{
|
| 13266 |
+
cur = ggml_type_size(node->type)*n_tasks;
|
| 13267 |
+
} break;
|
| 13268 |
+
case GGML_OP_MUL_MAT:
|
| 13269 |
+
{
|
| 13270 |
+
const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
|
| 13271 |
+
|
| 13272 |
+
if (node->src[1]->type != vec_dot_type) {
|
| 13273 |
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
| 13274 |
+
}
|
| 13275 |
+
} break;
|
| 13276 |
+
case GGML_OP_MUL_MAT_ID:
|
| 13277 |
+
{
|
| 13278 |
+
cur = 0;
|
| 13279 |
+
const struct ggml_tensor * src0 = node->src[0];
|
| 13280 |
+
const struct ggml_tensor * src1 = node->src[1];
|
| 13281 |
+
const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
|
| 13282 |
+
if (src1->type != vec_dot_type) {
|
| 13283 |
+
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
| 13284 |
+
}
|
| 13285 |
+
const int n_as = src0->ne[2];
|
| 13286 |
+
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
| 13287 |
+
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
| 13288 |
+
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
|
| 13289 |
+
} break;
|
| 13290 |
+
case GGML_OP_OUT_PROD:
|
| 13291 |
+
{
|
| 13292 |
+
if (ggml_is_quantized(node->src[0]->type)) {
|
| 13293 |
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
| 13294 |
+
}
|
| 13295 |
+
} break;
|
| 13296 |
+
case GGML_OP_SOFT_MAX:
|
| 13297 |
+
case GGML_OP_ROPE:
|
| 13298 |
+
{
|
| 13299 |
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
| 13300 |
+
} break;
|
| 13301 |
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
| 13302 |
+
{
|
| 13303 |
+
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
| 13304 |
+
GGML_ASSERT(node->src[1]->ne[2] == 1);
|
| 13305 |
+
GGML_ASSERT(node->src[1]->ne[3] == 1);
|
| 13306 |
+
|
| 13307 |
+
const int64_t ne00 = node->src[0]->ne[0]; // K
|
| 13308 |
+
const int64_t ne01 = node->src[0]->ne[1]; // Cout
|
| 13309 |
+
const int64_t ne02 = node->src[0]->ne[2]; // Cin
|
| 13310 |
+
const int64_t ne10 = node->src[1]->ne[0]; // L
|
| 13311 |
+
const int64_t ne11 = node->src[1]->ne[1]; // Cin
|
| 13312 |
+
|
| 13313 |
+
if ((node->src[0]->type == GGML_TYPE_F16 ||
|
| 13314 |
+
node->src[0]->type == GGML_TYPE_BF16) &&
|
| 13315 |
+
node->src[1]->type == GGML_TYPE_F32) {
|
| 13316 |
+
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
|
| 13317 |
+
cur += sizeof(ggml_fp16_t)*ne10*ne11;
|
| 13318 |
+
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
| 13319 |
+
node->src[1]->type == GGML_TYPE_F32) {
|
| 13320 |
+
cur += sizeof(float)*ne00*ne01*ne02;
|
| 13321 |
+
cur += sizeof(float)*ne10*ne11;
|
| 13322 |
+
} else {
|
| 13323 |
+
GGML_ABORT("fatal error");
|
| 13324 |
+
}
|
| 13325 |
+
} break;
|
| 13326 |
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
| 13327 |
+
{
|
| 13328 |
+
const int64_t ne00 = node->src[0]->ne[0]; // W
|
| 13329 |
+
const int64_t ne01 = node->src[0]->ne[1]; // H
|
| 13330 |
+
const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
|
| 13331 |
+
const int64_t ne03 = node->src[0]->ne[3]; // Channels In
|
| 13332 |
|
| 13333 |
+
const int64_t ne10 = node->src[1]->ne[0]; // W
|
| 13334 |
+
const int64_t ne11 = node->src[1]->ne[1]; // H
|
| 13335 |
+
const int64_t ne12 = node->src[1]->ne[2]; // Channels In
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13336 |
|
| 13337 |
+
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
| 13338 |
+
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
| 13339 |
+
} break;
|
| 13340 |
+
case GGML_OP_FLASH_ATTN_EXT:
|
| 13341 |
+
{
|
| 13342 |
+
const int64_t ne00 = node->src[0]->ne[0]; // D
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13343 |
|
| 13344 |
+
cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
|
| 13345 |
+
} break;
|
| 13346 |
+
case GGML_OP_FLASH_ATTN_BACK:
|
| 13347 |
+
{
|
| 13348 |
+
const int64_t D = node->src[0]->ne[0];
|
| 13349 |
+
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
| 13350 |
+
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
| 13351 |
+
if (node->src[1]->type == GGML_TYPE_F32) {
|
| 13352 |
+
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
| 13353 |
+
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
| 13354 |
+
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
| 13355 |
+
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
| 13356 |
+
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
| 13357 |
+
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
| 13358 |
+
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
| 13359 |
+
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
| 13360 |
+
}
|
| 13361 |
+
} break;
|
| 13362 |
+
|
| 13363 |
+
case GGML_OP_CROSS_ENTROPY_LOSS:
|
| 13364 |
+
{
|
| 13365 |
+
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
| 13366 |
+
} break;
|
| 13367 |
+
case GGML_OP_COUNT:
|
| 13368 |
+
{
|
| 13369 |
+
GGML_ABORT("fatal error");
|
| 13370 |
+
}
|
| 13371 |
+
default:
|
| 13372 |
+
break;
|
| 13373 |
+
}
|
| 13374 |
}
|
| 13375 |
|
| 13376 |
work_size = MAX(work_size, cur);
|
|
@@ -2,12 +2,18 @@
|
|
| 2 |
#include "ggml-backend-impl.h"
|
| 3 |
#include "ggml-cpu.h"
|
| 4 |
#include "ggml-cpu-aarch64.h"
|
|
|
|
| 5 |
#include "ggml-impl.h"
|
| 6 |
#include "amx/amx.h"
|
|
|
|
| 7 |
#include <cctype>
|
| 8 |
#include <string>
|
| 9 |
#include <vector>
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
#if defined(__APPLE__)
|
| 12 |
#include <sys/types.h>
|
| 13 |
#include <sys/sysctl.h>
|
|
@@ -23,115 +29,7 @@
|
|
| 23 |
|
| 24 |
// ggml-backend interface
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
// buffer type HBM
|
| 29 |
-
|
| 30 |
-
#include <hbwmalloc.h>
|
| 31 |
-
|
| 32 |
-
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 33 |
-
return "CPU_HBM";
|
| 34 |
-
|
| 35 |
-
GGML_UNUSED(buft);
|
| 36 |
-
}
|
| 37 |
-
|
| 38 |
-
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 39 |
-
hbw_free(buffer->context);
|
| 40 |
-
}
|
| 41 |
-
|
| 42 |
-
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 43 |
-
void * ptr;
|
| 44 |
-
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
| 45 |
-
if (result != 0) {
|
| 46 |
-
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
| 47 |
-
return NULL;
|
| 48 |
-
}
|
| 49 |
-
|
| 50 |
-
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 51 |
-
buffer->buft = buft;
|
| 52 |
-
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
| 53 |
-
|
| 54 |
-
return buffer;
|
| 55 |
-
}
|
| 56 |
-
|
| 57 |
-
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
| 58 |
-
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
| 59 |
-
/* .iface = */ {
|
| 60 |
-
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
| 61 |
-
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
| 62 |
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 63 |
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 64 |
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 65 |
-
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 66 |
-
},
|
| 67 |
-
/* .context = */ NULL,
|
| 68 |
-
};
|
| 69 |
-
|
| 70 |
-
return &ggml_backend_cpu_buffer_type_hbm;
|
| 71 |
-
}
|
| 72 |
-
#endif
|
| 73 |
-
|
| 74 |
-
// buffer type AARCH64
|
| 75 |
-
|
| 76 |
-
static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 77 |
-
tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
|
| 78 |
-
|
| 79 |
-
GGML_UNUSED(buffer);
|
| 80 |
-
}
|
| 81 |
-
|
| 82 |
-
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 83 |
-
GGML_ASSERT(offset == 0);
|
| 84 |
-
GGML_ASSERT(size == ggml_nbytes(tensor));
|
| 85 |
-
|
| 86 |
-
enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;
|
| 87 |
-
|
| 88 |
-
ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
|
| 89 |
-
|
| 90 |
-
GGML_UNUSED(buffer);
|
| 91 |
-
}
|
| 92 |
-
|
| 93 |
-
static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 94 |
-
return "CPU_AARCH64";
|
| 95 |
-
|
| 96 |
-
GGML_UNUSED(buft);
|
| 97 |
-
}
|
| 98 |
-
|
| 99 |
-
static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 100 |
-
auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
| 101 |
-
|
| 102 |
-
if (buffer == NULL) {
|
| 103 |
-
return NULL;
|
| 104 |
-
}
|
| 105 |
-
|
| 106 |
-
buffer->buft = buft;
|
| 107 |
-
buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
|
| 108 |
-
buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
|
| 109 |
-
|
| 110 |
-
return buffer;
|
| 111 |
-
}
|
| 112 |
-
|
| 113 |
-
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
|
| 114 |
-
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
|
| 115 |
-
/* .iface = */ {
|
| 116 |
-
/* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
|
| 117 |
-
/* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
|
| 118 |
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
| 119 |
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 120 |
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 121 |
-
/* .is_host = */ NULL,
|
| 122 |
-
},
|
| 123 |
-
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
| 124 |
-
/* .context = */ NULL,
|
| 125 |
-
};
|
| 126 |
-
|
| 127 |
-
return &ggml_backend_cpu_buffer_type_aarch64;
|
| 128 |
-
}
|
| 129 |
-
|
| 130 |
-
bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
|
| 131 |
-
return buft == ggml_backend_cpu_aarch64_buffer_type();
|
| 132 |
-
}
|
| 133 |
-
|
| 134 |
-
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
|
| 135 |
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
| 136 |
std::vector<ggml_backend_buffer_type_t> bufts;
|
| 137 |
|
|
@@ -152,11 +50,22 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
|
|
| 152 |
return bufts;
|
| 153 |
}();
|
| 154 |
|
| 155 |
-
return bufts
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
GGML_UNUSED(device);
|
| 158 |
}
|
| 159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
// CPU backend - backend (stream)
|
| 161 |
|
| 162 |
struct ggml_backend_cpu_context {
|
|
@@ -465,25 +374,19 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
|
| 465 |
return true;
|
| 466 |
}
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
}
|
| 472 |
}
|
| 473 |
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
}
|
| 478 |
-
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
| 479 |
-
if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
|
| 480 |
-
return false;
|
| 481 |
-
}
|
| 482 |
-
}
|
| 483 |
-
#endif
|
| 484 |
-
|
| 485 |
-
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
| 486 |
-
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
|
| 487 |
return false;
|
| 488 |
}
|
| 489 |
}
|
|
@@ -506,19 +409,10 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
|
| 506 |
default:
|
| 507 |
return true;
|
| 508 |
}
|
| 509 |
-
|
| 510 |
-
GGML_UNUSED(dev);
|
| 511 |
}
|
| 512 |
|
| 513 |
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 517 |
-
supported = supported || ggml_backend_amx_buft_is_amx(buft);
|
| 518 |
-
#endif
|
| 519 |
-
|
| 520 |
-
return supported;
|
| 521 |
-
|
| 522 |
GGML_UNUSED(dev);
|
| 523 |
}
|
| 524 |
|
|
@@ -666,10 +560,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|
| 666 |
|
| 667 |
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
| 668 |
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
| 669 |
-
|
|
|
|
| 670 |
}
|
| 671 |
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
| 672 |
-
|
|
|
|
| 673 |
}
|
| 674 |
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
| 675 |
return (void *)ggml_backend_cpu_get_features;
|
|
|
|
| 2 |
#include "ggml-backend-impl.h"
|
| 3 |
#include "ggml-cpu.h"
|
| 4 |
#include "ggml-cpu-aarch64.h"
|
| 5 |
+
#include "ggml-cpu-traits.h"
|
| 6 |
#include "ggml-impl.h"
|
| 7 |
#include "amx/amx.h"
|
| 8 |
+
|
| 9 |
#include <cctype>
|
| 10 |
#include <string>
|
| 11 |
#include <vector>
|
| 12 |
|
| 13 |
+
#ifdef GGML_USE_CPU_HBM
|
| 14 |
+
#include "ggml-cpu-hbm.h"
|
| 15 |
+
#endif
|
| 16 |
+
|
| 17 |
#if defined(__APPLE__)
|
| 18 |
#include <sys/types.h>
|
| 19 |
#include <sys/sysctl.h>
|
|
|
|
| 29 |
|
| 30 |
// ggml-backend interface
|
| 31 |
|
| 32 |
+
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
| 34 |
std::vector<ggml_backend_buffer_type_t> bufts;
|
| 35 |
|
|
|
|
| 50 |
return bufts;
|
| 51 |
}();
|
| 52 |
|
| 53 |
+
return bufts;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
|
| 57 |
+
return ggml_backend_cpu_get_extra_buffers_type().data();
|
| 58 |
|
| 59 |
GGML_UNUSED(device);
|
| 60 |
}
|
| 61 |
|
| 62 |
+
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
|
| 63 |
+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
| 64 |
+
if (extra && extra == buft) return true;
|
| 65 |
+
}
|
| 66 |
+
return false;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
// CPU backend - backend (stream)
|
| 70 |
|
| 71 |
struct ggml_backend_cpu_context {
|
|
|
|
| 374 |
return true;
|
| 375 |
}
|
| 376 |
|
| 377 |
+
// extra_buffer_op?
|
| 378 |
+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
| 379 |
+
if (extra) {
|
| 380 |
+
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
|
| 381 |
+
if (buf_extra && buf_extra->supports_op(dev, op)) {
|
| 382 |
+
return true;
|
| 383 |
+
}
|
| 384 |
}
|
| 385 |
}
|
| 386 |
|
| 387 |
+
// the other case need host buffer.
|
| 388 |
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 389 |
+
if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
return false;
|
| 391 |
}
|
| 392 |
}
|
|
|
|
| 409 |
default:
|
| 410 |
return true;
|
| 411 |
}
|
|
|
|
|
|
|
| 412 |
}
|
| 413 |
|
| 414 |
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
| 415 |
+
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
GGML_UNUSED(dev);
|
| 417 |
}
|
| 418 |
|
|
|
|
| 560 |
|
| 561 |
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
| 562 |
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
| 563 |
+
ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads;
|
| 564 |
+
return (void *)fct;
|
| 565 |
}
|
| 566 |
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
| 567 |
+
ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type;
|
| 568 |
+
return (void *)fct;
|
| 569 |
}
|
| 570 |
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
| 571 |
return (void *)ggml_backend_cpu_get_features;
|
|
@@ -3210,7 +3210,7 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
|
|
| 3210 |
static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
|
| 3211 |
/* .get_name = */ ggml_backend_cuda_reg_get_name,
|
| 3212 |
/* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
|
| 3213 |
-
/* .
|
| 3214 |
/* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
|
| 3215 |
};
|
| 3216 |
|
|
|
|
| 3210 |
static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
|
| 3211 |
/* .get_name = */ ggml_backend_cuda_reg_get_name,
|
| 3212 |
/* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
|
| 3213 |
+
/* .get_device = */ ggml_backend_cuda_reg_get_device,
|
| 3214 |
/* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
|
| 3215 |
};
|
| 3216 |
|
|
@@ -5220,15 +5220,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|
| 5220 |
{
|
| 5221 |
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
| 5222 |
} break;
|
| 5223 |
-
case GGML_TYPE_Q4_0_4_4:
|
| 5224 |
-
case GGML_TYPE_Q4_0_4_8:
|
| 5225 |
-
{
|
| 5226 |
-
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
|
| 5227 |
-
} break;
|
| 5228 |
-
case GGML_TYPE_Q4_0_8_8:
|
| 5229 |
-
{
|
| 5230 |
-
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
|
| 5231 |
-
} break;
|
| 5232 |
|
| 5233 |
case GGML_TYPE_I8:
|
| 5234 |
case GGML_TYPE_I16:
|
|
|
|
| 5220 |
{
|
| 5221 |
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
| 5222 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5223 |
|
| 5224 |
case GGML_TYPE_I8:
|
| 5225 |
case GGML_TYPE_I16:
|
|
@@ -4630,7 +4630,7 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
|
|
| 4630 |
static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = {
|
| 4631 |
/* .get_name = */ ggml_backend_sycl_reg_get_name,
|
| 4632 |
/* .get_device_count = */ ggml_backend_sycl_reg_get_device_count,
|
| 4633 |
-
/* .
|
| 4634 |
/* .get_proc_address = */ ggml_backend_sycl_reg_get_proc_address,
|
| 4635 |
};
|
| 4636 |
|
|
|
|
| 4630 |
static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = {
|
| 4631 |
/* .get_name = */ ggml_backend_sycl_reg_get_name,
|
| 4632 |
/* .get_device_count = */ ggml_backend_sycl_reg_get_device_count,
|
| 4633 |
+
/* .get_device = */ ggml_backend_sycl_reg_get_device,
|
| 4634 |
/* .get_proc_address = */ ggml_backend_sycl_reg_get_proc_address,
|
| 4635 |
};
|
| 4636 |
|
|
@@ -8,7 +8,10 @@
|
|
| 8 |
|
| 9 |
// FIXME: required here for quantization functions
|
| 10 |
#include "ggml-quants.h"
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 14 |
#include <malloc.h> // using malloc.h with MSC/MINGW
|
|
@@ -788,32 +791,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|
| 788 |
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
| 789 |
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
|
| 790 |
},
|
| 791 |
-
[
|
| 792 |
-
.type_name = "
|
| 793 |
-
.blck_size =
|
| 794 |
-
.
|
| 795 |
-
.
|
| 796 |
-
.is_quantized = true,
|
| 797 |
-
.to_float = NULL,
|
| 798 |
-
.from_float_ref = NULL,
|
| 799 |
},
|
| 800 |
-
[
|
| 801 |
-
.type_name = "
|
| 802 |
-
.blck_size =
|
| 803 |
-
.
|
| 804 |
-
.
|
| 805 |
-
.is_quantized = true,
|
| 806 |
-
.to_float = NULL,
|
| 807 |
-
.from_float_ref = NULL,
|
| 808 |
},
|
| 809 |
-
[
|
| 810 |
-
.type_name = "
|
| 811 |
-
.blck_size =
|
| 812 |
-
.
|
| 813 |
-
.
|
| 814 |
-
.is_quantized = true,
|
| 815 |
-
.to_float = NULL,
|
| 816 |
-
.from_float_ref = NULL,
|
| 817 |
},
|
| 818 |
[GGML_TYPE_TQ1_0] = {
|
| 819 |
.type_name = "tq1_0",
|
|
@@ -831,14 +825,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|
| 831 |
.to_float = (ggml_to_float_t) dequantize_row_tq2_0,
|
| 832 |
.from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
|
| 833 |
},
|
| 834 |
-
[
|
| 835 |
-
.type_name = "
|
| 836 |
-
.blck_size =
|
| 837 |
-
.
|
| 838 |
-
.
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 842 |
},
|
| 843 |
};
|
| 844 |
|
|
@@ -1270,9 +1273,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
| 1270 |
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
| 1271 |
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
| 1272 |
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
| 1273 |
-
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
|
| 1274 |
-
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
|
| 1275 |
-
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
|
| 1276 |
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
| 1277 |
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
| 1278 |
}
|
|
@@ -6304,9 +6304,6 @@ size_t ggml_quantize_chunk(
|
|
| 6304 |
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 6305 |
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 6306 |
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 6307 |
-
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 6308 |
-
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 6309 |
-
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 6310 |
case GGML_TYPE_F16:
|
| 6311 |
{
|
| 6312 |
size_t elemsize = sizeof(ggml_fp16_t);
|
|
@@ -6838,7 +6835,16 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
| 6838 |
(int64_t) info->ne[2] *
|
| 6839 |
(int64_t) info->ne[3];
|
| 6840 |
|
| 6841 |
-
if (ggml_blck_size(info->type) == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6842 |
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
| 6843 |
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
| 6844 |
fclose(file);
|
|
|
|
| 8 |
|
| 9 |
// FIXME: required here for quantization functions
|
| 10 |
#include "ggml-quants.h"
|
| 11 |
+
|
| 12 |
+
#ifdef GGML_USE_CPU_HBM
|
| 13 |
+
#include <hbwmalloc.h>
|
| 14 |
+
#endif
|
| 15 |
|
| 16 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 17 |
#include <malloc.h> // using malloc.h with MSC/MINGW
|
|
|
|
| 791 |
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
| 792 |
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
|
| 793 |
},
|
| 794 |
+
[31] = { // GGML_TYPE_Q4_0_4_4
|
| 795 |
+
.type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
|
| 796 |
+
.blck_size = 0,
|
| 797 |
+
.type_size = 0,
|
| 798 |
+
.is_quantized = false,
|
|
|
|
|
|
|
|
|
|
| 799 |
},
|
| 800 |
+
[32] = { // GGML_TYPE_Q4_0_4_8
|
| 801 |
+
.type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
|
| 802 |
+
.blck_size = 0,
|
| 803 |
+
.type_size = 0,
|
| 804 |
+
.is_quantized = false,
|
|
|
|
|
|
|
|
|
|
| 805 |
},
|
| 806 |
+
[33] = { // GGML_TYPE_Q4_0_8_8
|
| 807 |
+
.type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
|
| 808 |
+
.blck_size = 0,
|
| 809 |
+
.type_size = 0,
|
| 810 |
+
.is_quantized = false,
|
|
|
|
|
|
|
|
|
|
| 811 |
},
|
| 812 |
[GGML_TYPE_TQ1_0] = {
|
| 813 |
.type_name = "tq1_0",
|
|
|
|
| 825 |
.to_float = (ggml_to_float_t) dequantize_row_tq2_0,
|
| 826 |
.from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
|
| 827 |
},
|
| 828 |
+
[36] = { // GGML_TYPE_IQ4_NL_4_4
|
| 829 |
+
.type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
|
| 830 |
+
.blck_size = 0,
|
| 831 |
+
.type_size = 0,
|
| 832 |
+
.is_quantized = false,
|
| 833 |
+
},
|
| 834 |
+
[37] = { // GGML_TYPE_IQ4_NL_4_8
|
| 835 |
+
.type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
|
| 836 |
+
.blck_size = 0,
|
| 837 |
+
.type_size = 0,
|
| 838 |
+
.is_quantized = false,
|
| 839 |
+
},
|
| 840 |
+
[38] = { // GGML_TYPE_IQ4_NL_8_8
|
| 841 |
+
.type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
|
| 842 |
+
.blck_size = 0,
|
| 843 |
+
.type_size = 0,
|
| 844 |
+
.is_quantized = false,
|
| 845 |
},
|
| 846 |
};
|
| 847 |
|
|
|
|
| 1273 |
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
| 1274 |
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
| 1275 |
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
|
|
|
|
|
|
|
|
|
| 1276 |
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
| 1277 |
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
| 1278 |
}
|
|
|
|
| 6304 |
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 6305 |
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 6306 |
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
|
|
|
|
|
|
|
|
| 6307 |
case GGML_TYPE_F16:
|
| 6308 |
{
|
| 6309 |
size_t elemsize = sizeof(ggml_fp16_t);
|
|
|
|
| 6835 |
(int64_t) info->ne[2] *
|
| 6836 |
(int64_t) info->ne[3];
|
| 6837 |
|
| 6838 |
+
if (ggml_blck_size(info->type) == 0 ) {
|
| 6839 |
+
// this tensor type support have been removed:
|
| 6840 |
+
fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
|
| 6841 |
+
__func__, info->name.data, (int) info->type, ggml_type_name(info->type));
|
| 6842 |
+
fclose(file);
|
| 6843 |
+
gguf_free(ctx);
|
| 6844 |
+
return NULL;
|
| 6845 |
+
}
|
| 6846 |
+
|
| 6847 |
+
if (ne % ggml_blck_size(info->type) != 0) {
|
| 6848 |
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
| 6849 |
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
| 6850 |
fclose(file);
|