Djip007 ggerganov commited on
Commit
163128e
·
1 Parent(s): 9a4de04

ggml : refactor online repacking (llama/10446)

Browse files

* rename ggml-cpu-aarch64.c to .cpp

* reformat extra cpu backend.

- clean Q4_0_N_M and IQ4_0_N_M
- remove from "file" tensor type
- allow only with dynamic repack

- extract cpu extra bufts and convert to C++
- hbm
- "aarch64"

- more generic use of extra buffer
- generalise extra_supports_op
- new API for "cpu-accel":
- amx
- aarch64

* clang-format

* Clean Q4_0_N_M ref

Enable restrict on C++

* add op GGML_OP_MUL_MAT_ID for Q4_0_N_M with runtime repack

* added/corrected control on tensor size for Q4 repacking.

* Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp

Co-authored-by: Georgi Gerganov <[email protected]>

* Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp

Co-authored-by: Georgi Gerganov <[email protected]>

* add debug logs on repacks.

---------

Co-authored-by: Georgi Gerganov <[email protected]>

ggml/include/ggml-cpu.h CHANGED
@@ -103,24 +103,14 @@ extern "C" {
103
 
104
  // Internal types and functions exposed for tests and benchmarks
105
 
106
- typedef void (*ggml_from_float_to_mat_t)
107
- (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
108
  typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
109
  const void * GGML_RESTRICT y, size_t by, int nrc);
110
- typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
111
- const void * GGML_RESTRICT y, int nr, int nc);
112
- typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
113
- const void * GGML_RESTRICT y, int nr, int nc);
114
 
115
  struct ggml_type_traits_cpu {
116
  ggml_from_float_t from_float;
117
- ggml_from_float_to_mat_t from_float_to_mat;
118
  ggml_vec_dot_t vec_dot;
119
  enum ggml_type vec_dot_type;
120
  int64_t nrows; // number of rows to process simultaneously
121
- int64_t ncols; // number of columns to process simultaneously
122
- ggml_gemv_t gemv;
123
- ggml_gemm_t gemm;
124
  };
125
 
126
  GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
@@ -140,13 +130,6 @@ extern "C" {
140
 
141
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
142
 
143
- #ifdef GGML_USE_CPU_HBM
144
- GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
145
- #endif
146
-
147
- GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
148
- GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
149
-
150
  #ifdef __cplusplus
151
  }
152
  #endif
 
103
 
104
  // Internal types and functions exposed for tests and benchmarks
105
 
 
 
106
  typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
107
  const void * GGML_RESTRICT y, size_t by, int nrc);
 
 
 
 
108
 
109
  struct ggml_type_traits_cpu {
110
  ggml_from_float_t from_float;
 
111
  ggml_vec_dot_t vec_dot;
112
  enum ggml_type vec_dot_type;
113
  int64_t nrows; // number of rows to process simultaneously
 
 
 
114
  };
115
 
116
  GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
 
130
 
131
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
132
 
 
 
 
 
 
 
 
133
  #ifdef __cplusplus
134
  }
135
  #endif
ggml/include/ggml.h CHANGED
@@ -384,15 +384,15 @@ extern "C" {
384
  GGML_TYPE_F64 = 28,
385
  GGML_TYPE_IQ1_M = 29,
386
  GGML_TYPE_BF16 = 30,
387
- GGML_TYPE_Q4_0_4_4 = 31,
388
- GGML_TYPE_Q4_0_4_8 = 32,
389
- GGML_TYPE_Q4_0_8_8 = 33,
390
  GGML_TYPE_TQ1_0 = 34,
391
  GGML_TYPE_TQ2_0 = 35,
392
- GGML_TYPE_IQ4_NL_4_4 = 36,
393
  // GGML_TYPE_IQ4_NL_4_8 = 37,
394
  // GGML_TYPE_IQ4_NL_8_8 = 38,
395
- GGML_TYPE_COUNT,
396
  };
397
 
398
  // precision
@@ -433,9 +433,6 @@ extern "C" {
433
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
434
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
435
  GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
436
- GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
437
- GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
438
- GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
439
  };
440
 
441
  // available tensor operations:
@@ -2205,11 +2202,19 @@ extern "C" {
2205
  GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
2206
  GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
2207
 
2208
- #ifdef __cplusplus
2209
- // restrict not standard in C++
2210
- #define GGML_RESTRICT
 
 
 
 
 
 
 
 
2211
  #else
2212
- #define GGML_RESTRICT restrict
2213
  #endif
2214
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2215
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
384
  GGML_TYPE_F64 = 28,
385
  GGML_TYPE_IQ1_M = 29,
386
  GGML_TYPE_BF16 = 30,
387
+ // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
388
+ // GGML_TYPE_Q4_0_4_8 = 32,
389
+ // GGML_TYPE_Q4_0_8_8 = 33,
390
  GGML_TYPE_TQ1_0 = 34,
391
  GGML_TYPE_TQ2_0 = 35,
392
+ // GGML_TYPE_IQ4_NL_4_4 = 36,
393
  // GGML_TYPE_IQ4_NL_4_8 = 37,
394
  // GGML_TYPE_IQ4_NL_8_8 = 38,
395
+ GGML_TYPE_COUNT = 39,
396
  };
397
 
398
  // precision
 
433
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
434
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
435
  GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
 
 
 
436
  };
437
 
438
  // available tensor operations:
 
2202
  GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
2203
  GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
2204
 
2205
+ #ifdef __cplusplus
2206
+ // restrict not standard in C++
2207
+ # if defined(__GNUC__)
2208
+ # define GGML_RESTRICT __restrict__
2209
+ # elif defined(__clang__)
2210
+ # define GGML_RESTRICT __restrict
2211
+ # elif defined(_MSC_VER)
2212
+ # define GGML_RESTRICT __restrict
2213
+ # else
2214
+ # define GGML_RESTRICT
2215
+ # endif
2216
  #else
2217
+ # define GGML_RESTRICT restrict
2218
  #endif
2219
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2220
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
ggml/src/CMakeLists.txt CHANGED
@@ -220,9 +220,7 @@ add_library(ggml-base
220
  ggml-threading.cpp
221
  ggml-threading.h
222
  ggml-quants.c
223
- ggml-quants.h
224
- ggml-aarch64.c
225
- ggml-aarch64.h)
226
 
227
  target_include_directories(ggml-base PRIVATE .)
228
 
 
220
  ggml-threading.cpp
221
  ggml-threading.h
222
  ggml-quants.c
223
+ ggml-quants.h)
 
 
224
 
225
  target_include_directories(ggml-base PRIVATE .)
226
 
ggml/src/ggml-cann/ggml-cann.cpp CHANGED
@@ -2089,7 +2089,7 @@ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, con
2089
  static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
2090
  /* .get_name = */ ggml_backend_cann_reg_get_name,
2091
  /* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
2092
- /* .get_device_get = */ ggml_backend_cann_reg_get_device,
2093
  /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
2094
  };
2095
 
 
2089
  static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
2090
  /* .get_name = */ ggml_backend_cann_reg_get_name,
2091
  /* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
2092
+ /* .get_device = */ ggml_backend_cann_reg_get_device,
2093
  /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
2094
  };
2095
 
ggml/src/ggml-common.h CHANGED
@@ -6,7 +6,20 @@
6
  typedef uint16_t ggml_half;
7
  typedef uint32_t ggml_half2;
8
 
9
- #define GGML_COMMON_AGGR
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  #define GGML_COMMON_DECL
12
  #elif defined(GGML_COMMON_DECL_METAL)
@@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
15
  typedef half ggml_half;
16
  typedef half2 ggml_half2;
17
 
18
- #define GGML_COMMON_AGGR
 
19
 
20
  #define GGML_COMMON_DECL
21
  #elif defined(GGML_COMMON_DECL_CUDA)
@@ -29,7 +43,8 @@ typedef half2 ggml_half2;
29
  typedef half ggml_half;
30
  typedef half2 ggml_half2;
31
 
32
- #define GGML_COMMON_AGGR data
 
33
 
34
  #define GGML_COMMON_DECL
35
  #elif defined(GGML_COMMON_DECL_HIP)
@@ -39,7 +54,8 @@ typedef half2 ggml_half2;
39
  typedef half ggml_half;
40
  typedef half2 ggml_half2;
41
 
42
- #define GGML_COMMON_AGGR data
 
43
 
44
  #define GGML_COMMON_DECL
45
  #elif defined(GGML_COMMON_DECL_SYCL)
@@ -49,7 +65,8 @@ typedef half2 ggml_half2;
49
  typedef sycl::half ggml_half;
50
  typedef sycl::half2 ggml_half2;
51
 
52
- #define GGML_COMMON_AGGR data
 
53
 
54
  #define GGML_COMMON_DECL
55
  #endif
@@ -154,9 +171,9 @@ typedef struct {
154
  struct {
155
  ggml_half d; // delta
156
  ggml_half m; // min
157
- } GGML_COMMON_AGGR;
158
  ggml_half2 dm;
159
- };
160
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
161
  } block_q4_1;
162
  static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -175,9 +192,9 @@ typedef struct {
175
  struct {
176
  ggml_half d; // delta
177
  ggml_half m; // min
178
- } GGML_COMMON_AGGR;
179
  ggml_half2 dm;
180
- };
181
  uint8_t qh[4]; // 5-th bit of quants
182
  uint8_t qs[QK5_1 / 2]; // nibbles / quants
183
  } block_q5_1;
@@ -196,37 +213,13 @@ typedef struct {
196
  struct {
197
  ggml_half d; // delta
198
  ggml_half s; // d * sum(qs[i])
199
- } GGML_COMMON_AGGR;
200
  ggml_half2 ds;
201
- };
202
  int8_t qs[QK8_1]; // quants
203
  } block_q8_1;
204
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
205
 
206
- typedef struct {
207
- ggml_half d[4]; // deltas for 4 q4_0 blocks
208
- uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
209
- } block_q4_0x4;
210
- static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
211
-
212
- typedef struct {
213
- ggml_half d[8]; // deltas for 8 q4_0 blocks
214
- uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
215
- } block_q4_0x8;
216
- static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
217
-
218
- typedef struct {
219
- ggml_half d[4]; // deltas for 4 q8_0 blocks
220
- int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
221
- } block_q8_0x4;
222
- static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
223
-
224
- typedef struct {
225
- ggml_half d[8]; // deltas for 8 q8_0 blocks
226
- int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
227
- } block_q8_0x8;
228
- static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
229
-
230
  //
231
  // Ternary quantization
232
  //
@@ -261,9 +254,9 @@ typedef struct {
261
  struct {
262
  ggml_half d; // super-block scale for quantized scales
263
  ggml_half dmin; // super-block scale for quantized mins
264
- } GGML_COMMON_AGGR;
265
  ggml_half2 dm;
266
- };
267
  } block_q2_K;
268
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
269
 
@@ -288,9 +281,9 @@ typedef struct {
288
  struct {
289
  ggml_half d; // super-block scale for quantized scales
290
  ggml_half dmin; // super-block scale for quantized mins
291
- } GGML_COMMON_AGGR;
292
  ggml_half2 dm;
293
- };
294
  uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
295
  uint8_t qs[QK_K/2]; // 4--bit quants
296
  } block_q4_K;
@@ -305,9 +298,9 @@ typedef struct {
305
  struct {
306
  ggml_half d; // super-block scale for quantized scales
307
  ggml_half dmin; // super-block scale for quantized mins
308
- } GGML_COMMON_AGGR;
309
  ggml_half2 dm;
310
- };
311
  uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
312
  uint8_t qh[QK_K/8]; // quants, high bit
313
  uint8_t qs[QK_K/2]; // quants, low 4 bits
@@ -418,12 +411,6 @@ typedef struct {
418
  } block_iq4_xs;
419
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
420
 
421
- typedef struct {
422
- ggml_half d[4]; // deltas for 4 iq4_nl blocks
423
- uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
424
- } block_iq4_nlx4;
425
- static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
426
-
427
  #endif // GGML_COMMON_DECL
428
  #endif // GGML_COMMON_DECL
429
 
@@ -437,6 +424,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
437
  #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
438
  #define GGML_TABLE_END() };
439
 
 
 
 
 
 
 
 
440
  #define GGML_COMMON_IMPL
441
  #elif defined(GGML_COMMON_IMPL_METAL)
442
  #include <metal_stdlib>
 
6
  typedef uint16_t ggml_half;
7
  typedef uint32_t ggml_half2;
8
 
9
+ #define GGML_COMMON_AGGR_U
10
+ #define GGML_COMMON_AGGR_S
11
+
12
+ #define GGML_COMMON_DECL
13
+ #elif defined(GGML_COMMON_DECL_CPP)
14
+ #include <cstdint>
15
+
16
+ typedef uint16_t ggml_half;
17
+ typedef uint32_t ggml_half2;
18
+
19
+ // std-c++ allow anonymous unions but some compiler warn on it
20
+ #define GGML_COMMON_AGGR_U data
21
+ // std-c++ do not allow it.
22
+ #define GGML_COMMON_AGGR_S data
23
 
24
  #define GGML_COMMON_DECL
25
  #elif defined(GGML_COMMON_DECL_METAL)
 
28
  typedef half ggml_half;
29
  typedef half2 ggml_half2;
30
 
31
+ #define GGML_COMMON_AGGR_U
32
+ #define GGML_COMMON_AGGR_S
33
 
34
  #define GGML_COMMON_DECL
35
  #elif defined(GGML_COMMON_DECL_CUDA)
 
43
  typedef half ggml_half;
44
  typedef half2 ggml_half2;
45
 
46
+ #define GGML_COMMON_AGGR_U
47
+ #define GGML_COMMON_AGGR_S data
48
 
49
  #define GGML_COMMON_DECL
50
  #elif defined(GGML_COMMON_DECL_HIP)
 
54
  typedef half ggml_half;
55
  typedef half2 ggml_half2;
56
 
57
+ #define GGML_COMMON_AGGR_U
58
+ #define GGML_COMMON_AGGR_S data
59
 
60
  #define GGML_COMMON_DECL
61
  #elif defined(GGML_COMMON_DECL_SYCL)
 
65
  typedef sycl::half ggml_half;
66
  typedef sycl::half2 ggml_half2;
67
 
68
+ #define GGML_COMMON_AGGR_U
69
+ #define GGML_COMMON_AGGR_S data
70
 
71
  #define GGML_COMMON_DECL
72
  #endif
 
171
  struct {
172
  ggml_half d; // delta
173
  ggml_half m; // min
174
+ } GGML_COMMON_AGGR_S;
175
  ggml_half2 dm;
176
+ } GGML_COMMON_AGGR_U;
177
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
178
  } block_q4_1;
179
  static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
 
192
  struct {
193
  ggml_half d; // delta
194
  ggml_half m; // min
195
+ } GGML_COMMON_AGGR_S;
196
  ggml_half2 dm;
197
+ } GGML_COMMON_AGGR_U;
198
  uint8_t qh[4]; // 5-th bit of quants
199
  uint8_t qs[QK5_1 / 2]; // nibbles / quants
200
  } block_q5_1;
 
213
  struct {
214
  ggml_half d; // delta
215
  ggml_half s; // d * sum(qs[i])
216
+ } GGML_COMMON_AGGR_S;
217
  ggml_half2 ds;
218
+ } GGML_COMMON_AGGR_U;
219
  int8_t qs[QK8_1]; // quants
220
  } block_q8_1;
221
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  //
224
  // Ternary quantization
225
  //
 
254
  struct {
255
  ggml_half d; // super-block scale for quantized scales
256
  ggml_half dmin; // super-block scale for quantized mins
257
+ } GGML_COMMON_AGGR_S;
258
  ggml_half2 dm;
259
+ } GGML_COMMON_AGGR_U;
260
  } block_q2_K;
261
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
262
 
 
281
  struct {
282
  ggml_half d; // super-block scale for quantized scales
283
  ggml_half dmin; // super-block scale for quantized mins
284
+ } GGML_COMMON_AGGR_S;
285
  ggml_half2 dm;
286
+ } GGML_COMMON_AGGR_U;
287
  uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
288
  uint8_t qs[QK_K/2]; // 4--bit quants
289
  } block_q4_K;
 
298
  struct {
299
  ggml_half d; // super-block scale for quantized scales
300
  ggml_half dmin; // super-block scale for quantized mins
301
+ } GGML_COMMON_AGGR_S;
302
  ggml_half2 dm;
303
+ } GGML_COMMON_AGGR_U;
304
  uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
305
  uint8_t qh[QK_K/8]; // quants, high bit
306
  uint8_t qs[QK_K/2]; // quants, low 4 bits
 
411
  } block_iq4_xs;
412
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
413
 
 
 
 
 
 
 
414
  #endif // GGML_COMMON_DECL
415
  #endif // GGML_COMMON_DECL
416
 
 
424
  #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
425
  #define GGML_TABLE_END() };
426
 
427
+ #define GGML_COMMON_IMPL
428
+ #elif defined(GGML_COMMON_IMPL_CPP)
429
+ #include <cstdint>
430
+
431
+ #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
432
+ #define GGML_TABLE_END() };
433
+
434
  #define GGML_COMMON_IMPL
435
  #elif defined(GGML_COMMON_IMPL_METAL)
436
  #include <metal_stdlib>
ggml/src/ggml-cpu/CMakeLists.txt CHANGED
@@ -10,10 +10,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
10
  list (APPEND GGML_CPU_SOURCES
11
  ggml-cpu/ggml-cpu.c
12
  ggml-cpu/ggml-cpu.cpp
13
- ggml-cpu/ggml-cpu-aarch64.c
14
  ggml-cpu/ggml-cpu-aarch64.h
 
 
15
  ggml-cpu/ggml-cpu-quants.c
16
  ggml-cpu/ggml-cpu-quants.h
 
 
17
  ggml-cpu/amx/amx.cpp
18
  ggml-cpu/amx/amx.h
19
  ggml-cpu/amx/mmq.cpp
 
10
  list (APPEND GGML_CPU_SOURCES
11
  ggml-cpu/ggml-cpu.c
12
  ggml-cpu/ggml-cpu.cpp
13
+ ggml-cpu/ggml-cpu-aarch64.cpp
14
  ggml-cpu/ggml-cpu-aarch64.h
15
+ ggml-cpu/ggml-cpu-hbm.cpp
16
+ ggml-cpu/ggml-cpu-hbm.h
17
  ggml-cpu/ggml-cpu-quants.c
18
  ggml-cpu/ggml-cpu-quants.h
19
+ ggml-cpu/ggml-cpu-traits.cpp
20
+ ggml-cpu/ggml-cpu-traits.h
21
  ggml-cpu/amx/amx.cpp
22
  ggml-cpu/amx/amx.h
23
  ggml-cpu/amx/mmq.cpp
ggml/src/ggml-cpu/amx/amx.cpp CHANGED
@@ -5,6 +5,7 @@
5
  #include "ggml-backend.h"
6
  #include "ggml-impl.h"
7
  #include "ggml-cpu.h"
 
8
 
9
  #if defined(__gnu_linux__)
10
  #include <sys/syscall.h>
@@ -17,31 +18,65 @@
17
 
18
  #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  // AMX buffer interface
21
  static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
22
  free(buffer->context);
23
  }
24
 
25
  static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
26
- return (void *)(buffer->context);
27
  }
28
 
29
- static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
30
- memset((char *)tensor->data + offset, value, size);
31
 
32
  GGML_UNUSED(buffer);
33
  }
34
 
35
- static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 
 
 
 
 
 
 
 
36
  if (qtype_has_amx_kernels(tensor->type)) {
 
37
  ggml_backend_amx_convert_weight(tensor, data, offset, size);
38
  } else {
39
- memcpy((char *)tensor->data + offset, data, size);
40
  }
41
 
42
  GGML_UNUSED(buffer);
43
  }
44
 
 
 
45
  static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
46
  GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
47
  memcpy(data, (const char *)tensor->data + offset, size);
@@ -62,6 +97,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
62
 
63
  GGML_UNUSED(buffer);
64
  }
 
65
 
66
  static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
67
  memset(buffer->context, value, buffer->size);
@@ -70,13 +106,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
70
  static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
71
  /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
72
  /* .get_base = */ ggml_backend_amx_buffer_get_base,
73
- /* .init_tensor = */ NULL, // no initialization required
74
  /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
75
  /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
76
- /* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
77
- /* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
78
  /* .clear = */ ggml_backend_amx_buffer_clear,
79
- /* .reset = */ NULL,
80
  };
81
 
82
  static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
@@ -101,14 +137,44 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
101
  GGML_UNUSED(buft);
102
  }
103
 
104
- static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
105
- return ggml_backend_amx_get_alloc_size(tensor);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- GGML_UNUSED(buft);
108
- }
 
 
 
109
 
110
- static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
111
- return false;
 
 
 
 
 
112
 
113
  GGML_UNUSED(buft);
114
  }
@@ -129,68 +195,26 @@ static bool ggml_amx_init() {
129
  return true;
130
  #endif
131
  }
 
132
  ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
133
  static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
134
  /* .iface = */ {
135
- /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
136
- /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
137
- /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
138
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
139
- /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
140
- /* .is_host = */ ggml_backend_amx_buffer_type_is_host,
141
- },
142
  /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
143
- /* .context = */ NULL,
144
  };
145
 
146
  if (!ggml_amx_init()) {
147
- return NULL;
148
  }
149
 
150
  return &ggml_backend_buffer_type_amx;
151
  }
152
 
153
- bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
154
- return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
155
- }
156
-
157
- bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
158
- // handle only 2d gemm for now
159
- auto is_contiguous_2d = [](const struct ggml_tensor * t) {
160
- return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
161
- };
162
-
163
- switch (op->op) {
164
- case GGML_OP_NONE:
165
- case GGML_OP_RESHAPE:
166
- case GGML_OP_VIEW:
167
- case GGML_OP_PERMUTE:
168
- case GGML_OP_TRANSPOSE:
169
- return true;
170
-
171
- case GGML_OP_MUL_MAT: {
172
- const struct ggml_tensor * src0 = op->src[0];
173
- const struct ggml_tensor * src1 = op->src[1];
174
-
175
- const enum ggml_type type = src0->type;
176
- const int64_t ne0 = op->ne[0];
177
-
178
- // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
179
- // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
180
- bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
181
-
182
- bool can_use_amx =
183
- is_contiguous_2d(src0) && // src0 must be contiguous
184
- is_contiguous_2d(src1) && // src1 must be contiguous
185
- src1->type == GGML_TYPE_F32 && // src1 must be float32
186
- has_amx_kernels && // with amx kernel impls
187
- ne0 % (TILE_N * 2) == 0; // out_features is 32x
188
-
189
- return can_use_amx;
190
- }
191
- default:
192
- return false;
193
- }
194
- }
195
-
196
- #endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
 
5
  #include "ggml-backend.h"
6
  #include "ggml-impl.h"
7
  #include "ggml-cpu.h"
8
+ #include "ggml-cpu-traits.h"
9
 
10
  #if defined(__gnu_linux__)
11
  #include <sys/syscall.h>
 
18
 
19
  #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
20
 
21
+ // AMX type_trais
22
+ namespace ggml::cpu::amx {
23
+ class tensor_traits : public ggml::cpu::tensor_traits {
24
+ bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
25
+ size = ggml_backend_amx_desired_wsize(op);
26
+ return true;
27
+ }
28
+
29
+ bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
30
+ if (op->op == GGML_OP_MUL_MAT) {
31
+ ggml_backend_amx_mul_mat(params, op);
32
+ return true;
33
+ }
34
+ return false;
35
+ }
36
+ };
37
+
38
+ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
39
+ static tensor_traits traits;
40
+ return &traits;
41
+ }
42
+ } // namespace ggml::cpu::amx
43
+
44
  // AMX buffer interface
45
  static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
46
  free(buffer->context);
47
  }
48
 
49
  static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
50
+ return (void *) (buffer->context);
51
  }
52
 
53
+ static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
54
+ tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
55
 
56
  GGML_UNUSED(buffer);
57
  }
58
 
59
+ static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
60
+ uint8_t value, size_t offset, size_t size) {
61
+ memset((char *) tensor->data + offset, value, size);
62
+
63
+ GGML_UNUSED(buffer);
64
+ }
65
+
66
+ static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
67
+ const void * data, size_t offset, size_t size) {
68
  if (qtype_has_amx_kernels(tensor->type)) {
69
+ GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
70
  ggml_backend_amx_convert_weight(tensor, data, offset, size);
71
  } else {
72
+ memcpy((char *) tensor->data + offset, data, size);
73
  }
74
 
75
  GGML_UNUSED(buffer);
76
  }
77
 
78
+ /*
79
+ // need to figure what we need to do with buffer->extra.
80
  static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
81
  GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
82
  memcpy(data, (const char *)tensor->data + offset, size);
 
97
 
98
  GGML_UNUSED(buffer);
99
  }
100
+ */
101
 
102
  static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
103
  memset(buffer->context, value, buffer->size);
 
106
  static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
107
  /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
108
  /* .get_base = */ ggml_backend_amx_buffer_get_base,
109
+ /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor,
110
  /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
111
  /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
112
+ /* .get_tensor = */ nullptr,
113
+ /* .cpy_tensor = */ nullptr,
114
  /* .clear = */ ggml_backend_amx_buffer_clear,
115
+ /* .reset = */ nullptr,
116
  };
117
 
118
  static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
 
137
  GGML_UNUSED(buft);
138
  }
139
 
140
+ namespace ggml::cpu::amx {
141
+ class extra_buffer_type : ggml::cpu::extra_buffer_type {
142
+ bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
143
+ // handle only 2d gemm for now
144
+ auto is_contiguous_2d = [](const struct ggml_tensor * t) {
145
+ return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
146
+ };
147
+
148
+ if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
149
+ is_contiguous_2d(op->src[1]) && // src1 must be contiguous
150
+ op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
151
+ op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
152
+ (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
153
+ // src1 must be host buffer
154
+ if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
155
+ return false;
156
+ }
157
+ // src1 must be float32
158
+ if (op->src[1]->type == GGML_TYPE_F32) {
159
+ return true;
160
+ }
161
+ }
162
+ return false;
163
+ }
164
 
165
+ ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
166
+ if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
167
+ op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
168
+ return (ggml::cpu::tensor_traits *) op->src[0]->extra;
169
+ }
170
 
171
+ return nullptr;
172
+ }
173
+ };
174
+ } // namespace ggml::cpu::amx
175
+
176
+ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
177
+ return ggml_backend_amx_get_alloc_size(tensor);
178
 
179
  GGML_UNUSED(buft);
180
  }
 
195
  return true;
196
  #endif
197
  }
198
+
199
  ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
200
  static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
201
  /* .iface = */ {
202
+ /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
203
+ /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
204
+ /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
205
+ /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
206
+ /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
207
+ /* .is_host = */ nullptr,
208
+ },
209
  /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
210
+ /* .context = */ new ggml::cpu::amx::extra_buffer_type(),
211
  };
212
 
213
  if (!ggml_amx_init()) {
214
+ return nullptr;
215
  }
216
 
217
  return &ggml_backend_buffer_type_amx;
218
  }
219
 
220
+ #endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cpu/amx/amx.h CHANGED
@@ -1,20 +1,8 @@
1
  #include "ggml-backend.h"
2
  #include "ggml-cpu-impl.h"
3
 
4
- #ifdef __cplusplus
5
- extern "C" {
6
- #endif
7
 
8
  #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
9
-
10
  ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
11
- bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
12
- bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
13
- void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
14
- size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
15
-
16
- #endif
17
-
18
- #ifdef __cplusplus
19
- }
20
  #endif
 
1
  #include "ggml-backend.h"
2
  #include "ggml-cpu-impl.h"
3
 
4
+ // GGML internal header
 
 
5
 
6
  #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
 
7
  ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
 
 
 
 
 
 
 
 
 
8
  #endif
ggml/src/ggml-cpu/amx/common.h CHANGED
@@ -7,7 +7,7 @@
7
  #include <memory>
8
  #include <type_traits>
9
 
10
- #if defined(_OPENMP)
11
  #include <omp.h>
12
  #endif
13
 
@@ -56,11 +56,11 @@ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
56
  }
57
 
58
  template <typename func_t>
59
- inline void parallel_for(int nth, int n, const func_t& f) {
60
- #if defined(_OPENMP)
61
- #pragma omp parallel num_threads(nth)
62
  {
63
- //int nth = omp_get_num_threads();
64
  int ith = omp_get_thread_num();
65
  int tbegin, tend;
66
  balance211(n, nth, ith, tbegin, tend);
@@ -68,8 +68,6 @@ inline void parallel_for(int nth, int n, const func_t& f) {
68
  }
69
  #else
70
  f(0, n);
71
-
72
- GGML_UNUSED(nth);
73
  #endif
74
  }
75
 
@@ -91,10 +89,3 @@ inline bool qtype_has_amx_kernels(const enum ggml_type type) {
91
  (type == GGML_TYPE_Q6_K) ||
92
  (type == GGML_TYPE_IQ4_XS);
93
  }
94
-
95
- // ggml backend context
96
- struct ggml_backend_amx_context {
97
- int n_threads = GGML_DEFAULT_N_THREADS;
98
- std::unique_ptr<char[]> work_data;
99
- size_t work_size = 0;
100
- };
 
7
  #include <memory>
8
  #include <type_traits>
9
 
10
+ #if defined(GGML_USE_OPENMP)
11
  #include <omp.h>
12
  #endif
13
 
 
56
  }
57
 
58
  template <typename func_t>
59
+ inline void parallel_for(int n, const func_t& f) {
60
+ #if defined(GGML_USE_OPENMP)
61
+ #pragma omp parallel
62
  {
63
+ int nth = omp_get_num_threads();
64
  int ith = omp_get_thread_num();
65
  int tbegin, tend;
66
  balance211(n, nth, ith, tbegin, tend);
 
68
  }
69
  #else
70
  f(0, n);
 
 
71
  #endif
72
  }
73
 
 
89
  (type == GGML_TYPE_Q6_K) ||
90
  (type == GGML_TYPE_IQ4_XS);
91
  }
 
 
 
 
 
 
 
ggml/src/ggml-cpu/amx/mmq.cpp CHANGED
@@ -18,10 +18,6 @@
18
  #include <unistd.h>
19
  #endif
20
 
21
- #if defined(_OPENMP)
22
- #include <omp.h>
23
- #endif
24
-
25
  #if (defined(_WIN32) || defined(_WIN64))
26
  #define RESTRICT __restrict
27
  #else
@@ -1382,13 +1378,13 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
1382
  #define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
1383
 
1384
  template<typename TB, int BLOCK_K>
1385
- void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) {
1386
  const int NB = N / TILE_N;
1387
  const int KB = K / BLOCK_K;
1388
  const int TILE_SIZE = get_tile_size<TB>();
1389
 
1390
  // parallel on NB should be enough
1391
- parallel_for(n_threads, NB, [&](int begin, int end) {
1392
  for (int n = begin; n < end; ++n) {
1393
  for (int k = 0; k < KB; ++k) {
1394
  int n0 = n * TILE_N;
@@ -2334,15 +2330,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
2334
  const int K = tensor->ne[0]; // ne0: in_features
2335
  const int N = tensor->ne[1]; // ne1: out_features
2336
 
2337
- #if defined(_OPENMP)
2338
- // the buffer ctx is not initialized when .set_tensor is called
2339
- int n_threads = omp_get_num_threads();
2340
- #else
2341
- int n_threads = 1;
2342
- #endif
2343
-
2344
  GGML_DISPATCH_QTYPES(TYPE, [&] {
2345
- convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads);
2346
  });
2347
  }
2348
 
 
18
  #include <unistd.h>
19
  #endif
20
 
 
 
 
 
21
  #if (defined(_WIN32) || defined(_WIN64))
22
  #define RESTRICT __restrict
23
  #else
 
1378
  #define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
1379
 
1380
  template<typename TB, int BLOCK_K>
1381
+ void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) {
1382
  const int NB = N / TILE_N;
1383
  const int KB = K / BLOCK_K;
1384
  const int TILE_SIZE = get_tile_size<TB>();
1385
 
1386
  // parallel on NB should be enough
1387
+ parallel_for(NB, [&](int begin, int end) {
1388
  for (int n = begin; n < end; ++n) {
1389
  for (int k = 0; k < KB; ++k) {
1390
  int n0 = n * TILE_N;
 
2330
  const int K = tensor->ne[0]; // ne0: in_features
2331
  const int N = tensor->ne[1]; // ne1: out_features
2332
 
 
 
 
 
 
 
 
2333
  GGML_DISPATCH_QTYPES(TYPE, [&] {
2334
+ convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K);
2335
  });
2336
  }
2337
 
ggml/src/ggml-cpu/amx/mmq.h CHANGED
@@ -1,16 +1,10 @@
1
  #pragma once
2
  #include "common.h"
3
 
4
- #ifdef __cplusplus
5
- extern "C" {
6
- #endif
7
 
8
  size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
9
 
10
  void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
11
 
12
  void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
13
-
14
- #ifdef __cplusplus
15
- }
16
- #endif
 
1
  #pragma once
2
  #include "common.h"
3
 
4
+ size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
 
 
5
 
6
  size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
7
 
8
  void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
9
 
10
  void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 
 
 
 
ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp ADDED
The diff for this file is too large to render. See raw diff
 
ggml/src/ggml-cpu/ggml-cpu-aarch64.h CHANGED
@@ -1,32 +1,8 @@
1
  #pragma once
2
 
 
3
  #include "ggml.h"
4
 
5
  // GGML internal header
6
 
7
- #ifdef __cplusplus
8
- extern "C" {
9
- #endif
10
-
11
- // Quantization
12
- void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
13
-
14
- // GEMV
15
- void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
16
- void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
17
- void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
18
- void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
19
-
20
- // GEMM
21
- void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
22
- void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
23
- void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
24
- void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
25
-
26
- void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
27
- enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
28
-
29
- #ifdef __cplusplus
30
- }
31
- #endif
32
-
 
1
  #pragma once
2
 
3
+ #include "ggml-cpu-traits.h"
4
  #include "ggml.h"
5
 
6
  // GGML internal header
7
 
8
+ ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cpu/ggml-cpu-hbm.cpp ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifdef GGML_USE_CPU_HBM
2
+
3
+ #include "ggml-backend.h"
4
+ #include "ggml-backend-impl.h"
5
+ #include "ggml-cpu.h"
6
+ #include "ggml-impl.h"
7
+
8
+ #include "ggml-cpu-hbm.h"
9
+
10
+ // buffer type HBM
11
+
12
+ #include <hbwmalloc.h>
13
+
14
+ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
15
+ return "CPU_HBM";
16
+
17
+ GGML_UNUSED(buft);
18
+ }
19
+
20
+ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
21
+ hbw_free(buffer->context);
22
+ }
23
+
24
+ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
25
+ size_t size) {
26
+ void * ptr;
27
+ int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
28
+ if (result != 0) {
29
+ GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
30
+ return NULL;
31
+ }
32
+
33
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
34
+ buffer->buft = buft;
35
+ buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
36
+
37
+ return buffer;
38
+ }
39
+
40
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
41
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
42
+ /* .iface = */ {
43
+ /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
44
+ /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
45
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
46
+ /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
47
+ /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
48
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
49
+ },
50
+ /* .context = */ nullptr,
51
+ };
52
+
53
+ return &ggml_backend_cpu_buffer_type_hbm;
54
+ }
55
+ #endif
ggml/src/ggml-cpu/ggml-cpu-hbm.h ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml-backend.h"
4
+ #include "ggml.h"
5
+
6
+ // GGML CPU internal header
7
+
8
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
ggml/src/ggml-cpu/ggml-cpu-traits.cpp ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml-cpu-traits.h"
2
+
3
+ #include "ggml-backend-impl.h"
4
+ #include "ggml-backend.h"
5
+
6
+ namespace ggml::cpu {
7
+ tensor_traits::~tensor_traits() {}
8
+
9
+ extra_buffer_type::~extra_buffer_type() {}
10
+ } // namespace ggml::cpu
11
+
12
+ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
13
+ for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
14
+ if (extra && extra->context) {
15
+ auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
16
+ auto tensor_traits = buf_extra->get_tensor_traits(op);
17
+ if (tensor_traits && tensor_traits->compute_forward(params, op)) {
18
+ return true;
19
+ }
20
+ }
21
+ }
22
+ return false;
23
+ }
24
+
25
+ bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
26
+ for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
27
+ if (extra && extra->context) {
28
+ auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
29
+ auto tensor_traits = buf_extra->get_tensor_traits(op);
30
+ if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) {
31
+ return true;
32
+ }
33
+ }
34
+ }
35
+ return false;
36
+ }
ggml/src/ggml-cpu/ggml-cpu-traits.h ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include "ggml-backend-impl.h"
3
+ #include "ggml-cpu-impl.h"
4
+ #include "ggml.h"
5
+
6
+ #ifdef __cplusplus
7
+ # include <vector>
8
+ extern "C" {
9
+ #endif
10
+
11
+ // return true if op part of extra "accelerator"
12
+ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
13
+ bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
14
+
15
+ #ifdef __cplusplus
16
+ }
17
+
18
+ namespace ggml::cpu {
19
+ // register in tensor->extra
20
+ class tensor_traits {
21
+ public:
22
+ virtual ~tensor_traits();
23
+ virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0;
24
+ virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
25
+ };
26
+
27
+ class extra_buffer_type {
28
+ public:
29
+ virtual ~extra_buffer_type();
30
+ virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
31
+ virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0;
32
+ };
33
+ } // namespace ggml::cpu
34
+
35
+ // implemented in ggml-cpu.cpp.
36
+ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
37
+
38
+ #endif
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -3,7 +3,7 @@
3
 
4
  #include "ggml-backend-impl.h"
5
  #include "ggml-backend.h"
6
- #include "ggml-cpu-aarch64.h"
7
  #include "ggml-cpu-impl.h"
8
  #include "ggml-cpu.h"
9
  #include "ggml-impl.h"
@@ -224,10 +224,6 @@ typedef void * thread_ret_t;
224
 
225
  typedef pthread_t ggml_thread_t;
226
 
227
- #ifdef GGML_USE_CPU_HBM
228
- #include <hbwmalloc.h>
229
- #endif
230
-
231
  #if defined(__APPLE__)
232
  #include <unistd.h>
233
  #include <mach/mach.h>
@@ -301,7 +297,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
301
  },
302
  [GGML_TYPE_Q8_0] = {
303
  .from_float = quantize_row_q8_0,
304
- .from_float_to_mat = quantize_mat_q8_0,
305
  .vec_dot = ggml_vec_dot_q8_0_q8_0,
306
  .vec_dot_type = GGML_TYPE_Q8_0,
307
  #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -409,33 +404,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
409
  .vec_dot_type = GGML_TYPE_BF16,
410
  .nrows = 1,
411
  },
412
- [GGML_TYPE_Q4_0_4_4] = {
413
- .from_float = NULL,
414
- .vec_dot = NULL,
415
- .vec_dot_type = GGML_TYPE_Q8_0,
416
- .nrows = 1,
417
- .ncols = 4,
418
- .gemv = ggml_gemv_q4_0_4x4_q8_0,
419
- .gemm = ggml_gemm_q4_0_4x4_q8_0,
420
- },
421
- [GGML_TYPE_Q4_0_4_8] = {
422
- .from_float = NULL,
423
- .vec_dot = NULL,
424
- .vec_dot_type = GGML_TYPE_Q8_0,
425
- .nrows = 1,
426
- .ncols = 4,
427
- .gemv = ggml_gemv_q4_0_4x8_q8_0,
428
- .gemm = ggml_gemm_q4_0_4x8_q8_0,
429
- },
430
- [GGML_TYPE_Q4_0_8_8] = {
431
- .from_float = NULL,
432
- .vec_dot = NULL,
433
- .vec_dot_type = GGML_TYPE_Q8_0,
434
- .nrows = 1,
435
- .ncols = 8,
436
- .gemv = ggml_gemv_q4_0_8x8_q8_0,
437
- .gemm = ggml_gemm_q4_0_8x8_q8_0,
438
- },
439
  [GGML_TYPE_TQ1_0] = {
440
  .from_float = quantize_row_tq1_0,
441
  .vec_dot = ggml_vec_dot_tq1_0_q8_K,
@@ -448,15 +416,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
448
  .vec_dot_type = GGML_TYPE_Q8_K,
449
  .nrows = 1,
450
  },
451
- [GGML_TYPE_IQ4_NL_4_4] = {
452
- .from_float = NULL,
453
- .vec_dot = NULL,
454
- .vec_dot_type = GGML_TYPE_Q8_0,
455
- .nrows = 1,
456
- .ncols = 4,
457
- .gemv = ggml_gemv_iq4_nl_4x4_q8_0,
458
- .gemm = ggml_gemm_iq4_nl_4x4_q8_0,
459
- },
460
  };
461
 
462
  const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@@ -4509,9 +4468,6 @@ static void ggml_compute_forward_add(
4509
  case GGML_TYPE_IQ4_XS:
4510
  case GGML_TYPE_IQ3_S:
4511
  case GGML_TYPE_IQ2_S:
4512
- case GGML_TYPE_Q4_0_4_4:
4513
- case GGML_TYPE_Q4_0_4_8:
4514
- case GGML_TYPE_Q4_0_8_8:
4515
  {
4516
  ggml_compute_forward_add_q_f32(params, dst);
4517
  } break;
@@ -4889,9 +4845,6 @@ static void ggml_compute_forward_add1(
4889
  case GGML_TYPE_IQ4_XS:
4890
  case GGML_TYPE_IQ3_S:
4891
  case GGML_TYPE_IQ2_S:
4892
- case GGML_TYPE_Q4_0_4_4:
4893
- case GGML_TYPE_Q4_0_4_8:
4894
- case GGML_TYPE_Q4_0_8_8:
4895
  {
4896
  ggml_compute_forward_add1_q_f32(params, dst);
4897
  } break;
@@ -5019,9 +4972,6 @@ static void ggml_compute_forward_acc(
5019
  case GGML_TYPE_IQ4_XS:
5020
  case GGML_TYPE_IQ3_S:
5021
  case GGML_TYPE_IQ2_S:
5022
- case GGML_TYPE_Q4_0_4_4:
5023
- case GGML_TYPE_Q4_0_4_8:
5024
- case GGML_TYPE_Q4_0_8_8:
5025
  default:
5026
  {
5027
  GGML_ABORT("fatal error");
@@ -7437,27 +7387,9 @@ static void ggml_compute_forward_mul_mat(
7437
  const int ith = params->ith;
7438
  const int nth = params->nth;
7439
 
7440
- enum ggml_type type = src0->type;
7441
-
7442
- if (src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
7443
- type = (enum ggml_type)(intptr_t)src0->extra;
7444
- }
7445
-
7446
- #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
7447
- if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
7448
- ggml_backend_amx_mul_mat(params, dst);
7449
- return;
7450
- }
7451
- #endif
7452
-
7453
- enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
7454
  ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
7455
- ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
7456
- int64_t const vec_dot_num_rows = type_traits_cpu[type].nrows;
7457
- int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
7458
- int64_t const blck_size_interleave = ggml_get_type_traits(type)->blck_size_interleave;
7459
- ggml_gemv_t const gemv = type_traits_cpu[type].gemv;
7460
- ggml_gemm_t const gemm = type_traits_cpu[type].gemm;
7461
 
7462
  GGML_ASSERT(ne0 == ne01);
7463
  GGML_ASSERT(ne1 == ne11);
@@ -7465,7 +7397,7 @@ static void ggml_compute_forward_mul_mat(
7465
  GGML_ASSERT(ne3 == ne13);
7466
 
7467
  // we don't support permuted src0 or src1
7468
- GGML_ASSERT(nb00 == ggml_type_size(type));
7469
  GGML_ASSERT(nb10 == ggml_type_size(src1->type));
7470
 
7471
  // dst cannot be transposed or permuted
@@ -7477,6 +7409,7 @@ static void ggml_compute_forward_mul_mat(
7477
  // nb01 >= nb00 - src0 is not transposed
7478
  // compute by src0 rows
7479
 
 
7480
  #if GGML_USE_LLAMAFILE
7481
  // broadcast factors
7482
  const int64_t r2 = ne12 / ne02;
@@ -7487,15 +7420,15 @@ static void ggml_compute_forward_mul_mat(
7487
  if (src1_cont) {
7488
  for (int64_t i13 = 0; i13 < ne13; i13++)
7489
  for (int64_t i12 = 0; i12 < ne12; i12++)
7490
- if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
7491
  (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
7492
- nb01/ggml_type_size(type),
7493
  (const char *)src1->data + i12*nb12 + i13*nb13,
7494
  nb11/ggml_type_size(src1->type),
7495
  (char *)dst->data + i12*nb2 + i13*nb3,
7496
  nb1/ggml_type_size(dst->type),
7497
  ith, nth,
7498
- type,
7499
  src1->type,
7500
  dst->type))
7501
  goto UseGgmlGemm1;
@@ -7516,19 +7449,10 @@ UseGgmlGemm1:;
7516
 
7517
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
7518
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
7519
- int64_t i11_processed = 0;
7520
- if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
7521
- for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
7522
- from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
7523
- (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
7524
- 4, ne10, blck_size_interleave);
7525
- }
7526
- i11_processed = ne11 - ne11 % 4;
7527
- }
7528
- for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
7529
  from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
7530
- (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
7531
- ne10);
7532
  }
7533
  }
7534
  }
@@ -7548,15 +7472,15 @@ UseGgmlGemm1:;
7548
 
7549
  for (int64_t i13 = 0; i13 < ne13; i13++)
7550
  for (int64_t i12 = 0; i12 < ne12; i12++)
7551
- if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
7552
  (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
7553
- nb01/ggml_type_size(type),
7554
  (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
7555
  row_size/ggml_type_size(vec_dot_type),
7556
  (char *)dst->data + i12*nb2 + i13*nb3,
7557
  nb1/ggml_type_size(dst->type),
7558
  ith, nth,
7559
- type,
7560
  vec_dot_type,
7561
  dst->type))
7562
  goto UseGgmlGemm2;
@@ -7598,28 +7522,6 @@ UseGgmlGemm2:;
7598
  const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
7599
  const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
7600
 
7601
- if ((ggml_n_dims(src0) == 2) && gemv) {
7602
- const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
7603
- const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
7604
- int64_t src0_start = (ith * ne01) / nth;
7605
- int64_t src0_end = ((ith + 1) * ne01) / nth;
7606
- src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
7607
- src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
7608
- if (src0_start >= src0_end) return;
7609
-
7610
- // If there are more than three rows in src1, use gemm; otherwise, use gemv.
7611
- if (gemm && (ne11 > 3)) {
7612
- gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
7613
- (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
7614
- }
7615
- for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
7616
- gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
7617
- (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
7618
- src0_end - src0_start);
7619
- }
7620
- return;
7621
- }
7622
-
7623
  // The first chunk comes from our thread_id, the rest will get auto-assigned.
7624
  int current_chunk = ith;
7625
 
@@ -7642,7 +7544,7 @@ UseGgmlGemm2:;
7642
  num_rows_per_vec_dot = 1;
7643
  }
7644
 
7645
- ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
7646
 
7647
  if (nth >= nchunk0 * nchunk1) {
7648
  break;
@@ -7674,8 +7576,6 @@ static void ggml_compute_forward_mul_mat_id(
7674
  ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
7675
  enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
7676
  ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
7677
- int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
7678
- ggml_gemv_t const gemv = type_traits_cpu[type].gemv;
7679
 
7680
  // we don't support permuted src0 or src1
7681
  GGML_ASSERT(nb00 == ggml_type_size(type));
@@ -7761,34 +7661,6 @@ static void ggml_compute_forward_mul_mat_id(
7761
  const int64_t nr0 = ne01; // src0 rows
7762
  const int64_t nr1 = cne1; // src1 rows
7763
 
7764
- if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
7765
- int64_t src0_cur_start = (ith * ne01) / nth;
7766
- int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
7767
- src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
7768
- src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
7769
- if (src0_cur_start >= src0_cur_end) return;
7770
-
7771
- for (int ir1 = 0; ir1 < nr1; ir1++) {
7772
- struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
7773
- const int id = row_mapping.i1; // selected expert index
7774
-
7775
- const int64_t i11 = id % ne11;
7776
- const int64_t i12 = row_mapping.i2; // row index in src1
7777
-
7778
- const int64_t i1 = id; // selected expert index
7779
- const int64_t i2 = i12; // row
7780
-
7781
- const char * src1_col = (const char *) wdata +
7782
- (src1_cont || src1->type != vec_dot_type
7783
- ? (i11 + i12 * ne11) * row_size
7784
- : (i11 * nb11 + i12 * nb12));
7785
-
7786
- gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
7787
- (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
7788
- }
7789
- continue;
7790
- }
7791
-
7792
  // distribute the thread work across the inner or outer loop based on which one is larger
7793
 
7794
  const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
@@ -8096,9 +7968,6 @@ static void ggml_compute_forward_out_prod(
8096
  case GGML_TYPE_IQ4_XS:
8097
  case GGML_TYPE_IQ3_S:
8098
  case GGML_TYPE_IQ2_S:
8099
- case GGML_TYPE_Q4_0_4_4:
8100
- case GGML_TYPE_Q4_0_4_8:
8101
- case GGML_TYPE_Q4_0_8_8:
8102
  {
8103
  ggml_compute_forward_out_prod_q_f32(params, dst);
8104
  } break;
@@ -8361,9 +8230,6 @@ static void ggml_compute_forward_set(
8361
  case GGML_TYPE_IQ4_XS:
8362
  case GGML_TYPE_IQ3_S:
8363
  case GGML_TYPE_IQ2_S:
8364
- case GGML_TYPE_Q4_0_4_4:
8365
- case GGML_TYPE_Q4_0_4_8:
8366
- case GGML_TYPE_Q4_0_8_8:
8367
  default:
8368
  {
8369
  GGML_ABORT("fatal error");
@@ -8625,9 +8491,6 @@ static void ggml_compute_forward_get_rows(
8625
  case GGML_TYPE_IQ4_XS:
8626
  case GGML_TYPE_IQ3_S:
8627
  case GGML_TYPE_IQ2_S:
8628
- case GGML_TYPE_Q4_0_4_4:
8629
- case GGML_TYPE_Q4_0_4_8:
8630
- case GGML_TYPE_Q4_0_8_8:
8631
  {
8632
  ggml_compute_forward_get_rows_q(params, dst);
8633
  } break;
@@ -9217,10 +9080,6 @@ static void ggml_compute_forward_clamp(
9217
  case GGML_TYPE_IQ3_S:
9218
  case GGML_TYPE_IQ2_S:
9219
  case GGML_TYPE_Q8_K:
9220
- case GGML_TYPE_Q4_0_4_4:
9221
- case GGML_TYPE_Q4_0_4_8:
9222
- case GGML_TYPE_Q4_0_8_8:
9223
- case GGML_TYPE_IQ4_NL_4_4:
9224
  case GGML_TYPE_I8:
9225
  case GGML_TYPE_I16:
9226
  case GGML_TYPE_I32:
@@ -12426,6 +12285,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
12426
  return;
12427
  }
12428
 
 
 
 
12429
  switch (tensor->op) {
12430
  case GGML_OP_DUP:
12431
  {
@@ -13373,146 +13235,142 @@ struct ggml_cplan ggml_graph_plan(
13373
 
13374
  size_t cur = 0;
13375
 
13376
- switch (node->op) {
13377
- case GGML_OP_CPY:
13378
- case GGML_OP_DUP:
13379
- {
13380
- if (ggml_is_quantized(node->type) ||
13381
- // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
13382
- (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
13383
- (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13384
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
13385
- }
13386
- } break;
13387
- case GGML_OP_ADD:
13388
- case GGML_OP_ADD1:
13389
- {
13390
- if (ggml_is_quantized(node->src[0]->type)) {
13391
- cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
13392
- }
13393
- } break;
13394
- case GGML_OP_ACC:
13395
- {
13396
- if (ggml_is_quantized(node->src[0]->type)) {
13397
- cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
13398
- }
13399
- } break;
13400
- case GGML_OP_COUNT_EQUAL:
13401
- {
13402
- cur = ggml_type_size(node->type)*n_tasks;
13403
- } break;
13404
- case GGML_OP_MUL_MAT:
13405
- {
13406
- #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
13407
- if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
13408
- cur = ggml_backend_amx_desired_wsize(node);
13409
- }
13410
- #endif
13411
- const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
 
 
 
 
 
13412
 
13413
- if (node->src[1]->type != vec_dot_type) {
13414
- size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
13415
- cur = MAX(cur, cur2);
13416
- }
13417
- } break;
13418
- case GGML_OP_MUL_MAT_ID:
13419
- {
13420
- cur = 0;
13421
- const struct ggml_tensor * src0 = node->src[0];
13422
- const struct ggml_tensor * src1 = node->src[1];
13423
- const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
13424
- if (src1->type != vec_dot_type) {
13425
- cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
13426
- }
13427
- const int n_as = src0->ne[2];
13428
- cur += GGML_PAD(cur, sizeof(int64_t)); // align
13429
- cur += n_as * sizeof(int64_t); // matrix_row_counts
13430
- cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
13431
- } break;
13432
- case GGML_OP_OUT_PROD:
13433
- {
13434
- if (ggml_is_quantized(node->src[0]->type)) {
13435
- cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
13436
- }
13437
- } break;
13438
- case GGML_OP_SOFT_MAX:
13439
- case GGML_OP_ROPE:
13440
- {
13441
- cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
13442
- } break;
13443
- case GGML_OP_CONV_TRANSPOSE_1D:
13444
- {
13445
- GGML_ASSERT(node->src[0]->ne[3] == 1);
13446
- GGML_ASSERT(node->src[1]->ne[2] == 1);
13447
- GGML_ASSERT(node->src[1]->ne[3] == 1);
13448
-
13449
- const int64_t ne00 = node->src[0]->ne[0]; // K
13450
- const int64_t ne01 = node->src[0]->ne[1]; // Cout
13451
- const int64_t ne02 = node->src[0]->ne[2]; // Cin
13452
-
13453
- const int64_t ne10 = node->src[1]->ne[0]; // L
13454
- const int64_t ne11 = node->src[1]->ne[1]; // Cin
13455
-
13456
- if ((node->src[0]->type == GGML_TYPE_F16 ||
13457
- node->src[0]->type == GGML_TYPE_BF16) &&
13458
- node->src[1]->type == GGML_TYPE_F32) {
13459
- cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
13460
- cur += sizeof(ggml_fp16_t)*ne10*ne11;
13461
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
13462
- node->src[1]->type == GGML_TYPE_F32) {
13463
- cur += sizeof(float)*ne00*ne01*ne02;
13464
- cur += sizeof(float)*ne10*ne11;
13465
- } else {
13466
- GGML_ABORT("fatal error");
13467
- }
13468
- } break;
13469
- case GGML_OP_CONV_TRANSPOSE_2D:
13470
- {
13471
- const int64_t ne00 = node->src[0]->ne[0]; // W
13472
- const int64_t ne01 = node->src[0]->ne[1]; // H
13473
- const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
13474
- const int64_t ne03 = node->src[0]->ne[3]; // Channels In
13475
-
13476
- const int64_t ne10 = node->src[1]->ne[0]; // W
13477
- const int64_t ne11 = node->src[1]->ne[1]; // H
13478
- const int64_t ne12 = node->src[1]->ne[2]; // Channels In
13479
-
13480
- cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
13481
- cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
13482
- } break;
13483
- case GGML_OP_FLASH_ATTN_EXT:
13484
- {
13485
- const int64_t ne00 = node->src[0]->ne[0]; // D
13486
 
13487
- cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
13488
- } break;
13489
- case GGML_OP_FLASH_ATTN_BACK:
13490
- {
13491
- const int64_t D = node->src[0]->ne[0];
13492
- const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
13493
- const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
13494
- if (node->src[1]->type == GGML_TYPE_F32) {
13495
- cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
13496
- cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
13497
- } else if (node->src[1]->type == GGML_TYPE_F16) {
13498
- cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
13499
- cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
13500
- } else if (node->src[1]->type == GGML_TYPE_BF16) {
13501
- cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
13502
- cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
13503
- }
13504
- } break;
13505
 
13506
- case GGML_OP_CROSS_ENTROPY_LOSS:
13507
- {
13508
- cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
13509
- } break;
13510
- case GGML_OP_COUNT:
13511
- {
13512
- GGML_ABORT("fatal error");
13513
- }
13514
- default:
13515
- break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13516
  }
13517
 
13518
  work_size = MAX(work_size, cur);
 
3
 
4
  #include "ggml-backend-impl.h"
5
  #include "ggml-backend.h"
6
+ #include "ggml-cpu-traits.h"
7
  #include "ggml-cpu-impl.h"
8
  #include "ggml-cpu.h"
9
  #include "ggml-impl.h"
 
224
 
225
  typedef pthread_t ggml_thread_t;
226
 
 
 
 
 
227
  #if defined(__APPLE__)
228
  #include <unistd.h>
229
  #include <mach/mach.h>
 
297
  },
298
  [GGML_TYPE_Q8_0] = {
299
  .from_float = quantize_row_q8_0,
 
300
  .vec_dot = ggml_vec_dot_q8_0_q8_0,
301
  .vec_dot_type = GGML_TYPE_Q8_0,
302
  #if defined (__ARM_FEATURE_MATMUL_INT8)
 
404
  .vec_dot_type = GGML_TYPE_BF16,
405
  .nrows = 1,
406
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  [GGML_TYPE_TQ1_0] = {
408
  .from_float = quantize_row_tq1_0,
409
  .vec_dot = ggml_vec_dot_tq1_0_q8_K,
 
416
  .vec_dot_type = GGML_TYPE_Q8_K,
417
  .nrows = 1,
418
  },
 
 
 
 
 
 
 
 
 
419
  };
420
 
421
  const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
 
4468
  case GGML_TYPE_IQ4_XS:
4469
  case GGML_TYPE_IQ3_S:
4470
  case GGML_TYPE_IQ2_S:
 
 
 
4471
  {
4472
  ggml_compute_forward_add_q_f32(params, dst);
4473
  } break;
 
4845
  case GGML_TYPE_IQ4_XS:
4846
  case GGML_TYPE_IQ3_S:
4847
  case GGML_TYPE_IQ2_S:
 
 
 
4848
  {
4849
  ggml_compute_forward_add1_q_f32(params, dst);
4850
  } break;
 
4972
  case GGML_TYPE_IQ4_XS:
4973
  case GGML_TYPE_IQ3_S:
4974
  case GGML_TYPE_IQ2_S:
 
 
 
4975
  default:
4976
  {
4977
  GGML_ABORT("fatal error");
 
7387
  const int ith = params->ith;
7388
  const int nth = params->nth;
7389
 
7390
+ enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
 
 
 
 
 
 
 
 
 
 
 
 
 
7391
  ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
7392
+ int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows;
 
 
 
 
 
7393
 
7394
  GGML_ASSERT(ne0 == ne01);
7395
  GGML_ASSERT(ne1 == ne11);
 
7397
  GGML_ASSERT(ne3 == ne13);
7398
 
7399
  // we don't support permuted src0 or src1
7400
+ GGML_ASSERT(nb00 == ggml_type_size(src0->type));
7401
  GGML_ASSERT(nb10 == ggml_type_size(src1->type));
7402
 
7403
  // dst cannot be transposed or permuted
 
7409
  // nb01 >= nb00 - src0 is not transposed
7410
  // compute by src0 rows
7411
 
7412
+ // TODO: extract to "extra_op"
7413
  #if GGML_USE_LLAMAFILE
7414
  // broadcast factors
7415
  const int64_t r2 = ne12 / ne02;
 
7420
  if (src1_cont) {
7421
  for (int64_t i13 = 0; i13 < ne13; i13++)
7422
  for (int64_t i12 = 0; i12 < ne12; i12++)
7423
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
7424
  (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
7425
+ nb01/ggml_type_size(src0->type),
7426
  (const char *)src1->data + i12*nb12 + i13*nb13,
7427
  nb11/ggml_type_size(src1->type),
7428
  (char *)dst->data + i12*nb2 + i13*nb3,
7429
  nb1/ggml_type_size(dst->type),
7430
  ith, nth,
7431
+ src0->type,
7432
  src1->type,
7433
  dst->type))
7434
  goto UseGgmlGemm1;
 
7449
 
7450
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
7451
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
7452
+ for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
 
 
 
 
 
 
 
 
 
7453
  from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
7454
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
7455
+ ne10);
7456
  }
7457
  }
7458
  }
 
7472
 
7473
  for (int64_t i13 = 0; i13 < ne13; i13++)
7474
  for (int64_t i12 = 0; i12 < ne12; i12++)
7475
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
7476
  (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
7477
+ nb01/ggml_type_size(src0->type),
7478
  (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
7479
  row_size/ggml_type_size(vec_dot_type),
7480
  (char *)dst->data + i12*nb2 + i13*nb3,
7481
  nb1/ggml_type_size(dst->type),
7482
  ith, nth,
7483
+ src0->type,
7484
  vec_dot_type,
7485
  dst->type))
7486
  goto UseGgmlGemm2;
 
7522
  const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
7523
  const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
7524
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7525
  // The first chunk comes from our thread_id, the rest will get auto-assigned.
7526
  int current_chunk = ith;
7527
 
 
7544
  num_rows_per_vec_dot = 1;
7545
  }
7546
 
7547
+ ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
7548
 
7549
  if (nth >= nchunk0 * nchunk1) {
7550
  break;
 
7576
  ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
7577
  enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
7578
  ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
 
 
7579
 
7580
  // we don't support permuted src0 or src1
7581
  GGML_ASSERT(nb00 == ggml_type_size(type));
 
7661
  const int64_t nr0 = ne01; // src0 rows
7662
  const int64_t nr1 = cne1; // src1 rows
7663
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7664
  // distribute the thread work across the inner or outer loop based on which one is larger
7665
 
7666
  const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
 
7968
  case GGML_TYPE_IQ4_XS:
7969
  case GGML_TYPE_IQ3_S:
7970
  case GGML_TYPE_IQ2_S:
 
 
 
7971
  {
7972
  ggml_compute_forward_out_prod_q_f32(params, dst);
7973
  } break;
 
8230
  case GGML_TYPE_IQ4_XS:
8231
  case GGML_TYPE_IQ3_S:
8232
  case GGML_TYPE_IQ2_S:
 
 
 
8233
  default:
8234
  {
8235
  GGML_ABORT("fatal error");
 
8491
  case GGML_TYPE_IQ4_XS:
8492
  case GGML_TYPE_IQ3_S:
8493
  case GGML_TYPE_IQ2_S:
 
 
 
8494
  {
8495
  ggml_compute_forward_get_rows_q(params, dst);
8496
  } break;
 
9080
  case GGML_TYPE_IQ3_S:
9081
  case GGML_TYPE_IQ2_S:
9082
  case GGML_TYPE_Q8_K:
 
 
 
 
9083
  case GGML_TYPE_I8:
9084
  case GGML_TYPE_I16:
9085
  case GGML_TYPE_I32:
 
12285
  return;
12286
  }
12287
 
12288
+ // extra_buffer op?
12289
+ if (ggml_cpu_extra_compute_forward(params, tensor)) return;
12290
+
12291
  switch (tensor->op) {
12292
  case GGML_OP_DUP:
12293
  {
 
13235
 
13236
  size_t cur = 0;
13237
 
13238
+ if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) {
13239
+
13240
+ switch (node->op) {
13241
+ case GGML_OP_CPY:
13242
+ case GGML_OP_DUP:
13243
+ {
13244
+ if (ggml_is_quantized(node->type) ||
13245
+ // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
13246
+ (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
13247
+ (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
13248
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
13249
+ }
13250
+ } break;
13251
+ case GGML_OP_ADD:
13252
+ case GGML_OP_ADD1:
13253
+ {
13254
+ if (ggml_is_quantized(node->src[0]->type)) {
13255
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
13256
+ }
13257
+ } break;
13258
+ case GGML_OP_ACC:
13259
+ {
13260
+ if (ggml_is_quantized(node->src[0]->type)) {
13261
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
13262
+ }
13263
+ } break;
13264
+ case GGML_OP_COUNT_EQUAL:
13265
+ {
13266
+ cur = ggml_type_size(node->type)*n_tasks;
13267
+ } break;
13268
+ case GGML_OP_MUL_MAT:
13269
+ {
13270
+ const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
13271
+
13272
+ if (node->src[1]->type != vec_dot_type) {
13273
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
13274
+ }
13275
+ } break;
13276
+ case GGML_OP_MUL_MAT_ID:
13277
+ {
13278
+ cur = 0;
13279
+ const struct ggml_tensor * src0 = node->src[0];
13280
+ const struct ggml_tensor * src1 = node->src[1];
13281
+ const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
13282
+ if (src1->type != vec_dot_type) {
13283
+ cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
13284
+ }
13285
+ const int n_as = src0->ne[2];
13286
+ cur += GGML_PAD(cur, sizeof(int64_t)); // align
13287
+ cur += n_as * sizeof(int64_t); // matrix_row_counts
13288
+ cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
13289
+ } break;
13290
+ case GGML_OP_OUT_PROD:
13291
+ {
13292
+ if (ggml_is_quantized(node->src[0]->type)) {
13293
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
13294
+ }
13295
+ } break;
13296
+ case GGML_OP_SOFT_MAX:
13297
+ case GGML_OP_ROPE:
13298
+ {
13299
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
13300
+ } break;
13301
+ case GGML_OP_CONV_TRANSPOSE_1D:
13302
+ {
13303
+ GGML_ASSERT(node->src[0]->ne[3] == 1);
13304
+ GGML_ASSERT(node->src[1]->ne[2] == 1);
13305
+ GGML_ASSERT(node->src[1]->ne[3] == 1);
13306
+
13307
+ const int64_t ne00 = node->src[0]->ne[0]; // K
13308
+ const int64_t ne01 = node->src[0]->ne[1]; // Cout
13309
+ const int64_t ne02 = node->src[0]->ne[2]; // Cin
13310
+ const int64_t ne10 = node->src[1]->ne[0]; // L
13311
+ const int64_t ne11 = node->src[1]->ne[1]; // Cin
13312
+
13313
+ if ((node->src[0]->type == GGML_TYPE_F16 ||
13314
+ node->src[0]->type == GGML_TYPE_BF16) &&
13315
+ node->src[1]->type == GGML_TYPE_F32) {
13316
+ cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
13317
+ cur += sizeof(ggml_fp16_t)*ne10*ne11;
13318
+ } else if (node->src[0]->type == GGML_TYPE_F32 &&
13319
+ node->src[1]->type == GGML_TYPE_F32) {
13320
+ cur += sizeof(float)*ne00*ne01*ne02;
13321
+ cur += sizeof(float)*ne10*ne11;
13322
+ } else {
13323
+ GGML_ABORT("fatal error");
13324
+ }
13325
+ } break;
13326
+ case GGML_OP_CONV_TRANSPOSE_2D:
13327
+ {
13328
+ const int64_t ne00 = node->src[0]->ne[0]; // W
13329
+ const int64_t ne01 = node->src[0]->ne[1]; // H
13330
+ const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
13331
+ const int64_t ne03 = node->src[0]->ne[3]; // Channels In
13332
 
13333
+ const int64_t ne10 = node->src[1]->ne[0]; // W
13334
+ const int64_t ne11 = node->src[1]->ne[1]; // H
13335
+ const int64_t ne12 = node->src[1]->ne[2]; // Channels In
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13336
 
13337
+ cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
13338
+ cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
13339
+ } break;
13340
+ case GGML_OP_FLASH_ATTN_EXT:
13341
+ {
13342
+ const int64_t ne00 = node->src[0]->ne[0]; // D
 
 
 
 
 
 
 
 
 
 
 
 
13343
 
13344
+ cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
13345
+ } break;
13346
+ case GGML_OP_FLASH_ATTN_BACK:
13347
+ {
13348
+ const int64_t D = node->src[0]->ne[0];
13349
+ const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
13350
+ const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
13351
+ if (node->src[1]->type == GGML_TYPE_F32) {
13352
+ cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
13353
+ cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
13354
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
13355
+ cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
13356
+ cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
13357
+ } else if (node->src[1]->type == GGML_TYPE_BF16) {
13358
+ cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
13359
+ cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
13360
+ }
13361
+ } break;
13362
+
13363
+ case GGML_OP_CROSS_ENTROPY_LOSS:
13364
+ {
13365
+ cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
13366
+ } break;
13367
+ case GGML_OP_COUNT:
13368
+ {
13369
+ GGML_ABORT("fatal error");
13370
+ }
13371
+ default:
13372
+ break;
13373
+ }
13374
  }
13375
 
13376
  work_size = MAX(work_size, cur);
ggml/src/ggml-cpu/ggml-cpu.cpp CHANGED
@@ -2,12 +2,18 @@
2
  #include "ggml-backend-impl.h"
3
  #include "ggml-cpu.h"
4
  #include "ggml-cpu-aarch64.h"
 
5
  #include "ggml-impl.h"
6
  #include "amx/amx.h"
 
7
  #include <cctype>
8
  #include <string>
9
  #include <vector>
10
 
 
 
 
 
11
  #if defined(__APPLE__)
12
  #include <sys/types.h>
13
  #include <sys/sysctl.h>
@@ -23,115 +29,7 @@
23
 
24
  // ggml-backend interface
25
 
26
- #ifdef GGML_USE_CPU_HBM
27
-
28
- // buffer type HBM
29
-
30
- #include <hbwmalloc.h>
31
-
32
- static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
33
- return "CPU_HBM";
34
-
35
- GGML_UNUSED(buft);
36
- }
37
-
38
- static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
39
- hbw_free(buffer->context);
40
- }
41
-
42
- static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
43
- void * ptr;
44
- int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
45
- if (result != 0) {
46
- GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
47
- return NULL;
48
- }
49
-
50
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
51
- buffer->buft = buft;
52
- buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
53
-
54
- return buffer;
55
- }
56
-
57
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
58
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
59
- /* .iface = */ {
60
- /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
61
- /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
62
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
63
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
64
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
65
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
66
- },
67
- /* .context = */ NULL,
68
- };
69
-
70
- return &ggml_backend_cpu_buffer_type_hbm;
71
- }
72
- #endif
73
-
74
- // buffer type AARCH64
75
-
76
- static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
77
- tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
78
-
79
- GGML_UNUSED(buffer);
80
- }
81
-
82
- static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
83
- GGML_ASSERT(offset == 0);
84
- GGML_ASSERT(size == ggml_nbytes(tensor));
85
-
86
- enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;
87
-
88
- ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
89
-
90
- GGML_UNUSED(buffer);
91
- }
92
-
93
- static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
94
- return "CPU_AARCH64";
95
-
96
- GGML_UNUSED(buft);
97
- }
98
-
99
- static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
100
- auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
101
-
102
- if (buffer == NULL) {
103
- return NULL;
104
- }
105
-
106
- buffer->buft = buft;
107
- buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
108
- buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
109
-
110
- return buffer;
111
- }
112
-
113
- ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
114
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
115
- /* .iface = */ {
116
- /* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
117
- /* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
118
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
119
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
120
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
121
- /* .is_host = */ NULL,
122
- },
123
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
124
- /* .context = */ NULL,
125
- };
126
-
127
- return &ggml_backend_cpu_buffer_type_aarch64;
128
- }
129
-
130
- bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
131
- return buft == ggml_backend_cpu_aarch64_buffer_type();
132
- }
133
-
134
- static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
135
  static std::vector<ggml_backend_buffer_type_t> bufts = []() {
136
  std::vector<ggml_backend_buffer_type_t> bufts;
137
 
@@ -152,11 +50,22 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
152
  return bufts;
153
  }();
154
 
155
- return bufts.data();
 
 
 
 
156
 
157
  GGML_UNUSED(device);
158
  }
159
 
 
 
 
 
 
 
 
160
  // CPU backend - backend (stream)
161
 
162
  struct ggml_backend_cpu_context {
@@ -465,25 +374,19 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
465
  return true;
466
  }
467
 
468
- if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
469
- if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
470
- return false;
 
 
 
 
471
  }
472
  }
473
 
474
- #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
475
- if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
476
- return ggml_backend_amx_device_supports_op(op);
477
- }
478
- for (int i = 1; i < GGML_MAX_SRC; i++) {
479
- if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
480
- return false;
481
- }
482
- }
483
- #endif
484
-
485
- for (int i = 1; i < GGML_MAX_SRC; i++) {
486
- if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
487
  return false;
488
  }
489
  }
@@ -506,19 +409,10 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
506
  default:
507
  return true;
508
  }
509
-
510
- GGML_UNUSED(dev);
511
  }
512
 
513
  static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
514
- bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
515
-
516
- #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
517
- supported = supported || ggml_backend_amx_buft_is_amx(buft);
518
- #endif
519
-
520
- return supported;
521
-
522
  GGML_UNUSED(dev);
523
  }
524
 
@@ -666,10 +560,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
666
 
667
  static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
668
  if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
669
- return (void *)ggml_backend_cpu_set_n_threads;
 
670
  }
671
  if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
672
- return (void *)ggml_backend_cpu_get_extra_bufts;
 
673
  }
674
  if (strcmp(name, "ggml_backend_get_features") == 0) {
675
  return (void *)ggml_backend_cpu_get_features;
 
2
  #include "ggml-backend-impl.h"
3
  #include "ggml-cpu.h"
4
  #include "ggml-cpu-aarch64.h"
5
+ #include "ggml-cpu-traits.h"
6
  #include "ggml-impl.h"
7
  #include "amx/amx.h"
8
+
9
  #include <cctype>
10
  #include <string>
11
  #include <vector>
12
 
13
+ #ifdef GGML_USE_CPU_HBM
14
+ #include "ggml-cpu-hbm.h"
15
+ #endif
16
+
17
  #if defined(__APPLE__)
18
  #include <sys/types.h>
19
  #include <sys/sysctl.h>
 
29
 
30
  // ggml-backend interface
31
 
32
+ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  static std::vector<ggml_backend_buffer_type_t> bufts = []() {
34
  std::vector<ggml_backend_buffer_type_t> bufts;
35
 
 
50
  return bufts;
51
  }();
52
 
53
+ return bufts;
54
+ }
55
+
56
+ static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
57
+ return ggml_backend_cpu_get_extra_buffers_type().data();
58
 
59
  GGML_UNUSED(device);
60
  }
61
 
62
+ static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
63
+ for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
64
+ if (extra && extra == buft) return true;
65
+ }
66
+ return false;
67
+ }
68
+
69
  // CPU backend - backend (stream)
70
 
71
  struct ggml_backend_cpu_context {
 
374
  return true;
375
  }
376
 
377
+ // extra_buffer_op?
378
+ for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
379
+ if (extra) {
380
+ auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
381
+ if (buf_extra && buf_extra->supports_op(dev, op)) {
382
+ return true;
383
+ }
384
  }
385
  }
386
 
387
+ // the other case need host buffer.
388
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
389
+ if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
 
 
 
 
 
 
 
 
 
 
390
  return false;
391
  }
392
  }
 
409
  default:
410
  return true;
411
  }
 
 
412
  }
413
 
414
  static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
415
+ return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft);
 
 
 
 
 
 
 
416
  GGML_UNUSED(dev);
417
  }
418
 
 
560
 
561
  static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
562
  if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
563
+ ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads;
564
+ return (void *)fct;
565
  }
566
  if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
567
+ ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type;
568
+ return (void *)fct;
569
  }
570
  if (strcmp(name, "ggml_backend_get_features") == 0) {
571
  return (void *)ggml_backend_cpu_get_features;
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -3210,7 +3210,7 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
3210
  static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
3211
  /* .get_name = */ ggml_backend_cuda_reg_get_name,
3212
  /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
3213
- /* .get_device_get = */ ggml_backend_cuda_reg_get_device,
3214
  /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
3215
  };
3216
 
 
3210
  static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
3211
  /* .get_name = */ ggml_backend_cuda_reg_get_name,
3212
  /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
3213
+ /* .get_device = */ ggml_backend_cuda_reg_get_device,
3214
  /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
3215
  };
3216
 
ggml/src/ggml-quants.c CHANGED
@@ -5220,15 +5220,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
5220
  {
5221
  VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
5222
  } break;
5223
- case GGML_TYPE_Q4_0_4_4:
5224
- case GGML_TYPE_Q4_0_4_8:
5225
- {
5226
- VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
5227
- } break;
5228
- case GGML_TYPE_Q4_0_8_8:
5229
- {
5230
- VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
5231
- } break;
5232
 
5233
  case GGML_TYPE_I8:
5234
  case GGML_TYPE_I16:
 
5220
  {
5221
  VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
5222
  } break;
 
 
 
 
 
 
 
 
 
5223
 
5224
  case GGML_TYPE_I8:
5225
  case GGML_TYPE_I16:
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -4630,7 +4630,7 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
4630
  static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = {
4631
  /* .get_name = */ ggml_backend_sycl_reg_get_name,
4632
  /* .get_device_count = */ ggml_backend_sycl_reg_get_device_count,
4633
- /* .get_device_get = */ ggml_backend_sycl_reg_get_device,
4634
  /* .get_proc_address = */ ggml_backend_sycl_reg_get_proc_address,
4635
  };
4636
 
 
4630
  static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = {
4631
  /* .get_name = */ ggml_backend_sycl_reg_get_name,
4632
  /* .get_device_count = */ ggml_backend_sycl_reg_get_device_count,
4633
+ /* .get_device = */ ggml_backend_sycl_reg_get_device,
4634
  /* .get_proc_address = */ ggml_backend_sycl_reg_get_proc_address,
4635
  };
4636
 
ggml/src/ggml.c CHANGED
@@ -8,7 +8,10 @@
8
 
9
  // FIXME: required here for quantization functions
10
  #include "ggml-quants.h"
11
- #include "ggml-aarch64.h"
 
 
 
12
 
13
  #if defined(_MSC_VER) || defined(__MINGW32__)
14
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -788,32 +791,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
788
  .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
789
  .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
790
  },
791
- [GGML_TYPE_Q4_0_4_4] = {
792
- .type_name = "q4_0_4x4",
793
- .blck_size = QK4_0,
794
- .blck_size_interleave = 4,
795
- .type_size = sizeof(block_q4_0),
796
- .is_quantized = true,
797
- .to_float = NULL,
798
- .from_float_ref = NULL,
799
  },
800
- [GGML_TYPE_Q4_0_4_8] = {
801
- .type_name = "q4_0_4x8",
802
- .blck_size = QK4_0,
803
- .blck_size_interleave = 8,
804
- .type_size = sizeof(block_q4_0),
805
- .is_quantized = true,
806
- .to_float = NULL,
807
- .from_float_ref = NULL,
808
  },
809
- [GGML_TYPE_Q4_0_8_8] = {
810
- .type_name = "q4_0_8x8",
811
- .blck_size = QK4_0,
812
- .blck_size_interleave = 8,
813
- .type_size = sizeof(block_q4_0),
814
- .is_quantized = true,
815
- .to_float = NULL,
816
- .from_float_ref = NULL,
817
  },
818
  [GGML_TYPE_TQ1_0] = {
819
  .type_name = "tq1_0",
@@ -831,14 +825,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
831
  .to_float = (ggml_to_float_t) dequantize_row_tq2_0,
832
  .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
833
  },
834
- [GGML_TYPE_IQ4_NL_4_4] = {
835
- .type_name = "iq4_nl_4x4",
836
- .blck_size = QK4_NL,
837
- .blck_size_interleave = 4,
838
- .type_size = sizeof(block_iq4_nl),
839
- .is_quantized = true,
840
- .to_float = NULL,
841
- .from_float_ref = NULL,
 
 
 
 
 
 
 
 
 
842
  },
843
  };
844
 
@@ -1270,9 +1273,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
1270
  case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
1271
  case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
1272
  case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
1273
- case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
1274
- case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
1275
- case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
1276
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
1277
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
1278
  }
@@ -6304,9 +6304,6 @@ size_t ggml_quantize_chunk(
6304
  case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6305
  case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6306
  case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6307
- case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6308
- case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6309
- case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6310
  case GGML_TYPE_F16:
6311
  {
6312
  size_t elemsize = sizeof(ggml_fp16_t);
@@ -6838,7 +6835,16 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6838
  (int64_t) info->ne[2] *
6839
  (int64_t) info->ne[3];
6840
 
6841
- if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
 
 
 
 
 
 
 
 
 
6842
  fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
6843
  __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
6844
  fclose(file);
 
8
 
9
  // FIXME: required here for quantization functions
10
  #include "ggml-quants.h"
11
+
12
+ #ifdef GGML_USE_CPU_HBM
13
+ #include <hbwmalloc.h>
14
+ #endif
15
 
16
  #if defined(_MSC_VER) || defined(__MINGW32__)
17
  #include <malloc.h> // using malloc.h with MSC/MINGW
 
791
  .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
792
  .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
793
  },
794
+ [31] = { // GGML_TYPE_Q4_0_4_4
795
+ .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
796
+ .blck_size = 0,
797
+ .type_size = 0,
798
+ .is_quantized = false,
 
 
 
799
  },
800
+ [32] = { // GGML_TYPE_Q4_0_4_8
801
+ .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
802
+ .blck_size = 0,
803
+ .type_size = 0,
804
+ .is_quantized = false,
 
 
 
805
  },
806
+ [33] = { // GGML_TYPE_Q4_0_8_8
807
+ .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
808
+ .blck_size = 0,
809
+ .type_size = 0,
810
+ .is_quantized = false,
 
 
 
811
  },
812
  [GGML_TYPE_TQ1_0] = {
813
  .type_name = "tq1_0",
 
825
  .to_float = (ggml_to_float_t) dequantize_row_tq2_0,
826
  .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
827
  },
828
+ [36] = { // GGML_TYPE_IQ4_NL_4_4
829
+ .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
830
+ .blck_size = 0,
831
+ .type_size = 0,
832
+ .is_quantized = false,
833
+ },
834
+ [37] = { // GGML_TYPE_IQ4_NL_4_8
835
+ .type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
836
+ .blck_size = 0,
837
+ .type_size = 0,
838
+ .is_quantized = false,
839
+ },
840
+ [38] = { // GGML_TYPE_IQ4_NL_8_8
841
+ .type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
842
+ .blck_size = 0,
843
+ .type_size = 0,
844
+ .is_quantized = false,
845
  },
846
  };
847
 
 
1273
  case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
1274
  case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
1275
  case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
 
 
 
1276
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
1277
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
1278
  }
 
6304
  case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6305
  case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6306
  case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 
 
 
6307
  case GGML_TYPE_F16:
6308
  {
6309
  size_t elemsize = sizeof(ggml_fp16_t);
 
6835
  (int64_t) info->ne[2] *
6836
  (int64_t) info->ne[3];
6837
 
6838
+ if (ggml_blck_size(info->type) == 0 ) {
6839
+ // this tensor type support have been removed:
6840
+ fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
6841
+ __func__, info->name.data, (int) info->type, ggml_type_name(info->type));
6842
+ fclose(file);
6843
+ gguf_free(ctx);
6844
+ return NULL;
6845
+ }
6846
+
6847
+ if (ne % ggml_blck_size(info->type) != 0) {
6848
  fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
6849
  __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
6850
  fclose(file);