Rémy O commited on
Commit
05466a9
·
1 Parent(s): 723b8b4

ggml-cpu: Faster IQ1 mul_mat_vec on AVX2 using BMI2 instructions (llama/12154)

Browse files

* ggml-cpu: Faster IQ1 mul_mat_vec on AVX2 using BMI2 instructions

* cmake: Add GGML_BMI2 build option

* ggml: enable BMI2 on relevant CPU variants

* ggml-cpu: include BMI2 in backend score

* ggml-cpu: register BMI2 in ggml_backend_cpu_get_features

* ggml-cpu: add __BMI2__ define when using MSVC

ggml/CMakeLists.txt CHANGED
@@ -106,6 +106,7 @@ option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable
106
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
107
  option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
108
  option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
 
109
  option(GGML_AVX512 "ggml: enable AVX512F" OFF)
110
  option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
111
  option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
 
106
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
107
  option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
108
  option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
109
+ option(GGML_BMI2 "ggml: enable BMI2" ${INS_ENB})
110
  option(GGML_AVX512 "ggml: enable AVX512F" OFF)
111
  option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
112
  option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
ggml/include/ggml-cpu.h CHANGED
@@ -80,6 +80,7 @@ extern "C" {
80
  GGML_BACKEND_API int ggml_cpu_has_avx (void);
81
  GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
82
  GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
 
83
  GGML_BACKEND_API int ggml_cpu_has_f16c (void);
84
  GGML_BACKEND_API int ggml_cpu_has_fma (void);
85
  GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
 
80
  GGML_BACKEND_API int ggml_cpu_has_avx (void);
81
  GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
82
  GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
83
+ GGML_BACKEND_API int ggml_cpu_has_bmi2 (void);
84
  GGML_BACKEND_API int ggml_cpu_has_f16c (void);
85
  GGML_BACKEND_API int ggml_cpu_has_fma (void);
86
  GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
ggml/src/CMakeLists.txt CHANGED
@@ -289,7 +289,7 @@ function(ggml_add_cpu_backend_variant tag_name)
289
  set(GGML_CPU_TAG_NAME ${tag_name})
290
  # other: OPENMP LLAMAFILE CPU_HBM
291
  foreach (feat NATIVE
292
- AVX AVX2 AVX_VNNI FMA F16C
293
  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
294
  AMX_TILE AMX_INT8 AMX_BF16)
295
  set(GGML_${feat} OFF)
@@ -309,13 +309,13 @@ if (GGML_CPU_ALL_VARIANTS)
309
  message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
310
  endif()
311
  ggml_add_cpu_backend_variant(sandybridge AVX)
312
- ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
313
- ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
314
- ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
315
- ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
316
  if (NOT MSVC)
317
  # MSVC doesn't support AMX
318
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
319
  endif()
320
  elseif (GGML_CPU)
321
  ggml_add_cpu_backend_variant_impl("")
 
289
  set(GGML_CPU_TAG_NAME ${tag_name})
290
  # other: OPENMP LLAMAFILE CPU_HBM
291
  foreach (feat NATIVE
292
+ AVX AVX2 BMI2 AVX_VNNI FMA F16C
293
  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
294
  AMX_TILE AMX_INT8 AMX_BF16)
295
  set(GGML_${feat} OFF)
 
309
  message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
310
  endif()
311
  ggml_add_cpu_backend_variant(sandybridge AVX)
312
+ ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
313
+ ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
314
+ ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
315
+ ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
316
  if (NOT MSVC)
317
  # MSVC doesn't support AMX
318
+ ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
319
  endif()
320
  elseif (GGML_CPU)
321
  ggml_add_cpu_backend_variant_impl("")
ggml/src/ggml-cpu/CMakeLists.txt CHANGED
@@ -219,6 +219,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
219
  if (GGML_AVX_VNNI)
220
  list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
221
  endif()
 
 
 
 
222
  else ()
223
  if (GGML_NATIVE)
224
  list(APPEND ARCH_FLAGS -march=native)
@@ -233,6 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
233
  list(APPEND ARCH_FLAGS -mfma)
234
  list(APPEND ARCH_DEFINITIONS GGML_FMA)
235
  endif()
 
 
 
 
236
  if (GGML_AVX)
237
  list(APPEND ARCH_FLAGS -mavx)
238
  list(APPEND ARCH_DEFINITIONS GGML_AVX)
 
219
  if (GGML_AVX_VNNI)
220
  list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
221
  endif()
222
+ if (GGML_BMI2)
223
+ # MSVC does not define macro __BMI2__
224
+ list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
225
+ endif()
226
  else ()
227
  if (GGML_NATIVE)
228
  list(APPEND ARCH_FLAGS -march=native)
 
237
  list(APPEND ARCH_FLAGS -mfma)
238
  list(APPEND ARCH_DEFINITIONS GGML_FMA)
239
  endif()
240
+ if (GGML_BMI2)
241
+ list(APPEND ARCH_FLAGS -mbmi2)
242
+ list(APPEND ARCH_DEFINITIONS GGML_BMI2)
243
+ endif()
244
  if (GGML_AVX)
245
  list(APPEND ARCH_FLAGS -mavx)
246
  list(APPEND ARCH_DEFINITIONS GGML_AVX)
ggml/src/ggml-cpu/cpu-feats-x86.cpp CHANGED
@@ -278,6 +278,10 @@ static int ggml_backend_cpu_x86_score() {
278
  if (!is.SSE42()) { return 0; }
279
  score += 1<<2;
280
  #endif
 
 
 
 
281
  #ifdef GGML_AVX
282
  if (!is.AVX()) { return 0; }
283
  score += 1<<4;
 
278
  if (!is.SSE42()) { return 0; }
279
  score += 1<<2;
280
  #endif
281
+ #ifdef GGML_BMI2
282
+ if (!is.BMI2()) { return 0; }
283
+ score += 1<<3;
284
+ #endif
285
  #ifdef GGML_AVX
286
  if (!is.AVX()) { return 0; }
287
  score += 1<<4;
ggml/src/ggml-cpu/ggml-cpu-quants.c CHANGED
@@ -11362,10 +11362,19 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
11362
  __m256i sumi = _mm256_setzero_si256();
11363
  int sumi1 = 0;
11364
  for (int ib = 0; ib < QK_K/32; ib += 2) {
 
 
 
 
 
 
 
 
11365
  const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
11366
  iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
11367
  const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
11368
  iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
 
11369
  qs += 8;
11370
  const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11371
  const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
@@ -11709,8 +11718,9 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
11709
 
11710
  #elif defined __AVX2__
11711
 
11712
- const __m256i mask = _mm256_set1_epi16(0x7);
11713
  const __m256i mone = _mm256_set1_epi16(1);
 
11714
 
11715
  __m256 accum1 = _mm256_setzero_ps();
11716
  __m256 accum2 = _mm256_setzero_ps();
@@ -11726,6 +11736,21 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
11726
  __m256i sumi1 = _mm256_setzero_si256();
11727
  __m256i sumi2 = _mm256_setzero_si256();
11728
  for (int ib = 0; ib < QK_K/32; ib += 2) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11729
  const __m256i q1b_1 = _mm256_set_epi64x(
11730
  iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
11731
  iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
@@ -11734,11 +11759,6 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
11734
  iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
11735
  iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
11736
  );
11737
- const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11738
- const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11739
-
11740
- const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
11741
- const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
11742
 
11743
  const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11744
  qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
@@ -11748,15 +11768,20 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
11748
  qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
11749
  qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11750
  qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
 
 
 
11751
 
11752
- const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
11753
- const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
 
 
11754
 
11755
- __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
11756
- __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
11757
 
11758
- scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
11759
- scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
11760
  const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
11761
  const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
11762
  const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
 
11362
  __m256i sumi = _mm256_setzero_si256();
11363
  int sumi1 = 0;
11364
  for (int ib = 0; ib < QK_K/32; ib += 2) {
11365
+ #ifdef __BMI2__
11366
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
11367
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
11368
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
11369
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
11370
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
11371
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
11372
+ #else
11373
  const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
11374
  iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
11375
  const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
11376
  iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
11377
+ #endif
11378
  qs += 8;
11379
  const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11380
  const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
 
11718
 
11719
  #elif defined __AVX2__
11720
 
11721
+ const __m256i mask = _mm256_set1_epi16(2 * 0x7);
11722
  const __m256i mone = _mm256_set1_epi16(1);
11723
+ const __m256i mone8 = _mm256_set1_epi8(1);
11724
 
11725
  __m256 accum1 = _mm256_setzero_ps();
11726
  __m256 accum2 = _mm256_setzero_ps();
 
11736
  __m256i sumi1 = _mm256_setzero_si256();
11737
  __m256i sumi2 = _mm256_setzero_si256();
11738
  for (int ib = 0; ib < QK_K/32; ib += 2) {
11739
+ #ifdef __BMI2__
11740
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
11741
+ | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
11742
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
11743
+ | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
11744
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
11745
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
11746
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
11747
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
11748
+
11749
+ // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
11750
+ const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
11751
+ const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
11752
+ const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
11753
+ #else
11754
  const __m256i q1b_1 = _mm256_set_epi64x(
11755
  iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
11756
  iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
 
11759
  iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
11760
  iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
11761
  );
 
 
 
 
 
11762
 
11763
  const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11764
  qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
 
11768
  qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
11769
  qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11770
  qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11771
+ #endif
11772
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11773
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11774
 
11775
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
11776
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
11777
+ const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
11778
+ const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
11779
 
11780
+ __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 2), _mm_set1_epi16(sc[ib/2] << 1));
11781
+ __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 8), _mm_set1_epi16(sc[ib/2] >> 5));
11782
 
11783
+ scale1 = _mm256_add_epi16(_mm256_and_si256(scale1, mask), mone);
11784
+ scale2 = _mm256_add_epi16(_mm256_and_si256(scale2, mask), mone);
11785
  const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
11786
  const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
11787
  const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -15579,6 +15579,14 @@ int ggml_cpu_has_amx_int8(void) {
15579
  #endif
15580
  }
15581
 
 
 
 
 
 
 
 
 
15582
  int ggml_cpu_has_fma(void) {
15583
  #if defined(__FMA__)
15584
  return 1;
 
15579
  #endif
15580
  }
15581
 
15582
+ int ggml_cpu_has_bmi2(void) {
15583
+ #if defined(__BMI2__)
15584
+ return 1;
15585
+ #else
15586
+ return 0;
15587
+ #endif
15588
+ }
15589
+
15590
  int ggml_cpu_has_fma(void) {
15591
  #if defined(__FMA__)
15592
  return 1;
ggml/src/ggml-cpu/ggml-cpu.cpp CHANGED
@@ -511,6 +511,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
511
  if (ggml_cpu_has_fma()) {
512
  features.push_back({ "FMA", "1" });
513
  }
 
 
 
514
  if (ggml_cpu_has_avx512()) {
515
  features.push_back({ "AVX512", "1" });
516
  }
 
511
  if (ggml_cpu_has_fma()) {
512
  features.push_back({ "FMA", "1" });
513
  }
514
+ if (ggml_cpu_has_bmi2()) {
515
+ features.push_back({ "BMI2", "1" });
516
+ }
517
  if (ggml_cpu_has_avx512()) {
518
  features.push_back({ "AVX512", "1" });
519
  }