Spaces:
Running
Running
Rémy O
commited on
Commit
·
05466a9
1
Parent(s):
723b8b4
ggml-cpu: Faster IQ1 mul_mat_vec on AVX2 using BMI2 instructions (llama/12154)
Browse files* ggml-cpu: Faster IQ1 mul_mat_vec on AVX2 using BMI2 instructions
* cmake: Add GGML_BMI2 build option
* ggml: enable BMI2 on relevant CPU variants
* ggml-cpu: include BMI2 in backend score
* ggml-cpu: register BMI2 in ggml_backend_cpu_get_features
* ggml-cpu: add __BMI2__ define when using MSVC
- ggml/CMakeLists.txt +1 -0
- ggml/include/ggml-cpu.h +1 -0
- ggml/src/CMakeLists.txt +6 -6
- ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- ggml/src/ggml-cpu/ggml-cpu-quants.c +37 -12
- ggml/src/ggml-cpu/ggml-cpu.c +8 -0
- ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
ggml/CMakeLists.txt
CHANGED
|
@@ -106,6 +106,7 @@ option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable
|
|
| 106 |
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
| 107 |
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
| 108 |
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
|
|
|
| 109 |
option(GGML_AVX512 "ggml: enable AVX512F" OFF)
|
| 110 |
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
| 111 |
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
|
|
|
| 106 |
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
| 107 |
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
| 108 |
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
| 109 |
+
option(GGML_BMI2 "ggml: enable BMI2" ${INS_ENB})
|
| 110 |
option(GGML_AVX512 "ggml: enable AVX512F" OFF)
|
| 111 |
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
| 112 |
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
ggml/include/ggml-cpu.h
CHANGED
|
@@ -80,6 +80,7 @@ extern "C" {
|
|
| 80 |
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
| 81 |
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
| 82 |
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
|
|
|
| 83 |
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
| 84 |
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
| 85 |
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
|
|
|
| 80 |
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
| 81 |
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
| 82 |
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
| 83 |
+
GGML_BACKEND_API int ggml_cpu_has_bmi2 (void);
|
| 84 |
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
| 85 |
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
| 86 |
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
ggml/src/CMakeLists.txt
CHANGED
|
@@ -289,7 +289,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
|
| 289 |
set(GGML_CPU_TAG_NAME ${tag_name})
|
| 290 |
# other: OPENMP LLAMAFILE CPU_HBM
|
| 291 |
foreach (feat NATIVE
|
| 292 |
-
AVX AVX2 AVX_VNNI FMA F16C
|
| 293 |
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
| 294 |
AMX_TILE AMX_INT8 AMX_BF16)
|
| 295 |
set(GGML_${feat} OFF)
|
|
@@ -309,13 +309,13 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
| 309 |
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
| 310 |
endif()
|
| 311 |
ggml_add_cpu_backend_variant(sandybridge AVX)
|
| 312 |
-
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
|
| 313 |
-
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
|
| 314 |
-
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
| 315 |
-
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
| 316 |
if (NOT MSVC)
|
| 317 |
# MSVC doesn't support AMX
|
| 318 |
-
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
| 319 |
endif()
|
| 320 |
elseif (GGML_CPU)
|
| 321 |
ggml_add_cpu_backend_variant_impl("")
|
|
|
|
| 289 |
set(GGML_CPU_TAG_NAME ${tag_name})
|
| 290 |
# other: OPENMP LLAMAFILE CPU_HBM
|
| 291 |
foreach (feat NATIVE
|
| 292 |
+
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
| 293 |
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
| 294 |
AMX_TILE AMX_INT8 AMX_BF16)
|
| 295 |
set(GGML_${feat} OFF)
|
|
|
|
| 309 |
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
| 310 |
endif()
|
| 311 |
ggml_add_cpu_backend_variant(sandybridge AVX)
|
| 312 |
+
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
|
| 313 |
+
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
| 314 |
+
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
| 315 |
+
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
| 316 |
if (NOT MSVC)
|
| 317 |
# MSVC doesn't support AMX
|
| 318 |
+
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
| 319 |
endif()
|
| 320 |
elseif (GGML_CPU)
|
| 321 |
ggml_add_cpu_backend_variant_impl("")
|
ggml/src/ggml-cpu/CMakeLists.txt
CHANGED
|
@@ -219,6 +219,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
| 219 |
if (GGML_AVX_VNNI)
|
| 220 |
list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
| 221 |
endif()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
else ()
|
| 223 |
if (GGML_NATIVE)
|
| 224 |
list(APPEND ARCH_FLAGS -march=native)
|
|
@@ -233,6 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
| 233 |
list(APPEND ARCH_FLAGS -mfma)
|
| 234 |
list(APPEND ARCH_DEFINITIONS GGML_FMA)
|
| 235 |
endif()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
if (GGML_AVX)
|
| 237 |
list(APPEND ARCH_FLAGS -mavx)
|
| 238 |
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
|
|
|
| 219 |
if (GGML_AVX_VNNI)
|
| 220 |
list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
| 221 |
endif()
|
| 222 |
+
if (GGML_BMI2)
|
| 223 |
+
# MSVC does not define macro __BMI2__
|
| 224 |
+
list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
|
| 225 |
+
endif()
|
| 226 |
else ()
|
| 227 |
if (GGML_NATIVE)
|
| 228 |
list(APPEND ARCH_FLAGS -march=native)
|
|
|
|
| 237 |
list(APPEND ARCH_FLAGS -mfma)
|
| 238 |
list(APPEND ARCH_DEFINITIONS GGML_FMA)
|
| 239 |
endif()
|
| 240 |
+
if (GGML_BMI2)
|
| 241 |
+
list(APPEND ARCH_FLAGS -mbmi2)
|
| 242 |
+
list(APPEND ARCH_DEFINITIONS GGML_BMI2)
|
| 243 |
+
endif()
|
| 244 |
if (GGML_AVX)
|
| 245 |
list(APPEND ARCH_FLAGS -mavx)
|
| 246 |
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
ggml/src/ggml-cpu/cpu-feats-x86.cpp
CHANGED
|
@@ -278,6 +278,10 @@ static int ggml_backend_cpu_x86_score() {
|
|
| 278 |
if (!is.SSE42()) { return 0; }
|
| 279 |
score += 1<<2;
|
| 280 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
#ifdef GGML_AVX
|
| 282 |
if (!is.AVX()) { return 0; }
|
| 283 |
score += 1<<4;
|
|
|
|
| 278 |
if (!is.SSE42()) { return 0; }
|
| 279 |
score += 1<<2;
|
| 280 |
#endif
|
| 281 |
+
#ifdef GGML_BMI2
|
| 282 |
+
if (!is.BMI2()) { return 0; }
|
| 283 |
+
score += 1<<3;
|
| 284 |
+
#endif
|
| 285 |
#ifdef GGML_AVX
|
| 286 |
if (!is.AVX()) { return 0; }
|
| 287 |
score += 1<<4;
|
ggml/src/ggml-cpu/ggml-cpu-quants.c
CHANGED
|
@@ -11362,10 +11362,19 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
| 11362 |
__m256i sumi = _mm256_setzero_si256();
|
| 11363 |
int sumi1 = 0;
|
| 11364 |
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11365 |
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
|
| 11366 |
iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
| 11367 |
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
|
| 11368 |
iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
|
|
|
| 11369 |
qs += 8;
|
| 11370 |
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
| 11371 |
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
@@ -11709,8 +11718,9 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
| 11709 |
|
| 11710 |
#elif defined __AVX2__
|
| 11711 |
|
| 11712 |
-
const __m256i mask = _mm256_set1_epi16(0x7);
|
| 11713 |
const __m256i mone = _mm256_set1_epi16(1);
|
|
|
|
| 11714 |
|
| 11715 |
__m256 accum1 = _mm256_setzero_ps();
|
| 11716 |
__m256 accum2 = _mm256_setzero_ps();
|
|
@@ -11726,6 +11736,21 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
| 11726 |
__m256i sumi1 = _mm256_setzero_si256();
|
| 11727 |
__m256i sumi2 = _mm256_setzero_si256();
|
| 11728 |
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11729 |
const __m256i q1b_1 = _mm256_set_epi64x(
|
| 11730 |
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
|
| 11731 |
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
|
|
@@ -11734,11 +11759,6 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
| 11734 |
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
|
| 11735 |
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
|
| 11736 |
);
|
| 11737 |
-
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
| 11738 |
-
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
| 11739 |
-
|
| 11740 |
-
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
| 11741 |
-
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
| 11742 |
|
| 11743 |
const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
| 11744 |
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
@@ -11748,15 +11768,20 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
| 11748 |
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
| 11749 |
qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
| 11750 |
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
|
|
|
|
|
|
|
|
|
| 11751 |
|
| 11752 |
-
const __m256i
|
| 11753 |
-
const __m256i
|
|
|
|
|
|
|
| 11754 |
|
| 11755 |
-
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >>
|
| 11756 |
-
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >>
|
| 11757 |
|
| 11758 |
-
scale1 = _mm256_add_epi16(
|
| 11759 |
-
scale2 = _mm256_add_epi16(
|
| 11760 |
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
|
| 11761 |
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
|
| 11762 |
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
|
|
|
|
| 11362 |
__m256i sumi = _mm256_setzero_si256();
|
| 11363 |
int sumi1 = 0;
|
| 11364 |
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
| 11365 |
+
#ifdef __BMI2__
|
| 11366 |
+
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
|
| 11367 |
+
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
|
| 11368 |
+
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
|
| 11369 |
+
const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
|
| 11370 |
+
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
|
| 11371 |
+
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
|
| 11372 |
+
#else
|
| 11373 |
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
|
| 11374 |
iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
| 11375 |
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
|
| 11376 |
iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
| 11377 |
+
#endif
|
| 11378 |
qs += 8;
|
| 11379 |
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
| 11380 |
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
|
|
| 11718 |
|
| 11719 |
#elif defined __AVX2__
|
| 11720 |
|
| 11721 |
+
const __m256i mask = _mm256_set1_epi16(2 * 0x7);
|
| 11722 |
const __m256i mone = _mm256_set1_epi16(1);
|
| 11723 |
+
const __m256i mone8 = _mm256_set1_epi8(1);
|
| 11724 |
|
| 11725 |
__m256 accum1 = _mm256_setzero_ps();
|
| 11726 |
__m256 accum2 = _mm256_setzero_ps();
|
|
|
|
| 11736 |
__m256i sumi1 = _mm256_setzero_si256();
|
| 11737 |
__m256i sumi2 = _mm256_setzero_si256();
|
| 11738 |
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
| 11739 |
+
#ifdef __BMI2__
|
| 11740 |
+
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
|
| 11741 |
+
| _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
|
| 11742 |
+
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
|
| 11743 |
+
| _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
|
| 11744 |
+
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
|
| 11745 |
+
const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
|
| 11746 |
+
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
|
| 11747 |
+
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
|
| 11748 |
+
|
| 11749 |
+
// Convert signs to bytes 0x81 (negative) or 0x01 (positive)
|
| 11750 |
+
const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
|
| 11751 |
+
const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
|
| 11752 |
+
const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
|
| 11753 |
+
#else
|
| 11754 |
const __m256i q1b_1 = _mm256_set_epi64x(
|
| 11755 |
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
|
| 11756 |
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
|
|
|
|
| 11759 |
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
|
| 11760 |
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
|
| 11761 |
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11762 |
|
| 11763 |
const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
| 11764 |
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
|
|
| 11768 |
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
| 11769 |
qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
| 11770 |
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
| 11771 |
+
#endif
|
| 11772 |
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
| 11773 |
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
| 11774 |
|
| 11775 |
+
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
| 11776 |
+
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
| 11777 |
+
const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
|
| 11778 |
+
const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
|
| 11779 |
|
| 11780 |
+
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 2), _mm_set1_epi16(sc[ib/2] << 1));
|
| 11781 |
+
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 8), _mm_set1_epi16(sc[ib/2] >> 5));
|
| 11782 |
|
| 11783 |
+
scale1 = _mm256_add_epi16(_mm256_and_si256(scale1, mask), mone);
|
| 11784 |
+
scale2 = _mm256_add_epi16(_mm256_and_si256(scale2, mask), mone);
|
| 11785 |
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
|
| 11786 |
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
|
| 11787 |
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
|
ggml/src/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -15579,6 +15579,14 @@ int ggml_cpu_has_amx_int8(void) {
|
|
| 15579 |
#endif
|
| 15580 |
}
|
| 15581 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15582 |
int ggml_cpu_has_fma(void) {
|
| 15583 |
#if defined(__FMA__)
|
| 15584 |
return 1;
|
|
|
|
| 15579 |
#endif
|
| 15580 |
}
|
| 15581 |
|
| 15582 |
+
int ggml_cpu_has_bmi2(void) {
|
| 15583 |
+
#if defined(__BMI2__)
|
| 15584 |
+
return 1;
|
| 15585 |
+
#else
|
| 15586 |
+
return 0;
|
| 15587 |
+
#endif
|
| 15588 |
+
}
|
| 15589 |
+
|
| 15590 |
int ggml_cpu_has_fma(void) {
|
| 15591 |
#if defined(__FMA__)
|
| 15592 |
return 1;
|
ggml/src/ggml-cpu/ggml-cpu.cpp
CHANGED
|
@@ -511,6 +511,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|
| 511 |
if (ggml_cpu_has_fma()) {
|
| 512 |
features.push_back({ "FMA", "1" });
|
| 513 |
}
|
|
|
|
|
|
|
|
|
|
| 514 |
if (ggml_cpu_has_avx512()) {
|
| 515 |
features.push_back({ "AVX512", "1" });
|
| 516 |
}
|
|
|
|
| 511 |
if (ggml_cpu_has_fma()) {
|
| 512 |
features.push_back({ "FMA", "1" });
|
| 513 |
}
|
| 514 |
+
if (ggml_cpu_has_bmi2()) {
|
| 515 |
+
features.push_back({ "BMI2", "1" });
|
| 516 |
+
}
|
| 517 |
if (ggml_cpu_has_avx512()) {
|
| 518 |
features.push_back({ "AVX512", "1" });
|
| 519 |
}
|