Spaces:
Running
Running
Hong Bo PENG
commited on
Commit
·
e3d09d2
1
Parent(s):
8c3ae74
ggml : fix and optimize ppc64le (ggml/849)
Browse files* fix compile issues introduced by loongarch_asx
* restore quant changes to merge
* fix compile issues introduced by loongarch_asx
* further optimize by using vec_msum & vec_sum4s on ppc64le
- ggml-quants.c +281 -362
ggml-quants.c
CHANGED
|
@@ -1076,6 +1076,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
| 1076 |
}
|
| 1077 |
vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
|
| 1078 |
vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
|
|
|
|
| 1079 |
|
| 1080 |
#elif defined(__loongarch_asx)
|
| 1081 |
for (int i = 0; i < nb; i++) {
|
|
@@ -1435,6 +1436,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
| 1435 |
accv = vec_add(accv, vec_sld(accv, accv, 4));
|
| 1436 |
accv = vec_add(accv, vec_sld(accv, accv, 8));
|
| 1437 |
y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
|
|
|
|
| 1438 |
|
| 1439 |
#elif defined(__loongarch_asx)
|
| 1440 |
for (int i = 0; i < nb; i++) {
|
|
@@ -4111,12 +4113,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 4111 |
|
| 4112 |
#elif defined(__POWER9_VECTOR__)
|
| 4113 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
|
|
| 4114 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 4115 |
const vector signed char v8 = vec_splats((signed char)0x8);
|
| 4116 |
|
| 4117 |
vector float vsumf0 = vec_splats(0.0f);
|
| 4118 |
|
| 4119 |
-
#pragma GCC unroll
|
| 4120 |
for (int i = 0; i < nb; i++) {
|
| 4121 |
__builtin_prefetch(x[i].qs, 0, 1);
|
| 4122 |
__builtin_prefetch(y[i].qs, 0, 1);
|
|
@@ -4138,9 +4141,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 4138 |
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
| 4139 |
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
| 4140 |
|
| 4141 |
-
|
| 4142 |
|
| 4143 |
-
|
|
|
|
| 4144 |
|
| 4145 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 4146 |
}
|
|
@@ -4514,6 +4518,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4514 |
|
| 4515 |
#elif defined(__POWER9_VECTOR__)
|
| 4516 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
|
|
| 4517 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 4518 |
|
| 4519 |
vector float vsumf0 = vec_splats(0.0f);
|
|
@@ -4535,15 +4540,13 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4535 |
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
| 4536 |
vector signed char q8y1 = vec_xl(16, y[i].qs);
|
| 4537 |
|
| 4538 |
-
vector
|
| 4539 |
-
vector
|
| 4540 |
-
|
| 4541 |
-
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
| 4542 |
-
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
| 4543 |
|
| 4544 |
-
|
| 4545 |
|
| 4546 |
-
|
|
|
|
| 4547 |
|
| 4548 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 4549 |
}
|
|
@@ -5245,6 +5248,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 5245 |
|
| 5246 |
#elif defined(__POWER9_VECTOR__)
|
| 5247 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
|
|
| 5248 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 5249 |
|
| 5250 |
vector float vsumf0 = vec_splats(0.0f);
|
|
@@ -5270,18 +5274,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 5270 |
|
| 5271 |
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
| 5272 |
|
| 5273 |
-
vector
|
| 5274 |
-
vector
|
| 5275 |
|
| 5276 |
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
| 5277 |
vector signed char q8y1 = vec_xl( 16, y[i].qs);
|
| 5278 |
|
| 5279 |
-
vector signed
|
| 5280 |
-
vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
|
| 5281 |
-
|
| 5282 |
-
qv0 = vec_add(qv0, qv1);
|
| 5283 |
|
| 5284 |
-
|
|
|
|
| 5285 |
|
| 5286 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 5287 |
}
|
|
@@ -5521,9 +5523,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 5521 |
*s = sumf;
|
| 5522 |
|
| 5523 |
#elif defined(__POWER9_VECTOR__)
|
|
|
|
| 5524 |
vector float vsumf0 = vec_splats(0.0f);
|
| 5525 |
|
| 5526 |
-
#pragma GCC unroll
|
| 5527 |
for (int i = 0; i < nb; i++) {
|
| 5528 |
__builtin_prefetch(x[i].qs, 0, 1);
|
| 5529 |
__builtin_prefetch(y[i].qs, 0, 1);
|
|
@@ -5542,13 +5545,13 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 5542 |
vector signed short qv2 = vec_mule(q8x1, q8y1);
|
| 5543 |
vector signed short qv3 = vec_mulo(q8x1, q8y1);
|
| 5544 |
|
| 5545 |
-
vector signed int vsumi0 =
|
| 5546 |
-
vector signed int vsumi1 =
|
| 5547 |
-
vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3));
|
| 5548 |
-
vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3));
|
| 5549 |
|
| 5550 |
-
vsumi0 =
|
| 5551 |
-
vsumi1 =
|
|
|
|
|
|
|
| 5552 |
|
| 5553 |
vsumi0 = vec_add(vsumi0, vsumi1);
|
| 5554 |
|
|
@@ -5936,6 +5939,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 5936 |
#elif defined(__POWER9_VECTOR__)
|
| 5937 |
const vector signed char lowMask = vec_splats((signed char)0x3);
|
| 5938 |
const vector signed char lowScaleMask = vec_splats((signed char)0xF);
|
|
|
|
| 5939 |
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
| 5940 |
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
| 5941 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
|
@@ -5973,15 +5977,17 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 5973 |
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
| 5974 |
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
| 5975 |
|
| 5976 |
-
vector signed int vsumi0 =
|
| 5977 |
-
vector signed int vsumi1 =
|
| 5978 |
-
vector signed int vsumi2 =
|
| 5979 |
-
vector signed int vsumi3 =
|
| 5980 |
-
vector signed int vsumi4 =
|
| 5981 |
-
vector signed int vsumi5 =
|
| 5982 |
-
vector signed int vsumi6 =
|
| 5983 |
-
vector signed int vsumi7 =
|
| 5984 |
|
|
|
|
|
|
|
| 5985 |
|
| 5986 |
for (int j = 0; j < QK_K/128; ++j) {
|
| 5987 |
__builtin_prefetch(q2, 0, 1);
|
|
@@ -5991,14 +5997,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 5991 |
vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
|
| 5992 |
q2 += 32;
|
| 5993 |
|
| 5994 |
-
vector
|
| 5995 |
-
vector
|
| 5996 |
-
vector
|
| 5997 |
-
vector
|
| 5998 |
-
vector
|
| 5999 |
-
vector
|
| 6000 |
-
vector
|
| 6001 |
-
vector
|
| 6002 |
|
| 6003 |
vector signed char q8y00 = vec_xl( 0, q8);
|
| 6004 |
vector signed char q8y10 = vec_xl( 16, q8);
|
|
@@ -6010,45 +6016,36 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6010 |
vector signed char q8y13 = vec_xl(112, q8);
|
| 6011 |
q8 += 128;
|
| 6012 |
|
| 6013 |
-
vector signed
|
| 6014 |
-
vector signed
|
| 6015 |
-
vector signed
|
| 6016 |
-
vector signed
|
| 6017 |
-
vector signed
|
| 6018 |
-
vector signed
|
| 6019 |
-
vector signed
|
| 6020 |
-
vector signed
|
| 6021 |
-
|
| 6022 |
-
vector signed short
|
| 6023 |
-
vector signed
|
| 6024 |
-
vector signed
|
| 6025 |
-
vector signed
|
| 6026 |
-
vector signed
|
| 6027 |
-
vector signed
|
| 6028 |
-
vector signed
|
| 6029 |
-
vector signed
|
| 6030 |
-
vector signed
|
|
|
|
|
|
|
| 6031 |
vscales = vec_sld(vscales, vscales, 8);
|
| 6032 |
|
| 6033 |
-
|
| 6034 |
-
|
| 6035 |
-
|
| 6036 |
-
|
| 6037 |
-
|
| 6038 |
-
|
| 6039 |
-
|
| 6040 |
-
|
| 6041 |
-
qv3 = vec_madd(qv7, vs7, qv3);
|
| 6042 |
-
|
| 6043 |
-
vsumi0 = vec_add(vec_unpackh(qv0), vsumi0);
|
| 6044 |
-
vsumi1 = vec_add(vec_unpackh(qv1), vsumi1);
|
| 6045 |
-
vsumi2 = vec_add(vec_unpackh(qv2), vsumi2);
|
| 6046 |
-
vsumi3 = vec_add(vec_unpackh(qv3), vsumi3);
|
| 6047 |
-
|
| 6048 |
-
vsumi4 = vec_add(vec_unpackl(qv0), vsumi4);
|
| 6049 |
-
vsumi5 = vec_add(vec_unpackl(qv1), vsumi5);
|
| 6050 |
-
vsumi6 = vec_add(vec_unpackl(qv2), vsumi6);
|
| 6051 |
-
vsumi7 = vec_add(vec_unpackl(qv3), vsumi7);
|
| 6052 |
}
|
| 6053 |
|
| 6054 |
vsumi0 = vec_add(vsumi0, vsumi4);
|
|
@@ -6639,6 +6636,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6639 |
|
| 6640 |
#elif defined(__POWER9_VECTOR__)
|
| 6641 |
const vector signed char lowMask = vec_splats((signed char)0x3);
|
|
|
|
|
|
|
|
|
|
| 6642 |
const vector signed char v1 = vec_splats((signed char)0x1);
|
| 6643 |
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
| 6644 |
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
|
@@ -6656,30 +6656,33 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6656 |
vector float vyd = vec_splats(y[i].d);
|
| 6657 |
vector float vd = vec_mul(vxd, vyd);
|
| 6658 |
|
| 6659 |
-
|
| 6660 |
-
|
| 6661 |
|
| 6662 |
-
|
| 6663 |
-
|
| 6664 |
-
|
| 6665 |
-
|
| 6666 |
-
|
|
|
|
| 6667 |
|
| 6668 |
-
|
|
|
|
|
|
|
|
|
|
| 6669 |
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
|
| 6670 |
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
|
| 6671 |
|
| 6672 |
vscales = vec_sub(vscales, off);
|
| 6673 |
|
| 6674 |
-
vector signed int vsumi0 =
|
| 6675 |
-
vector signed int vsumi1 =
|
| 6676 |
-
vector signed int vsumi2 =
|
| 6677 |
-
vector signed int vsumi3 =
|
| 6678 |
-
vector signed int vsumi4 =
|
| 6679 |
-
vector signed int vsumi5 =
|
| 6680 |
-
vector signed int vsumi6 =
|
| 6681 |
-
vector signed int vsumi7 =
|
| 6682 |
-
|
| 6683 |
|
| 6684 |
const uint8_t * restrict q3 = x[i].qs;
|
| 6685 |
const int8_t * restrict q8 = y[i].qs;
|
|
@@ -6753,23 +6756,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6753 |
vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
|
| 6754 |
vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
|
| 6755 |
|
| 6756 |
-
|
| 6757 |
-
|
| 6758 |
-
|
| 6759 |
-
|
| 6760 |
-
|
| 6761 |
-
|
| 6762 |
-
|
| 6763 |
-
|
| 6764 |
-
|
| 6765 |
-
vsumi0 = vec_add(vsum0, vsumi0);
|
| 6766 |
-
vsumi1 = vec_add(vsum1, vsumi1);
|
| 6767 |
-
vsumi2 = vec_add(vsum2, vsumi2);
|
| 6768 |
-
vsumi3 = vec_add(vsum3, vsumi3);
|
| 6769 |
-
vsumi4 = vec_add(vsum4, vsumi4);
|
| 6770 |
-
vsumi5 = vec_add(vsum5, vsumi5);
|
| 6771 |
-
vsumi6 = vec_add(vsum6, vsumi6);
|
| 6772 |
-
vsumi7 = vec_add(vsum7, vsumi7);
|
| 6773 |
}
|
| 6774 |
|
| 6775 |
vsumi0 = vec_add(vsumi0, vsumi4);
|
|
@@ -7268,6 +7262,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7268 |
|
| 7269 |
#elif defined(__POWER9_VECTOR__)
|
| 7270 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7271 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 7272 |
|
| 7273 |
vector float vsumf0 = vec_splats(0.0f);
|
|
@@ -7286,15 +7284,24 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7286 |
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
| 7287 |
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
| 7288 |
|
| 7289 |
-
|
|
|
|
|
|
|
|
|
|
| 7290 |
|
| 7291 |
-
|
| 7292 |
-
|
| 7293 |
-
|
| 7294 |
-
|
| 7295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7296 |
|
| 7297 |
-
vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
|
| 7298 |
vector signed short vscales = vec_unpackh(utmps);
|
| 7299 |
vector signed short q4xmins = vec_unpackl(utmps);
|
| 7300 |
vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
|
|
@@ -7310,14 +7317,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7310 |
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
| 7311 |
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
| 7312 |
|
| 7313 |
-
vector signed int vsumi0 =
|
| 7314 |
-
vector signed int vsumi1 =
|
| 7315 |
-
vector signed int vsumi2 =
|
| 7316 |
-
vector signed int vsumi3 =
|
| 7317 |
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
| 7318 |
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
| 7319 |
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
| 7320 |
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
| 7321 |
|
| 7322 |
const uint8_t * restrict q4 = x[i].qs;
|
| 7323 |
const int8_t * restrict q8 = y[i].qs;
|
|
@@ -7332,14 +7335,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7332 |
vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
|
| 7333 |
q4 += 64;
|
| 7334 |
|
| 7335 |
-
vector
|
| 7336 |
-
vector
|
| 7337 |
-
vector
|
| 7338 |
-
vector
|
| 7339 |
-
vector
|
| 7340 |
-
vector
|
| 7341 |
-
vector
|
| 7342 |
-
vector
|
| 7343 |
|
| 7344 |
vector signed char q8y00 = vec_xl( 0, q8);
|
| 7345 |
vector signed char q8y10 = vec_xl( 16, q8);
|
|
@@ -7351,41 +7354,33 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7351 |
vector signed char q8y31 = vec_xl(112, q8);
|
| 7352 |
q8 += 128;
|
| 7353 |
|
| 7354 |
-
vector signed
|
| 7355 |
-
vector signed
|
| 7356 |
-
vector signed
|
| 7357 |
-
vector signed
|
| 7358 |
-
vector signed
|
| 7359 |
-
vector signed
|
| 7360 |
-
vector signed
|
| 7361 |
-
vector signed
|
| 7362 |
-
|
| 7363 |
-
vector signed
|
| 7364 |
-
vector signed
|
| 7365 |
-
vector signed
|
| 7366 |
-
vector signed
|
|
|
|
| 7367 |
vscales = vec_sld(vscales, vscales, 8);
|
| 7368 |
|
| 7369 |
-
|
| 7370 |
-
|
| 7371 |
-
|
| 7372 |
-
|
| 7373 |
|
| 7374 |
-
vsumi0 = vec_add(
|
| 7375 |
-
vsumi1 = vec_add(
|
| 7376 |
-
vsumi2 = vec_add(
|
| 7377 |
-
vsumi3 = vec_add(
|
| 7378 |
-
vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4);
|
| 7379 |
-
vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5);
|
| 7380 |
-
vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6);
|
| 7381 |
-
vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7);
|
| 7382 |
}
|
| 7383 |
|
| 7384 |
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
| 7385 |
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
| 7386 |
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
| 7387 |
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
| 7388 |
-
|
| 7389 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 7390 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 7391 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
@@ -7887,6 +7882,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7887 |
|
| 7888 |
#elif defined(__POWER9_VECTOR__)
|
| 7889 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
|
|
|
|
|
|
|
|
| 7890 |
const vector unsigned char v1 = vec_splats((unsigned char)0x1);
|
| 7891 |
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
| 7892 |
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
|
@@ -7905,18 +7903,27 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7905 |
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
| 7906 |
vector float vdmin = vec_mul(vxmin, vyd);
|
| 7907 |
|
| 7908 |
-
|
|
|
|
|
|
|
|
|
|
| 7909 |
|
| 7910 |
-
|
| 7911 |
-
|
| 7912 |
-
|
| 7913 |
-
|
| 7914 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7915 |
|
| 7916 |
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
| 7917 |
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
| 7918 |
|
| 7919 |
-
vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
|
| 7920 |
vector signed short vscales = vec_unpackh(utmps);
|
| 7921 |
|
| 7922 |
vector signed short q5xmins = vec_unpackl(utmps);
|
|
@@ -7936,10 +7943,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7936 |
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
|
| 7937 |
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
|
| 7938 |
|
| 7939 |
-
vector signed int vsumi0 =
|
| 7940 |
-
vector signed int vsumi1 =
|
| 7941 |
-
vector signed int vsumi2 =
|
| 7942 |
-
vector signed int vsumi3 =
|
| 7943 |
|
| 7944 |
const uint8_t * restrict q5 = x[i].qs;
|
| 7945 |
const int8_t * restrict q8 = y[i].qs;
|
|
@@ -7964,10 +7971,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7964 |
qxhs0 = vec_sr(qxhs0, v2);
|
| 7965 |
qxhs1 = vec_sr(qxhs1, v2);
|
| 7966 |
|
| 7967 |
-
vector
|
| 7968 |
-
vector
|
| 7969 |
-
vector
|
| 7970 |
-
vector
|
| 7971 |
|
| 7972 |
vector signed char q8y00 = vec_xl( 0, q8);
|
| 7973 |
vector signed char q8y10 = vec_xl(16, q8);
|
|
@@ -7975,22 +7982,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7975 |
vector signed char q8y11 = vec_xl(48, q8);
|
| 7976 |
q8 += 64;
|
| 7977 |
|
| 7978 |
-
vector signed
|
| 7979 |
-
vector signed
|
| 7980 |
-
vector signed
|
| 7981 |
-
vector signed
|
| 7982 |
|
| 7983 |
-
vector signed
|
| 7984 |
-
vector signed
|
|
|
|
| 7985 |
vscales = vec_sld(vscales, vscales, 12);
|
| 7986 |
|
| 7987 |
-
|
| 7988 |
-
|
| 7989 |
-
|
| 7990 |
-
|
| 7991 |
-
vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
|
| 7992 |
-
vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2);
|
| 7993 |
-
vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3);
|
| 7994 |
}
|
| 7995 |
|
| 7996 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
|
@@ -8551,6 +8556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 8551 |
|
| 8552 |
#elif defined(__POWER9_VECTOR__)
|
| 8553 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
|
|
| 8554 |
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
| 8555 |
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
| 8556 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
|
@@ -8567,14 +8573,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 8567 |
vector float vyd = vec_splats(y[i].d);
|
| 8568 |
vector float vd = vec_mul(vxd, vyd);
|
| 8569 |
|
| 8570 |
-
vector signed int vsumi0 =
|
| 8571 |
-
vector signed int vsumi1 =
|
| 8572 |
-
vector signed int vsumi2 =
|
| 8573 |
-
vector signed int vsumi3 =
|
| 8574 |
-
vector signed int vsumi4 =
|
| 8575 |
-
vector signed int vsumi5 =
|
| 8576 |
-
vector signed int vsumi6 =
|
| 8577 |
-
vector signed int vsumi7 =
|
| 8578 |
|
| 8579 |
const uint8_t * restrict q6 = x[i].ql;
|
| 8580 |
const uint8_t * restrict qh = x[i].qh;
|
|
@@ -8654,23 +8660,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 8654 |
vector signed short vs6 = vec_splat(vscales, 6);
|
| 8655 |
vector signed short vs7 = vec_splat(vscales, 7);
|
| 8656 |
|
| 8657 |
-
vsumi0 =
|
| 8658 |
-
vsumi1 =
|
| 8659 |
-
vsumi2 =
|
| 8660 |
-
vsumi3 =
|
| 8661 |
-
vsumi4 =
|
| 8662 |
-
vsumi5 =
|
| 8663 |
-
vsumi6 =
|
| 8664 |
-
vsumi7 =
|
| 8665 |
-
|
| 8666 |
-
vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0);
|
| 8667 |
-
vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1);
|
| 8668 |
-
vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2);
|
| 8669 |
-
vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3);
|
| 8670 |
-
vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4);
|
| 8671 |
-
vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5);
|
| 8672 |
-
vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6);
|
| 8673 |
-
vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7);
|
| 8674 |
}
|
| 8675 |
|
| 8676 |
vsumi0 = vec_add(vsumi0, vsumi4);
|
|
@@ -8951,6 +8948,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
| 8951 |
*s = 0.125f * hsum_float_8(accumf);
|
| 8952 |
|
| 8953 |
#elif defined(__POWER9_VECTOR__)
|
|
|
|
| 8954 |
vector float vsumf0 = vec_splats(0.0f);
|
| 8955 |
vector float vsumf1 = vec_splats(0.0f);
|
| 8956 |
vector float vsumf2 = vec_splats(0.0f);
|
|
@@ -8963,14 +8961,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
| 8963 |
vector float vyd = vec_splats(y[i].d);
|
| 8964 |
vector float vd = vec_mul(vxd, vyd);
|
| 8965 |
|
| 8966 |
-
vector signed int vsumi0 =
|
| 8967 |
-
vector signed int vsumi1 =
|
| 8968 |
-
vector signed int vsumi2 =
|
| 8969 |
-
vector signed int vsumi3 =
|
| 8970 |
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
| 8971 |
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
| 8972 |
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
| 8973 |
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
| 8974 |
|
| 8975 |
const uint16_t * restrict q2 = x[i].qs;
|
| 8976 |
const int8_t * restrict q8 = y[i].qs;
|
|
@@ -9017,21 +9011,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
| 9017 |
vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
|
| 9018 |
vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
|
| 9019 |
|
| 9020 |
-
vsumi0 =
|
| 9021 |
-
vsumi1 =
|
| 9022 |
-
vsumi2 =
|
| 9023 |
-
vsumi3 =
|
| 9024 |
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
| 9025 |
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
| 9026 |
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
| 9027 |
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
| 9028 |
}
|
| 9029 |
|
| 9030 |
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
| 9031 |
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
| 9032 |
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
| 9033 |
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
| 9034 |
-
|
| 9035 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 9036 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 9037 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
@@ -9423,6 +9408,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 9423 |
|
| 9424 |
*s = 0.125f * hsum_float_8(accumf);
|
| 9425 |
#elif defined(__POWER9_VECTOR__)
|
|
|
|
| 9426 |
vector float vsumf0 = vec_splats(0.0f);
|
| 9427 |
vector float vsumf1 = vec_splats(0.0f);
|
| 9428 |
vector float vsumf2 = vec_splats(0.0f);
|
|
@@ -9435,14 +9421,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 9435 |
vector float vyd = vec_splats(y[i].d);
|
| 9436 |
vector float vd = vec_mul(vxd, vyd);
|
| 9437 |
|
| 9438 |
-
vector signed int vsumi0 =
|
| 9439 |
-
vector signed int vsumi1 =
|
| 9440 |
-
vector signed int vsumi2 =
|
| 9441 |
-
vector signed int vsumi3 =
|
| 9442 |
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
| 9443 |
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
| 9444 |
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
| 9445 |
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
| 9446 |
|
| 9447 |
const uint16_t * restrict q2 = x[i].qs;
|
| 9448 |
const uint8_t * restrict sc = x[i].scales;
|
|
@@ -9490,21 +9472,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 9490 |
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
| 9491 |
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
| 9492 |
|
| 9493 |
-
vsumi0 =
|
| 9494 |
-
vsumi1 =
|
| 9495 |
-
vsumi2 =
|
| 9496 |
-
vsumi3 =
|
| 9497 |
-
vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
|
| 9498 |
-
vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
|
| 9499 |
-
vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
|
| 9500 |
-
vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
|
| 9501 |
}
|
| 9502 |
|
| 9503 |
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
| 9504 |
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
| 9505 |
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
| 9506 |
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
| 9507 |
-
|
| 9508 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 9509 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 9510 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
@@ -9727,6 +9700,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 9727 |
|
| 9728 |
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
| 9729 |
|
|
|
|
|
|
|
| 9730 |
vector float vsumf0 = vec_splats(0.0f);
|
| 9731 |
vector float vsumf1 = vec_splats(0.0f);
|
| 9732 |
vector float vsumf2 = vec_splats(0.0f);
|
|
@@ -9741,14 +9716,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 9741 |
vector float vyd = vec_splats(y[i].d);
|
| 9742 |
vector float vd = vec_mul(vxd, vyd);
|
| 9743 |
|
| 9744 |
-
vector signed int vsumi0 =
|
| 9745 |
-
vector signed int vsumi1 =
|
| 9746 |
-
vector signed int vsumi2 =
|
| 9747 |
-
vector signed int vsumi3 =
|
| 9748 |
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
| 9749 |
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
| 9750 |
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
| 9751 |
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
| 9752 |
|
| 9753 |
const uint8_t * restrict q2 = x[i].qs;
|
| 9754 |
const uint8_t * restrict qh = x[i].qh;
|
|
@@ -9808,21 +9779,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 9808 |
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
| 9809 |
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
| 9810 |
|
| 9811 |
-
vsumi0 =
|
| 9812 |
-
vsumi1 =
|
| 9813 |
-
vsumi2 =
|
| 9814 |
-
vsumi3 =
|
| 9815 |
-
vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
|
| 9816 |
-
vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
|
| 9817 |
-
vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
|
| 9818 |
-
vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
|
| 9819 |
}
|
| 9820 |
|
| 9821 |
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
| 9822 |
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
| 9823 |
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
| 9824 |
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
| 9825 |
-
|
| 9826 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 9827 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 9828 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
@@ -10060,6 +10022,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
| 10060 |
#elif defined(__POWER9_VECTOR__)
|
| 10061 |
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
| 10062 |
|
|
|
|
|
|
|
| 10063 |
vector float vsumf0 = vec_splats(0.0f);
|
| 10064 |
vector float vsumf1 = vec_splats(0.0f);
|
| 10065 |
vector float vsumf2 = vec_splats(0.0f);
|
|
@@ -10070,14 +10034,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
| 10070 |
vector float vyd = vec_splats(y[i].d);
|
| 10071 |
vector float vd = vec_mul(vxd, vyd);
|
| 10072 |
|
| 10073 |
-
vector signed int vsumi0 =
|
| 10074 |
-
vector signed int vsumi1 =
|
| 10075 |
-
vector signed int vsumi2 =
|
| 10076 |
-
vector signed int vsumi3 =
|
| 10077 |
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
| 10078 |
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
| 10079 |
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
| 10080 |
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
| 10081 |
|
| 10082 |
const uint8_t * restrict q3 = x[i].qs;
|
| 10083 |
const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
|
|
@@ -10122,21 +10082,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
| 10122 |
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
| 10123 |
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
| 10124 |
|
| 10125 |
-
vsumi0 =
|
| 10126 |
-
vsumi1 =
|
| 10127 |
-
vsumi2 =
|
| 10128 |
-
vsumi3 =
|
| 10129 |
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
| 10130 |
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
| 10131 |
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
| 10132 |
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
| 10133 |
}
|
| 10134 |
|
| 10135 |
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
| 10136 |
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
| 10137 |
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
| 10138 |
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
| 10139 |
-
|
| 10140 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 10141 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 10142 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
@@ -10426,6 +10377,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
| 10426 |
|
| 10427 |
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
| 10428 |
|
|
|
|
|
|
|
| 10429 |
vector float vsumf0 = vec_splats(0.0f);
|
| 10430 |
vector float vsumf1 = vec_splats(0.0f);
|
| 10431 |
vector float vsumf2 = vec_splats(0.0f);
|
|
@@ -10446,14 +10399,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
| 10446 |
const uint8_t * restrict sc = x[i].scales;
|
| 10447 |
const int8_t * restrict q8 = y[i].qs;
|
| 10448 |
|
| 10449 |
-
vector signed int vsumi0 =
|
| 10450 |
-
vector signed int vsumi1 =
|
| 10451 |
-
vector signed int vsumi2 =
|
| 10452 |
-
vector signed int vsumi3 =
|
| 10453 |
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
| 10454 |
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
| 10455 |
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
| 10456 |
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
| 10457 |
|
| 10458 |
for (int j = 0; j < QK_K/32; j += 2) {
|
| 10459 |
__builtin_prefetch(q3, 0, 1);
|
|
@@ -10507,21 +10456,12 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
| 10507 |
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
| 10508 |
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
| 10509 |
|
| 10510 |
-
vsumi0 =
|
| 10511 |
-
vsumi1 =
|
| 10512 |
-
vsumi2 =
|
| 10513 |
-
vsumi3 =
|
| 10514 |
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
| 10515 |
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
| 10516 |
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
| 10517 |
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
| 10518 |
}
|
| 10519 |
|
| 10520 |
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
| 10521 |
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
| 10522 |
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
| 10523 |
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
| 10524 |
-
|
| 10525 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 10526 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 10527 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
@@ -10802,10 +10742,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
| 10802 |
vector signed int vsumi1 = vec_splats((int32_t)0);
|
| 10803 |
vector signed int vsumi2 = vec_splats((int32_t)0);
|
| 10804 |
vector signed int vsumi3 = vec_splats((int32_t)0);
|
| 10805 |
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
| 10806 |
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
| 10807 |
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
| 10808 |
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
| 10809 |
vector signed int vsumi8 = vec_splats((int32_t)0);
|
| 10810 |
|
| 10811 |
const uint8_t * restrict q1 = x[i].qs;
|
|
@@ -10847,14 +10783,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
| 10847 |
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
| 10848 |
vector signed short vscales = vec_sld(vscales23, vscales01, 8);
|
| 10849 |
|
| 10850 |
-
vsumi0 =
|
| 10851 |
-
vsumi1 =
|
| 10852 |
-
vsumi2 =
|
| 10853 |
-
vsumi3 =
|
| 10854 |
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
| 10855 |
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
| 10856 |
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
| 10857 |
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
| 10858 |
|
| 10859 |
vector signed short q8ysums = vec_xl_len(qs, 8);
|
| 10860 |
qs += 4;
|
|
@@ -10869,11 +10801,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
| 10869 |
vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
|
| 10870 |
}
|
| 10871 |
|
| 10872 |
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
| 10873 |
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
| 10874 |
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
| 10875 |
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
| 10876 |
-
|
| 10877 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 10878 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 10879 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
@@ -11267,6 +11194,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 11267 |
|
| 11268 |
#elif defined(__POWER9_VECTOR__)
|
| 11269 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
|
|
| 11270 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 11271 |
|
| 11272 |
vector float vsumf0 = vec_splats(0.0f);
|
|
@@ -11297,8 +11225,11 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 11297 |
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
| 11298 |
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
| 11299 |
|
| 11300 |
-
vector signed int vsumi0 =
|
| 11301 |
-
vector signed int vsumi1 =
|
|
|
|
|
|
|
|
|
|
| 11302 |
|
| 11303 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 11304 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
|
@@ -11453,6 +11384,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 11453 |
|
| 11454 |
#elif defined(__POWER9_VECTOR__)
|
| 11455 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
|
|
| 11456 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 11457 |
|
| 11458 |
vector float vsumf0 = vec_splats(0.0f);
|
|
@@ -11468,14 +11400,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 11468 |
vector float vyd = vec_splats(y[ibl].d);
|
| 11469 |
vector float vd = vec_mul(vxd, vyd);
|
| 11470 |
|
| 11471 |
-
vector signed int vsumi0 =
|
| 11472 |
-
vector signed int vsumi1 =
|
| 11473 |
-
vector signed int vsumi2 =
|
| 11474 |
-
vector signed int vsumi3 =
|
| 11475 |
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
| 11476 |
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
| 11477 |
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
| 11478 |
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
| 11479 |
|
| 11480 |
uint16_t h = x[ibl].scales_h;
|
| 11481 |
|
|
@@ -11520,21 +11448,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 11520 |
vector signed short vscales01 = vec_splats((int16_t)ls0);
|
| 11521 |
vector signed short vscales23 = vec_splats((int16_t)ls1);
|
| 11522 |
|
| 11523 |
-
vsumi0 =
|
| 11524 |
-
vsumi1 =
|
| 11525 |
-
vsumi2 =
|
| 11526 |
-
vsumi3 =
|
| 11527 |
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
| 11528 |
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
| 11529 |
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
| 11530 |
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
| 11531 |
}
|
| 11532 |
|
| 11533 |
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
| 11534 |
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
| 11535 |
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
| 11536 |
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
| 11537 |
-
|
| 11538 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 11539 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 11540 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
|
|
| 1076 |
}
|
| 1077 |
vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
|
| 1078 |
vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
|
| 1079 |
+
}
|
| 1080 |
|
| 1081 |
#elif defined(__loongarch_asx)
|
| 1082 |
for (int i = 0; i < nb; i++) {
|
|
|
|
| 1436 |
accv = vec_add(accv, vec_sld(accv, accv, 4));
|
| 1437 |
accv = vec_add(accv, vec_sld(accv, accv, 8));
|
| 1438 |
y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
|
| 1439 |
+
}
|
| 1440 |
|
| 1441 |
#elif defined(__loongarch_asx)
|
| 1442 |
for (int i = 0; i < nb; i++) {
|
|
|
|
| 4113 |
|
| 4114 |
#elif defined(__POWER9_VECTOR__)
|
| 4115 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
| 4116 |
+
const vector signed int v0 = vec_splats((int32_t)0);
|
| 4117 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 4118 |
const vector signed char v8 = vec_splats((signed char)0x8);
|
| 4119 |
|
| 4120 |
vector float vsumf0 = vec_splats(0.0f);
|
| 4121 |
|
| 4122 |
+
#pragma GCC unroll 8
|
| 4123 |
for (int i = 0; i < nb; i++) {
|
| 4124 |
__builtin_prefetch(x[i].qs, 0, 1);
|
| 4125 |
__builtin_prefetch(y[i].qs, 0, 1);
|
|
|
|
| 4141 |
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
| 4142 |
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
| 4143 |
|
| 4144 |
+
vector signed int vsumi0 = v0;
|
| 4145 |
|
| 4146 |
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
| 4147 |
+
vsumi0 = vec_sum4s(qv1, vsumi0);
|
| 4148 |
|
| 4149 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 4150 |
}
|
|
|
|
| 4518 |
|
| 4519 |
#elif defined(__POWER9_VECTOR__)
|
| 4520 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
| 4521 |
+
const vector signed int v0 = vec_splats((int32_t)0);
|
| 4522 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 4523 |
|
| 4524 |
vector float vsumf0 = vec_splats(0.0f);
|
|
|
|
| 4540 |
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
| 4541 |
vector signed char q8y1 = vec_xl(16, y[i].qs);
|
| 4542 |
|
| 4543 |
+
vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
|
| 4544 |
+
vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
|
|
|
|
|
|
|
|
|
|
| 4545 |
|
| 4546 |
+
vector signed int vsumi0 = v0;
|
| 4547 |
|
| 4548 |
+
vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
|
| 4549 |
+
vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
|
| 4550 |
|
| 4551 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 4552 |
}
|
|
|
|
| 5248 |
|
| 5249 |
#elif defined(__POWER9_VECTOR__)
|
| 5250 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
| 5251 |
+
const vector signed int v0 = vec_splats((int32_t)0);
|
| 5252 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 5253 |
|
| 5254 |
vector float vsumf0 = vec_splats(0.0f);
|
|
|
|
| 5274 |
|
| 5275 |
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
| 5276 |
|
| 5277 |
+
vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
|
| 5278 |
+
vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
|
| 5279 |
|
| 5280 |
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
| 5281 |
vector signed char q8y1 = vec_xl( 16, y[i].qs);
|
| 5282 |
|
| 5283 |
+
vector signed int vsumi0 = v0;
|
|
|
|
|
|
|
|
|
|
| 5284 |
|
| 5285 |
+
vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
|
| 5286 |
+
vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
|
| 5287 |
|
| 5288 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 5289 |
}
|
|
|
|
| 5523 |
*s = sumf;
|
| 5524 |
|
| 5525 |
#elif defined(__POWER9_VECTOR__)
|
| 5526 |
+
const vector signed int v0 = vec_splats((int32_t)0);
|
| 5527 |
vector float vsumf0 = vec_splats(0.0f);
|
| 5528 |
|
| 5529 |
+
#pragma GCC unroll 8
|
| 5530 |
for (int i = 0; i < nb; i++) {
|
| 5531 |
__builtin_prefetch(x[i].qs, 0, 1);
|
| 5532 |
__builtin_prefetch(y[i].qs, 0, 1);
|
|
|
|
| 5545 |
vector signed short qv2 = vec_mule(q8x1, q8y1);
|
| 5546 |
vector signed short qv3 = vec_mulo(q8x1, q8y1);
|
| 5547 |
|
| 5548 |
+
vector signed int vsumi0 = v0;
|
| 5549 |
+
vector signed int vsumi1 = v0;
|
|
|
|
|
|
|
| 5550 |
|
| 5551 |
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
| 5552 |
+
vsumi1 = vec_sum4s(qv1, vsumi1);
|
| 5553 |
+
vsumi0 = vec_sum4s(qv2, vsumi0);
|
| 5554 |
+
vsumi1 = vec_sum4s(qv3, vsumi1);
|
| 5555 |
|
| 5556 |
vsumi0 = vec_add(vsumi0, vsumi1);
|
| 5557 |
|
|
|
|
| 5939 |
#elif defined(__POWER9_VECTOR__)
|
| 5940 |
const vector signed char lowMask = vec_splats((signed char)0x3);
|
| 5941 |
const vector signed char lowScaleMask = vec_splats((signed char)0xF);
|
| 5942 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 5943 |
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
| 5944 |
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
| 5945 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
|
|
|
| 5977 |
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
| 5978 |
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
| 5979 |
|
| 5980 |
+
vector signed int vsumi0 = v0;
|
| 5981 |
+
vector signed int vsumi1 = v0;
|
| 5982 |
+
vector signed int vsumi2 = v0;
|
| 5983 |
+
vector signed int vsumi3 = v0;
|
| 5984 |
+
vector signed int vsumi4 = v0;
|
| 5985 |
+
vector signed int vsumi5 = v0;
|
| 5986 |
+
vector signed int vsumi6 = v0;
|
| 5987 |
+
vector signed int vsumi7 = v0;
|
| 5988 |
|
| 5989 |
+
const uint8_t * restrict q2 = x[i].qs;
|
| 5990 |
+
const int8_t * restrict q8 = y[i].qs;
|
| 5991 |
|
| 5992 |
for (int j = 0; j < QK_K/128; ++j) {
|
| 5993 |
__builtin_prefetch(q2, 0, 1);
|
|
|
|
| 5997 |
vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
|
| 5998 |
q2 += 32;
|
| 5999 |
|
| 6000 |
+
vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
|
| 6001 |
+
vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
|
| 6002 |
+
vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
|
| 6003 |
+
vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
|
| 6004 |
+
vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
|
| 6005 |
+
vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
|
| 6006 |
+
vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
|
| 6007 |
+
vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
|
| 6008 |
|
| 6009 |
vector signed char q8y00 = vec_xl( 0, q8);
|
| 6010 |
vector signed char q8y10 = vec_xl( 16, q8);
|
|
|
|
| 6016 |
vector signed char q8y13 = vec_xl(112, q8);
|
| 6017 |
q8 += 128;
|
| 6018 |
|
| 6019 |
+
vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
|
| 6020 |
+
vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
|
| 6021 |
+
vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
|
| 6022 |
+
vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
|
| 6023 |
+
vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
|
| 6024 |
+
vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
|
| 6025 |
+
vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
|
| 6026 |
+
vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
|
| 6027 |
+
|
| 6028 |
+
vector signed short vscales_07 = vec_unpackh(vscales);
|
| 6029 |
+
vector signed int vscales_03 = vec_unpackh(vscales_07);
|
| 6030 |
+
vector signed int vscales_47 = vec_unpackl(vscales_07);
|
| 6031 |
+
vector signed int vs0 = vec_splat(vscales_03, 0);
|
| 6032 |
+
vector signed int vs1 = vec_splat(vscales_03, 1);
|
| 6033 |
+
vector signed int vs2 = vec_splat(vscales_03, 2);
|
| 6034 |
+
vector signed int vs3 = vec_splat(vscales_03, 3);
|
| 6035 |
+
vector signed int vs4 = vec_splat(vscales_47, 0);
|
| 6036 |
+
vector signed int vs5 = vec_splat(vscales_47, 1);
|
| 6037 |
+
vector signed int vs6 = vec_splat(vscales_47, 2);
|
| 6038 |
+
vector signed int vs7 = vec_splat(vscales_47, 3);
|
| 6039 |
vscales = vec_sld(vscales, vscales, 8);
|
| 6040 |
|
| 6041 |
+
vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
|
| 6042 |
+
vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
|
| 6043 |
+
vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
|
| 6044 |
+
vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
|
| 6045 |
+
vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
|
| 6046 |
+
vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
|
| 6047 |
+
vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
|
| 6048 |
+
vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6049 |
}
|
| 6050 |
|
| 6051 |
vsumi0 = vec_add(vsumi0, vsumi4);
|
|
|
|
| 6636 |
|
| 6637 |
#elif defined(__POWER9_VECTOR__)
|
| 6638 |
const vector signed char lowMask = vec_splats((signed char)0x3);
|
| 6639 |
+
const vector signed char lowMask1 = vec_splats((int8_t)0xf);
|
| 6640 |
+
const vector signed char lowMask2 = vec_splats((int8_t)0x30);
|
| 6641 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 6642 |
const vector signed char v1 = vec_splats((signed char)0x1);
|
| 6643 |
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
| 6644 |
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
|
|
|
| 6656 |
vector float vyd = vec_splats(y[i].d);
|
| 6657 |
vector float vd = vec_mul(vxd, vyd);
|
| 6658 |
|
| 6659 |
+
UNUSED(kmask1);
|
| 6660 |
+
UNUSED(kmask2);
|
| 6661 |
|
| 6662 |
+
vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
|
| 6663 |
+
vector signed char u1 = vec_and(u0, lowMask1);
|
| 6664 |
+
vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
|
| 6665 |
+
vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
|
| 6666 |
+
vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
|
| 6667 |
+
vector signed char u31 = vec_and(u3, lowMask2);
|
| 6668 |
|
| 6669 |
+
u1 = vec_or(u1, u30);
|
| 6670 |
+
u2 = vec_or(vec_sr(u0, v4), u31);
|
| 6671 |
+
|
| 6672 |
+
vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
|
| 6673 |
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
|
| 6674 |
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
|
| 6675 |
|
| 6676 |
vscales = vec_sub(vscales, off);
|
| 6677 |
|
| 6678 |
+
vector signed int vsumi0 = v0;
|
| 6679 |
+
vector signed int vsumi1 = v0;
|
| 6680 |
+
vector signed int vsumi2 = v0;
|
| 6681 |
+
vector signed int vsumi3 = v0;
|
| 6682 |
+
vector signed int vsumi4 = v0;
|
| 6683 |
+
vector signed int vsumi5 = v0;
|
| 6684 |
+
vector signed int vsumi6 = v0;
|
| 6685 |
+
vector signed int vsumi7 = v0;
|
|
|
|
| 6686 |
|
| 6687 |
const uint8_t * restrict q3 = x[i].qs;
|
| 6688 |
const int8_t * restrict q8 = y[i].qs;
|
|
|
|
| 6756 |
vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
|
| 6757 |
vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
|
| 6758 |
|
| 6759 |
+
vsumi0 = vec_msum(qv00, vs0, vsumi0);
|
| 6760 |
+
vsumi1 = vec_msum(qv01, vs2, vsumi1);
|
| 6761 |
+
vsumi2 = vec_msum(qv02, vs4, vsumi2);
|
| 6762 |
+
vsumi3 = vec_msum(qv03, vs6, vsumi3);
|
| 6763 |
+
vsumi4 = vec_msum(qv10, vs1, vsumi4);
|
| 6764 |
+
vsumi5 = vec_msum(qv11, vs3, vsumi5);
|
| 6765 |
+
vsumi6 = vec_msum(qv12, vs5, vsumi6);
|
| 6766 |
+
vsumi7 = vec_msum(qv13, vs7, vsumi7);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6767 |
}
|
| 6768 |
|
| 6769 |
vsumi0 = vec_add(vsumi0, vsumi4);
|
|
|
|
| 7262 |
|
| 7263 |
#elif defined(__POWER9_VECTOR__)
|
| 7264 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
| 7265 |
+
const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
|
| 7266 |
+
const vector signed char lowMask2 = vec_splats((int8_t)0x30);
|
| 7267 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 7268 |
+
const vector unsigned char v2 = vec_splats((uint8_t)2);
|
| 7269 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 7270 |
|
| 7271 |
vector float vsumf0 = vec_splats(0.0f);
|
|
|
|
| 7284 |
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
| 7285 |
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
| 7286 |
|
| 7287 |
+
UNUSED(kmask1);
|
| 7288 |
+
UNUSED(kmask2);
|
| 7289 |
+
UNUSED(kmask3);
|
| 7290 |
+
UNUSED(utmp);
|
| 7291 |
|
| 7292 |
+
vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
|
| 7293 |
+
vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
|
| 7294 |
+
vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
|
| 7295 |
+
vector signed char u3 = vec_sr(u2, v4);
|
| 7296 |
+
|
| 7297 |
+
vector signed char u30 = u1;
|
| 7298 |
+
vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
|
| 7299 |
+
|
| 7300 |
+
u1 = vec_and(u0, lowMask1);
|
| 7301 |
+
u2 = vec_or(u30, u31);
|
| 7302 |
+
|
| 7303 |
+
vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
|
| 7304 |
|
|
|
|
| 7305 |
vector signed short vscales = vec_unpackh(utmps);
|
| 7306 |
vector signed short q4xmins = vec_unpackl(utmps);
|
| 7307 |
vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
|
|
|
|
| 7317 |
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
| 7318 |
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
| 7319 |
|
| 7320 |
+
vector signed int vsumi0 = v0;
|
| 7321 |
+
vector signed int vsumi1 = v0;
|
| 7322 |
+
vector signed int vsumi2 = v0;
|
| 7323 |
+
vector signed int vsumi3 = v0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7324 |
|
| 7325 |
const uint8_t * restrict q4 = x[i].qs;
|
| 7326 |
const int8_t * restrict q8 = y[i].qs;
|
|
|
|
| 7335 |
vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
|
| 7336 |
q4 += 64;
|
| 7337 |
|
| 7338 |
+
vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
|
| 7339 |
+
vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
|
| 7340 |
+
vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
|
| 7341 |
+
vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
|
| 7342 |
+
vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
|
| 7343 |
+
vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
|
| 7344 |
+
vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
|
| 7345 |
+
vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
|
| 7346 |
|
| 7347 |
vector signed char q8y00 = vec_xl( 0, q8);
|
| 7348 |
vector signed char q8y10 = vec_xl( 16, q8);
|
|
|
|
| 7354 |
vector signed char q8y31 = vec_xl(112, q8);
|
| 7355 |
q8 += 128;
|
| 7356 |
|
| 7357 |
+
vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
|
| 7358 |
+
vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
|
| 7359 |
+
vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
|
| 7360 |
+
vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
|
| 7361 |
+
vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
|
| 7362 |
+
vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
|
| 7363 |
+
vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
|
| 7364 |
+
vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
|
| 7365 |
+
|
| 7366 |
+
vector signed int vscales_h = vec_unpackh(vscales);
|
| 7367 |
+
vector signed int vs0 = vec_splat(vscales_h, 0);
|
| 7368 |
+
vector signed int vs1 = vec_splat(vscales_h, 1);
|
| 7369 |
+
vector signed int vs2 = vec_splat(vscales_h, 2);
|
| 7370 |
+
vector signed int vs3 = vec_splat(vscales_h, 3);
|
| 7371 |
vscales = vec_sld(vscales, vscales, 8);
|
| 7372 |
|
| 7373 |
+
vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
|
| 7374 |
+
vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
|
| 7375 |
+
vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
|
| 7376 |
+
vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
|
| 7377 |
|
| 7378 |
+
vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
|
| 7379 |
+
vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
|
| 7380 |
+
vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
|
| 7381 |
+
vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7382 |
}
|
| 7383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7384 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 7385 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 7386 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
|
|
| 7882 |
|
| 7883 |
#elif defined(__POWER9_VECTOR__)
|
| 7884 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
| 7885 |
+
const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
|
| 7886 |
+
const vector signed char lowMask2 = vec_splats((int8_t)0x30);
|
| 7887 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 7888 |
const vector unsigned char v1 = vec_splats((unsigned char)0x1);
|
| 7889 |
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
| 7890 |
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
|
|
|
| 7903 |
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
| 7904 |
vector float vdmin = vec_mul(vxmin, vyd);
|
| 7905 |
|
| 7906 |
+
UNUSED(kmask1);
|
| 7907 |
+
UNUSED(kmask2);
|
| 7908 |
+
UNUSED(kmask3);
|
| 7909 |
+
UNUSED(utmp);
|
| 7910 |
|
| 7911 |
+
vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
|
| 7912 |
+
vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
|
| 7913 |
+
vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
|
| 7914 |
+
vector signed char u3 = vec_sr(u2, v4);
|
| 7915 |
+
|
| 7916 |
+
vector signed char u30 = u1;
|
| 7917 |
+
vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
|
| 7918 |
+
|
| 7919 |
+
u1 = vec_and(u0, lowMask1);
|
| 7920 |
+
u2 = vec_or(u30, u31);
|
| 7921 |
+
|
| 7922 |
+
vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
|
| 7923 |
|
| 7924 |
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
| 7925 |
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
| 7926 |
|
|
|
|
| 7927 |
vector signed short vscales = vec_unpackh(utmps);
|
| 7928 |
|
| 7929 |
vector signed short q5xmins = vec_unpackl(utmps);
|
|
|
|
| 7943 |
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
|
| 7944 |
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
|
| 7945 |
|
| 7946 |
+
vector signed int vsumi0 = v0;
|
| 7947 |
+
vector signed int vsumi1 = v0;
|
| 7948 |
+
vector signed int vsumi2 = v0;
|
| 7949 |
+
vector signed int vsumi3 = v0;
|
| 7950 |
|
| 7951 |
const uint8_t * restrict q5 = x[i].qs;
|
| 7952 |
const int8_t * restrict q8 = y[i].qs;
|
|
|
|
| 7971 |
qxhs0 = vec_sr(qxhs0, v2);
|
| 7972 |
qxhs1 = vec_sr(qxhs1, v2);
|
| 7973 |
|
| 7974 |
+
vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
|
| 7975 |
+
vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
|
| 7976 |
+
vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
|
| 7977 |
+
vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
|
| 7978 |
|
| 7979 |
vector signed char q8y00 = vec_xl( 0, q8);
|
| 7980 |
vector signed char q8y10 = vec_xl(16, q8);
|
|
|
|
| 7982 |
vector signed char q8y11 = vec_xl(48, q8);
|
| 7983 |
q8 += 64;
|
| 7984 |
|
| 7985 |
+
vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
|
| 7986 |
+
vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
|
| 7987 |
+
vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
|
| 7988 |
+
vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
|
| 7989 |
|
| 7990 |
+
vector signed int vscales_h = vec_unpackh(vscales);
|
| 7991 |
+
vector signed int vs0 = vec_splat(vscales_h, 0);
|
| 7992 |
+
vector signed int vs1 = vec_splat(vscales_h, 1);
|
| 7993 |
vscales = vec_sld(vscales, vscales, 12);
|
| 7994 |
|
| 7995 |
+
vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
|
| 7996 |
+
vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
|
| 7997 |
+
vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
|
| 7998 |
+
vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
|
|
|
|
|
|
|
|
|
|
| 7999 |
}
|
| 8000 |
|
| 8001 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
|
|
|
| 8556 |
|
| 8557 |
#elif defined(__POWER9_VECTOR__)
|
| 8558 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
| 8559 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 8560 |
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
| 8561 |
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
| 8562 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
|
|
|
| 8573 |
vector float vyd = vec_splats(y[i].d);
|
| 8574 |
vector float vd = vec_mul(vxd, vyd);
|
| 8575 |
|
| 8576 |
+
vector signed int vsumi0 = v0;
|
| 8577 |
+
vector signed int vsumi1 = v0;
|
| 8578 |
+
vector signed int vsumi2 = v0;
|
| 8579 |
+
vector signed int vsumi3 = v0;
|
| 8580 |
+
vector signed int vsumi4 = v0;
|
| 8581 |
+
vector signed int vsumi5 = v0;
|
| 8582 |
+
vector signed int vsumi6 = v0;
|
| 8583 |
+
vector signed int vsumi7 = v0;
|
| 8584 |
|
| 8585 |
const uint8_t * restrict q6 = x[i].ql;
|
| 8586 |
const uint8_t * restrict qh = x[i].qh;
|
|
|
|
| 8660 |
vector signed short vs6 = vec_splat(vscales, 6);
|
| 8661 |
vector signed short vs7 = vec_splat(vscales, 7);
|
| 8662 |
|
| 8663 |
+
vsumi0 = vec_msum(qv00, vs0, vsumi0);
|
| 8664 |
+
vsumi1 = vec_msum(qv01, vs4, vsumi1);
|
| 8665 |
+
vsumi2 = vec_msum(qv10, vs1, vsumi2);
|
| 8666 |
+
vsumi3 = vec_msum(qv11, vs5, vsumi3);
|
| 8667 |
+
vsumi4 = vec_msum(qv20, vs2, vsumi4);
|
| 8668 |
+
vsumi5 = vec_msum(qv21, vs6, vsumi5);
|
| 8669 |
+
vsumi6 = vec_msum(qv30, vs3, vsumi6);
|
| 8670 |
+
vsumi7 = vec_msum(qv31, vs7, vsumi7);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8671 |
}
|
| 8672 |
|
| 8673 |
vsumi0 = vec_add(vsumi0, vsumi4);
|
|
|
|
| 8948 |
*s = 0.125f * hsum_float_8(accumf);
|
| 8949 |
|
| 8950 |
#elif defined(__POWER9_VECTOR__)
|
| 8951 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 8952 |
vector float vsumf0 = vec_splats(0.0f);
|
| 8953 |
vector float vsumf1 = vec_splats(0.0f);
|
| 8954 |
vector float vsumf2 = vec_splats(0.0f);
|
|
|
|
| 8961 |
vector float vyd = vec_splats(y[i].d);
|
| 8962 |
vector float vd = vec_mul(vxd, vyd);
|
| 8963 |
|
| 8964 |
+
vector signed int vsumi0 = v0;
|
| 8965 |
+
vector signed int vsumi1 = v0;
|
| 8966 |
+
vector signed int vsumi2 = v0;
|
| 8967 |
+
vector signed int vsumi3 = v0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8968 |
|
| 8969 |
const uint16_t * restrict q2 = x[i].qs;
|
| 8970 |
const int8_t * restrict q8 = y[i].qs;
|
|
|
|
| 9011 |
vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
|
| 9012 |
vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
|
| 9013 |
|
| 9014 |
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
| 9015 |
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
| 9016 |
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
| 9017 |
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9018 |
}
|
| 9019 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9020 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 9021 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 9022 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
|
|
| 9408 |
|
| 9409 |
*s = 0.125f * hsum_float_8(accumf);
|
| 9410 |
#elif defined(__POWER9_VECTOR__)
|
| 9411 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 9412 |
vector float vsumf0 = vec_splats(0.0f);
|
| 9413 |
vector float vsumf1 = vec_splats(0.0f);
|
| 9414 |
vector float vsumf2 = vec_splats(0.0f);
|
|
|
|
| 9421 |
vector float vyd = vec_splats(y[i].d);
|
| 9422 |
vector float vd = vec_mul(vxd, vyd);
|
| 9423 |
|
| 9424 |
+
vector signed int vsumi0 = v0;
|
| 9425 |
+
vector signed int vsumi1 = v0;
|
| 9426 |
+
vector signed int vsumi2 = v0;
|
| 9427 |
+
vector signed int vsumi3 = v0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9428 |
|
| 9429 |
const uint16_t * restrict q2 = x[i].qs;
|
| 9430 |
const uint8_t * restrict sc = x[i].scales;
|
|
|
|
| 9472 |
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
| 9473 |
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
| 9474 |
|
| 9475 |
+
vsumi0 = vec_msum(qv0, vscales0, vsumi0);
|
| 9476 |
+
vsumi1 = vec_msum(qv1, vscales1, vsumi1);
|
| 9477 |
+
vsumi2 = vec_msum(qv2, vscales2, vsumi2);
|
| 9478 |
+
vsumi3 = vec_msum(qv3, vscales3, vsumi3);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9479 |
}
|
| 9480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9481 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 9482 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 9483 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
|
|
| 9700 |
|
| 9701 |
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
| 9702 |
|
| 9703 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 9704 |
+
|
| 9705 |
vector float vsumf0 = vec_splats(0.0f);
|
| 9706 |
vector float vsumf1 = vec_splats(0.0f);
|
| 9707 |
vector float vsumf2 = vec_splats(0.0f);
|
|
|
|
| 9716 |
vector float vyd = vec_splats(y[i].d);
|
| 9717 |
vector float vd = vec_mul(vxd, vyd);
|
| 9718 |
|
| 9719 |
+
vector signed int vsumi0 = v0;
|
| 9720 |
+
vector signed int vsumi1 = v0;
|
| 9721 |
+
vector signed int vsumi2 = v0;
|
| 9722 |
+
vector signed int vsumi3 = v0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9723 |
|
| 9724 |
const uint8_t * restrict q2 = x[i].qs;
|
| 9725 |
const uint8_t * restrict qh = x[i].qh;
|
|
|
|
| 9779 |
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
| 9780 |
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
| 9781 |
|
| 9782 |
+
vsumi0 = vec_msum(qv0, vscales0, vsumi0);
|
| 9783 |
+
vsumi1 = vec_msum(qv1, vscales1, vsumi1);
|
| 9784 |
+
vsumi2 = vec_msum(qv2, vscales2, vsumi2);
|
| 9785 |
+
vsumi3 = vec_msum(qv3, vscales3, vsumi3);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9786 |
}
|
| 9787 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9788 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 9789 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 9790 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
|
|
| 10022 |
#elif defined(__POWER9_VECTOR__)
|
| 10023 |
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
| 10024 |
|
| 10025 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 10026 |
+
|
| 10027 |
vector float vsumf0 = vec_splats(0.0f);
|
| 10028 |
vector float vsumf1 = vec_splats(0.0f);
|
| 10029 |
vector float vsumf2 = vec_splats(0.0f);
|
|
|
|
| 10034 |
vector float vyd = vec_splats(y[i].d);
|
| 10035 |
vector float vd = vec_mul(vxd, vyd);
|
| 10036 |
|
| 10037 |
+
vector signed int vsumi0 = v0;
|
| 10038 |
+
vector signed int vsumi1 = v0;
|
| 10039 |
+
vector signed int vsumi2 = v0;
|
| 10040 |
+
vector signed int vsumi3 = v0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10041 |
|
| 10042 |
const uint8_t * restrict q3 = x[i].qs;
|
| 10043 |
const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
|
|
|
|
| 10082 |
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
| 10083 |
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
| 10084 |
|
| 10085 |
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
| 10086 |
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
| 10087 |
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
| 10088 |
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10089 |
}
|
| 10090 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10091 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 10092 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 10093 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
|
|
| 10377 |
|
| 10378 |
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
| 10379 |
|
| 10380 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 10381 |
+
|
| 10382 |
vector float vsumf0 = vec_splats(0.0f);
|
| 10383 |
vector float vsumf1 = vec_splats(0.0f);
|
| 10384 |
vector float vsumf2 = vec_splats(0.0f);
|
|
|
|
| 10399 |
const uint8_t * restrict sc = x[i].scales;
|
| 10400 |
const int8_t * restrict q8 = y[i].qs;
|
| 10401 |
|
| 10402 |
+
vector signed int vsumi0 = v0;
|
| 10403 |
+
vector signed int vsumi1 = v0;
|
| 10404 |
+
vector signed int vsumi2 = v0;
|
| 10405 |
+
vector signed int vsumi3 = v0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10406 |
|
| 10407 |
for (int j = 0; j < QK_K/32; j += 2) {
|
| 10408 |
__builtin_prefetch(q3, 0, 1);
|
|
|
|
| 10456 |
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
| 10457 |
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
| 10458 |
|
| 10459 |
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
| 10460 |
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
| 10461 |
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
| 10462 |
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10463 |
}
|
| 10464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10465 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 10466 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 10467 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
|
|
| 10742 |
vector signed int vsumi1 = vec_splats((int32_t)0);
|
| 10743 |
vector signed int vsumi2 = vec_splats((int32_t)0);
|
| 10744 |
vector signed int vsumi3 = vec_splats((int32_t)0);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10745 |
vector signed int vsumi8 = vec_splats((int32_t)0);
|
| 10746 |
|
| 10747 |
const uint8_t * restrict q1 = x[i].qs;
|
|
|
|
| 10783 |
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
| 10784 |
vector signed short vscales = vec_sld(vscales23, vscales01, 8);
|
| 10785 |
|
| 10786 |
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
| 10787 |
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
| 10788 |
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
| 10789 |
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10790 |
|
| 10791 |
vector signed short q8ysums = vec_xl_len(qs, 8);
|
| 10792 |
qs += 4;
|
|
|
|
| 10801 |
vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
|
| 10802 |
}
|
| 10803 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10804 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 10805 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 10806 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
|
|
|
| 11194 |
|
| 11195 |
#elif defined(__POWER9_VECTOR__)
|
| 11196 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
| 11197 |
+
const vector signed int v0 = vec_splats((int32_t)0);
|
| 11198 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 11199 |
|
| 11200 |
vector float vsumf0 = vec_splats(0.0f);
|
|
|
|
| 11225 |
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
| 11226 |
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
| 11227 |
|
| 11228 |
+
vector signed int vsumi0 = v0;
|
| 11229 |
+
vector signed int vsumi1 = v0;
|
| 11230 |
+
|
| 11231 |
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
| 11232 |
+
vsumi1 = vec_sum4s(qv1, vsumi1);
|
| 11233 |
|
| 11234 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 11235 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
|
|
|
| 11384 |
|
| 11385 |
#elif defined(__POWER9_VECTOR__)
|
| 11386 |
const vector signed char lowMask = vec_splats((signed char)0xF);
|
| 11387 |
+
const vector int v0 = vec_splats((int32_t)0);
|
| 11388 |
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
| 11389 |
|
| 11390 |
vector float vsumf0 = vec_splats(0.0f);
|
|
|
|
| 11400 |
vector float vyd = vec_splats(y[ibl].d);
|
| 11401 |
vector float vd = vec_mul(vxd, vyd);
|
| 11402 |
|
| 11403 |
+
vector signed int vsumi0 = v0;
|
| 11404 |
+
vector signed int vsumi1 = v0;
|
| 11405 |
+
vector signed int vsumi2 = v0;
|
| 11406 |
+
vector signed int vsumi3 = v0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11407 |
|
| 11408 |
uint16_t h = x[ibl].scales_h;
|
| 11409 |
|
|
|
|
| 11448 |
vector signed short vscales01 = vec_splats((int16_t)ls0);
|
| 11449 |
vector signed short vscales23 = vec_splats((int16_t)ls1);
|
| 11450 |
|
| 11451 |
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
| 11452 |
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
| 11453 |
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
| 11454 |
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11455 |
}
|
| 11456 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11457 |
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
| 11458 |
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
| 11459 |
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|