Hong Bo PENG commited on
Commit
e3d09d2
·
1 Parent(s): 8c3ae74

ggml : fix and optimize ppc64le (ggml/849)

Browse files

* fix compile issues introduced by loongarch_asx

* restore quant changes to merge

* fix compile issues introduced by loongarch_asx

* further optimize by using vec_msum & vec_sum4s on ppc64le

Files changed (1) hide show
  1. ggml-quants.c +281 -362
ggml-quants.c CHANGED
@@ -1076,6 +1076,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1076
  }
1077
  vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
1078
  vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
 
1079
 
1080
  #elif defined(__loongarch_asx)
1081
  for (int i = 0; i < nb; i++) {
@@ -1435,6 +1436,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1435
  accv = vec_add(accv, vec_sld(accv, accv, 4));
1436
  accv = vec_add(accv, vec_sld(accv, accv, 8));
1437
  y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
 
1438
 
1439
  #elif defined(__loongarch_asx)
1440
  for (int i = 0; i < nb; i++) {
@@ -4111,12 +4113,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4111
 
4112
  #elif defined(__POWER9_VECTOR__)
4113
  const vector signed char lowMask = vec_splats((signed char)0xF);
 
4114
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4115
  const vector signed char v8 = vec_splats((signed char)0x8);
4116
 
4117
  vector float vsumf0 = vec_splats(0.0f);
4118
 
4119
- #pragma GCC unroll 4
4120
  for (int i = 0; i < nb; i++) {
4121
  __builtin_prefetch(x[i].qs, 0, 1);
4122
  __builtin_prefetch(y[i].qs, 0, 1);
@@ -4138,9 +4141,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4138
  vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
4139
  vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
4140
 
4141
- qv0 = vec_add(qv0, qv1);
4142
 
4143
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
 
4144
 
4145
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4146
  }
@@ -4514,6 +4518,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4514
 
4515
  #elif defined(__POWER9_VECTOR__)
4516
  const vector signed char lowMask = vec_splats((signed char)0xF);
 
4517
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4518
 
4519
  vector float vsumf0 = vec_splats(0.0f);
@@ -4535,15 +4540,13 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4535
  vector signed char q8y0 = vec_xl( 0, y[i].qs);
4536
  vector signed char q8y1 = vec_xl(16, y[i].qs);
4537
 
4538
- vector signed char q4x0 = vec_and(qxs, lowMask);
4539
- vector signed char q4x1 = vec_sr(qxs, v4);
4540
-
4541
- vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
4542
- vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
4543
 
4544
- qv0 = vec_add(qv0, qv1);
4545
 
4546
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
 
4547
 
4548
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4549
  }
@@ -5245,6 +5248,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5245
 
5246
  #elif defined(__POWER9_VECTOR__)
5247
  const vector signed char lowMask = vec_splats((signed char)0xF);
 
5248
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
5249
 
5250
  vector float vsumf0 = vec_splats(0.0f);
@@ -5270,18 +5274,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5270
 
5271
  vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
5272
 
5273
- vector signed char q5x0 = vec_or(vec_and(qxs, lowMask), qh0);
5274
- vector signed char q5x1 = vec_or(vec_sr(qxs, v4), qh1);
5275
 
5276
  vector signed char q8y0 = vec_xl( 0, y[i].qs);
5277
  vector signed char q8y1 = vec_xl( 16, y[i].qs);
5278
 
5279
- vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
5280
- vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
5281
-
5282
- qv0 = vec_add(qv0, qv1);
5283
 
5284
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
 
5285
 
5286
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
5287
  }
@@ -5521,9 +5523,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
5521
  *s = sumf;
5522
 
5523
  #elif defined(__POWER9_VECTOR__)
 
5524
  vector float vsumf0 = vec_splats(0.0f);
5525
 
5526
- #pragma GCC unroll 4
5527
  for (int i = 0; i < nb; i++) {
5528
  __builtin_prefetch(x[i].qs, 0, 1);
5529
  __builtin_prefetch(y[i].qs, 0, 1);
@@ -5542,13 +5545,13 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
5542
  vector signed short qv2 = vec_mule(q8x1, q8y1);
5543
  vector signed short qv3 = vec_mulo(q8x1, q8y1);
5544
 
5545
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackh(qv1));
5546
- vector signed int vsumi1 = vec_add(vec_unpackl(qv0), vec_unpackl(qv1));
5547
- vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3));
5548
- vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3));
5549
 
5550
- vsumi0 = vec_add(vsumi0, vsumi2);
5551
- vsumi1 = vec_add(vsumi1, vsumi3);
 
 
5552
 
5553
  vsumi0 = vec_add(vsumi0, vsumi1);
5554
 
@@ -5936,6 +5939,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5936
  #elif defined(__POWER9_VECTOR__)
5937
  const vector signed char lowMask = vec_splats((signed char)0x3);
5938
  const vector signed char lowScaleMask = vec_splats((signed char)0xF);
 
5939
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
5940
  const vector unsigned char v6 = vec_splats((unsigned char)0x6);
5941
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
@@ -5973,15 +5977,17 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5973
  vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
5974
  vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
5975
 
5976
- vector signed int vsumi0 = vec_splats((int32_t)0);
5977
- vector signed int vsumi1 = vec_splats((int32_t)0);
5978
- vector signed int vsumi2 = vec_splats((int32_t)0);
5979
- vector signed int vsumi3 = vec_splats((int32_t)0);
5980
- vector signed int vsumi4 = vec_splats((int32_t)0);
5981
- vector signed int vsumi5 = vec_splats((int32_t)0);
5982
- vector signed int vsumi6 = vec_splats((int32_t)0);
5983
- vector signed int vsumi7 = vec_splats((int32_t)0);
5984
 
 
 
5985
 
5986
  for (int j = 0; j < QK_K/128; ++j) {
5987
  __builtin_prefetch(q2, 0, 1);
@@ -5991,14 +5997,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5991
  vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
5992
  q2 += 32;
5993
 
5994
- vector signed char q2x00 = vec_and(qxs0, lowMask);
5995
- vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
5996
- vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
5997
- vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
5998
- vector signed char q2x10 = vec_and(qxs1, lowMask);
5999
- vector signed char q2x11 = vec_and(vec_sr(qxs1, v2), lowMask);
6000
- vector signed char q2x12 = vec_and(vec_sr(qxs1, v4), lowMask);
6001
- vector signed char q2x13 = vec_and(vec_sr(qxs1, v6), lowMask);
6002
 
6003
  vector signed char q8y00 = vec_xl( 0, q8);
6004
  vector signed char q8y10 = vec_xl( 16, q8);
@@ -6010,45 +6016,36 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6010
  vector signed char q8y13 = vec_xl(112, q8);
6011
  q8 += 128;
6012
 
6013
- vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
6014
- vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
6015
- vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
6016
- vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
6017
- vector signed short qv4 = vec_add(vec_mule(q2x10, q8y10), vec_mulo(q2x10, q8y10));
6018
- vector signed short qv5 = vec_add(vec_mule(q2x11, q8y11), vec_mulo(q2x11, q8y11));
6019
- vector signed short qv6 = vec_add(vec_mule(q2x12, q8y12), vec_mulo(q2x12, q8y12));
6020
- vector signed short qv7 = vec_add(vec_mule(q2x13, q8y13), vec_mulo(q2x13, q8y13));
6021
-
6022
- vector signed short vscales_h = vec_unpackh(vscales);
6023
- vector signed short vs0 = vec_splat(vscales_h, 0);
6024
- vector signed short vs1 = vec_splat(vscales_h, 1);
6025
- vector signed short vs2 = vec_splat(vscales_h, 2);
6026
- vector signed short vs3 = vec_splat(vscales_h, 3);
6027
- vector signed short vs4 = vec_splat(vscales_h, 4);
6028
- vector signed short vs5 = vec_splat(vscales_h, 5);
6029
- vector signed short vs6 = vec_splat(vscales_h, 6);
6030
- vector signed short vs7 = vec_splat(vscales_h, 7);
 
 
6031
  vscales = vec_sld(vscales, vscales, 8);
6032
 
6033
- qv0 = vec_mul(qv0, vs0);
6034
- qv1 = vec_mul(qv1, vs2);
6035
- qv2 = vec_mul(qv2, vs4);
6036
- qv3 = vec_mul(qv3, vs6);
6037
-
6038
- qv0 = vec_madd(qv4, vs1, qv0);
6039
- qv1 = vec_madd(qv5, vs3, qv1);
6040
- qv2 = vec_madd(qv6, vs5, qv2);
6041
- qv3 = vec_madd(qv7, vs7, qv3);
6042
-
6043
- vsumi0 = vec_add(vec_unpackh(qv0), vsumi0);
6044
- vsumi1 = vec_add(vec_unpackh(qv1), vsumi1);
6045
- vsumi2 = vec_add(vec_unpackh(qv2), vsumi2);
6046
- vsumi3 = vec_add(vec_unpackh(qv3), vsumi3);
6047
-
6048
- vsumi4 = vec_add(vec_unpackl(qv0), vsumi4);
6049
- vsumi5 = vec_add(vec_unpackl(qv1), vsumi5);
6050
- vsumi6 = vec_add(vec_unpackl(qv2), vsumi6);
6051
- vsumi7 = vec_add(vec_unpackl(qv3), vsumi7);
6052
  }
6053
 
6054
  vsumi0 = vec_add(vsumi0, vsumi4);
@@ -6639,6 +6636,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6639
 
6640
  #elif defined(__POWER9_VECTOR__)
6641
  const vector signed char lowMask = vec_splats((signed char)0x3);
 
 
 
6642
  const vector signed char v1 = vec_splats((signed char)0x1);
6643
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
6644
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
@@ -6656,30 +6656,33 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6656
  vector float vyd = vec_splats(y[i].d);
6657
  vector float vd = vec_mul(vxd, vyd);
6658
 
6659
- uint32_t aux[3];
6660
- uint32_t utmp[4];
6661
 
6662
- memcpy(aux, x[i].scales, 12);
6663
- utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
6664
- utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
6665
- utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
6666
- utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
 
6667
 
6668
- vector signed char vscales = (vector signed char)vec_xl( 0, utmp);
 
 
 
6669
  vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
6670
  vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
6671
 
6672
  vscales = vec_sub(vscales, off);
6673
 
6674
- vector signed int vsumi0 = vec_splats((int32_t)0);
6675
- vector signed int vsumi1 = vec_splats((int32_t)0);
6676
- vector signed int vsumi2 = vec_splats((int32_t)0);
6677
- vector signed int vsumi3 = vec_splats((int32_t)0);
6678
- vector signed int vsumi4 = vec_splats((int32_t)0);
6679
- vector signed int vsumi5 = vec_splats((int32_t)0);
6680
- vector signed int vsumi6 = vec_splats((int32_t)0);
6681
- vector signed int vsumi7 = vec_splats((int32_t)0);
6682
-
6683
 
6684
  const uint8_t * restrict q3 = x[i].qs;
6685
  const int8_t * restrict q8 = y[i].qs;
@@ -6753,23 +6756,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6753
  vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
6754
  vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
6755
 
6756
- vector signed int vsum0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
6757
- vector signed int vsum1 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
6758
- vector signed int vsum2 = vec_add(vec_mule(qv02, vs4), vec_mulo(qv02, vs4));
6759
- vector signed int vsum3 = vec_add(vec_mule(qv03, vs6), vec_mulo(qv03, vs6));
6760
- vector signed int vsum4 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
6761
- vector signed int vsum5 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
6762
- vector signed int vsum6 = vec_add(vec_mule(qv12, vs5), vec_mulo(qv12, vs5));
6763
- vector signed int vsum7 = vec_add(vec_mule(qv13, vs7), vec_mulo(qv13, vs7));
6764
-
6765
- vsumi0 = vec_add(vsum0, vsumi0);
6766
- vsumi1 = vec_add(vsum1, vsumi1);
6767
- vsumi2 = vec_add(vsum2, vsumi2);
6768
- vsumi3 = vec_add(vsum3, vsumi3);
6769
- vsumi4 = vec_add(vsum4, vsumi4);
6770
- vsumi5 = vec_add(vsum5, vsumi5);
6771
- vsumi6 = vec_add(vsum6, vsumi6);
6772
- vsumi7 = vec_add(vsum7, vsumi7);
6773
  }
6774
 
6775
  vsumi0 = vec_add(vsumi0, vsumi4);
@@ -7268,6 +7262,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7268
 
7269
  #elif defined(__POWER9_VECTOR__)
7270
  const vector signed char lowMask = vec_splats((signed char)0xF);
 
 
 
 
7271
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
7272
 
7273
  vector float vsumf0 = vec_splats(0.0f);
@@ -7286,15 +7284,24 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7286
  vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
7287
  vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
7288
 
7289
- memcpy(utmp, x[i].scales, 12);
 
 
 
7290
 
7291
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
7292
- const uint32_t uaux = utmp[1] & kmask1;
7293
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7294
- utmp[2] = uaux;
7295
- utmp[0] &= kmask1;
 
 
 
 
 
 
 
7296
 
7297
- vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
7298
  vector signed short vscales = vec_unpackh(utmps);
7299
  vector signed short q4xmins = vec_unpackl(utmps);
7300
  vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
@@ -7310,14 +7317,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7310
  vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
7311
  vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
7312
 
7313
- vector signed int vsumi0 = vec_splats((int32_t)0);
7314
- vector signed int vsumi1 = vec_splats((int32_t)0);
7315
- vector signed int vsumi2 = vec_splats((int32_t)0);
7316
- vector signed int vsumi3 = vec_splats((int32_t)0);
7317
- vector signed int vsumi4 = vec_splats((int32_t)0);
7318
- vector signed int vsumi5 = vec_splats((int32_t)0);
7319
- vector signed int vsumi6 = vec_splats((int32_t)0);
7320
- vector signed int vsumi7 = vec_splats((int32_t)0);
7321
 
7322
  const uint8_t * restrict q4 = x[i].qs;
7323
  const int8_t * restrict q8 = y[i].qs;
@@ -7332,14 +7335,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7332
  vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
7333
  q4 += 64;
7334
 
7335
- vector signed char q4x00 = vec_and(qxs0, lowMask);
7336
- vector signed char q4x01 = vec_sr(qxs0, v4);
7337
- vector signed char q4x10 = vec_and(qxs1, lowMask);
7338
- vector signed char q4x11 = vec_sr(qxs1, v4);
7339
- vector signed char q4x20 = vec_and(qxs2, lowMask);
7340
- vector signed char q4x21 = vec_sr(qxs2, v4);
7341
- vector signed char q4x30 = vec_and(qxs3, lowMask);
7342
- vector signed char q4x31 = vec_sr(qxs3, v4);
7343
 
7344
  vector signed char q8y00 = vec_xl( 0, q8);
7345
  vector signed char q8y10 = vec_xl( 16, q8);
@@ -7351,41 +7354,33 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7351
  vector signed char q8y31 = vec_xl(112, q8);
7352
  q8 += 128;
7353
 
7354
- vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
7355
- vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
7356
- vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
7357
- vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
7358
- vector signed short qv20 = vec_add(vec_mule(q4x20, q8y20), vec_mulo(q4x20, q8y20));
7359
- vector signed short qv21 = vec_add(vec_mule(q4x21, q8y21), vec_mulo(q4x21, q8y21));
7360
- vector signed short qv30 = vec_add(vec_mule(q4x30, q8y30), vec_mulo(q4x30, q8y30));
7361
- vector signed short qv31 = vec_add(vec_mule(q4x31, q8y31), vec_mulo(q4x31, q8y31));
7362
-
7363
- vector signed short vs0 = vec_splat(vscales, 0);
7364
- vector signed short vs1 = vec_splat(vscales, 1);
7365
- vector signed short vs2 = vec_splat(vscales, 2);
7366
- vector signed short vs3 = vec_splat(vscales, 3);
 
7367
  vscales = vec_sld(vscales, vscales, 8);
7368
 
7369
- qv00 = vec_add(qv00, qv10);
7370
- qv10 = vec_add(qv01, qv11);
7371
- qv20 = vec_add(qv20, qv30);
7372
- qv30 = vec_add(qv21, qv31);
7373
 
7374
- vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
7375
- vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
7376
- vsumi2 = vec_add(vec_mule(qv10, vs1), vsumi2);
7377
- vsumi3 = vec_add(vec_mulo(qv10, vs1), vsumi3);
7378
- vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4);
7379
- vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5);
7380
- vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6);
7381
- vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7);
7382
  }
7383
 
7384
- vsumi0 = vec_add(vsumi0, vsumi4);
7385
- vsumi1 = vec_add(vsumi1, vsumi5);
7386
- vsumi2 = vec_add(vsumi2, vsumi6);
7387
- vsumi3 = vec_add(vsumi3, vsumi7);
7388
-
7389
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
7390
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
7391
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -7887,6 +7882,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7887
 
7888
  #elif defined(__POWER9_VECTOR__)
7889
  const vector signed char lowMask = vec_splats((signed char)0xF);
 
 
 
7890
  const vector unsigned char v1 = vec_splats((unsigned char)0x1);
7891
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
7892
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
@@ -7905,18 +7903,27 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7905
  vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
7906
  vector float vdmin = vec_mul(vxmin, vyd);
7907
 
7908
- memcpy(utmp, x[i].scales, 12);
 
 
 
7909
 
7910
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
7911
- const uint32_t uaux = utmp[1] & kmask1;
7912
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7913
- utmp[2] = uaux;
7914
- utmp[0] &= kmask1;
 
 
 
 
 
 
 
7915
 
7916
  vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
7917
  vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
7918
 
7919
- vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
7920
  vector signed short vscales = vec_unpackh(utmps);
7921
 
7922
  vector signed short q5xmins = vec_unpackl(utmps);
@@ -7936,10 +7943,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7936
  vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
7937
  vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
7938
 
7939
- vector signed int vsumi0 = vec_splats((int32_t)0);
7940
- vector signed int vsumi1 = vec_splats((int32_t)0);
7941
- vector signed int vsumi2 = vec_splats((int32_t)0);
7942
- vector signed int vsumi3 = vec_splats((int32_t)0);
7943
 
7944
  const uint8_t * restrict q5 = x[i].qs;
7945
  const int8_t * restrict q8 = y[i].qs;
@@ -7964,10 +7971,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7964
  qxhs0 = vec_sr(qxhs0, v2);
7965
  qxhs1 = vec_sr(qxhs1, v2);
7966
 
7967
- vector signed char q5x00 = vec_or(q5h00, qxs00);
7968
- vector signed char q5x01 = vec_or(q5h01, qxs01);
7969
- vector signed char q5x10 = vec_or(q5h10, qxs10);
7970
- vector signed char q5x11 = vec_or(q5h11, qxs11);
7971
 
7972
  vector signed char q8y00 = vec_xl( 0, q8);
7973
  vector signed char q8y10 = vec_xl(16, q8);
@@ -7975,22 +7982,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7975
  vector signed char q8y11 = vec_xl(48, q8);
7976
  q8 += 64;
7977
 
7978
- vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
7979
- vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
7980
- vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
7981
- vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
7982
 
7983
- vector signed short vs0 = vec_splat(vscales, 0);
7984
- vector signed short vs1 = vec_splat(vscales, 1);
 
7985
  vscales = vec_sld(vscales, vscales, 12);
7986
 
7987
- qv00 = vec_add(qv00, qv10);
7988
- qv01 = vec_add(qv01, qv11);
7989
-
7990
- vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
7991
- vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
7992
- vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2);
7993
- vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3);
7994
  }
7995
 
7996
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
@@ -8551,6 +8556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8551
 
8552
  #elif defined(__POWER9_VECTOR__)
8553
  const vector signed char lowMask = vec_splats((signed char)0xF);
 
8554
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
8555
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
8556
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
@@ -8567,14 +8573,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8567
  vector float vyd = vec_splats(y[i].d);
8568
  vector float vd = vec_mul(vxd, vyd);
8569
 
8570
- vector signed int vsumi0 = vec_splats((int32_t)0);
8571
- vector signed int vsumi1 = vec_splats((int32_t)0);
8572
- vector signed int vsumi2 = vec_splats((int32_t)0);
8573
- vector signed int vsumi3 = vec_splats((int32_t)0);
8574
- vector signed int vsumi4 = vec_splats((int32_t)0);
8575
- vector signed int vsumi5 = vec_splats((int32_t)0);
8576
- vector signed int vsumi6 = vec_splats((int32_t)0);
8577
- vector signed int vsumi7 = vec_splats((int32_t)0);
8578
 
8579
  const uint8_t * restrict q6 = x[i].ql;
8580
  const uint8_t * restrict qh = x[i].qh;
@@ -8654,23 +8660,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8654
  vector signed short vs6 = vec_splat(vscales, 6);
8655
  vector signed short vs7 = vec_splat(vscales, 7);
8656
 
8657
- vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
8658
- vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
8659
- vsumi2 = vec_add(vec_mule(qv01, vs4), vsumi2);
8660
- vsumi3 = vec_add(vec_mulo(qv01, vs4), vsumi3);
8661
- vsumi4 = vec_add(vec_mule(qv10, vs1), vsumi4);
8662
- vsumi5 = vec_add(vec_mulo(qv10, vs1), vsumi5);
8663
- vsumi6 = vec_add(vec_mule(qv11, vs5), vsumi6);
8664
- vsumi7 = vec_add(vec_mulo(qv11, vs5), vsumi7);
8665
-
8666
- vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0);
8667
- vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1);
8668
- vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2);
8669
- vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3);
8670
- vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4);
8671
- vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5);
8672
- vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6);
8673
- vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7);
8674
  }
8675
 
8676
  vsumi0 = vec_add(vsumi0, vsumi4);
@@ -8951,6 +8948,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8951
  *s = 0.125f * hsum_float_8(accumf);
8952
 
8953
  #elif defined(__POWER9_VECTOR__)
 
8954
  vector float vsumf0 = vec_splats(0.0f);
8955
  vector float vsumf1 = vec_splats(0.0f);
8956
  vector float vsumf2 = vec_splats(0.0f);
@@ -8963,14 +8961,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8963
  vector float vyd = vec_splats(y[i].d);
8964
  vector float vd = vec_mul(vxd, vyd);
8965
 
8966
- vector signed int vsumi0 = vec_splats((int32_t)0);
8967
- vector signed int vsumi1 = vec_splats((int32_t)0);
8968
- vector signed int vsumi2 = vec_splats((int32_t)0);
8969
- vector signed int vsumi3 = vec_splats((int32_t)0);
8970
- vector signed int vsumi4 = vec_splats((int32_t)0);
8971
- vector signed int vsumi5 = vec_splats((int32_t)0);
8972
- vector signed int vsumi6 = vec_splats((int32_t)0);
8973
- vector signed int vsumi7 = vec_splats((int32_t)0);
8974
 
8975
  const uint16_t * restrict q2 = x[i].qs;
8976
  const int8_t * restrict q8 = y[i].qs;
@@ -9017,21 +9011,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9017
  vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
9018
  vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
9019
 
9020
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
9021
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
9022
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
9023
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
9024
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
9025
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
9026
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
9027
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
9028
  }
9029
 
9030
- vsumi0 = vec_add(vsumi0, vsumi4);
9031
- vsumi1 = vec_add(vsumi1, vsumi5);
9032
- vsumi2 = vec_add(vsumi2, vsumi6);
9033
- vsumi3 = vec_add(vsumi3, vsumi7);
9034
-
9035
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9036
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9037
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -9423,6 +9408,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9423
 
9424
  *s = 0.125f * hsum_float_8(accumf);
9425
  #elif defined(__POWER9_VECTOR__)
 
9426
  vector float vsumf0 = vec_splats(0.0f);
9427
  vector float vsumf1 = vec_splats(0.0f);
9428
  vector float vsumf2 = vec_splats(0.0f);
@@ -9435,14 +9421,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9435
  vector float vyd = vec_splats(y[i].d);
9436
  vector float vd = vec_mul(vxd, vyd);
9437
 
9438
- vector signed int vsumi0 = vec_splats((int32_t)0);
9439
- vector signed int vsumi1 = vec_splats((int32_t)0);
9440
- vector signed int vsumi2 = vec_splats((int32_t)0);
9441
- vector signed int vsumi3 = vec_splats((int32_t)0);
9442
- vector signed int vsumi4 = vec_splats((int32_t)0);
9443
- vector signed int vsumi5 = vec_splats((int32_t)0);
9444
- vector signed int vsumi6 = vec_splats((int32_t)0);
9445
- vector signed int vsumi7 = vec_splats((int32_t)0);
9446
 
9447
  const uint16_t * restrict q2 = x[i].qs;
9448
  const uint8_t * restrict sc = x[i].scales;
@@ -9490,21 +9472,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9490
  vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
9491
  vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
9492
 
9493
- vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
9494
- vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
9495
- vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
9496
- vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
9497
- vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
9498
- vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
9499
- vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
9500
- vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
9501
  }
9502
 
9503
- vsumi0 = vec_add(vsumi0, vsumi4);
9504
- vsumi1 = vec_add(vsumi1, vsumi5);
9505
- vsumi2 = vec_add(vsumi2, vsumi6);
9506
- vsumi3 = vec_add(vsumi3, vsumi7);
9507
-
9508
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9509
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9510
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -9727,6 +9700,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9727
 
9728
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
9729
 
 
 
9730
  vector float vsumf0 = vec_splats(0.0f);
9731
  vector float vsumf1 = vec_splats(0.0f);
9732
  vector float vsumf2 = vec_splats(0.0f);
@@ -9741,14 +9716,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9741
  vector float vyd = vec_splats(y[i].d);
9742
  vector float vd = vec_mul(vxd, vyd);
9743
 
9744
- vector signed int vsumi0 = vec_splats((int32_t)0);
9745
- vector signed int vsumi1 = vec_splats((int32_t)0);
9746
- vector signed int vsumi2 = vec_splats((int32_t)0);
9747
- vector signed int vsumi3 = vec_splats((int32_t)0);
9748
- vector signed int vsumi4 = vec_splats((int32_t)0);
9749
- vector signed int vsumi5 = vec_splats((int32_t)0);
9750
- vector signed int vsumi6 = vec_splats((int32_t)0);
9751
- vector signed int vsumi7 = vec_splats((int32_t)0);
9752
 
9753
  const uint8_t * restrict q2 = x[i].qs;
9754
  const uint8_t * restrict qh = x[i].qh;
@@ -9808,21 +9779,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9808
  vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
9809
  vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
9810
 
9811
- vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
9812
- vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
9813
- vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
9814
- vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
9815
- vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
9816
- vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
9817
- vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
9818
- vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
9819
  }
9820
 
9821
- vsumi0 = vec_add(vsumi0, vsumi4);
9822
- vsumi1 = vec_add(vsumi1, vsumi5);
9823
- vsumi2 = vec_add(vsumi2, vsumi6);
9824
- vsumi3 = vec_add(vsumi3, vsumi7);
9825
-
9826
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9827
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9828
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -10060,6 +10022,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10060
  #elif defined(__POWER9_VECTOR__)
10061
  const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10062
 
 
 
10063
  vector float vsumf0 = vec_splats(0.0f);
10064
  vector float vsumf1 = vec_splats(0.0f);
10065
  vector float vsumf2 = vec_splats(0.0f);
@@ -10070,14 +10034,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10070
  vector float vyd = vec_splats(y[i].d);
10071
  vector float vd = vec_mul(vxd, vyd);
10072
 
10073
- vector signed int vsumi0 = vec_splats((int32_t)0);
10074
- vector signed int vsumi1 = vec_splats((int32_t)0);
10075
- vector signed int vsumi2 = vec_splats((int32_t)0);
10076
- vector signed int vsumi3 = vec_splats((int32_t)0);
10077
- vector signed int vsumi4 = vec_splats((int32_t)0);
10078
- vector signed int vsumi5 = vec_splats((int32_t)0);
10079
- vector signed int vsumi6 = vec_splats((int32_t)0);
10080
- vector signed int vsumi7 = vec_splats((int32_t)0);
10081
 
10082
  const uint8_t * restrict q3 = x[i].qs;
10083
  const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
@@ -10122,21 +10082,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10122
  vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
10123
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10124
 
10125
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10126
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10127
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10128
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10129
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10130
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10131
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10132
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
10133
  }
10134
 
10135
- vsumi0 = vec_add(vsumi0, vsumi4);
10136
- vsumi1 = vec_add(vsumi1, vsumi5);
10137
- vsumi2 = vec_add(vsumi2, vsumi6);
10138
- vsumi3 = vec_add(vsumi3, vsumi7);
10139
-
10140
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10141
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10142
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -10426,6 +10377,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10426
 
10427
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
10428
 
 
 
10429
  vector float vsumf0 = vec_splats(0.0f);
10430
  vector float vsumf1 = vec_splats(0.0f);
10431
  vector float vsumf2 = vec_splats(0.0f);
@@ -10446,14 +10399,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10446
  const uint8_t * restrict sc = x[i].scales;
10447
  const int8_t * restrict q8 = y[i].qs;
10448
 
10449
- vector signed int vsumi0 = vec_splats((int32_t)0);
10450
- vector signed int vsumi1 = vec_splats((int32_t)0);
10451
- vector signed int vsumi2 = vec_splats((int32_t)0);
10452
- vector signed int vsumi3 = vec_splats((int32_t)0);
10453
- vector signed int vsumi4 = vec_splats((int32_t)0);
10454
- vector signed int vsumi5 = vec_splats((int32_t)0);
10455
- vector signed int vsumi6 = vec_splats((int32_t)0);
10456
- vector signed int vsumi7 = vec_splats((int32_t)0);
10457
 
10458
  for (int j = 0; j < QK_K/32; j += 2) {
10459
  __builtin_prefetch(q3, 0, 1);
@@ -10507,21 +10456,12 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10507
  vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
10508
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10509
 
10510
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10511
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10512
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10513
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10514
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10515
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10516
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10517
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
10518
  }
10519
 
10520
- vsumi0 = vec_add(vsumi0, vsumi4);
10521
- vsumi1 = vec_add(vsumi1, vsumi5);
10522
- vsumi2 = vec_add(vsumi2, vsumi6);
10523
- vsumi3 = vec_add(vsumi3, vsumi7);
10524
-
10525
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10526
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10527
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -10802,10 +10742,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10802
  vector signed int vsumi1 = vec_splats((int32_t)0);
10803
  vector signed int vsumi2 = vec_splats((int32_t)0);
10804
  vector signed int vsumi3 = vec_splats((int32_t)0);
10805
- vector signed int vsumi4 = vec_splats((int32_t)0);
10806
- vector signed int vsumi5 = vec_splats((int32_t)0);
10807
- vector signed int vsumi6 = vec_splats((int32_t)0);
10808
- vector signed int vsumi7 = vec_splats((int32_t)0);
10809
  vector signed int vsumi8 = vec_splats((int32_t)0);
10810
 
10811
  const uint8_t * restrict q1 = x[i].qs;
@@ -10847,14 +10783,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10847
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10848
  vector signed short vscales = vec_sld(vscales23, vscales01, 8);
10849
 
10850
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10851
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10852
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10853
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10854
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10855
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10856
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10857
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
10858
 
10859
  vector signed short q8ysums = vec_xl_len(qs, 8);
10860
  qs += 4;
@@ -10869,11 +10801,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10869
  vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
10870
  }
10871
 
10872
- vsumi0 = vec_add(vsumi0, vsumi4);
10873
- vsumi1 = vec_add(vsumi1, vsumi5);
10874
- vsumi2 = vec_add(vsumi2, vsumi6);
10875
- vsumi3 = vec_add(vsumi3, vsumi7);
10876
-
10877
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10878
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10879
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -11267,6 +11194,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11267
 
11268
  #elif defined(__POWER9_VECTOR__)
11269
  const vector signed char lowMask = vec_splats((signed char)0xF);
 
11270
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11271
 
11272
  vector float vsumf0 = vec_splats(0.0f);
@@ -11297,8 +11225,11 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11297
  vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
11298
  vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
11299
 
11300
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
11301
- vector signed int vsumi1 = vec_add(vec_unpackh(qv1), vec_unpackl(qv1));
 
 
 
11302
 
11303
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11304
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
@@ -11453,6 +11384,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11453
 
11454
  #elif defined(__POWER9_VECTOR__)
11455
  const vector signed char lowMask = vec_splats((signed char)0xF);
 
11456
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11457
 
11458
  vector float vsumf0 = vec_splats(0.0f);
@@ -11468,14 +11400,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11468
  vector float vyd = vec_splats(y[ibl].d);
11469
  vector float vd = vec_mul(vxd, vyd);
11470
 
11471
- vector signed int vsumi0 = vec_splats((int32_t)0);
11472
- vector signed int vsumi1 = vec_splats((int32_t)0);
11473
- vector signed int vsumi2 = vec_splats((int32_t)0);
11474
- vector signed int vsumi3 = vec_splats((int32_t)0);
11475
- vector signed int vsumi4 = vec_splats((int32_t)0);
11476
- vector signed int vsumi5 = vec_splats((int32_t)0);
11477
- vector signed int vsumi6 = vec_splats((int32_t)0);
11478
- vector signed int vsumi7 = vec_splats((int32_t)0);
11479
 
11480
  uint16_t h = x[ibl].scales_h;
11481
 
@@ -11520,21 +11448,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11520
  vector signed short vscales01 = vec_splats((int16_t)ls0);
11521
  vector signed short vscales23 = vec_splats((int16_t)ls1);
11522
 
11523
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
11524
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
11525
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
11526
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
11527
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
11528
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
11529
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
11530
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
11531
  }
11532
 
11533
- vsumi0 = vec_add(vsumi0, vsumi4);
11534
- vsumi1 = vec_add(vsumi1, vsumi5);
11535
- vsumi2 = vec_add(vsumi2, vsumi6);
11536
- vsumi3 = vec_add(vsumi3, vsumi7);
11537
-
11538
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11539
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
11540
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
 
1076
  }
1077
  vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
1078
  vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
1079
+ }
1080
 
1081
  #elif defined(__loongarch_asx)
1082
  for (int i = 0; i < nb; i++) {
 
1436
  accv = vec_add(accv, vec_sld(accv, accv, 4));
1437
  accv = vec_add(accv, vec_sld(accv, accv, 8));
1438
  y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
1439
+ }
1440
 
1441
  #elif defined(__loongarch_asx)
1442
  for (int i = 0; i < nb; i++) {
 
4113
 
4114
  #elif defined(__POWER9_VECTOR__)
4115
  const vector signed char lowMask = vec_splats((signed char)0xF);
4116
+ const vector signed int v0 = vec_splats((int32_t)0);
4117
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4118
  const vector signed char v8 = vec_splats((signed char)0x8);
4119
 
4120
  vector float vsumf0 = vec_splats(0.0f);
4121
 
4122
+ #pragma GCC unroll 8
4123
  for (int i = 0; i < nb; i++) {
4124
  __builtin_prefetch(x[i].qs, 0, 1);
4125
  __builtin_prefetch(y[i].qs, 0, 1);
 
4141
  vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
4142
  vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
4143
 
4144
+ vector signed int vsumi0 = v0;
4145
 
4146
+ vsumi0 = vec_sum4s(qv0, vsumi0);
4147
+ vsumi0 = vec_sum4s(qv1, vsumi0);
4148
 
4149
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4150
  }
 
4518
 
4519
  #elif defined(__POWER9_VECTOR__)
4520
  const vector signed char lowMask = vec_splats((signed char)0xF);
4521
+ const vector signed int v0 = vec_splats((int32_t)0);
4522
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4523
 
4524
  vector float vsumf0 = vec_splats(0.0f);
 
4540
  vector signed char q8y0 = vec_xl( 0, y[i].qs);
4541
  vector signed char q8y1 = vec_xl(16, y[i].qs);
4542
 
4543
+ vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
4544
+ vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
 
 
 
4545
 
4546
+ vector signed int vsumi0 = v0;
4547
 
4548
+ vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
4549
+ vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
4550
 
4551
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4552
  }
 
5248
 
5249
  #elif defined(__POWER9_VECTOR__)
5250
  const vector signed char lowMask = vec_splats((signed char)0xF);
5251
+ const vector signed int v0 = vec_splats((int32_t)0);
5252
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
5253
 
5254
  vector float vsumf0 = vec_splats(0.0f);
 
5274
 
5275
  vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
5276
 
5277
+ vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
5278
+ vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
5279
 
5280
  vector signed char q8y0 = vec_xl( 0, y[i].qs);
5281
  vector signed char q8y1 = vec_xl( 16, y[i].qs);
5282
 
5283
+ vector signed int vsumi0 = v0;
 
 
 
5284
 
5285
+ vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
5286
+ vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
5287
 
5288
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
5289
  }
 
5523
  *s = sumf;
5524
 
5525
  #elif defined(__POWER9_VECTOR__)
5526
+ const vector signed int v0 = vec_splats((int32_t)0);
5527
  vector float vsumf0 = vec_splats(0.0f);
5528
 
5529
+ #pragma GCC unroll 8
5530
  for (int i = 0; i < nb; i++) {
5531
  __builtin_prefetch(x[i].qs, 0, 1);
5532
  __builtin_prefetch(y[i].qs, 0, 1);
 
5545
  vector signed short qv2 = vec_mule(q8x1, q8y1);
5546
  vector signed short qv3 = vec_mulo(q8x1, q8y1);
5547
 
5548
+ vector signed int vsumi0 = v0;
5549
+ vector signed int vsumi1 = v0;
 
 
5550
 
5551
+ vsumi0 = vec_sum4s(qv0, vsumi0);
5552
+ vsumi1 = vec_sum4s(qv1, vsumi1);
5553
+ vsumi0 = vec_sum4s(qv2, vsumi0);
5554
+ vsumi1 = vec_sum4s(qv3, vsumi1);
5555
 
5556
  vsumi0 = vec_add(vsumi0, vsumi1);
5557
 
 
5939
  #elif defined(__POWER9_VECTOR__)
5940
  const vector signed char lowMask = vec_splats((signed char)0x3);
5941
  const vector signed char lowScaleMask = vec_splats((signed char)0xF);
5942
+ const vector int v0 = vec_splats((int32_t)0);
5943
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
5944
  const vector unsigned char v6 = vec_splats((unsigned char)0x6);
5945
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
 
5977
  vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
5978
  vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
5979
 
5980
+ vector signed int vsumi0 = v0;
5981
+ vector signed int vsumi1 = v0;
5982
+ vector signed int vsumi2 = v0;
5983
+ vector signed int vsumi3 = v0;
5984
+ vector signed int vsumi4 = v0;
5985
+ vector signed int vsumi5 = v0;
5986
+ vector signed int vsumi6 = v0;
5987
+ vector signed int vsumi7 = v0;
5988
 
5989
+ const uint8_t * restrict q2 = x[i].qs;
5990
+ const int8_t * restrict q8 = y[i].qs;
5991
 
5992
  for (int j = 0; j < QK_K/128; ++j) {
5993
  __builtin_prefetch(q2, 0, 1);
 
5997
  vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
5998
  q2 += 32;
5999
 
6000
+ vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
6001
+ vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
6002
+ vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
6003
+ vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
6004
+ vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
6005
+ vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
6006
+ vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
6007
+ vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
6008
 
6009
  vector signed char q8y00 = vec_xl( 0, q8);
6010
  vector signed char q8y10 = vec_xl( 16, q8);
 
6016
  vector signed char q8y13 = vec_xl(112, q8);
6017
  q8 += 128;
6018
 
6019
+ vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
6020
+ vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
6021
+ vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
6022
+ vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
6023
+ vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
6024
+ vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
6025
+ vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
6026
+ vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
6027
+
6028
+ vector signed short vscales_07 = vec_unpackh(vscales);
6029
+ vector signed int vscales_03 = vec_unpackh(vscales_07);
6030
+ vector signed int vscales_47 = vec_unpackl(vscales_07);
6031
+ vector signed int vs0 = vec_splat(vscales_03, 0);
6032
+ vector signed int vs1 = vec_splat(vscales_03, 1);
6033
+ vector signed int vs2 = vec_splat(vscales_03, 2);
6034
+ vector signed int vs3 = vec_splat(vscales_03, 3);
6035
+ vector signed int vs4 = vec_splat(vscales_47, 0);
6036
+ vector signed int vs5 = vec_splat(vscales_47, 1);
6037
+ vector signed int vs6 = vec_splat(vscales_47, 2);
6038
+ vector signed int vs7 = vec_splat(vscales_47, 3);
6039
  vscales = vec_sld(vscales, vscales, 8);
6040
 
6041
+ vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
6042
+ vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
6043
+ vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
6044
+ vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
6045
+ vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
6046
+ vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
6047
+ vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
6048
+ vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
 
 
 
 
 
 
 
 
 
 
 
6049
  }
6050
 
6051
  vsumi0 = vec_add(vsumi0, vsumi4);
 
6636
 
6637
  #elif defined(__POWER9_VECTOR__)
6638
  const vector signed char lowMask = vec_splats((signed char)0x3);
6639
+ const vector signed char lowMask1 = vec_splats((int8_t)0xf);
6640
+ const vector signed char lowMask2 = vec_splats((int8_t)0x30);
6641
+ const vector int v0 = vec_splats((int32_t)0);
6642
  const vector signed char v1 = vec_splats((signed char)0x1);
6643
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
6644
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
 
6656
  vector float vyd = vec_splats(y[i].d);
6657
  vector float vd = vec_mul(vxd, vyd);
6658
 
6659
+ UNUSED(kmask1);
6660
+ UNUSED(kmask2);
6661
 
6662
+ vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
6663
+ vector signed char u1 = vec_and(u0, lowMask1);
6664
+ vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
6665
+ vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
6666
+ vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
6667
+ vector signed char u31 = vec_and(u3, lowMask2);
6668
 
6669
+ u1 = vec_or(u1, u30);
6670
+ u2 = vec_or(vec_sr(u0, v4), u31);
6671
+
6672
+ vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
6673
  vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
6674
  vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
6675
 
6676
  vscales = vec_sub(vscales, off);
6677
 
6678
+ vector signed int vsumi0 = v0;
6679
+ vector signed int vsumi1 = v0;
6680
+ vector signed int vsumi2 = v0;
6681
+ vector signed int vsumi3 = v0;
6682
+ vector signed int vsumi4 = v0;
6683
+ vector signed int vsumi5 = v0;
6684
+ vector signed int vsumi6 = v0;
6685
+ vector signed int vsumi7 = v0;
 
6686
 
6687
  const uint8_t * restrict q3 = x[i].qs;
6688
  const int8_t * restrict q8 = y[i].qs;
 
6756
  vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
6757
  vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
6758
 
6759
+ vsumi0 = vec_msum(qv00, vs0, vsumi0);
6760
+ vsumi1 = vec_msum(qv01, vs2, vsumi1);
6761
+ vsumi2 = vec_msum(qv02, vs4, vsumi2);
6762
+ vsumi3 = vec_msum(qv03, vs6, vsumi3);
6763
+ vsumi4 = vec_msum(qv10, vs1, vsumi4);
6764
+ vsumi5 = vec_msum(qv11, vs3, vsumi5);
6765
+ vsumi6 = vec_msum(qv12, vs5, vsumi6);
6766
+ vsumi7 = vec_msum(qv13, vs7, vsumi7);
 
 
 
 
 
 
 
 
 
6767
  }
6768
 
6769
  vsumi0 = vec_add(vsumi0, vsumi4);
 
7262
 
7263
  #elif defined(__POWER9_VECTOR__)
7264
  const vector signed char lowMask = vec_splats((signed char)0xF);
7265
+ const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
7266
+ const vector signed char lowMask2 = vec_splats((int8_t)0x30);
7267
+ const vector int v0 = vec_splats((int32_t)0);
7268
+ const vector unsigned char v2 = vec_splats((uint8_t)2);
7269
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
7270
 
7271
  vector float vsumf0 = vec_splats(0.0f);
 
7284
  vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
7285
  vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
7286
 
7287
+ UNUSED(kmask1);
7288
+ UNUSED(kmask2);
7289
+ UNUSED(kmask3);
7290
+ UNUSED(utmp);
7291
 
7292
+ vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
7293
+ vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
7294
+ vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
7295
+ vector signed char u3 = vec_sr(u2, v4);
7296
+
7297
+ vector signed char u30 = u1;
7298
+ vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
7299
+
7300
+ u1 = vec_and(u0, lowMask1);
7301
+ u2 = vec_or(u30, u31);
7302
+
7303
+ vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
7304
 
 
7305
  vector signed short vscales = vec_unpackh(utmps);
7306
  vector signed short q4xmins = vec_unpackl(utmps);
7307
  vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
 
7317
  vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
7318
  vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
7319
 
7320
+ vector signed int vsumi0 = v0;
7321
+ vector signed int vsumi1 = v0;
7322
+ vector signed int vsumi2 = v0;
7323
+ vector signed int vsumi3 = v0;
 
 
 
 
7324
 
7325
  const uint8_t * restrict q4 = x[i].qs;
7326
  const int8_t * restrict q8 = y[i].qs;
 
7335
  vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
7336
  q4 += 64;
7337
 
7338
+ vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
7339
+ vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
7340
+ vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
7341
+ vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
7342
+ vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
7343
+ vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
7344
+ vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
7345
+ vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
7346
 
7347
  vector signed char q8y00 = vec_xl( 0, q8);
7348
  vector signed char q8y10 = vec_xl( 16, q8);
 
7354
  vector signed char q8y31 = vec_xl(112, q8);
7355
  q8 += 128;
7356
 
7357
+ vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
7358
+ vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
7359
+ vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
7360
+ vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
7361
+ vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
7362
+ vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
7363
+ vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
7364
+ vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
7365
+
7366
+ vector signed int vscales_h = vec_unpackh(vscales);
7367
+ vector signed int vs0 = vec_splat(vscales_h, 0);
7368
+ vector signed int vs1 = vec_splat(vscales_h, 1);
7369
+ vector signed int vs2 = vec_splat(vscales_h, 2);
7370
+ vector signed int vs3 = vec_splat(vscales_h, 3);
7371
  vscales = vec_sld(vscales, vscales, 8);
7372
 
7373
+ vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
7374
+ vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
7375
+ vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
7376
+ vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
7377
 
7378
+ vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
7379
+ vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
7380
+ vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
7381
+ vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
 
 
 
 
7382
  }
7383
 
 
 
 
 
 
7384
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
7385
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
7386
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
 
7882
 
7883
  #elif defined(__POWER9_VECTOR__)
7884
  const vector signed char lowMask = vec_splats((signed char)0xF);
7885
+ const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
7886
+ const vector signed char lowMask2 = vec_splats((int8_t)0x30);
7887
+ const vector int v0 = vec_splats((int32_t)0);
7888
  const vector unsigned char v1 = vec_splats((unsigned char)0x1);
7889
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
7890
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
 
7903
  vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
7904
  vector float vdmin = vec_mul(vxmin, vyd);
7905
 
7906
+ UNUSED(kmask1);
7907
+ UNUSED(kmask2);
7908
+ UNUSED(kmask3);
7909
+ UNUSED(utmp);
7910
 
7911
+ vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
7912
+ vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
7913
+ vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
7914
+ vector signed char u3 = vec_sr(u2, v4);
7915
+
7916
+ vector signed char u30 = u1;
7917
+ vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
7918
+
7919
+ u1 = vec_and(u0, lowMask1);
7920
+ u2 = vec_or(u30, u31);
7921
+
7922
+ vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
7923
 
7924
  vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
7925
  vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
7926
 
 
7927
  vector signed short vscales = vec_unpackh(utmps);
7928
 
7929
  vector signed short q5xmins = vec_unpackl(utmps);
 
7943
  vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
7944
  vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
7945
 
7946
+ vector signed int vsumi0 = v0;
7947
+ vector signed int vsumi1 = v0;
7948
+ vector signed int vsumi2 = v0;
7949
+ vector signed int vsumi3 = v0;
7950
 
7951
  const uint8_t * restrict q5 = x[i].qs;
7952
  const int8_t * restrict q8 = y[i].qs;
 
7971
  qxhs0 = vec_sr(qxhs0, v2);
7972
  qxhs1 = vec_sr(qxhs1, v2);
7973
 
7974
+ vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
7975
+ vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
7976
+ vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
7977
+ vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
7978
 
7979
  vector signed char q8y00 = vec_xl( 0, q8);
7980
  vector signed char q8y10 = vec_xl(16, q8);
 
7982
  vector signed char q8y11 = vec_xl(48, q8);
7983
  q8 += 64;
7984
 
7985
+ vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
7986
+ vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
7987
+ vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
7988
+ vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
7989
 
7990
+ vector signed int vscales_h = vec_unpackh(vscales);
7991
+ vector signed int vs0 = vec_splat(vscales_h, 0);
7992
+ vector signed int vs1 = vec_splat(vscales_h, 1);
7993
  vscales = vec_sld(vscales, vscales, 12);
7994
 
7995
+ vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
7996
+ vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
7997
+ vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
7998
+ vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
 
 
 
7999
  }
8000
 
8001
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
 
8556
 
8557
  #elif defined(__POWER9_VECTOR__)
8558
  const vector signed char lowMask = vec_splats((signed char)0xF);
8559
+ const vector int v0 = vec_splats((int32_t)0);
8560
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
8561
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
8562
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
 
8573
  vector float vyd = vec_splats(y[i].d);
8574
  vector float vd = vec_mul(vxd, vyd);
8575
 
8576
+ vector signed int vsumi0 = v0;
8577
+ vector signed int vsumi1 = v0;
8578
+ vector signed int vsumi2 = v0;
8579
+ vector signed int vsumi3 = v0;
8580
+ vector signed int vsumi4 = v0;
8581
+ vector signed int vsumi5 = v0;
8582
+ vector signed int vsumi6 = v0;
8583
+ vector signed int vsumi7 = v0;
8584
 
8585
  const uint8_t * restrict q6 = x[i].ql;
8586
  const uint8_t * restrict qh = x[i].qh;
 
8660
  vector signed short vs6 = vec_splat(vscales, 6);
8661
  vector signed short vs7 = vec_splat(vscales, 7);
8662
 
8663
+ vsumi0 = vec_msum(qv00, vs0, vsumi0);
8664
+ vsumi1 = vec_msum(qv01, vs4, vsumi1);
8665
+ vsumi2 = vec_msum(qv10, vs1, vsumi2);
8666
+ vsumi3 = vec_msum(qv11, vs5, vsumi3);
8667
+ vsumi4 = vec_msum(qv20, vs2, vsumi4);
8668
+ vsumi5 = vec_msum(qv21, vs6, vsumi5);
8669
+ vsumi6 = vec_msum(qv30, vs3, vsumi6);
8670
+ vsumi7 = vec_msum(qv31, vs7, vsumi7);
 
 
 
 
 
 
 
 
 
8671
  }
8672
 
8673
  vsumi0 = vec_add(vsumi0, vsumi4);
 
8948
  *s = 0.125f * hsum_float_8(accumf);
8949
 
8950
  #elif defined(__POWER9_VECTOR__)
8951
+ const vector int v0 = vec_splats((int32_t)0);
8952
  vector float vsumf0 = vec_splats(0.0f);
8953
  vector float vsumf1 = vec_splats(0.0f);
8954
  vector float vsumf2 = vec_splats(0.0f);
 
8961
  vector float vyd = vec_splats(y[i].d);
8962
  vector float vd = vec_mul(vxd, vyd);
8963
 
8964
+ vector signed int vsumi0 = v0;
8965
+ vector signed int vsumi1 = v0;
8966
+ vector signed int vsumi2 = v0;
8967
+ vector signed int vsumi3 = v0;
 
 
 
 
8968
 
8969
  const uint16_t * restrict q2 = x[i].qs;
8970
  const int8_t * restrict q8 = y[i].qs;
 
9011
  vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
9012
  vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
9013
 
9014
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
9015
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
9016
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
9017
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
 
 
 
 
9018
  }
9019
 
 
 
 
 
 
9020
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9021
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9022
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
 
9408
 
9409
  *s = 0.125f * hsum_float_8(accumf);
9410
  #elif defined(__POWER9_VECTOR__)
9411
+ const vector int v0 = vec_splats((int32_t)0);
9412
  vector float vsumf0 = vec_splats(0.0f);
9413
  vector float vsumf1 = vec_splats(0.0f);
9414
  vector float vsumf2 = vec_splats(0.0f);
 
9421
  vector float vyd = vec_splats(y[i].d);
9422
  vector float vd = vec_mul(vxd, vyd);
9423
 
9424
+ vector signed int vsumi0 = v0;
9425
+ vector signed int vsumi1 = v0;
9426
+ vector signed int vsumi2 = v0;
9427
+ vector signed int vsumi3 = v0;
 
 
 
 
9428
 
9429
  const uint16_t * restrict q2 = x[i].qs;
9430
  const uint8_t * restrict sc = x[i].scales;
 
9472
  vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
9473
  vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
9474
 
9475
+ vsumi0 = vec_msum(qv0, vscales0, vsumi0);
9476
+ vsumi1 = vec_msum(qv1, vscales1, vsumi1);
9477
+ vsumi2 = vec_msum(qv2, vscales2, vsumi2);
9478
+ vsumi3 = vec_msum(qv3, vscales3, vsumi3);
 
 
 
 
9479
  }
9480
 
 
 
 
 
 
9481
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9482
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9483
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
 
9700
 
9701
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
9702
 
9703
+ const vector int v0 = vec_splats((int32_t)0);
9704
+
9705
  vector float vsumf0 = vec_splats(0.0f);
9706
  vector float vsumf1 = vec_splats(0.0f);
9707
  vector float vsumf2 = vec_splats(0.0f);
 
9716
  vector float vyd = vec_splats(y[i].d);
9717
  vector float vd = vec_mul(vxd, vyd);
9718
 
9719
+ vector signed int vsumi0 = v0;
9720
+ vector signed int vsumi1 = v0;
9721
+ vector signed int vsumi2 = v0;
9722
+ vector signed int vsumi3 = v0;
 
 
 
 
9723
 
9724
  const uint8_t * restrict q2 = x[i].qs;
9725
  const uint8_t * restrict qh = x[i].qh;
 
9779
  vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
9780
  vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
9781
 
9782
+ vsumi0 = vec_msum(qv0, vscales0, vsumi0);
9783
+ vsumi1 = vec_msum(qv1, vscales1, vsumi1);
9784
+ vsumi2 = vec_msum(qv2, vscales2, vsumi2);
9785
+ vsumi3 = vec_msum(qv3, vscales3, vsumi3);
 
 
 
 
9786
  }
9787
 
 
 
 
 
 
9788
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9789
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9790
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
 
10022
  #elif defined(__POWER9_VECTOR__)
10023
  const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10024
 
10025
+ const vector int v0 = vec_splats((int32_t)0);
10026
+
10027
  vector float vsumf0 = vec_splats(0.0f);
10028
  vector float vsumf1 = vec_splats(0.0f);
10029
  vector float vsumf2 = vec_splats(0.0f);
 
10034
  vector float vyd = vec_splats(y[i].d);
10035
  vector float vd = vec_mul(vxd, vyd);
10036
 
10037
+ vector signed int vsumi0 = v0;
10038
+ vector signed int vsumi1 = v0;
10039
+ vector signed int vsumi2 = v0;
10040
+ vector signed int vsumi3 = v0;
 
 
 
 
10041
 
10042
  const uint8_t * restrict q3 = x[i].qs;
10043
  const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
 
10082
  vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
10083
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10084
 
10085
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
10086
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
10087
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
10088
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
 
 
 
 
10089
  }
10090
 
 
 
 
 
 
10091
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10092
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10093
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
 
10377
 
10378
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
10379
 
10380
+ const vector int v0 = vec_splats((int32_t)0);
10381
+
10382
  vector float vsumf0 = vec_splats(0.0f);
10383
  vector float vsumf1 = vec_splats(0.0f);
10384
  vector float vsumf2 = vec_splats(0.0f);
 
10399
  const uint8_t * restrict sc = x[i].scales;
10400
  const int8_t * restrict q8 = y[i].qs;
10401
 
10402
+ vector signed int vsumi0 = v0;
10403
+ vector signed int vsumi1 = v0;
10404
+ vector signed int vsumi2 = v0;
10405
+ vector signed int vsumi3 = v0;
 
 
 
 
10406
 
10407
  for (int j = 0; j < QK_K/32; j += 2) {
10408
  __builtin_prefetch(q3, 0, 1);
 
10456
  vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
10457
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10458
 
10459
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
10460
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
10461
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
10462
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
 
 
 
 
10463
  }
10464
 
 
 
 
 
 
10465
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10466
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10467
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
 
10742
  vector signed int vsumi1 = vec_splats((int32_t)0);
10743
  vector signed int vsumi2 = vec_splats((int32_t)0);
10744
  vector signed int vsumi3 = vec_splats((int32_t)0);
 
 
 
 
10745
  vector signed int vsumi8 = vec_splats((int32_t)0);
10746
 
10747
  const uint8_t * restrict q1 = x[i].qs;
 
10783
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10784
  vector signed short vscales = vec_sld(vscales23, vscales01, 8);
10785
 
10786
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
10787
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
10788
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
10789
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
 
 
 
 
10790
 
10791
  vector signed short q8ysums = vec_xl_len(qs, 8);
10792
  qs += 4;
 
10801
  vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
10802
  }
10803
 
 
 
 
 
 
10804
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10805
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10806
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
 
11194
 
11195
  #elif defined(__POWER9_VECTOR__)
11196
  const vector signed char lowMask = vec_splats((signed char)0xF);
11197
+ const vector signed int v0 = vec_splats((int32_t)0);
11198
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11199
 
11200
  vector float vsumf0 = vec_splats(0.0f);
 
11225
  vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
11226
  vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
11227
 
11228
+ vector signed int vsumi0 = v0;
11229
+ vector signed int vsumi1 = v0;
11230
+
11231
+ vsumi0 = vec_sum4s(qv0, vsumi0);
11232
+ vsumi1 = vec_sum4s(qv1, vsumi1);
11233
 
11234
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11235
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
 
11384
 
11385
  #elif defined(__POWER9_VECTOR__)
11386
  const vector signed char lowMask = vec_splats((signed char)0xF);
11387
+ const vector int v0 = vec_splats((int32_t)0);
11388
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11389
 
11390
  vector float vsumf0 = vec_splats(0.0f);
 
11400
  vector float vyd = vec_splats(y[ibl].d);
11401
  vector float vd = vec_mul(vxd, vyd);
11402
 
11403
+ vector signed int vsumi0 = v0;
11404
+ vector signed int vsumi1 = v0;
11405
+ vector signed int vsumi2 = v0;
11406
+ vector signed int vsumi3 = v0;
 
 
 
 
11407
 
11408
  uint16_t h = x[ibl].scales_h;
11409
 
 
11448
  vector signed short vscales01 = vec_splats((int16_t)ls0);
11449
  vector signed short vscales23 = vec_splats((int16_t)ls1);
11450
 
11451
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
11452
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
11453
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
11454
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
 
 
 
 
11455
  }
11456
 
 
 
 
 
 
11457
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11458
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
11459
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);