Spaces:
Running
Running
ggml : fix indentation
Browse files
ggml.c
CHANGED
|
@@ -140,9 +140,6 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
| 140 |
#include <immintrin.h>
|
| 141 |
#endif
|
| 142 |
|
| 143 |
-
// FP16 <-> FP32
|
| 144 |
-
// ref: https://github.com/Maratyszcza/FP16
|
| 145 |
-
|
| 146 |
#ifdef __F16C__
|
| 147 |
float ggml_fp16_to_fp32(ggml_fp16_t h) {
|
| 148 |
return _cvtsh_ss(h);
|
|
@@ -156,6 +153,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
|
|
| 156 |
|
| 157 |
#else
|
| 158 |
|
|
|
|
|
|
|
|
|
|
| 159 |
static inline float fp32_from_bits(uint32_t w) {
|
| 160 |
union {
|
| 161 |
uint32_t as_bits;
|
|
@@ -439,10 +439,10 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
|
|
| 439 |
y2 = _mm256_loadu_ps(y + i + 16);
|
| 440 |
y3 = _mm256_loadu_ps(y + i + 24);
|
| 441 |
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
}
|
| 447 |
|
| 448 |
sum0 = _mm256_add_ps(sum0, sum1);
|
|
@@ -680,10 +680,10 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 680 |
y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
|
| 681 |
y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
|
| 682 |
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
}
|
| 688 |
|
| 689 |
const __m256 sum01 = _mm256_add_ps(sum0, sum1);
|
|
@@ -849,10 +849,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
|
| 849 |
y2 = _mm256_loadu_ps(y + i + 16);
|
| 850 |
y3 = _mm256_loadu_ps(y + i + 24);
|
| 851 |
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
|
| 857 |
_mm256_storeu_ps(y + i + 0, y0);
|
| 858 |
_mm256_storeu_ps(y + i + 8, y1);
|
|
@@ -1046,10 +1046,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 1046 |
x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
|
| 1047 |
x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
|
| 1048 |
|
| 1049 |
-
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
|
| 1053 |
|
| 1054 |
_mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0));
|
| 1055 |
_mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0));
|
|
|
|
| 140 |
#include <immintrin.h>
|
| 141 |
#endif
|
| 142 |
|
|
|
|
|
|
|
|
|
|
| 143 |
#ifdef __F16C__
|
| 144 |
float ggml_fp16_to_fp32(ggml_fp16_t h) {
|
| 145 |
return _cvtsh_ss(h);
|
|
|
|
| 153 |
|
| 154 |
#else
|
| 155 |
|
| 156 |
+
// FP16 <-> FP32
|
| 157 |
+
// ref: https://github.com/Maratyszcza/FP16
|
| 158 |
+
|
| 159 |
static inline float fp32_from_bits(uint32_t w) {
|
| 160 |
union {
|
| 161 |
uint32_t as_bits;
|
|
|
|
| 439 |
y2 = _mm256_loadu_ps(y + i + 16);
|
| 440 |
y3 = _mm256_loadu_ps(y + i + 24);
|
| 441 |
|
| 442 |
+
sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0);
|
| 443 |
+
sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1);
|
| 444 |
+
sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2);
|
| 445 |
+
sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3);
|
| 446 |
}
|
| 447 |
|
| 448 |
sum0 = _mm256_add_ps(sum0, sum1);
|
|
|
|
| 680 |
y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
|
| 681 |
y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
|
| 682 |
|
| 683 |
+
sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0);
|
| 684 |
+
sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1);
|
| 685 |
+
sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2);
|
| 686 |
+
sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3);
|
| 687 |
}
|
| 688 |
|
| 689 |
const __m256 sum01 = _mm256_add_ps(sum0, sum1);
|
|
|
|
| 849 |
y2 = _mm256_loadu_ps(y + i + 16);
|
| 850 |
y3 = _mm256_loadu_ps(y + i + 24);
|
| 851 |
|
| 852 |
+
y0 = _mm256_add_ps(_mm256_mul_ps(x0, v4), y0);
|
| 853 |
+
y1 = _mm256_add_ps(_mm256_mul_ps(x1, v4), y1);
|
| 854 |
+
y2 = _mm256_add_ps(_mm256_mul_ps(x2, v4), y2);
|
| 855 |
+
y3 = _mm256_add_ps(_mm256_mul_ps(x3, v4), y3);
|
| 856 |
|
| 857 |
_mm256_storeu_ps(y + i + 0, y0);
|
| 858 |
_mm256_storeu_ps(y + i + 8, y1);
|
|
|
|
| 1046 |
x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
|
| 1047 |
x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
|
| 1048 |
|
| 1049 |
+
y0 = _mm256_add_ps(_mm256_mul_ps(x0, v8), y0);
|
| 1050 |
+
y1 = _mm256_add_ps(_mm256_mul_ps(x1, v8), y1);
|
| 1051 |
+
y2 = _mm256_add_ps(_mm256_mul_ps(x2, v8), y2);
|
| 1052 |
+
y3 = _mm256_add_ps(_mm256_mul_ps(x3, v8), y3);
|
| 1053 |
|
| 1054 |
_mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0));
|
| 1055 |
_mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0));
|