Spaces:
Running
Running
sync : ggml (VMM, sync-ggml-am, dotprod ARM fixes, CUDA fixes) (#1691)
Browse files* scripts : add sync-ggml-am.sh
* sync : ggml (VMM, ARM dot prod fix, etc.)
* build : fix CUDA build
* ggml : fix some mul mat cases + add tests for src1 F16
https://github.com/ggerganov/ggml/commit/dbd02958fa4f46898f68ca29c27ddcdc58a06f98
- CMakeLists.txt +7 -1
- Makefile +1 -1
- extra/sync-ggml-am.sh +138 -0
- extra/sync-ggml.last +1 -0
- ggml-backend.c +12 -12
- ggml-cuda.cu +533 -404
- ggml-quants.c +23 -342
- ggml.c +4 -7
- ggml.h +2 -0
CMakeLists.txt
CHANGED
|
@@ -218,11 +218,17 @@ if (WHISPER_CUBLAS)
|
|
| 218 |
add_compile_definitions(GGML_USE_CUBLAS)
|
| 219 |
|
| 220 |
if (WHISPER_STATIC)
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
else()
|
| 223 |
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
| 224 |
endif()
|
| 225 |
|
|
|
|
| 226 |
else()
|
| 227 |
message(FATAL_ERROR "cuBLAS not found")
|
| 228 |
endif()
|
|
|
|
| 218 |
add_compile_definitions(GGML_USE_CUBLAS)
|
| 219 |
|
| 220 |
if (WHISPER_STATIC)
|
| 221 |
+
if (WIN32)
|
| 222 |
+
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
|
| 223 |
+
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
| 224 |
+
else ()
|
| 225 |
+
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
| 226 |
+
endif()
|
| 227 |
else()
|
| 228 |
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
| 229 |
endif()
|
| 230 |
|
| 231 |
+
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
|
| 232 |
else()
|
| 233 |
message(FATAL_ERROR "cuBLAS not found")
|
| 234 |
endif()
|
Makefile
CHANGED
|
@@ -206,7 +206,7 @@ ifdef WHISPER_CUBLAS
|
|
| 206 |
|
| 207 |
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
| 208 |
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
| 209 |
-
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
|
| 210 |
WHISPER_OBJ += ggml-cuda.o
|
| 211 |
NVCC = nvcc
|
| 212 |
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
|
|
|
|
| 206 |
|
| 207 |
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
| 208 |
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
| 209 |
+
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
|
| 210 |
WHISPER_OBJ += ggml-cuda.o
|
| 211 |
NVCC = nvcc
|
| 212 |
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
|
extra/sync-ggml-am.sh
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#
|
| 3 |
+
# Synchronize ggml changes to whisper.cpp
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
#
|
| 7 |
+
# $ cd /path/to/whisper.cpp
|
| 8 |
+
# $ ./extra/sync-ggml-am.sh
|
| 9 |
+
#
|
| 10 |
+
|
| 11 |
+
set -e
|
| 12 |
+
|
| 13 |
+
sd=$(dirname $0)
|
| 14 |
+
cd $sd/../
|
| 15 |
+
|
| 16 |
+
SRC_WHISPER=$(pwd)
|
| 17 |
+
SRC_GGML=$(cd ../ggml; pwd)
|
| 18 |
+
|
| 19 |
+
if [ ! -d $SRC_GGML ]; then
|
| 20 |
+
echo "ggml not found at $SRC_GGML"
|
| 21 |
+
exit 1
|
| 22 |
+
fi
|
| 23 |
+
|
| 24 |
+
lc=$(cat $SRC_WHISPER/extra/sync-ggml.last)
|
| 25 |
+
echo "Syncing ggml changes since commit $lc"
|
| 26 |
+
|
| 27 |
+
cd $SRC_GGML
|
| 28 |
+
|
| 29 |
+
git log --oneline $lc..HEAD
|
| 30 |
+
|
| 31 |
+
git format-patch $lc --stdout -- \
|
| 32 |
+
include/ggml/ggml*.h \
|
| 33 |
+
src/ggml*.h \
|
| 34 |
+
src/ggml*.c \
|
| 35 |
+
src/ggml*.cpp \
|
| 36 |
+
src/ggml*.m \
|
| 37 |
+
src/ggml*.metal \
|
| 38 |
+
src/ggml*.cu \
|
| 39 |
+
tests/test-opt.cpp \
|
| 40 |
+
tests/test-grad0.cpp \
|
| 41 |
+
tests/test-quantize-fns.cpp \
|
| 42 |
+
tests/test-quantize-perf.cpp \
|
| 43 |
+
tests/test-backend-ops.cpp \
|
| 44 |
+
> $SRC_WHISPER/ggml-src.patch
|
| 45 |
+
|
| 46 |
+
# delete files if empty
|
| 47 |
+
if [ ! -s $SRC_WHISPER/ggml-src.patch ]; then
|
| 48 |
+
rm -v $SRC_WHISPER/ggml-src.patch
|
| 49 |
+
fi
|
| 50 |
+
|
| 51 |
+
cd $SRC_WHISPER
|
| 52 |
+
|
| 53 |
+
if [ -f $SRC_WHISPER/ggml-src.patch ]; then
|
| 54 |
+
# replace PR numbers
|
| 55 |
+
#
|
| 56 |
+
# Subject: some text (#1234)
|
| 57 |
+
# Subject: some text (ggml/1234)
|
| 58 |
+
cat ggml-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (ggml\/\2)/' > ggml-src.patch.tmp
|
| 59 |
+
mv ggml-src.patch.tmp ggml-src.patch
|
| 60 |
+
|
| 61 |
+
cat ggml-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (ggml\/\2)/' > ggml-src.patch.tmp
|
| 62 |
+
mv ggml-src.patch.tmp ggml-src.patch
|
| 63 |
+
|
| 64 |
+
# replace filenames:
|
| 65 |
+
#
|
| 66 |
+
# src/ggml.c -> ggml.c
|
| 67 |
+
# src/ggml-alloc.c -> ggml-alloc.c
|
| 68 |
+
# src/ggml-backend-impl.h -> ggml-backend-impl.h
|
| 69 |
+
# src/ggml-backend.c -> ggml-backend.c
|
| 70 |
+
# src/ggml-cuda.cu -> ggml-cuda.cu
|
| 71 |
+
# src/ggml-cuda.h -> ggml-cuda.h
|
| 72 |
+
# src/ggml-impl.h -> ggml-impl.h
|
| 73 |
+
# src/ggml-metal.h -> ggml-metal.h
|
| 74 |
+
# src/ggml-metal.m -> ggml-metal.m
|
| 75 |
+
# src/ggml-metal.metal -> ggml-metal.metal
|
| 76 |
+
# src/ggml-mpi.h -> ggml-mpi.h
|
| 77 |
+
# src/ggml-mpi.c -> ggml-mpi.c
|
| 78 |
+
# src/ggml-opencl.cpp -> ggml-opencl.cpp
|
| 79 |
+
# src/ggml-opencl.h -> ggml-opencl.h
|
| 80 |
+
# src/ggml-quants.c -> ggml-quants.c
|
| 81 |
+
# src/ggml-quants.h -> ggml-quants.h
|
| 82 |
+
# include/ggml/ggml.h -> ggml.h
|
| 83 |
+
# include/ggml/ggml-alloc.h -> ggml-alloc.h
|
| 84 |
+
# include/ggml/ggml-backend.h -> ggml-backend.h
|
| 85 |
+
#
|
| 86 |
+
# examples/common.h -> examples/common.h
|
| 87 |
+
# examples/common.cpp -> examples/common.cpp
|
| 88 |
+
# examples/common-ggml.h -> examples/common-ggml.h
|
| 89 |
+
# examples/common-ggml.cpp -> examples/common-ggml.cpp
|
| 90 |
+
#
|
| 91 |
+
# examples/whisper/whisper.h -> whisper.h
|
| 92 |
+
# examples/whisper/whisper.cpp -> whisper.cpp
|
| 93 |
+
# examples/whisper/main.cpp -> examples/main/main.cpp
|
| 94 |
+
# examples/whisper/quantize.cpp -> examples/quantize/quantize.cpp
|
| 95 |
+
|
| 96 |
+
cat ggml-src.patch | sed \
|
| 97 |
+
-e 's/src\/ggml\.c/ggml.c/g' \
|
| 98 |
+
-e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \
|
| 99 |
+
-e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
|
| 100 |
+
-e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
|
| 101 |
+
-e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
|
| 102 |
+
-e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
|
| 103 |
+
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
|
| 104 |
+
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
|
| 105 |
+
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
|
| 106 |
+
-e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \
|
| 107 |
+
-e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
|
| 108 |
+
-e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
|
| 109 |
+
-e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
|
| 110 |
+
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
| 111 |
+
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
| 112 |
+
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
| 113 |
+
-e 's/include\/ggml\/ggml\.h/ggml.h/g' \
|
| 114 |
+
-e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
|
| 115 |
+
-e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
|
| 116 |
+
-e 's/examples\/common\.h/examples\/common.h/g' \
|
| 117 |
+
-e 's/examples\/common\.cpp/examples\/common.cpp/g' \
|
| 118 |
+
-e 's/examples\/common-ggml\.h/examples\/common-ggml.h/g' \
|
| 119 |
+
-e 's/examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \
|
| 120 |
+
-e 's/examples\/whisper\/whisper\.h/whisper.h/g' \
|
| 121 |
+
-e 's/examples\/whisper\/whisper\.cpp/whisper.cpp/g' \
|
| 122 |
+
-e 's/examples\/whisper\/main\.cpp/examples\/main\/main.cpp/g' \
|
| 123 |
+
-e 's/examples\/whisper\/quantize\.cpp/examples\/quantize\/quantize.cpp/g' \
|
| 124 |
+
> ggml-src.patch.tmp
|
| 125 |
+
mv ggml-src.patch.tmp ggml-src.patch
|
| 126 |
+
|
| 127 |
+
git am ggml-src.patch
|
| 128 |
+
|
| 129 |
+
rm -v $SRC_WHISPER/ggml-src.patch
|
| 130 |
+
fi
|
| 131 |
+
|
| 132 |
+
# update last commit
|
| 133 |
+
cd $SRC_GGML
|
| 134 |
+
git log -1 --format=%H > $SRC_WHISPER/extra/sync-ggml.last
|
| 135 |
+
|
| 136 |
+
echo "Done"
|
| 137 |
+
|
| 138 |
+
exit 0
|
extra/sync-ggml.last
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
1467a4eb71bdb5ac316d248a7f3f26cdadc56b68
|
ggml-backend.c
CHANGED
|
@@ -297,7 +297,7 @@ static void ggml_backend_registry_init(void) {
|
|
| 297 |
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
| 298 |
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
| 299 |
|
| 300 |
-
|
| 301 |
|
| 302 |
ggml_backend_registry[id] = (struct ggml_backend_reg) {
|
| 303 |
/* .name = */ {0},
|
|
@@ -330,6 +330,8 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
|
|
| 330 |
return i;
|
| 331 |
}
|
| 332 |
}
|
|
|
|
|
|
|
| 333 |
return SIZE_MAX;
|
| 334 |
}
|
| 335 |
|
|
@@ -340,15 +342,15 @@ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str)
|
|
| 340 |
const char * params = strchr(backend_str, ':');
|
| 341 |
char backend_name[128];
|
| 342 |
if (params == NULL) {
|
| 343 |
-
|
| 344 |
params = "";
|
| 345 |
} else {
|
| 346 |
-
|
| 347 |
-
backend_name[params - backend_str] = '\0';
|
| 348 |
params++;
|
| 349 |
}
|
| 350 |
|
| 351 |
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
|
|
|
|
| 352 |
if (backend_i == SIZE_MAX) {
|
| 353 |
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
|
| 354 |
return NULL;
|
|
@@ -396,18 +398,12 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
| 396 |
}
|
| 397 |
|
| 398 |
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 399 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 400 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 401 |
-
|
| 402 |
memcpy((char *)tensor->data + offset, data, size);
|
| 403 |
|
| 404 |
GGML_UNUSED(buffer);
|
| 405 |
}
|
| 406 |
|
| 407 |
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 408 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 409 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 410 |
-
|
| 411 |
memcpy(data, (const char *)tensor->data + offset, size);
|
| 412 |
|
| 413 |
GGML_UNUSED(buffer);
|
|
@@ -618,10 +614,14 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
| 618 |
}
|
| 619 |
|
| 620 |
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
|
| 623 |
GGML_UNUSED(backend);
|
| 624 |
-
GGML_UNUSED(op);
|
| 625 |
}
|
| 626 |
|
| 627 |
static struct ggml_backend_i cpu_backend_i = {
|
|
|
|
| 297 |
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
| 298 |
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
| 299 |
|
| 300 |
+
size_t id = ggml_backend_registry_count;
|
| 301 |
|
| 302 |
ggml_backend_registry[id] = (struct ggml_backend_reg) {
|
| 303 |
/* .name = */ {0},
|
|
|
|
| 330 |
return i;
|
| 331 |
}
|
| 332 |
}
|
| 333 |
+
|
| 334 |
+
// not found
|
| 335 |
return SIZE_MAX;
|
| 336 |
}
|
| 337 |
|
|
|
|
| 342 |
const char * params = strchr(backend_str, ':');
|
| 343 |
char backend_name[128];
|
| 344 |
if (params == NULL) {
|
| 345 |
+
snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
|
| 346 |
params = "";
|
| 347 |
} else {
|
| 348 |
+
snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
|
|
|
|
| 349 |
params++;
|
| 350 |
}
|
| 351 |
|
| 352 |
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
|
| 353 |
+
|
| 354 |
if (backend_i == SIZE_MAX) {
|
| 355 |
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
|
| 356 |
return NULL;
|
|
|
|
| 398 |
}
|
| 399 |
|
| 400 |
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
|
|
|
|
|
|
| 401 |
memcpy((char *)tensor->data + offset, data, size);
|
| 402 |
|
| 403 |
GGML_UNUSED(buffer);
|
| 404 |
}
|
| 405 |
|
| 406 |
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
|
|
|
|
|
|
| 407 |
memcpy(data, (const char *)tensor->data + offset, size);
|
| 408 |
|
| 409 |
GGML_UNUSED(buffer);
|
|
|
|
| 614 |
}
|
| 615 |
|
| 616 |
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 617 |
+
switch (op->op) {
|
| 618 |
+
case GGML_OP_MUL_MAT:
|
| 619 |
+
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
| 620 |
+
default:
|
| 621 |
+
return true;
|
| 622 |
+
}
|
| 623 |
|
| 624 |
GGML_UNUSED(backend);
|
|
|
|
| 625 |
}
|
| 626 |
|
| 627 |
static struct ggml_backend_i cpu_backend_i = {
|
ggml-cuda.cu
CHANGED
|
@@ -68,8 +68,9 @@
|
|
| 68 |
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
| 69 |
#endif
|
| 70 |
#define cudaMemcpy hipMemcpy
|
| 71 |
-
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
| 72 |
#define cudaMemcpyAsync hipMemcpyAsync
|
|
|
|
|
|
|
| 73 |
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
| 74 |
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
| 75 |
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
|
@@ -86,17 +87,29 @@
|
|
| 86 |
#define cudaStream_t hipStream_t
|
| 87 |
#define cudaSuccess hipSuccess
|
| 88 |
#define __trap abort
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
#else
|
| 90 |
#include <cuda_runtime.h>
|
|
|
|
| 91 |
#include <cublas_v2.h>
|
| 92 |
#include <cuda_fp16.h>
|
| 93 |
-
|
| 94 |
-
#
|
|
|
|
| 95 |
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
| 96 |
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
| 97 |
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
| 98 |
#define cublasComputeType_t cudaDataType_t
|
| 99 |
-
#endif
|
|
|
|
| 100 |
#endif // defined(GGML_USE_HIPBLAS)
|
| 101 |
|
| 102 |
#include "ggml-cuda.h"
|
|
@@ -151,7 +164,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
| 151 |
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
| 152 |
#if __has_builtin(__builtin_elementwise_sub_sat)
|
| 153 |
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
| 154 |
-
return reinterpret_cast<const int&>(c);
|
| 155 |
#else
|
| 156 |
int8x4_t c;
|
| 157 |
int16_t tmp;
|
|
@@ -162,7 +175,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
| 162 |
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
| 163 |
c[i] = tmp;
|
| 164 |
}
|
| 165 |
-
return reinterpret_cast<int&>(c);
|
| 166 |
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
| 167 |
}
|
| 168 |
|
|
@@ -200,45 +213,59 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
| 200 |
|
| 201 |
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
| 202 |
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
} while (0)
|
| 215 |
|
|
|
|
|
|
|
| 216 |
#if CUDART_VERSION >= 12000
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
| 221 |
-
int id; \
|
| 222 |
-
cudaGetDevice(&id); \
|
| 223 |
-
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
| 224 |
-
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
| 225 |
-
fprintf(stderr, "current device: %d\n", id); \
|
| 226 |
-
GGML_ASSERT(!"cuBLAS error"); \
|
| 227 |
-
} \
|
| 228 |
-
} while (0)
|
| 229 |
#else
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
#if CUDART_VERSION >= 11100
|
| 244 |
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
|
@@ -294,10 +321,10 @@ typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 294 |
typedef void (*ggml_cuda_op_mul_mat_t)(
|
| 295 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 296 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 297 |
-
const int64_t src1_padded_row_size,
|
| 298 |
typedef void (*ggml_cuda_op_flatten_t)(
|
| 299 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 300 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 301 |
|
| 302 |
// QK = number of values after dequantization
|
| 303 |
// QR = QK / number of values before dequantization
|
|
@@ -503,22 +530,29 @@ struct ggml_tensor_extra_gpu {
|
|
| 503 |
|
| 504 |
// this is faster on Windows
|
| 505 |
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
| 506 |
-
|
| 507 |
int current_device;
|
| 508 |
CUDA_CHECK(cudaGetDevice(¤t_device));
|
| 509 |
|
| 510 |
if (device == current_device) {
|
| 511 |
-
return
|
| 512 |
}
|
| 513 |
|
| 514 |
-
|
| 515 |
}
|
| 516 |
|
| 517 |
static int g_device_count = -1;
|
| 518 |
static int g_main_device = 0;
|
| 519 |
-
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
| 520 |
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
| 521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
static void * g_scratch_buffer = nullptr;
|
| 523 |
static size_t g_scratch_size = 0; // disabled by default
|
| 524 |
static size_t g_scratch_offset = 0;
|
|
@@ -560,6 +594,7 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
| 560 |
|
| 561 |
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
| 562 |
return b;
|
|
|
|
| 563 |
}
|
| 564 |
|
| 565 |
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
|
@@ -681,7 +716,7 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
| 681 |
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
| 682 |
}
|
| 683 |
|
| 684 |
-
static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
|
| 685 |
const float GELU_QUICK_COEF = -1.702f;
|
| 686 |
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 687 |
if (i >= k) {
|
|
@@ -690,7 +725,7 @@ static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
|
|
| 690 |
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
|
| 691 |
}
|
| 692 |
|
| 693 |
-
static __global__ void tanh_f32(const float *x, float *dst, int k) {
|
| 694 |
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 695 |
if (i >= k) {
|
| 696 |
return;
|
|
@@ -707,7 +742,7 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
|
| 707 |
dst[i] = fmaxf(x[i], 0);
|
| 708 |
}
|
| 709 |
|
| 710 |
-
static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
|
| 711 |
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 712 |
if (i >= k) {
|
| 713 |
return;
|
|
@@ -760,7 +795,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
|
|
| 760 |
}
|
| 761 |
}
|
| 762 |
|
| 763 |
-
static __global__ void concat_f32(const float
|
| 764 |
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 765 |
if (nidx >= ne0) {
|
| 766 |
return;
|
|
@@ -785,7 +820,7 @@ static __global__ void concat_f32(const float *x,const float *y, float *dst, c
|
|
| 785 |
}
|
| 786 |
}
|
| 787 |
|
| 788 |
-
static __global__ void upscale_f32(const float
|
| 789 |
int ne0 = ne00 * scale_factor;
|
| 790 |
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 791 |
if (nidx >= ne0) {
|
|
@@ -805,7 +840,7 @@ static __global__ void upscale_f32(const float *x, float *dst, const int ne00,
|
|
| 805 |
dst[offset_dst] = x[offset_src];
|
| 806 |
}
|
| 807 |
|
| 808 |
-
static __global__ void pad_f32(const float
|
| 809 |
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 810 |
if (nidx >= ne0) {
|
| 811 |
return;
|
|
@@ -4707,7 +4742,6 @@ static __global__ void mul_mat_p021_f16_f32(
|
|
| 4707 |
|
| 4708 |
const int row_y = col_x;
|
| 4709 |
|
| 4710 |
-
|
| 4711 |
// y is not transposed but permuted
|
| 4712 |
const int iy = channel*nrows_y + row_y;
|
| 4713 |
|
|
@@ -5382,7 +5416,7 @@ struct bin_bcast_cuda {
|
|
| 5382 |
cne[3] = 1;
|
| 5383 |
};
|
| 5384 |
|
| 5385 |
-
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
| 5386 |
cnb[1] *= cne[1];
|
| 5387 |
cnb[2] *= cne[2];
|
| 5388 |
cnb[3] *= cne[3];
|
|
@@ -5875,7 +5909,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
| 5875 |
|
| 5876 |
int id;
|
| 5877 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 5878 |
-
const int compute_capability =
|
| 5879 |
|
| 5880 |
int mmq_x, mmq_y, nwarps;
|
| 5881 |
if (compute_capability >= CC_RDNA2) {
|
|
@@ -5920,7 +5954,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
| 5920 |
|
| 5921 |
int id;
|
| 5922 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 5923 |
-
const int compute_capability =
|
| 5924 |
|
| 5925 |
int mmq_x, mmq_y, nwarps;
|
| 5926 |
if (compute_capability >= CC_RDNA2) {
|
|
@@ -5965,7 +5999,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
| 5965 |
|
| 5966 |
int id;
|
| 5967 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 5968 |
-
const int compute_capability =
|
| 5969 |
|
| 5970 |
int mmq_x, mmq_y, nwarps;
|
| 5971 |
if (compute_capability >= CC_RDNA2) {
|
|
@@ -6010,7 +6044,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
| 6010 |
|
| 6011 |
int id;
|
| 6012 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6013 |
-
const int compute_capability =
|
| 6014 |
|
| 6015 |
int mmq_x, mmq_y, nwarps;
|
| 6016 |
if (compute_capability >= CC_RDNA2) {
|
|
@@ -6055,7 +6089,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
| 6055 |
|
| 6056 |
int id;
|
| 6057 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6058 |
-
const int compute_capability =
|
| 6059 |
|
| 6060 |
int mmq_x, mmq_y, nwarps;
|
| 6061 |
if (compute_capability >= CC_RDNA2) {
|
|
@@ -6100,7 +6134,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
| 6100 |
|
| 6101 |
int id;
|
| 6102 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6103 |
-
const int compute_capability =
|
| 6104 |
|
| 6105 |
int mmq_x, mmq_y, nwarps;
|
| 6106 |
if (compute_capability >= CC_RDNA2) {
|
|
@@ -6147,7 +6181,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
| 6147 |
|
| 6148 |
int id;
|
| 6149 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6150 |
-
const int compute_capability =
|
| 6151 |
|
| 6152 |
int mmq_x, mmq_y, nwarps;
|
| 6153 |
if (compute_capability >= CC_RDNA2) {
|
|
@@ -6193,7 +6227,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
| 6193 |
|
| 6194 |
int id;
|
| 6195 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6196 |
-
const int compute_capability =
|
| 6197 |
|
| 6198 |
int mmq_x, mmq_y, nwarps;
|
| 6199 |
if (compute_capability >= CC_RDNA2) {
|
|
@@ -6238,7 +6272,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
| 6238 |
|
| 6239 |
int id;
|
| 6240 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6241 |
-
const int compute_capability =
|
| 6242 |
|
| 6243 |
int mmq_x, mmq_y, nwarps;
|
| 6244 |
if (compute_capability >= CC_RDNA2) {
|
|
@@ -6283,7 +6317,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
| 6283 |
|
| 6284 |
int id;
|
| 6285 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6286 |
-
const int compute_capability =
|
| 6287 |
|
| 6288 |
int mmq_x, mmq_y, nwarps;
|
| 6289 |
if (compute_capability >= CC_RDNA2) {
|
|
@@ -6543,30 +6577,30 @@ struct scoped_spin_lock {
|
|
| 6543 |
scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
|
| 6544 |
};
|
| 6545 |
|
| 6546 |
-
|
|
|
|
|
|
|
|
|
|
| 6547 |
void * ptr = nullptr;
|
| 6548 |
size_t size = 0;
|
| 6549 |
};
|
| 6550 |
|
| 6551 |
-
static
|
| 6552 |
-
static
|
| 6553 |
|
| 6554 |
-
static void *
|
| 6555 |
scoped_spin_lock lock(g_cuda_pool_lock);
|
| 6556 |
-
int id;
|
| 6557 |
-
CUDA_CHECK(cudaGetDevice(&id));
|
| 6558 |
#ifdef DEBUG_CUDA_MALLOC
|
| 6559 |
int nnz = 0;
|
| 6560 |
-
size_t max_size = 0
|
| 6561 |
#endif
|
| 6562 |
size_t best_diff = 1ull << 36;
|
| 6563 |
int ibest = -1;
|
| 6564 |
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
| 6565 |
-
|
| 6566 |
if (b.ptr != nullptr) {
|
| 6567 |
#ifdef DEBUG_CUDA_MALLOC
|
| 6568 |
++nnz;
|
| 6569 |
-
tot_size += b.size;
|
| 6570 |
if (b.size > max_size) max_size = b.size;
|
| 6571 |
#endif
|
| 6572 |
if (b.size >= size) {
|
|
@@ -6586,32 +6620,32 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
| 6586 |
}
|
| 6587 |
}
|
| 6588 |
if (ibest >= 0) {
|
| 6589 |
-
|
| 6590 |
void * ptr = b.ptr;
|
| 6591 |
*actual_size = b.size;
|
| 6592 |
b.ptr = nullptr;
|
| 6593 |
b.size = 0;
|
| 6594 |
return ptr;
|
| 6595 |
}
|
| 6596 |
-
#ifdef DEBUG_CUDA_MALLOC
|
| 6597 |
-
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
| 6598 |
-
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
| 6599 |
-
#endif
|
| 6600 |
void * ptr;
|
| 6601 |
size_t look_ahead_size = (size_t) (1.05 * size);
|
| 6602 |
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
|
|
|
| 6603 |
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
| 6604 |
*actual_size = look_ahead_size;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6605 |
return ptr;
|
| 6606 |
}
|
| 6607 |
|
| 6608 |
-
static void
|
| 6609 |
scoped_spin_lock lock(g_cuda_pool_lock);
|
| 6610 |
-
int id;
|
| 6611 |
-
CUDA_CHECK(cudaGetDevice(&id));
|
| 6612 |
|
| 6613 |
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
| 6614 |
-
|
| 6615 |
if (b.ptr == nullptr) {
|
| 6616 |
b.ptr = ptr;
|
| 6617 |
b.size = size;
|
|
@@ -6619,9 +6653,149 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
| 6619 |
}
|
| 6620 |
}
|
| 6621 |
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
|
|
|
| 6622 |
CUDA_CHECK(cudaFree(ptr));
|
|
|
|
| 6623 |
}
|
| 6624 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6625 |
static bool g_cublas_loaded = false;
|
| 6626 |
|
| 6627 |
bool ggml_cublas_loaded(void) {
|
|
@@ -6660,16 +6834,33 @@ void ggml_init_cublas() {
|
|
| 6660 |
#endif
|
| 6661 |
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
| 6662 |
for (int id = 0; id < g_device_count; ++id) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6663 |
cudaDeviceProp prop;
|
| 6664 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
| 6665 |
-
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
| 6666 |
|
| 6667 |
g_tensor_split[id] = total_vram;
|
| 6668 |
total_vram += prop.totalGlobalMem;
|
| 6669 |
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 6670 |
-
|
| 6671 |
#else
|
| 6672 |
-
|
| 6673 |
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 6674 |
}
|
| 6675 |
for (int id = 0; id < g_device_count; ++id) {
|
|
@@ -6677,7 +6868,7 @@ void ggml_init_cublas() {
|
|
| 6677 |
}
|
| 6678 |
|
| 6679 |
for (int id = 0; id < g_device_count; ++id) {
|
| 6680 |
-
|
| 6681 |
|
| 6682 |
// create cuda streams
|
| 6683 |
for (int is = 0; is < MAX_STREAMS; ++is) {
|
|
@@ -6729,8 +6920,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
| 6729 |
void * ptr = nullptr;
|
| 6730 |
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
| 6731 |
if (err != cudaSuccess) {
|
| 6732 |
-
//
|
| 6733 |
-
// This can fixed the OOM error in WSL.
|
| 6734 |
cudaGetLastError();
|
| 6735 |
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
| 6736 |
size/1024.0/1024.0, cudaGetErrorString(err));
|
|
@@ -6793,7 +6983,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
| 6793 |
|
| 6794 |
static void ggml_cuda_op_get_rows(
|
| 6795 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6796 |
-
const float * src0_d, const float * src1_d, float * dst_d,
|
| 6797 |
|
| 6798 |
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
| 6799 |
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
@@ -6835,9 +7025,9 @@ static void ggml_cuda_op_get_rows(
|
|
| 6835 |
}
|
| 6836 |
|
| 6837 |
template<class op>
|
| 6838 |
-
|
| 6839 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6840 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6841 |
|
| 6842 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 6843 |
|
|
@@ -6856,7 +7046,7 @@ inline void ggml_cuda_op_bin_bcast(
|
|
| 6856 |
|
| 6857 |
static void ggml_cuda_op_repeat(
|
| 6858 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6859 |
-
const float * src0_d, const float * src1_d, float * dst_d,
|
| 6860 |
|
| 6861 |
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
| 6862 |
|
|
@@ -6864,16 +7054,16 @@ static void ggml_cuda_op_repeat(
|
|
| 6864 |
(void) src1_d;
|
| 6865 |
}
|
| 6866 |
|
| 6867 |
-
|
| 6868 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6869 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6870 |
|
| 6871 |
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
| 6872 |
}
|
| 6873 |
|
| 6874 |
-
|
| 6875 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6876 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6877 |
|
| 6878 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 6879 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
@@ -6890,23 +7080,23 @@ inline void ggml_cuda_op_acc(
|
|
| 6890 |
(void) dst;
|
| 6891 |
}
|
| 6892 |
|
| 6893 |
-
|
| 6894 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6895 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6896 |
|
| 6897 |
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
| 6898 |
}
|
| 6899 |
|
| 6900 |
-
|
| 6901 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6902 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6903 |
|
| 6904 |
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
| 6905 |
}
|
| 6906 |
|
| 6907 |
-
|
| 6908 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6909 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6910 |
|
| 6911 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 6912 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -6918,9 +7108,9 @@ inline void ggml_cuda_op_gelu(
|
|
| 6918 |
(void) src1_dd;
|
| 6919 |
}
|
| 6920 |
|
| 6921 |
-
|
| 6922 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6923 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6924 |
|
| 6925 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 6926 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -6932,9 +7122,9 @@ inline void ggml_cuda_op_silu(
|
|
| 6932 |
(void) src1_dd;
|
| 6933 |
}
|
| 6934 |
|
| 6935 |
-
|
| 6936 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6937 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6938 |
|
| 6939 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 6940 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -6946,9 +7136,9 @@ inline void ggml_cuda_op_gelu_quick(
|
|
| 6946 |
(void) src1_dd;
|
| 6947 |
}
|
| 6948 |
|
| 6949 |
-
|
| 6950 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6951 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6952 |
|
| 6953 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 6954 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -6960,9 +7150,9 @@ inline void ggml_cuda_op_tanh(
|
|
| 6960 |
(void) src1_dd;
|
| 6961 |
}
|
| 6962 |
|
| 6963 |
-
|
| 6964 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6965 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6966 |
|
| 6967 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 6968 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -6974,9 +7164,9 @@ inline void ggml_cuda_op_relu(
|
|
| 6974 |
(void) src1_dd;
|
| 6975 |
}
|
| 6976 |
|
| 6977 |
-
|
| 6978 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6979 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6980 |
|
| 6981 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 6982 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -6991,9 +7181,9 @@ inline void ggml_cuda_op_leaky_relu(
|
|
| 6991 |
(void) src1_dd;
|
| 6992 |
}
|
| 6993 |
|
| 6994 |
-
|
| 6995 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6996 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 6997 |
|
| 6998 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 6999 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -7005,9 +7195,9 @@ inline void ggml_cuda_op_sqr(
|
|
| 7005 |
(void) src1_dd;
|
| 7006 |
}
|
| 7007 |
|
| 7008 |
-
|
| 7009 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7010 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7011 |
|
| 7012 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7013 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -7025,10 +7215,9 @@ inline void ggml_cuda_op_norm(
|
|
| 7025 |
(void) src1_dd;
|
| 7026 |
}
|
| 7027 |
|
| 7028 |
-
|
| 7029 |
-
inline void ggml_cuda_op_group_norm(
|
| 7030 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7031 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7032 |
|
| 7033 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7034 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -7042,9 +7231,9 @@ inline void ggml_cuda_op_group_norm(
|
|
| 7042 |
(void) src1_dd;
|
| 7043 |
}
|
| 7044 |
|
| 7045 |
-
|
| 7046 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7047 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7048 |
|
| 7049 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7050 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
@@ -7058,9 +7247,9 @@ inline void ggml_cuda_op_concat(
|
|
| 7058 |
(void) dst;
|
| 7059 |
}
|
| 7060 |
|
| 7061 |
-
|
| 7062 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7063 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7064 |
|
| 7065 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7066 |
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
@@ -7075,9 +7264,9 @@ inline void ggml_cuda_op_upscale(
|
|
| 7075 |
(void) src1_dd;
|
| 7076 |
}
|
| 7077 |
|
| 7078 |
-
|
| 7079 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7080 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7081 |
|
| 7082 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7083 |
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
@@ -7092,9 +7281,9 @@ inline void ggml_cuda_op_pad(
|
|
| 7092 |
(void) src1_dd;
|
| 7093 |
}
|
| 7094 |
|
| 7095 |
-
|
| 7096 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7097 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7098 |
|
| 7099 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7100 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -7112,10 +7301,10 @@ inline void ggml_cuda_op_rms_norm(
|
|
| 7112 |
(void) src1_dd;
|
| 7113 |
}
|
| 7114 |
|
| 7115 |
-
|
| 7116 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 7117 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 7118 |
-
const int64_t src1_padded_row_size,
|
| 7119 |
|
| 7120 |
const int64_t ne00 = src0->ne[0];
|
| 7121 |
|
|
@@ -7177,13 +7366,13 @@ inline void ggml_cuda_op_mul_mat_q(
|
|
| 7177 |
static int64_t get_row_rounding(ggml_type type) {
|
| 7178 |
int64_t min_compute_capability = INT_MAX;
|
| 7179 |
int64_t max_compute_capability = INT_MIN;
|
| 7180 |
-
for (
|
| 7181 |
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
| 7182 |
-
if (min_compute_capability >
|
| 7183 |
-
min_compute_capability =
|
| 7184 |
}
|
| 7185 |
-
if (max_compute_capability <
|
| 7186 |
-
max_compute_capability =
|
| 7187 |
}
|
| 7188 |
}
|
| 7189 |
}
|
|
@@ -7235,10 +7424,10 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
| 7235 |
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 7236 |
}
|
| 7237 |
|
| 7238 |
-
|
| 7239 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 7240 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 7241 |
-
const int64_t src1_padded_row_size,
|
| 7242 |
|
| 7243 |
GGML_ASSERT(ggml_nrows(src1) == 1);
|
| 7244 |
|
|
@@ -7288,18 +7477,20 @@ inline void ggml_cuda_op_mul_mat_vec_q(
|
|
| 7288 |
(void) src1_padded_row_size;
|
| 7289 |
}
|
| 7290 |
|
| 7291 |
-
|
| 7292 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 7293 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 7294 |
-
const int64_t src1_padded_row_size,
|
| 7295 |
|
| 7296 |
const int64_t ne00 = src0->ne[0];
|
| 7297 |
const int64_t row_diff = row_high - row_low;
|
| 7298 |
|
|
|
|
|
|
|
| 7299 |
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
| 7300 |
#ifdef GGML_CUDA_F16
|
| 7301 |
-
|
| 7302 |
-
|
| 7303 |
|
| 7304 |
bool src1_convert_f16 =
|
| 7305 |
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
|
@@ -7307,7 +7498,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
| 7307 |
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
| 7308 |
|
| 7309 |
if (src1_convert_f16) {
|
| 7310 |
-
src1_dfloat = (
|
| 7311 |
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
| 7312 |
ne00, 1, sizeof(float), 0, 0,
|
| 7313 |
ne00, 1, sizeof(half), 0, 0, stream);
|
|
@@ -7355,12 +7546,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
| 7355 |
break;
|
| 7356 |
}
|
| 7357 |
|
| 7358 |
-
#ifdef GGML_CUDA_F16
|
| 7359 |
-
if (src1_convert_f16) {
|
| 7360 |
-
ggml_cuda_pool_free(src1_dfloat, ash);
|
| 7361 |
-
}
|
| 7362 |
-
#endif // GGML_CUDA_F16
|
| 7363 |
-
|
| 7364 |
(void) src1;
|
| 7365 |
(void) dst;
|
| 7366 |
(void) src1_ddq_i;
|
|
@@ -7368,10 +7553,10 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
| 7368 |
(void) src1_padded_row_size;
|
| 7369 |
}
|
| 7370 |
|
| 7371 |
-
|
| 7372 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 7373 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 7374 |
-
const int64_t src1_padded_row_size,
|
| 7375 |
|
| 7376 |
GGML_ASSERT(src0_dd_i != nullptr);
|
| 7377 |
GGML_ASSERT(src1_ddf_i != nullptr);
|
|
@@ -7391,33 +7576,31 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
| 7391 |
// ldc == nrows of the matrix that cuBLAS writes into
|
| 7392 |
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
| 7393 |
|
| 7394 |
-
const int compute_capability =
|
| 7395 |
|
| 7396 |
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
|
|
| 7397 |
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
| 7398 |
-
half
|
| 7399 |
-
size_t src0_as = 0;
|
| 7400 |
if (src0->type != GGML_TYPE_F16) {
|
| 7401 |
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
| 7402 |
GGML_ASSERT(to_fp16_cuda != nullptr);
|
| 7403 |
size_t ne = row_diff*ne00;
|
| 7404 |
-
src0_as_f16
|
| 7405 |
-
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
|
| 7406 |
}
|
| 7407 |
-
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
|
| 7408 |
|
| 7409 |
-
half
|
| 7410 |
-
size_t src1_as = 0;
|
| 7411 |
if (src1->type != GGML_TYPE_F16) {
|
| 7412 |
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
| 7413 |
GGML_ASSERT(to_fp16_cuda != nullptr);
|
| 7414 |
size_t ne = src1_ncols*ne10;
|
| 7415 |
-
src1_as_f16
|
| 7416 |
-
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
| 7417 |
}
|
| 7418 |
-
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
| 7419 |
-
|
| 7420 |
-
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
| 7421 |
|
| 7422 |
const half alpha_f16 = 1.0f;
|
| 7423 |
const half beta_f16 = 0.0f;
|
|
@@ -7426,36 +7609,33 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
| 7426 |
CUBLAS_CHECK(
|
| 7427 |
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 7428 |
row_diff, src1_ncols, ne10,
|
| 7429 |
-
&alpha_f16, src0_ptr,
|
| 7430 |
-
src1_ptr,
|
| 7431 |
-
&beta_f16, dst_f16, CUDA_R_16F, ldc,
|
| 7432 |
CUBLAS_COMPUTE_16F,
|
| 7433 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 7434 |
|
| 7435 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
| 7436 |
-
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
|
| 7437 |
-
|
| 7438 |
-
|
| 7439 |
-
|
| 7440 |
-
if (src0_as != 0) {
|
| 7441 |
-
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
| 7442 |
-
}
|
| 7443 |
-
|
| 7444 |
-
if (src1_as != 0) {
|
| 7445 |
-
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
| 7446 |
-
}
|
| 7447 |
-
}
|
| 7448 |
-
else {
|
| 7449 |
-
float * src0_ddq_as_f32 = nullptr;
|
| 7450 |
-
size_t src0_as = 0;
|
| 7451 |
|
| 7452 |
if (src0->type != GGML_TYPE_F32) {
|
| 7453 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
| 7454 |
GGML_ASSERT(to_fp32_cuda != nullptr);
|
| 7455 |
-
src0_ddq_as_f32
|
| 7456 |
-
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7457 |
}
|
| 7458 |
-
|
|
|
|
|
|
|
| 7459 |
|
| 7460 |
const float alpha = 1.0f;
|
| 7461 |
const float beta = 0.0f;
|
|
@@ -7464,13 +7644,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
| 7464 |
CUBLAS_CHECK(
|
| 7465 |
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 7466 |
row_diff, src1_ncols, ne10,
|
| 7467 |
-
&alpha, src0_ddf_i,
|
| 7468 |
-
|
| 7469 |
-
&beta, dst_dd_i,
|
| 7470 |
-
|
| 7471 |
-
if (src0_as != 0) {
|
| 7472 |
-
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
| 7473 |
-
}
|
| 7474 |
}
|
| 7475 |
|
| 7476 |
(void) dst;
|
|
@@ -7478,9 +7654,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
| 7478 |
(void) src1_padded_row_size;
|
| 7479 |
}
|
| 7480 |
|
| 7481 |
-
|
| 7482 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7483 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7484 |
|
| 7485 |
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
| 7486 |
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
@@ -7558,9 +7734,9 @@ inline void ggml_cuda_op_rope(
|
|
| 7558 |
(void) src1_dd;
|
| 7559 |
}
|
| 7560 |
|
| 7561 |
-
|
| 7562 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7563 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7564 |
|
| 7565 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7566 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -7589,9 +7765,9 @@ inline void ggml_cuda_op_alibi(
|
|
| 7589 |
(void) src1_dd;
|
| 7590 |
}
|
| 7591 |
|
| 7592 |
-
|
| 7593 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7594 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7595 |
|
| 7596 |
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
| 7597 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
@@ -7624,10 +7800,9 @@ inline void ggml_cuda_op_im2col(
|
|
| 7624 |
(void) src0_dd;
|
| 7625 |
}
|
| 7626 |
|
| 7627 |
-
|
| 7628 |
-
inline void ggml_cuda_op_sum_rows(
|
| 7629 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7630 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7631 |
|
| 7632 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7633 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -7642,9 +7817,9 @@ inline void ggml_cuda_op_sum_rows(
|
|
| 7642 |
(void) src1_dd;
|
| 7643 |
}
|
| 7644 |
|
| 7645 |
-
|
| 7646 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7647 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7648 |
|
| 7649 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7650 |
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
|
@@ -7661,9 +7836,9 @@ inline void ggml_cuda_op_argsort(
|
|
| 7661 |
(void) src1_dd;
|
| 7662 |
}
|
| 7663 |
|
| 7664 |
-
|
| 7665 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7666 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7667 |
|
| 7668 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7669 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -7681,9 +7856,9 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
| 7681 |
(void) src1_dd;
|
| 7682 |
}
|
| 7683 |
|
| 7684 |
-
|
| 7685 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7686 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7687 |
|
| 7688 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7689 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -7702,9 +7877,9 @@ inline void ggml_cuda_op_soft_max(
|
|
| 7702 |
(void) dst;
|
| 7703 |
}
|
| 7704 |
|
| 7705 |
-
|
| 7706 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7707 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7708 |
|
| 7709 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7710 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -7720,9 +7895,9 @@ inline void ggml_cuda_op_scale(
|
|
| 7720 |
(void) src1_dd;
|
| 7721 |
}
|
| 7722 |
|
| 7723 |
-
|
| 7724 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7725 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
| 7726 |
|
| 7727 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7728 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -7762,18 +7937,17 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 7762 |
float * src1_ddf = nullptr;
|
| 7763 |
float * dst_ddf = nullptr;
|
| 7764 |
|
| 7765 |
-
|
| 7766 |
-
|
| 7767 |
-
|
| 7768 |
-
size_t dst_asf = 0;
|
| 7769 |
|
| 7770 |
ggml_cuda_set_device(g_main_device);
|
| 7771 |
-
|
| 7772 |
|
| 7773 |
if (src0_on_device) {
|
| 7774 |
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
| 7775 |
} else {
|
| 7776 |
-
src0_ddf = (
|
| 7777 |
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
| 7778 |
}
|
| 7779 |
|
|
@@ -7781,14 +7955,14 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 7781 |
if (src1_on_device) {
|
| 7782 |
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
| 7783 |
} else {
|
| 7784 |
-
src1_ddf = (
|
| 7785 |
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
| 7786 |
}
|
| 7787 |
}
|
| 7788 |
if (dst_on_device) {
|
| 7789 |
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
| 7790 |
} else {
|
| 7791 |
-
dst_ddf = (
|
| 7792 |
}
|
| 7793 |
|
| 7794 |
// do the computation
|
|
@@ -7800,16 +7974,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 7800 |
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
| 7801 |
}
|
| 7802 |
|
| 7803 |
-
if (src0_asf > 0) {
|
| 7804 |
-
ggml_cuda_pool_free(src0_ddf, src0_asf);
|
| 7805 |
-
}
|
| 7806 |
-
if (src1_asf > 0) {
|
| 7807 |
-
ggml_cuda_pool_free(src1_ddf, src1_asf);
|
| 7808 |
-
}
|
| 7809 |
-
if (dst_asf > 0) {
|
| 7810 |
-
ggml_cuda_pool_free(dst_ddf, dst_asf);
|
| 7811 |
-
}
|
| 7812 |
-
|
| 7813 |
if (dst->backend == GGML_BACKEND_CPU) {
|
| 7814 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 7815 |
}
|
|
@@ -7826,12 +7990,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
| 7826 |
|
| 7827 |
#ifdef NDEBUG
|
| 7828 |
for (int id = 0; id < g_device_count; ++id) {
|
| 7829 |
-
|
| 7830 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 7831 |
}
|
| 7832 |
|
| 7833 |
for (int id = 0; id < g_device_count; ++id) {
|
| 7834 |
-
|
| 7835 |
|
| 7836 |
for (int id_other = 0; id_other < g_device_count; ++id_other) {
|
| 7837 |
if (id == id_other) {
|
|
@@ -7865,7 +8029,6 @@ static void ggml_cuda_op_mul_mat(
|
|
| 7865 |
const int64_t ne01 = src0->ne[1];
|
| 7866 |
const int64_t ne02 = src0->ne[2];
|
| 7867 |
const int64_t ne03 = src0->ne[3];
|
| 7868 |
-
const int64_t nrows0 = ggml_nrows(src0);
|
| 7869 |
|
| 7870 |
const int64_t ne10 = src1->ne[0];
|
| 7871 |
const int64_t ne11 = src1->ne[1];
|
|
@@ -7883,6 +8046,7 @@ static void ggml_cuda_op_mul_mat(
|
|
| 7883 |
|
| 7884 |
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
| 7885 |
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
|
|
|
| 7886 |
|
| 7887 |
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
| 7888 |
|
|
@@ -7908,27 +8072,29 @@ static void ggml_cuda_op_mul_mat(
|
|
| 7908 |
GGML_ASSERT(!(split && ne03 > 1));
|
| 7909 |
GGML_ASSERT(!(split && ne02 < ne12));
|
| 7910 |
|
| 7911 |
-
|
| 7912 |
-
|
| 7913 |
-
|
| 7914 |
-
|
| 7915 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7916 |
|
| 7917 |
-
|
| 7918 |
-
|
| 7919 |
-
|
| 7920 |
-
size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
| 7921 |
-
size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
|
| 7922 |
|
| 7923 |
-
|
| 7924 |
-
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
| 7925 |
|
| 7926 |
int used_devices = 0;
|
| 7927 |
|
| 7928 |
-
for (
|
| 7929 |
// by default, use all rows
|
| 7930 |
-
|
| 7931 |
-
|
| 7932 |
|
| 7933 |
// for multi GPU, get the row boundaries from tensor split
|
| 7934 |
// and round to mul_mat_q tile sizes
|
|
@@ -7936,19 +8102,23 @@ static void ggml_cuda_op_mul_mat(
|
|
| 7936 |
const int64_t rounding = get_row_rounding(src0->type);
|
| 7937 |
|
| 7938 |
if (id != 0) {
|
| 7939 |
-
|
| 7940 |
-
|
|
|
|
|
|
|
| 7941 |
}
|
| 7942 |
|
| 7943 |
if (id != g_device_count - 1) {
|
| 7944 |
-
|
| 7945 |
-
|
|
|
|
|
|
|
| 7946 |
}
|
| 7947 |
}
|
| 7948 |
}
|
| 7949 |
|
| 7950 |
-
for (
|
| 7951 |
-
if ((!split && id != g_main_device) ||
|
| 7952 |
continue;
|
| 7953 |
}
|
| 7954 |
|
|
@@ -7958,42 +8128,41 @@ static void ggml_cuda_op_mul_mat(
|
|
| 7958 |
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
| 7959 |
|
| 7960 |
ggml_cuda_set_device(id);
|
| 7961 |
-
|
| 7962 |
|
| 7963 |
if (src0_on_device && src0_is_contiguous) {
|
| 7964 |
-
|
| 7965 |
} else {
|
| 7966 |
-
|
| 7967 |
-
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
| 7968 |
}
|
| 7969 |
|
| 7970 |
if (src1_on_device && src1_is_contiguous) {
|
| 7971 |
-
|
| 7972 |
} else {
|
| 7973 |
-
|
| 7974 |
}
|
| 7975 |
|
| 7976 |
if (convert_src1_to_q8_1) {
|
| 7977 |
-
|
| 7978 |
|
| 7979 |
if (src1_on_device && src1_is_contiguous) {
|
| 7980 |
-
quantize_row_q8_1_cuda(
|
| 7981 |
CUDA_CHECK(cudaGetLastError());
|
| 7982 |
}
|
| 7983 |
}
|
| 7984 |
|
| 7985 |
if (dst_on_device) {
|
| 7986 |
-
|
| 7987 |
} else {
|
| 7988 |
-
const size_t size_dst_ddf = split ? (
|
| 7989 |
-
|
| 7990 |
}
|
| 7991 |
}
|
| 7992 |
|
| 7993 |
// if multiple devices are used they need to wait for the main device
|
| 7994 |
// here an event is recorded that signals that the main device has finished calculating the input data
|
| 7995 |
if (split && used_devices > 1) {
|
| 7996 |
-
|
| 7997 |
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
| 7998 |
}
|
| 7999 |
|
|
@@ -8002,17 +8171,17 @@ static void ggml_cuda_op_mul_mat(
|
|
| 8002 |
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
| 8003 |
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
| 8004 |
|
| 8005 |
-
for (
|
| 8006 |
-
if ((!split && id != g_main_device) ||
|
| 8007 |
continue;
|
| 8008 |
}
|
| 8009 |
|
| 8010 |
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
| 8011 |
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
| 8012 |
-
const int64_t row_diff =
|
| 8013 |
|
| 8014 |
ggml_cuda_set_device(id);
|
| 8015 |
-
|
| 8016 |
|
| 8017 |
// wait for main GPU data if necessary
|
| 8018 |
if (split && (id != g_main_device || is != 0)) {
|
|
@@ -8026,34 +8195,34 @@ static void ggml_cuda_op_mul_mat(
|
|
| 8026 |
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
| 8027 |
|
| 8028 |
// for split tensors the data begins at i0 == i0_offset_low
|
| 8029 |
-
char * src0_dd_i =
|
| 8030 |
-
float * src1_ddf_i =
|
| 8031 |
-
char * src1_ddq_i =
|
| 8032 |
-
float * dst_dd_i =
|
| 8033 |
|
| 8034 |
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
| 8035 |
// in that case an offset on dst_ddf_i is needed
|
| 8036 |
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
|
| 8037 |
-
dst_dd_i +=
|
| 8038 |
}
|
| 8039 |
|
| 8040 |
// copy src0, src1 to device if necessary
|
| 8041 |
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
| 8042 |
if (id != g_main_device) {
|
| 8043 |
if (convert_src1_to_q8_1) {
|
| 8044 |
-
char * src1_ddq_i_source =
|
| 8045 |
-
CUDA_CHECK(
|
| 8046 |
-
|
| 8047 |
} else {
|
| 8048 |
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
| 8049 |
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
| 8050 |
-
CUDA_CHECK(
|
| 8051 |
-
|
| 8052 |
}
|
| 8053 |
}
|
| 8054 |
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
| 8055 |
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
| 8056 |
-
|
| 8057 |
} else {
|
| 8058 |
GGML_ASSERT(false);
|
| 8059 |
}
|
|
@@ -8064,12 +8233,12 @@ static void ggml_cuda_op_mul_mat(
|
|
| 8064 |
}
|
| 8065 |
|
| 8066 |
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
| 8067 |
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor,
|
| 8068 |
}
|
| 8069 |
|
| 8070 |
// do the computation
|
| 8071 |
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
| 8072 |
-
|
| 8073 |
CUDA_CHECK(cudaGetLastError());
|
| 8074 |
|
| 8075 |
// copy dst to host or other device if necessary
|
|
@@ -8093,9 +8262,25 @@ static void ggml_cuda_op_mul_mat(
|
|
| 8093 |
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
| 8094 |
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
| 8095 |
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
| 8096 |
-
dhf_dst_i += src1_col_0*ne0 +
|
| 8097 |
-
|
| 8098 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8099 |
} else {
|
| 8100 |
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
| 8101 |
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
|
@@ -8112,35 +8297,14 @@ static void ggml_cuda_op_mul_mat(
|
|
| 8112 |
}
|
| 8113 |
}
|
| 8114 |
|
| 8115 |
-
for (int64_t id = 0; id < g_device_count; ++id) {
|
| 8116 |
-
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
| 8117 |
-
continue;
|
| 8118 |
-
}
|
| 8119 |
-
CUDA_CHECK(ggml_cuda_set_device(id));
|
| 8120 |
-
|
| 8121 |
-
// free buffers again when done
|
| 8122 |
-
if (src0_as[id] > 0) {
|
| 8123 |
-
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
|
| 8124 |
-
}
|
| 8125 |
-
if (src1_asf[id] > 0) {
|
| 8126 |
-
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
| 8127 |
-
}
|
| 8128 |
-
if (src1_asq[id] > 0) {
|
| 8129 |
-
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
|
| 8130 |
-
}
|
| 8131 |
-
if (dst_as[id] > 0) {
|
| 8132 |
-
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
|
| 8133 |
-
}
|
| 8134 |
-
}
|
| 8135 |
-
|
| 8136 |
// main device waits for all other devices to be finished
|
| 8137 |
if (split && g_device_count > 1) {
|
| 8138 |
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
| 8139 |
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
|
| 8140 |
|
| 8141 |
-
|
| 8142 |
-
for (
|
| 8143 |
-
if (
|
| 8144 |
continue;
|
| 8145 |
}
|
| 8146 |
for (int64_t is = 0; is < is_max; ++is) {
|
|
@@ -8150,7 +8314,7 @@ static void ggml_cuda_op_mul_mat(
|
|
| 8150 |
}
|
| 8151 |
|
| 8152 |
if (dst->backend == GGML_BACKEND_CPU) {
|
| 8153 |
-
|
| 8154 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 8155 |
}
|
| 8156 |
}
|
|
@@ -8260,7 +8424,7 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
| 8260 |
|
| 8261 |
const int64_t ne12 = src1->ne[2];
|
| 8262 |
|
| 8263 |
-
|
| 8264 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 8265 |
|
| 8266 |
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
@@ -8292,7 +8456,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
| 8292 |
|
| 8293 |
const int64_t ne12 = src1->ne[2];
|
| 8294 |
|
| 8295 |
-
|
| 8296 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 8297 |
|
| 8298 |
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
@@ -8329,9 +8493,9 @@ static __global__ void k_compute_batched_ptrs(
|
|
| 8329 |
int64_t i03 = i13 / r3;
|
| 8330 |
int64_t i02 = i12 / r2;
|
| 8331 |
|
| 8332 |
-
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02
|
| 8333 |
-
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12
|
| 8334 |
-
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2
|
| 8335 |
}
|
| 8336 |
|
| 8337 |
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -8340,37 +8504,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8340 |
|
| 8341 |
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
| 8342 |
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
| 8343 |
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 8344 |
-
|
| 8345 |
-
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
| 8346 |
-
const int64_t ne01 = src0->ne[1];
|
| 8347 |
-
const int64_t ne02 = src0->ne[2];
|
| 8348 |
-
const int64_t ne03 = src0->ne[3];
|
| 8349 |
-
|
| 8350 |
-
const int64_t nb01 = src0->nb[1];
|
| 8351 |
-
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
| 8352 |
-
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
| 8353 |
-
|
| 8354 |
-
const int64_t ne10 = src1->ne[0];
|
| 8355 |
-
const int64_t ne11 = src1->ne[1];
|
| 8356 |
-
const int64_t ne12 = src1->ne[2];
|
| 8357 |
-
const int64_t ne13 = src1->ne[3];
|
| 8358 |
|
| 8359 |
-
|
| 8360 |
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
| 8361 |
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
| 8362 |
|
| 8363 |
-
const int64_t
|
| 8364 |
-
const int64_t ne = ggml_nelements(dst);
|
| 8365 |
|
| 8366 |
-
|
| 8367 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 8368 |
|
| 8369 |
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
| 8370 |
|
| 8371 |
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
| 8372 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
| 8373 |
-
half *
|
| 8374 |
|
| 8375 |
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
| 8376 |
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
|
@@ -8379,17 +8525,18 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8379 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
| 8380 |
|
| 8381 |
// convert src1 to fp16
|
| 8382 |
-
|
| 8383 |
-
|
| 8384 |
-
|
| 8385 |
-
|
| 8386 |
-
|
| 8387 |
-
|
| 8388 |
-
|
| 8389 |
-
|
|
|
|
| 8390 |
|
| 8391 |
-
half
|
| 8392 |
-
char * dst_t
|
| 8393 |
|
| 8394 |
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
| 8395 |
cudaDataType_t cu_data_type = CUDA_R_16F;
|
|
@@ -8408,8 +8555,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8408 |
const void * beta = &beta_f16;
|
| 8409 |
|
| 8410 |
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 8411 |
-
|
| 8412 |
-
dst_t = (char *) dst_f16;
|
| 8413 |
|
| 8414 |
nbd2 /= sizeof(float) / sizeof(half);
|
| 8415 |
nbd3 /= sizeof(float) / sizeof(half);
|
|
@@ -8456,9 +8602,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8456 |
CUBLAS_CHECK(
|
| 8457 |
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 8458 |
ne01, ne11, ne10,
|
| 8459 |
-
alpha, (const char *)
|
| 8460 |
-
(const char *)
|
| 8461 |
-
beta, ( char *)
|
| 8462 |
ne12*ne13,
|
| 8463 |
cu_compute_type,
|
| 8464 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
@@ -8466,23 +8612,18 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8466 |
// use cublasGemmBatchedEx
|
| 8467 |
const int ne23 = ne12*ne13;
|
| 8468 |
|
| 8469 |
-
const void
|
| 8470 |
-
|
| 8471 |
-
|
| 8472 |
-
size_t ptrs_src_s = 0;
|
| 8473 |
-
size_t ptrs_dst_s = 0;
|
| 8474 |
-
|
| 8475 |
-
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
| 8476 |
-
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
| 8477 |
|
| 8478 |
dim3 block_dims(ne13, ne12);
|
| 8479 |
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
| 8480 |
-
|
| 8481 |
-
ptrs_src, ptrs_dst,
|
| 8482 |
ne12, ne13,
|
| 8483 |
ne23,
|
| 8484 |
nb02, nb03,
|
| 8485 |
-
nb12
|
|
|
|
| 8486 |
nbd2, nbd3,
|
| 8487 |
r2, r3);
|
| 8488 |
CUDA_CHECK(cudaGetLastError());
|
|
@@ -8490,30 +8631,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8490 |
CUBLAS_CHECK(
|
| 8491 |
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 8492 |
ne01, ne11, ne10,
|
| 8493 |
-
alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/
|
| 8494 |
-
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/
|
| 8495 |
-
beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
|
| 8496 |
ne23,
|
| 8497 |
cu_compute_type,
|
| 8498 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 8499 |
-
|
| 8500 |
-
if (ptrs_src_s != 0) {
|
| 8501 |
-
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
| 8502 |
-
}
|
| 8503 |
-
if (ptrs_dst_s != 0) {
|
| 8504 |
-
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
| 8505 |
-
}
|
| 8506 |
}
|
| 8507 |
#endif
|
| 8508 |
|
| 8509 |
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 8510 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
| 8511 |
-
to_fp32_cuda(dst_f16, dst_ddf,
|
| 8512 |
-
|
| 8513 |
-
ggml_cuda_pool_free(dst_f16, dst_as);
|
| 8514 |
}
|
| 8515 |
-
|
| 8516 |
-
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
| 8517 |
}
|
| 8518 |
|
| 8519 |
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -8525,9 +8655,9 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
| 8525 |
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
| 8526 |
|
| 8527 |
int64_t min_compute_capability = INT_MAX;
|
| 8528 |
-
for (
|
| 8529 |
-
if (min_compute_capability >
|
| 8530 |
-
min_compute_capability =
|
| 8531 |
}
|
| 8532 |
}
|
| 8533 |
|
|
@@ -8551,13 +8681,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
| 8551 |
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
| 8552 |
// KQV single-batch
|
| 8553 |
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
| 8554 |
-
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 &&
|
| 8555 |
// KQ + KQV multi-batch
|
| 8556 |
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
| 8557 |
} else if (src0->type == GGML_TYPE_F32) {
|
| 8558 |
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
| 8559 |
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
| 8560 |
-
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
| 8561 |
#ifdef GGML_CUDA_FORCE_DMMV
|
| 8562 |
const bool use_mul_mat_vec_q = false;
|
| 8563 |
#else
|
|
@@ -8668,7 +8798,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
|
| 8668 |
const int64_t ne1 = ggml_nelements(src1);
|
| 8669 |
const int64_t ne = ggml_nelements(dst);
|
| 8670 |
|
| 8671 |
-
|
| 8672 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 8673 |
|
| 8674 |
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
|
@@ -8786,7 +8916,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 8786 |
|
| 8787 |
std::vector<char> ids_host(ggml_nbytes(ids));
|
| 8788 |
|
| 8789 |
-
|
| 8790 |
|
| 8791 |
if (ids->backend == GGML_BACKEND_GPU) {
|
| 8792 |
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
|
@@ -8840,12 +8970,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 8840 |
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
| 8841 |
}
|
| 8842 |
} else {
|
| 8843 |
-
|
| 8844 |
-
char
|
| 8845 |
-
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
|
| 8846 |
|
| 8847 |
-
src1_row_extra.data_device[g_main_device] = src1_contiguous;
|
| 8848 |
-
dst_row_extra.data_device[g_main_device] = dst_contiguous;
|
| 8849 |
|
| 8850 |
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
| 8851 |
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
|
@@ -8865,7 +8994,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 8865 |
|
| 8866 |
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
| 8867 |
|
| 8868 |
-
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
|
| 8869 |
nb11, src1_kind, stream));
|
| 8870 |
num_src1_rows++;
|
| 8871 |
}
|
|
@@ -8897,14 +9026,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 8897 |
|
| 8898 |
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
| 8899 |
|
| 8900 |
-
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
|
| 8901 |
nb1, dst_kind, stream));
|
| 8902 |
num_src1_rows++;
|
| 8903 |
}
|
| 8904 |
}
|
| 8905 |
-
|
| 8906 |
-
ggml_cuda_pool_free(src1_contiguous, as_src1);
|
| 8907 |
-
ggml_cuda_pool_free(dst_contiguous, as_dst);
|
| 8908 |
}
|
| 8909 |
|
| 8910 |
if (dst->backend == GGML_BACKEND_CPU) {
|
|
@@ -8946,7 +9072,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
| 8946 |
const int64_t nb11 = src1->nb[1];
|
| 8947 |
const int64_t nb12 = src1->nb[2];
|
| 8948 |
|
| 8949 |
-
|
| 8950 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 8951 |
|
| 8952 |
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
@@ -9036,7 +9162,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
| 9036 |
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
| 9037 |
memset(extra, 0, sizeof(*extra));
|
| 9038 |
|
| 9039 |
-
for (
|
| 9040 |
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
| 9041 |
continue;
|
| 9042 |
}
|
|
@@ -9107,15 +9233,14 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
| 9107 |
|
| 9108 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 9109 |
|
| 9110 |
-
for (
|
|
|
|
| 9111 |
if (extra->data_device[id] != nullptr) {
|
| 9112 |
-
CUDA_CHECK(ggml_cuda_set_device(id));
|
| 9113 |
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
| 9114 |
}
|
| 9115 |
|
| 9116 |
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
| 9117 |
if (extra->events[id][is] != nullptr) {
|
| 9118 |
-
CUDA_CHECK(ggml_cuda_set_device(id));
|
| 9119 |
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
| 9120 |
}
|
| 9121 |
}
|
|
@@ -9169,7 +9294,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
| 9169 |
force_inplace;
|
| 9170 |
const size_t size = ggml_nbytes(tensor);
|
| 9171 |
|
| 9172 |
-
|
| 9173 |
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
| 9174 |
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
| 9175 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
|
@@ -9246,7 +9371,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
|
| 9246 |
GGML_ASSERT(ggml_is_contiguous(tensor));
|
| 9247 |
|
| 9248 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 9249 |
-
|
| 9250 |
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
| 9251 |
}
|
| 9252 |
|
|
@@ -9670,12 +9795,16 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
| 9670 |
// host buffer type
|
| 9671 |
|
| 9672 |
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 9673 |
-
|
| 9674 |
}
|
| 9675 |
|
| 9676 |
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 9677 |
-
void * ptr;
|
| 9678 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9679 |
|
| 9680 |
// FIXME: this is a hack to avoid having to implement a new buffer type
|
| 9681 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
|
|
| 68 |
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
| 69 |
#endif
|
| 70 |
#define cudaMemcpy hipMemcpy
|
|
|
|
| 71 |
#define cudaMemcpyAsync hipMemcpyAsync
|
| 72 |
+
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
| 73 |
+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
| 74 |
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
| 75 |
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
| 76 |
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
|
|
|
| 87 |
#define cudaStream_t hipStream_t
|
| 88 |
#define cudaSuccess hipSuccess
|
| 89 |
#define __trap abort
|
| 90 |
+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
| 91 |
+
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
| 92 |
+
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
| 93 |
+
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
| 94 |
+
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
| 95 |
+
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
| 96 |
+
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
| 97 |
+
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
| 98 |
+
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
| 99 |
#else
|
| 100 |
#include <cuda_runtime.h>
|
| 101 |
+
#include <cuda.h>
|
| 102 |
#include <cublas_v2.h>
|
| 103 |
#include <cuda_fp16.h>
|
| 104 |
+
|
| 105 |
+
#if CUDART_VERSION < 11020
|
| 106 |
+
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
| 107 |
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
| 108 |
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
| 109 |
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
| 110 |
#define cublasComputeType_t cudaDataType_t
|
| 111 |
+
#endif // CUDART_VERSION < 11020
|
| 112 |
+
|
| 113 |
#endif // defined(GGML_USE_HIPBLAS)
|
| 114 |
|
| 115 |
#include "ggml-cuda.h"
|
|
|
|
| 164 |
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
| 165 |
#if __has_builtin(__builtin_elementwise_sub_sat)
|
| 166 |
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
| 167 |
+
return reinterpret_cast<const int &>(c);
|
| 168 |
#else
|
| 169 |
int8x4_t c;
|
| 170 |
int16_t tmp;
|
|
|
|
| 175 |
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
| 176 |
c[i] = tmp;
|
| 177 |
}
|
| 178 |
+
return reinterpret_cast<int &>(c);
|
| 179 |
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
| 180 |
}
|
| 181 |
|
|
|
|
| 213 |
|
| 214 |
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
| 215 |
|
| 216 |
+
[[noreturn]]
|
| 217 |
+
static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
|
| 218 |
+
int id = -1; // in case cudaGetDevice fails
|
| 219 |
+
cudaGetDevice(&id);
|
| 220 |
+
|
| 221 |
+
fprintf(stderr, "CUDA error: %s\n", msg);
|
| 222 |
+
fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
| 223 |
+
fprintf(stderr, " %s\n", stmt);
|
| 224 |
+
// abort with GGML_ASSERT to get a stack trace
|
| 225 |
+
GGML_ASSERT(!"CUDA error");
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
#define CUDA_CHECK_GEN(err, success, error_fn) \
|
| 229 |
+
do { \
|
| 230 |
+
auto err_ = (err); \
|
| 231 |
+
if (err_ != (success)) { \
|
| 232 |
+
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
|
| 233 |
+
} \
|
| 234 |
} while (0)
|
| 235 |
|
| 236 |
+
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
|
| 237 |
+
|
| 238 |
#if CUDART_VERSION >= 12000
|
| 239 |
+
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
| 240 |
+
return cublasGetStatusString(err);
|
| 241 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
#else
|
| 243 |
+
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
| 244 |
+
switch (err) {
|
| 245 |
+
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
|
| 246 |
+
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
|
| 247 |
+
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
|
| 248 |
+
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
|
| 249 |
+
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
|
| 250 |
+
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
|
| 251 |
+
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
|
| 252 |
+
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
|
| 253 |
+
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
|
| 254 |
+
default: return "unknown error";
|
| 255 |
+
}
|
| 256 |
+
}
|
| 257 |
+
#endif // CUDART_VERSION >= 12000
|
| 258 |
+
|
| 259 |
+
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
| 260 |
+
|
| 261 |
+
#if !defined(GGML_USE_HIPBLAS)
|
| 262 |
+
static const char * cu_get_error_str(CUresult err) {
|
| 263 |
+
const char * err_str;
|
| 264 |
+
cuGetErrorString(err, &err_str);
|
| 265 |
+
return err_str;
|
| 266 |
+
}
|
| 267 |
+
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
| 268 |
+
#endif
|
| 269 |
|
| 270 |
#if CUDART_VERSION >= 11100
|
| 271 |
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
|
|
|
| 321 |
typedef void (*ggml_cuda_op_mul_mat_t)(
|
| 322 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 323 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 324 |
+
const int64_t src1_padded_row_size, cudaStream_t stream);
|
| 325 |
typedef void (*ggml_cuda_op_flatten_t)(
|
| 326 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 327 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
|
| 328 |
|
| 329 |
// QK = number of values after dequantization
|
| 330 |
// QR = QK / number of values before dequantization
|
|
|
|
| 530 |
|
| 531 |
// this is faster on Windows
|
| 532 |
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
| 533 |
+
static void ggml_cuda_set_device(const int device) {
|
| 534 |
int current_device;
|
| 535 |
CUDA_CHECK(cudaGetDevice(¤t_device));
|
| 536 |
|
| 537 |
if (device == current_device) {
|
| 538 |
+
return;
|
| 539 |
}
|
| 540 |
|
| 541 |
+
CUDA_CHECK(cudaSetDevice(device));
|
| 542 |
}
|
| 543 |
|
| 544 |
static int g_device_count = -1;
|
| 545 |
static int g_main_device = 0;
|
|
|
|
| 546 |
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
| 547 |
|
| 548 |
+
struct cuda_device_capabilities {
|
| 549 |
+
int cc; // compute capability
|
| 550 |
+
bool vmm; // virtual memory support
|
| 551 |
+
size_t vmm_granularity; // granularity of virtual memory
|
| 552 |
+
};
|
| 553 |
+
|
| 554 |
+
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
|
| 555 |
+
|
| 556 |
static void * g_scratch_buffer = nullptr;
|
| 557 |
static size_t g_scratch_size = 0; // disabled by default
|
| 558 |
static size_t g_scratch_offset = 0;
|
|
|
|
| 594 |
|
| 595 |
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
| 596 |
return b;
|
| 597 |
+
GGML_UNUSED(a);
|
| 598 |
}
|
| 599 |
|
| 600 |
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
|
|
|
| 716 |
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
| 717 |
}
|
| 718 |
|
| 719 |
+
static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
|
| 720 |
const float GELU_QUICK_COEF = -1.702f;
|
| 721 |
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 722 |
if (i >= k) {
|
|
|
|
| 725 |
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
|
| 726 |
}
|
| 727 |
|
| 728 |
+
static __global__ void tanh_f32(const float * x, float * dst, int k) {
|
| 729 |
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 730 |
if (i >= k) {
|
| 731 |
return;
|
|
|
|
| 742 |
dst[i] = fmaxf(x[i], 0);
|
| 743 |
}
|
| 744 |
|
| 745 |
+
static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
|
| 746 |
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 747 |
if (i >= k) {
|
| 748 |
return;
|
|
|
|
| 795 |
}
|
| 796 |
}
|
| 797 |
|
| 798 |
+
static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
|
| 799 |
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 800 |
if (nidx >= ne0) {
|
| 801 |
return;
|
|
|
|
| 820 |
}
|
| 821 |
}
|
| 822 |
|
| 823 |
+
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
|
| 824 |
int ne0 = ne00 * scale_factor;
|
| 825 |
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 826 |
if (nidx >= ne0) {
|
|
|
|
| 840 |
dst[offset_dst] = x[offset_src];
|
| 841 |
}
|
| 842 |
|
| 843 |
+
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
|
| 844 |
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 845 |
if (nidx >= ne0) {
|
| 846 |
return;
|
|
|
|
| 4742 |
|
| 4743 |
const int row_y = col_x;
|
| 4744 |
|
|
|
|
| 4745 |
// y is not transposed but permuted
|
| 4746 |
const int iy = channel*nrows_y + row_y;
|
| 4747 |
|
|
|
|
| 5416 |
cne[3] = 1;
|
| 5417 |
};
|
| 5418 |
|
| 5419 |
+
auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
|
| 5420 |
cnb[1] *= cne[1];
|
| 5421 |
cnb[2] *= cne[2];
|
| 5422 |
cnb[3] *= cne[3];
|
|
|
|
| 5909 |
|
| 5910 |
int id;
|
| 5911 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 5912 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 5913 |
|
| 5914 |
int mmq_x, mmq_y, nwarps;
|
| 5915 |
if (compute_capability >= CC_RDNA2) {
|
|
|
|
| 5954 |
|
| 5955 |
int id;
|
| 5956 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 5957 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 5958 |
|
| 5959 |
int mmq_x, mmq_y, nwarps;
|
| 5960 |
if (compute_capability >= CC_RDNA2) {
|
|
|
|
| 5999 |
|
| 6000 |
int id;
|
| 6001 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6002 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 6003 |
|
| 6004 |
int mmq_x, mmq_y, nwarps;
|
| 6005 |
if (compute_capability >= CC_RDNA2) {
|
|
|
|
| 6044 |
|
| 6045 |
int id;
|
| 6046 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6047 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 6048 |
|
| 6049 |
int mmq_x, mmq_y, nwarps;
|
| 6050 |
if (compute_capability >= CC_RDNA2) {
|
|
|
|
| 6089 |
|
| 6090 |
int id;
|
| 6091 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6092 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 6093 |
|
| 6094 |
int mmq_x, mmq_y, nwarps;
|
| 6095 |
if (compute_capability >= CC_RDNA2) {
|
|
|
|
| 6134 |
|
| 6135 |
int id;
|
| 6136 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6137 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 6138 |
|
| 6139 |
int mmq_x, mmq_y, nwarps;
|
| 6140 |
if (compute_capability >= CC_RDNA2) {
|
|
|
|
| 6181 |
|
| 6182 |
int id;
|
| 6183 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6184 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 6185 |
|
| 6186 |
int mmq_x, mmq_y, nwarps;
|
| 6187 |
if (compute_capability >= CC_RDNA2) {
|
|
|
|
| 6227 |
|
| 6228 |
int id;
|
| 6229 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6230 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 6231 |
|
| 6232 |
int mmq_x, mmq_y, nwarps;
|
| 6233 |
if (compute_capability >= CC_RDNA2) {
|
|
|
|
| 6272 |
|
| 6273 |
int id;
|
| 6274 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6275 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 6276 |
|
| 6277 |
int mmq_x, mmq_y, nwarps;
|
| 6278 |
if (compute_capability >= CC_RDNA2) {
|
|
|
|
| 6317 |
|
| 6318 |
int id;
|
| 6319 |
CUDA_CHECK(cudaGetDevice(&id));
|
| 6320 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 6321 |
|
| 6322 |
int mmq_x, mmq_y, nwarps;
|
| 6323 |
if (compute_capability >= CC_RDNA2) {
|
|
|
|
| 6577 |
scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
|
| 6578 |
};
|
| 6579 |
|
| 6580 |
+
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
|
| 6581 |
+
|
| 6582 |
+
// #define DEBUG_CUDA_MALLOC
|
| 6583 |
+
struct ggml_cuda_buffer {
|
| 6584 |
void * ptr = nullptr;
|
| 6585 |
size_t size = 0;
|
| 6586 |
};
|
| 6587 |
|
| 6588 |
+
static ggml_cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
|
| 6589 |
+
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
|
| 6590 |
|
| 6591 |
+
static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
|
| 6592 |
scoped_spin_lock lock(g_cuda_pool_lock);
|
|
|
|
|
|
|
| 6593 |
#ifdef DEBUG_CUDA_MALLOC
|
| 6594 |
int nnz = 0;
|
| 6595 |
+
size_t max_size = 0;
|
| 6596 |
#endif
|
| 6597 |
size_t best_diff = 1ull << 36;
|
| 6598 |
int ibest = -1;
|
| 6599 |
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
| 6600 |
+
ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
|
| 6601 |
if (b.ptr != nullptr) {
|
| 6602 |
#ifdef DEBUG_CUDA_MALLOC
|
| 6603 |
++nnz;
|
|
|
|
| 6604 |
if (b.size > max_size) max_size = b.size;
|
| 6605 |
#endif
|
| 6606 |
if (b.size >= size) {
|
|
|
|
| 6620 |
}
|
| 6621 |
}
|
| 6622 |
if (ibest >= 0) {
|
| 6623 |
+
ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest];
|
| 6624 |
void * ptr = b.ptr;
|
| 6625 |
*actual_size = b.size;
|
| 6626 |
b.ptr = nullptr;
|
| 6627 |
b.size = 0;
|
| 6628 |
return ptr;
|
| 6629 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6630 |
void * ptr;
|
| 6631 |
size_t look_ahead_size = (size_t) (1.05 * size);
|
| 6632 |
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
| 6633 |
+
ggml_cuda_set_device(device);
|
| 6634 |
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
| 6635 |
*actual_size = look_ahead_size;
|
| 6636 |
+
g_cuda_pool_size[device] += look_ahead_size;
|
| 6637 |
+
#ifdef DEBUG_CUDA_MALLOC
|
| 6638 |
+
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
|
| 6639 |
+
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
|
| 6640 |
+
#endif
|
| 6641 |
return ptr;
|
| 6642 |
}
|
| 6643 |
|
| 6644 |
+
static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
|
| 6645 |
scoped_spin_lock lock(g_cuda_pool_lock);
|
|
|
|
|
|
|
| 6646 |
|
| 6647 |
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
| 6648 |
+
ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
|
| 6649 |
if (b.ptr == nullptr) {
|
| 6650 |
b.ptr = ptr;
|
| 6651 |
b.size = size;
|
|
|
|
| 6653 |
}
|
| 6654 |
}
|
| 6655 |
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
| 6656 |
+
ggml_cuda_set_device(device);
|
| 6657 |
CUDA_CHECK(cudaFree(ptr));
|
| 6658 |
+
g_cuda_pool_size[device] -= size;
|
| 6659 |
}
|
| 6660 |
|
| 6661 |
+
#if !defined(GGML_USE_HIPBLAS)
|
| 6662 |
+
// pool with virtual memory
|
| 6663 |
+
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
| 6664 |
+
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
| 6665 |
+
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
|
| 6666 |
+
|
| 6667 |
+
static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
|
| 6668 |
+
scoped_spin_lock lock(g_cuda_pool_lock);
|
| 6669 |
+
|
| 6670 |
+
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
|
| 6671 |
+
const size_t alignment = 128;
|
| 6672 |
+
size = alignment * ((size + alignment - 1) / alignment);
|
| 6673 |
+
|
| 6674 |
+
size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device];
|
| 6675 |
+
|
| 6676 |
+
if (size > avail) {
|
| 6677 |
+
// round up to the next multiple of the granularity
|
| 6678 |
+
size_t reserve_size = size - avail;
|
| 6679 |
+
const size_t granularity = g_device_caps[device].vmm_granularity;
|
| 6680 |
+
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
|
| 6681 |
+
|
| 6682 |
+
GGML_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
|
| 6683 |
+
|
| 6684 |
+
// allocate more physical memory
|
| 6685 |
+
CUmemAllocationProp prop = {};
|
| 6686 |
+
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
| 6687 |
+
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
| 6688 |
+
prop.location.id = device;
|
| 6689 |
+
CUmemGenericAllocationHandle handle;
|
| 6690 |
+
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
|
| 6691 |
+
|
| 6692 |
+
// reserve virtual address space (if not already reserved)
|
| 6693 |
+
if (g_cuda_pool_addr[device] == 0) {
|
| 6694 |
+
CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
|
| 6695 |
+
}
|
| 6696 |
+
|
| 6697 |
+
// map at the end of the pool
|
| 6698 |
+
CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0));
|
| 6699 |
+
|
| 6700 |
+
// the memory allocation handle is no longer needed after mapping
|
| 6701 |
+
CU_CHECK(cuMemRelease(handle));
|
| 6702 |
+
|
| 6703 |
+
// set access
|
| 6704 |
+
CUmemAccessDesc access = {};
|
| 6705 |
+
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
| 6706 |
+
access.location.id = device;
|
| 6707 |
+
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
| 6708 |
+
CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1));
|
| 6709 |
+
|
| 6710 |
+
// add to the pool
|
| 6711 |
+
g_cuda_pool_size[device] += reserve_size;
|
| 6712 |
+
|
| 6713 |
+
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
| 6714 |
+
// id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
|
| 6715 |
+
// (unsigned long long) (reserve_size/1024/1024));
|
| 6716 |
+
}
|
| 6717 |
+
|
| 6718 |
+
GGML_ASSERT(g_cuda_pool_addr[device] != 0);
|
| 6719 |
+
|
| 6720 |
+
void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]);
|
| 6721 |
+
*actual_size = size;
|
| 6722 |
+
g_cuda_pool_used[device] += size;
|
| 6723 |
+
|
| 6724 |
+
#ifdef DEBUG_CUDA_MALLOC
|
| 6725 |
+
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
|
| 6726 |
+
#endif
|
| 6727 |
+
|
| 6728 |
+
return ptr;
|
| 6729 |
+
}
|
| 6730 |
+
|
| 6731 |
+
static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
|
| 6732 |
+
scoped_spin_lock lock(g_cuda_pool_lock);
|
| 6733 |
+
|
| 6734 |
+
#ifdef DEBUG_CUDA_MALLOC
|
| 6735 |
+
printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
|
| 6736 |
+
#endif
|
| 6737 |
+
|
| 6738 |
+
g_cuda_pool_used[device] -= size;
|
| 6739 |
+
|
| 6740 |
+
// all deallocations must be in reverse order of the allocations
|
| 6741 |
+
GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]));
|
| 6742 |
+
}
|
| 6743 |
+
|
| 6744 |
+
static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
|
| 6745 |
+
if (g_device_caps[device].vmm) {
|
| 6746 |
+
return ggml_cuda_pool_malloc_vmm(device, size, actual_size);
|
| 6747 |
+
} else {
|
| 6748 |
+
return ggml_cuda_pool_malloc_leg(device, size, actual_size);
|
| 6749 |
+
}
|
| 6750 |
+
}
|
| 6751 |
+
|
| 6752 |
+
static void ggml_cuda_pool_free(int device, void * ptr, size_t size) {
|
| 6753 |
+
if (g_device_caps[device].vmm) {
|
| 6754 |
+
ggml_cuda_pool_free_vmm(device, ptr, size);
|
| 6755 |
+
} else {
|
| 6756 |
+
ggml_cuda_pool_free_leg(device, ptr, size);
|
| 6757 |
+
}
|
| 6758 |
+
}
|
| 6759 |
+
#else
|
| 6760 |
+
#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
|
| 6761 |
+
#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
|
| 6762 |
+
#endif // !defined(GGML_USE_HIPBLAS)
|
| 6763 |
+
|
| 6764 |
+
template<typename T>
|
| 6765 |
+
struct cuda_pool_alloc {
|
| 6766 |
+
int device = -1;
|
| 6767 |
+
T * ptr = nullptr;
|
| 6768 |
+
size_t actual_size = 0;
|
| 6769 |
+
|
| 6770 |
+
// size is in number of elements
|
| 6771 |
+
T * alloc(size_t size) {
|
| 6772 |
+
GGML_ASSERT(ptr == nullptr);
|
| 6773 |
+
CUDA_CHECK(cudaGetDevice(&device));
|
| 6774 |
+
ptr = (T *) ggml_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size);
|
| 6775 |
+
return ptr;
|
| 6776 |
+
}
|
| 6777 |
+
|
| 6778 |
+
cuda_pool_alloc(size_t size) {
|
| 6779 |
+
alloc(size);
|
| 6780 |
+
}
|
| 6781 |
+
|
| 6782 |
+
~cuda_pool_alloc() {
|
| 6783 |
+
if (ptr != nullptr) {
|
| 6784 |
+
ggml_cuda_pool_free(device, ptr, actual_size);
|
| 6785 |
+
}
|
| 6786 |
+
}
|
| 6787 |
+
|
| 6788 |
+
T * get() {
|
| 6789 |
+
return ptr;
|
| 6790 |
+
}
|
| 6791 |
+
|
| 6792 |
+
cuda_pool_alloc() = default;
|
| 6793 |
+
cuda_pool_alloc(const cuda_pool_alloc &) = delete;
|
| 6794 |
+
cuda_pool_alloc(cuda_pool_alloc &&) = delete;
|
| 6795 |
+
cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
|
| 6796 |
+
cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
|
| 6797 |
+
};
|
| 6798 |
+
|
| 6799 |
static bool g_cublas_loaded = false;
|
| 6800 |
|
| 6801 |
bool ggml_cublas_loaded(void) {
|
|
|
|
| 6834 |
#endif
|
| 6835 |
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
| 6836 |
for (int id = 0; id < g_device_count; ++id) {
|
| 6837 |
+
int device_vmm = 0;
|
| 6838 |
+
|
| 6839 |
+
#if !defined(GGML_USE_HIPBLAS)
|
| 6840 |
+
CUdevice device;
|
| 6841 |
+
CU_CHECK(cuDeviceGet(&device, id));
|
| 6842 |
+
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
| 6843 |
+
|
| 6844 |
+
if (device_vmm) {
|
| 6845 |
+
CUmemAllocationProp alloc_prop = {};
|
| 6846 |
+
alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
| 6847 |
+
alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
| 6848 |
+
alloc_prop.location.id = id;
|
| 6849 |
+
CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
| 6850 |
+
}
|
| 6851 |
+
#endif // !defined(GGML_USE_HIPBLAS)
|
| 6852 |
+
g_device_caps[id].vmm = !!device_vmm;
|
| 6853 |
+
|
| 6854 |
cudaDeviceProp prop;
|
| 6855 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
| 6856 |
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
| 6857 |
|
| 6858 |
g_tensor_split[id] = total_vram;
|
| 6859 |
total_vram += prop.totalGlobalMem;
|
| 6860 |
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 6861 |
+
g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
| 6862 |
#else
|
| 6863 |
+
g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
|
| 6864 |
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 6865 |
}
|
| 6866 |
for (int id = 0; id < g_device_count; ++id) {
|
|
|
|
| 6868 |
}
|
| 6869 |
|
| 6870 |
for (int id = 0; id < g_device_count; ++id) {
|
| 6871 |
+
ggml_cuda_set_device(id);
|
| 6872 |
|
| 6873 |
// create cuda streams
|
| 6874 |
for (int is = 0; is < MAX_STREAMS; ++is) {
|
|
|
|
| 6920 |
void * ptr = nullptr;
|
| 6921 |
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
| 6922 |
if (err != cudaSuccess) {
|
| 6923 |
+
// clear the error
|
|
|
|
| 6924 |
cudaGetLastError();
|
| 6925 |
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
| 6926 |
size/1024.0/1024.0, cudaGetErrorString(err));
|
|
|
|
| 6983 |
|
| 6984 |
static void ggml_cuda_op_get_rows(
|
| 6985 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6986 |
+
const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {
|
| 6987 |
|
| 6988 |
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
| 6989 |
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
|
|
| 7025 |
}
|
| 7026 |
|
| 7027 |
template<class op>
|
| 7028 |
+
static void ggml_cuda_op_bin_bcast(
|
| 7029 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7030 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7031 |
|
| 7032 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 7033 |
|
|
|
|
| 7046 |
|
| 7047 |
static void ggml_cuda_op_repeat(
|
| 7048 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7049 |
+
const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {
|
| 7050 |
|
| 7051 |
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
| 7052 |
|
|
|
|
| 7054 |
(void) src1_d;
|
| 7055 |
}
|
| 7056 |
|
| 7057 |
+
static void ggml_cuda_op_add(
|
| 7058 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7059 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7060 |
|
| 7061 |
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
| 7062 |
}
|
| 7063 |
|
| 7064 |
+
static void ggml_cuda_op_acc(
|
| 7065 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7066 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7067 |
|
| 7068 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7069 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
| 7080 |
(void) dst;
|
| 7081 |
}
|
| 7082 |
|
| 7083 |
+
static void ggml_cuda_op_mul(
|
| 7084 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7085 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7086 |
|
| 7087 |
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
| 7088 |
}
|
| 7089 |
|
| 7090 |
+
static void ggml_cuda_op_div(
|
| 7091 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7092 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7093 |
|
| 7094 |
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
| 7095 |
}
|
| 7096 |
|
| 7097 |
+
static void ggml_cuda_op_gelu(
|
| 7098 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7099 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7100 |
|
| 7101 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7102 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7108 |
(void) src1_dd;
|
| 7109 |
}
|
| 7110 |
|
| 7111 |
+
static void ggml_cuda_op_silu(
|
| 7112 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7113 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7114 |
|
| 7115 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7116 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7122 |
(void) src1_dd;
|
| 7123 |
}
|
| 7124 |
|
| 7125 |
+
static void ggml_cuda_op_gelu_quick(
|
| 7126 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7127 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7128 |
|
| 7129 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7130 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7136 |
(void) src1_dd;
|
| 7137 |
}
|
| 7138 |
|
| 7139 |
+
static void ggml_cuda_op_tanh(
|
| 7140 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7141 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7142 |
|
| 7143 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7144 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7150 |
(void) src1_dd;
|
| 7151 |
}
|
| 7152 |
|
| 7153 |
+
static void ggml_cuda_op_relu(
|
| 7154 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7155 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7156 |
|
| 7157 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7158 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7164 |
(void) src1_dd;
|
| 7165 |
}
|
| 7166 |
|
| 7167 |
+
static void ggml_cuda_op_leaky_relu(
|
| 7168 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7169 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7170 |
|
| 7171 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7172 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7181 |
(void) src1_dd;
|
| 7182 |
}
|
| 7183 |
|
| 7184 |
+
static void ggml_cuda_op_sqr(
|
| 7185 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7186 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7187 |
|
| 7188 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7189 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7195 |
(void) src1_dd;
|
| 7196 |
}
|
| 7197 |
|
| 7198 |
+
static void ggml_cuda_op_norm(
|
| 7199 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7200 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7201 |
|
| 7202 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7203 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7215 |
(void) src1_dd;
|
| 7216 |
}
|
| 7217 |
|
| 7218 |
+
static void ggml_cuda_op_group_norm(
|
|
|
|
| 7219 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7220 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7221 |
|
| 7222 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7223 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7231 |
(void) src1_dd;
|
| 7232 |
}
|
| 7233 |
|
| 7234 |
+
static void ggml_cuda_op_concat(
|
| 7235 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7236 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7237 |
|
| 7238 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7239 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
| 7247 |
(void) dst;
|
| 7248 |
}
|
| 7249 |
|
| 7250 |
+
static void ggml_cuda_op_upscale(
|
| 7251 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7252 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7253 |
|
| 7254 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7255 |
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
|
|
| 7264 |
(void) src1_dd;
|
| 7265 |
}
|
| 7266 |
|
| 7267 |
+
static void ggml_cuda_op_pad(
|
| 7268 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7269 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7270 |
|
| 7271 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7272 |
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
|
|
| 7281 |
(void) src1_dd;
|
| 7282 |
}
|
| 7283 |
|
| 7284 |
+
static void ggml_cuda_op_rms_norm(
|
| 7285 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7286 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7287 |
|
| 7288 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7289 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7301 |
(void) src1_dd;
|
| 7302 |
}
|
| 7303 |
|
| 7304 |
+
static void ggml_cuda_op_mul_mat_q(
|
| 7305 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 7306 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 7307 |
+
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
| 7308 |
|
| 7309 |
const int64_t ne00 = src0->ne[0];
|
| 7310 |
|
|
|
|
| 7366 |
static int64_t get_row_rounding(ggml_type type) {
|
| 7367 |
int64_t min_compute_capability = INT_MAX;
|
| 7368 |
int64_t max_compute_capability = INT_MIN;
|
| 7369 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 7370 |
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
| 7371 |
+
if (min_compute_capability > g_device_caps[id].cc) {
|
| 7372 |
+
min_compute_capability = g_device_caps[id].cc;
|
| 7373 |
}
|
| 7374 |
+
if (max_compute_capability < g_device_caps[id].cc) {
|
| 7375 |
+
max_compute_capability = g_device_caps[id].cc;
|
| 7376 |
}
|
| 7377 |
}
|
| 7378 |
}
|
|
|
|
| 7424 |
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 7425 |
}
|
| 7426 |
|
| 7427 |
+
static void ggml_cuda_op_mul_mat_vec_q(
|
| 7428 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 7429 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 7430 |
+
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
| 7431 |
|
| 7432 |
GGML_ASSERT(ggml_nrows(src1) == 1);
|
| 7433 |
|
|
|
|
| 7477 |
(void) src1_padded_row_size;
|
| 7478 |
}
|
| 7479 |
|
| 7480 |
+
static void ggml_cuda_op_dequantize_mul_mat_vec(
|
| 7481 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 7482 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 7483 |
+
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
| 7484 |
|
| 7485 |
const int64_t ne00 = src0->ne[0];
|
| 7486 |
const int64_t row_diff = row_high - row_low;
|
| 7487 |
|
| 7488 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 7489 |
+
|
| 7490 |
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
| 7491 |
#ifdef GGML_CUDA_F16
|
| 7492 |
+
cuda_pool_alloc<half> src1_dfloat_a;
|
| 7493 |
+
half * src1_dfloat = nullptr; // dfloat == half
|
| 7494 |
|
| 7495 |
bool src1_convert_f16 =
|
| 7496 |
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
|
|
|
| 7498 |
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
| 7499 |
|
| 7500 |
if (src1_convert_f16) {
|
| 7501 |
+
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
| 7502 |
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
| 7503 |
ne00, 1, sizeof(float), 0, 0,
|
| 7504 |
ne00, 1, sizeof(half), 0, 0, stream);
|
|
|
|
| 7546 |
break;
|
| 7547 |
}
|
| 7548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7549 |
(void) src1;
|
| 7550 |
(void) dst;
|
| 7551 |
(void) src1_ddq_i;
|
|
|
|
| 7553 |
(void) src1_padded_row_size;
|
| 7554 |
}
|
| 7555 |
|
| 7556 |
+
static void ggml_cuda_op_mul_mat_cublas(
|
| 7557 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 7558 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 7559 |
+
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
| 7560 |
|
| 7561 |
GGML_ASSERT(src0_dd_i != nullptr);
|
| 7562 |
GGML_ASSERT(src1_ddf_i != nullptr);
|
|
|
|
| 7576 |
// ldc == nrows of the matrix that cuBLAS writes into
|
| 7577 |
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
| 7578 |
|
| 7579 |
+
const int compute_capability = g_device_caps[id].cc;
|
| 7580 |
|
| 7581 |
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 7582 |
+
//printf("this branch\n");
|
| 7583 |
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
| 7584 |
+
cuda_pool_alloc<half> src0_as_f16;
|
|
|
|
| 7585 |
if (src0->type != GGML_TYPE_F16) {
|
| 7586 |
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
| 7587 |
GGML_ASSERT(to_fp16_cuda != nullptr);
|
| 7588 |
size_t ne = row_diff*ne00;
|
| 7589 |
+
src0_as_f16.alloc(ne);
|
| 7590 |
+
to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
|
| 7591 |
}
|
| 7592 |
+
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
|
| 7593 |
|
| 7594 |
+
cuda_pool_alloc<half> src1_as_f16;
|
|
|
|
| 7595 |
if (src1->type != GGML_TYPE_F16) {
|
| 7596 |
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
| 7597 |
GGML_ASSERT(to_fp16_cuda != nullptr);
|
| 7598 |
size_t ne = src1_ncols*ne10;
|
| 7599 |
+
src1_as_f16.alloc(ne);
|
| 7600 |
+
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
|
| 7601 |
}
|
| 7602 |
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
|
| 7603 |
+
cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);
|
|
|
|
| 7604 |
|
| 7605 |
const half alpha_f16 = 1.0f;
|
| 7606 |
const half beta_f16 = 0.0f;
|
|
|
|
| 7609 |
CUBLAS_CHECK(
|
| 7610 |
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 7611 |
row_diff, src1_ncols, ne10,
|
| 7612 |
+
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
| 7613 |
+
src1_ptr, CUDA_R_16F, ne10,
|
| 7614 |
+
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
| 7615 |
CUBLAS_COMPUTE_16F,
|
| 7616 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 7617 |
|
| 7618 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
| 7619 |
+
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
| 7620 |
+
} else {
|
| 7621 |
+
cuda_pool_alloc<float> src0_ddq_as_f32;
|
| 7622 |
+
cuda_pool_alloc<float> src1_ddq_as_f32;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7623 |
|
| 7624 |
if (src0->type != GGML_TYPE_F32) {
|
| 7625 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
| 7626 |
GGML_ASSERT(to_fp32_cuda != nullptr);
|
| 7627 |
+
src0_ddq_as_f32.alloc(row_diff*ne00);
|
| 7628 |
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
| 7629 |
+
}
|
| 7630 |
+
if (src1->type != GGML_TYPE_F32) {
|
| 7631 |
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
|
| 7632 |
+
GGML_ASSERT(to_fp32_cuda != nullptr);
|
| 7633 |
+
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
| 7634 |
+
to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
|
| 7635 |
}
|
| 7636 |
+
|
| 7637 |
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
| 7638 |
+
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
| 7639 |
|
| 7640 |
const float alpha = 1.0f;
|
| 7641 |
const float beta = 0.0f;
|
|
|
|
| 7644 |
CUBLAS_CHECK(
|
| 7645 |
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 7646 |
row_diff, src1_ncols, ne10,
|
| 7647 |
+
&alpha, src0_ddf_i, ne00,
|
| 7648 |
+
src1_ddf1_i, ne10,
|
| 7649 |
+
&beta, dst_dd_i, ldc));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7650 |
}
|
| 7651 |
|
| 7652 |
(void) dst;
|
|
|
|
| 7654 |
(void) src1_padded_row_size;
|
| 7655 |
}
|
| 7656 |
|
| 7657 |
+
static void ggml_cuda_op_rope(
|
| 7658 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7659 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7660 |
|
| 7661 |
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
| 7662 |
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
|
|
| 7734 |
(void) src1_dd;
|
| 7735 |
}
|
| 7736 |
|
| 7737 |
+
static void ggml_cuda_op_alibi(
|
| 7738 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7739 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7740 |
|
| 7741 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7742 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7765 |
(void) src1_dd;
|
| 7766 |
}
|
| 7767 |
|
| 7768 |
+
static void ggml_cuda_op_im2col(
|
| 7769 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7770 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7771 |
|
| 7772 |
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
| 7773 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
| 7800 |
(void) src0_dd;
|
| 7801 |
}
|
| 7802 |
|
| 7803 |
+
static void ggml_cuda_op_sum_rows(
|
|
|
|
| 7804 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7805 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7806 |
|
| 7807 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7808 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7817 |
(void) src1_dd;
|
| 7818 |
}
|
| 7819 |
|
| 7820 |
+
static void ggml_cuda_op_argsort(
|
| 7821 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7822 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7823 |
|
| 7824 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7825 |
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
|
|
|
| 7836 |
(void) src1_dd;
|
| 7837 |
}
|
| 7838 |
|
| 7839 |
+
static void ggml_cuda_op_diag_mask_inf(
|
| 7840 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7841 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7842 |
|
| 7843 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7844 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7856 |
(void) src1_dd;
|
| 7857 |
}
|
| 7858 |
|
| 7859 |
+
static void ggml_cuda_op_soft_max(
|
| 7860 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7861 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7862 |
|
| 7863 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7864 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7877 |
(void) dst;
|
| 7878 |
}
|
| 7879 |
|
| 7880 |
+
static void ggml_cuda_op_scale(
|
| 7881 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7882 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7883 |
|
| 7884 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7885 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7895 |
(void) src1_dd;
|
| 7896 |
}
|
| 7897 |
|
| 7898 |
+
static void ggml_cuda_op_clamp(
|
| 7899 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 7900 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 7901 |
|
| 7902 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7903 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 7937 |
float * src1_ddf = nullptr;
|
| 7938 |
float * dst_ddf = nullptr;
|
| 7939 |
|
| 7940 |
+
cuda_pool_alloc<float> src0_f;
|
| 7941 |
+
cuda_pool_alloc<float> src1_f;
|
| 7942 |
+
cuda_pool_alloc<float> dst_f;
|
|
|
|
| 7943 |
|
| 7944 |
ggml_cuda_set_device(g_main_device);
|
| 7945 |
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 7946 |
|
| 7947 |
if (src0_on_device) {
|
| 7948 |
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
| 7949 |
} else {
|
| 7950 |
+
src0_ddf = src0_f.alloc(ggml_nelements(src0));
|
| 7951 |
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
| 7952 |
}
|
| 7953 |
|
|
|
|
| 7955 |
if (src1_on_device) {
|
| 7956 |
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
| 7957 |
} else {
|
| 7958 |
+
src1_ddf = src1_f.alloc(ggml_nelements(src1));
|
| 7959 |
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
| 7960 |
}
|
| 7961 |
}
|
| 7962 |
if (dst_on_device) {
|
| 7963 |
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
| 7964 |
} else {
|
| 7965 |
+
dst_ddf = dst_f.alloc(ggml_nelements(dst));
|
| 7966 |
}
|
| 7967 |
|
| 7968 |
// do the computation
|
|
|
|
| 7974 |
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
| 7975 |
}
|
| 7976 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7977 |
if (dst->backend == GGML_BACKEND_CPU) {
|
| 7978 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 7979 |
}
|
|
|
|
| 7990 |
|
| 7991 |
#ifdef NDEBUG
|
| 7992 |
for (int id = 0; id < g_device_count; ++id) {
|
| 7993 |
+
ggml_cuda_set_device(id);
|
| 7994 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 7995 |
}
|
| 7996 |
|
| 7997 |
for (int id = 0; id < g_device_count; ++id) {
|
| 7998 |
+
ggml_cuda_set_device(id);
|
| 7999 |
|
| 8000 |
for (int id_other = 0; id_other < g_device_count; ++id_other) {
|
| 8001 |
if (id == id_other) {
|
|
|
|
| 8029 |
const int64_t ne01 = src0->ne[1];
|
| 8030 |
const int64_t ne02 = src0->ne[2];
|
| 8031 |
const int64_t ne03 = src0->ne[3];
|
|
|
|
| 8032 |
|
| 8033 |
const int64_t ne10 = src1->ne[0];
|
| 8034 |
const int64_t ne11 = src1->ne[1];
|
|
|
|
| 8046 |
|
| 8047 |
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
| 8048 |
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
| 8049 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
| 8050 |
|
| 8051 |
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
| 8052 |
|
|
|
|
| 8072 |
GGML_ASSERT(!(split && ne03 > 1));
|
| 8073 |
GGML_ASSERT(!(split && ne02 < ne12));
|
| 8074 |
|
| 8075 |
+
struct dev_data {
|
| 8076 |
+
cuda_pool_alloc<char> src0_dd_alloc;
|
| 8077 |
+
cuda_pool_alloc<float> src1_ddf_alloc;
|
| 8078 |
+
cuda_pool_alloc<char> src1_ddq_alloc;
|
| 8079 |
+
cuda_pool_alloc<float> dst_dd_alloc;
|
| 8080 |
+
|
| 8081 |
+
char * src0_dd = nullptr;
|
| 8082 |
+
float * src1_ddf = nullptr; // float
|
| 8083 |
+
char * src1_ddq = nullptr; // q8_1
|
| 8084 |
+
float * dst_dd = nullptr;
|
| 8085 |
|
| 8086 |
+
int64_t row_low;
|
| 8087 |
+
int64_t row_high;
|
| 8088 |
+
};
|
|
|
|
|
|
|
| 8089 |
|
| 8090 |
+
dev_data dev[GGML_CUDA_MAX_DEVICES];
|
|
|
|
| 8091 |
|
| 8092 |
int used_devices = 0;
|
| 8093 |
|
| 8094 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 8095 |
// by default, use all rows
|
| 8096 |
+
dev[id].row_low = 0;
|
| 8097 |
+
dev[id].row_high = ne01;
|
| 8098 |
|
| 8099 |
// for multi GPU, get the row boundaries from tensor split
|
| 8100 |
// and round to mul_mat_q tile sizes
|
|
|
|
| 8102 |
const int64_t rounding = get_row_rounding(src0->type);
|
| 8103 |
|
| 8104 |
if (id != 0) {
|
| 8105 |
+
dev[id].row_low = ne01*g_tensor_split[id];
|
| 8106 |
+
if (dev[id].row_low < ne01) {
|
| 8107 |
+
dev[id].row_low -= dev[id].row_low % rounding;
|
| 8108 |
+
}
|
| 8109 |
}
|
| 8110 |
|
| 8111 |
if (id != g_device_count - 1) {
|
| 8112 |
+
dev[id].row_high = ne01*g_tensor_split[id + 1];
|
| 8113 |
+
if (dev[id].row_high < ne01) {
|
| 8114 |
+
dev[id].row_high -= dev[id].row_high % rounding;
|
| 8115 |
+
}
|
| 8116 |
}
|
| 8117 |
}
|
| 8118 |
}
|
| 8119 |
|
| 8120 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 8121 |
+
if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
|
| 8122 |
continue;
|
| 8123 |
}
|
| 8124 |
|
|
|
|
| 8128 |
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
| 8129 |
|
| 8130 |
ggml_cuda_set_device(id);
|
| 8131 |
+
cudaStream_t stream = g_cudaStreams[id][0];
|
| 8132 |
|
| 8133 |
if (src0_on_device && src0_is_contiguous) {
|
| 8134 |
+
dev[id].src0_dd = (char *) src0_extra->data_device[id];
|
| 8135 |
} else {
|
| 8136 |
+
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
|
|
|
|
| 8137 |
}
|
| 8138 |
|
| 8139 |
if (src1_on_device && src1_is_contiguous) {
|
| 8140 |
+
dev[id].src1_ddf = (float *) src1_extra->data_device[id];
|
| 8141 |
} else {
|
| 8142 |
+
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_nelements(src1));
|
| 8143 |
}
|
| 8144 |
|
| 8145 |
if (convert_src1_to_q8_1) {
|
| 8146 |
+
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
| 8147 |
|
| 8148 |
if (src1_on_device && src1_is_contiguous) {
|
| 8149 |
+
quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
| 8150 |
CUDA_CHECK(cudaGetLastError());
|
| 8151 |
}
|
| 8152 |
}
|
| 8153 |
|
| 8154 |
if (dst_on_device) {
|
| 8155 |
+
dev[id].dst_dd = (float *) dst_extra->data_device[id];
|
| 8156 |
} else {
|
| 8157 |
+
const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
|
| 8158 |
+
dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf);
|
| 8159 |
}
|
| 8160 |
}
|
| 8161 |
|
| 8162 |
// if multiple devices are used they need to wait for the main device
|
| 8163 |
// here an event is recorded that signals that the main device has finished calculating the input data
|
| 8164 |
if (split && used_devices > 1) {
|
| 8165 |
+
ggml_cuda_set_device(g_main_device);
|
| 8166 |
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
| 8167 |
}
|
| 8168 |
|
|
|
|
| 8171 |
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
| 8172 |
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
| 8173 |
|
| 8174 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 8175 |
+
if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
|
| 8176 |
continue;
|
| 8177 |
}
|
| 8178 |
|
| 8179 |
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
| 8180 |
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
| 8181 |
+
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
| 8182 |
|
| 8183 |
ggml_cuda_set_device(id);
|
| 8184 |
+
cudaStream_t stream = g_cudaStreams[id][is];
|
| 8185 |
|
| 8186 |
// wait for main GPU data if necessary
|
| 8187 |
if (split && (id != g_main_device || is != 0)) {
|
|
|
|
| 8195 |
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
| 8196 |
|
| 8197 |
// for split tensors the data begins at i0 == i0_offset_low
|
| 8198 |
+
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
| 8199 |
+
float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
|
| 8200 |
+
char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
|
| 8201 |
+
float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
| 8202 |
|
| 8203 |
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
| 8204 |
// in that case an offset on dst_ddf_i is needed
|
| 8205 |
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
|
| 8206 |
+
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
| 8207 |
}
|
| 8208 |
|
| 8209 |
// copy src0, src1 to device if necessary
|
| 8210 |
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
| 8211 |
if (id != g_main_device) {
|
| 8212 |
if (convert_src1_to_q8_1) {
|
| 8213 |
+
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
| 8214 |
+
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
|
| 8215 |
+
src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
|
| 8216 |
} else {
|
| 8217 |
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
| 8218 |
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
| 8219 |
+
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
|
| 8220 |
+
src1_ncols*ne10*sizeof(float), stream));
|
| 8221 |
}
|
| 8222 |
}
|
| 8223 |
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
| 8224 |
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
| 8225 |
+
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
| 8226 |
} else {
|
| 8227 |
GGML_ASSERT(false);
|
| 8228 |
}
|
|
|
|
| 8233 |
}
|
| 8234 |
|
| 8235 |
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
| 8236 |
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
|
| 8237 |
}
|
| 8238 |
|
| 8239 |
// do the computation
|
| 8240 |
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
| 8241 |
+
dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
|
| 8242 |
CUDA_CHECK(cudaGetLastError());
|
| 8243 |
|
| 8244 |
// copy dst to host or other device if necessary
|
|
|
|
| 8262 |
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
| 8263 |
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
| 8264 |
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
| 8265 |
+
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
| 8266 |
+
#if !defined(GGML_USE_HIPBLAS)
|
| 8267 |
+
if (kind == cudaMemcpyDeviceToDevice) {
|
| 8268 |
+
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
| 8269 |
+
cudaMemcpy3DPeerParms p = {};
|
| 8270 |
+
p.dstDevice = g_main_device;
|
| 8271 |
+
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
|
| 8272 |
+
p.srcDevice = id;
|
| 8273 |
+
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
| 8274 |
+
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
| 8275 |
+
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
| 8276 |
+
} else
|
| 8277 |
+
#endif
|
| 8278 |
+
{
|
| 8279 |
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
| 8280 |
+
dst_dd_i, row_diff*sizeof(float),
|
| 8281 |
+
row_diff*sizeof(float), src1_ncols,
|
| 8282 |
+
kind, stream));
|
| 8283 |
+
}
|
| 8284 |
} else {
|
| 8285 |
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
| 8286 |
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
|
|
|
| 8297 |
}
|
| 8298 |
}
|
| 8299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8300 |
// main device waits for all other devices to be finished
|
| 8301 |
if (split && g_device_count > 1) {
|
| 8302 |
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
| 8303 |
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
|
| 8304 |
|
| 8305 |
+
ggml_cuda_set_device(g_main_device);
|
| 8306 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 8307 |
+
if (dev[id].row_low == dev[id].row_high) {
|
| 8308 |
continue;
|
| 8309 |
}
|
| 8310 |
for (int64_t is = 0; is < is_max; ++is) {
|
|
|
|
| 8314 |
}
|
| 8315 |
|
| 8316 |
if (dst->backend == GGML_BACKEND_CPU) {
|
| 8317 |
+
ggml_cuda_set_device(g_main_device);
|
| 8318 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 8319 |
}
|
| 8320 |
}
|
|
|
|
| 8424 |
|
| 8425 |
const int64_t ne12 = src1->ne[2];
|
| 8426 |
|
| 8427 |
+
ggml_cuda_set_device(g_main_device);
|
| 8428 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 8429 |
|
| 8430 |
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
|
|
| 8456 |
|
| 8457 |
const int64_t ne12 = src1->ne[2];
|
| 8458 |
|
| 8459 |
+
ggml_cuda_set_device(g_main_device);
|
| 8460 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 8461 |
|
| 8462 |
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
|
|
| 8493 |
int64_t i03 = i13 / r3;
|
| 8494 |
int64_t i02 = i12 / r2;
|
| 8495 |
|
| 8496 |
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
| 8497 |
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
| 8498 |
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
| 8499 |
}
|
| 8500 |
|
| 8501 |
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 8504 |
|
| 8505 |
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
| 8506 |
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8507 |
|
| 8508 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
|
|
|
|
|
| 8509 |
|
| 8510 |
+
const int64_t ne_dst = ggml_nelements(dst);
|
|
|
|
| 8511 |
|
| 8512 |
+
ggml_cuda_set_device(g_main_device);
|
| 8513 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 8514 |
|
| 8515 |
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
| 8516 |
|
| 8517 |
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
| 8518 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
| 8519 |
+
half * src0_f16 = (half *) src0_ddq;
|
| 8520 |
|
| 8521 |
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
| 8522 |
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
|
|
|
| 8525 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
| 8526 |
|
| 8527 |
// convert src1 to fp16
|
| 8528 |
+
cuda_pool_alloc<half> src1_f16_alloc;
|
| 8529 |
+
if (src1->type != GGML_TYPE_F16) {
|
| 8530 |
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
| 8531 |
+
const int64_t ne_src1 = ggml_nelements(src1);
|
| 8532 |
+
src1_f16_alloc.alloc(ne_src1);
|
| 8533 |
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
| 8534 |
+
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
| 8535 |
+
}
|
| 8536 |
+
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
|
| 8537 |
|
| 8538 |
+
cuda_pool_alloc<half> dst_f16;
|
| 8539 |
+
char * dst_t;
|
| 8540 |
|
| 8541 |
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
| 8542 |
cudaDataType_t cu_data_type = CUDA_R_16F;
|
|
|
|
| 8555 |
const void * beta = &beta_f16;
|
| 8556 |
|
| 8557 |
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 8558 |
+
dst_t = (char *) dst_f16.alloc(ne_dst);
|
|
|
|
| 8559 |
|
| 8560 |
nbd2 /= sizeof(float) / sizeof(half);
|
| 8561 |
nbd3 /= sizeof(float) / sizeof(half);
|
|
|
|
| 8602 |
CUBLAS_CHECK(
|
| 8603 |
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 8604 |
ne01, ne11, ne10,
|
| 8605 |
+
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
| 8606 |
+
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
|
| 8607 |
+
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
|
| 8608 |
ne12*ne13,
|
| 8609 |
cu_compute_type,
|
| 8610 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
|
|
| 8612 |
// use cublasGemmBatchedEx
|
| 8613 |
const int ne23 = ne12*ne13;
|
| 8614 |
|
| 8615 |
+
cuda_pool_alloc<const void *> ptrs_src(2*ne23);
|
| 8616 |
+
cuda_pool_alloc< void *> ptrs_dst(1*ne23);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8617 |
|
| 8618 |
dim3 block_dims(ne13, ne12);
|
| 8619 |
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
| 8620 |
+
src0_f16, src1_f16, dst_t,
|
| 8621 |
+
ptrs_src.get(), ptrs_dst.get(),
|
| 8622 |
ne12, ne13,
|
| 8623 |
ne23,
|
| 8624 |
nb02, nb03,
|
| 8625 |
+
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
|
| 8626 |
+
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
|
| 8627 |
nbd2, nbd3,
|
| 8628 |
r2, r3);
|
| 8629 |
CUDA_CHECK(cudaGetLastError());
|
|
|
|
| 8631 |
CUBLAS_CHECK(
|
| 8632 |
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 8633 |
ne01, ne11, ne10,
|
| 8634 |
+
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
|
| 8635 |
+
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
|
| 8636 |
+
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
|
| 8637 |
ne23,
|
| 8638 |
cu_compute_type,
|
| 8639 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8640 |
}
|
| 8641 |
#endif
|
| 8642 |
|
| 8643 |
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 8644 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
| 8645 |
+
to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
|
|
|
|
|
|
| 8646 |
}
|
|
|
|
|
|
|
| 8647 |
}
|
| 8648 |
|
| 8649 |
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 8655 |
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
| 8656 |
|
| 8657 |
int64_t min_compute_capability = INT_MAX;
|
| 8658 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 8659 |
+
if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
| 8660 |
+
min_compute_capability = g_device_caps[id].cc;
|
| 8661 |
}
|
| 8662 |
}
|
| 8663 |
|
|
|
|
| 8681 |
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
| 8682 |
// KQV single-batch
|
| 8683 |
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
| 8684 |
+
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
| 8685 |
// KQ + KQV multi-batch
|
| 8686 |
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
| 8687 |
} else if (src0->type == GGML_TYPE_F32) {
|
| 8688 |
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
| 8689 |
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
| 8690 |
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
|
| 8691 |
#ifdef GGML_CUDA_FORCE_DMMV
|
| 8692 |
const bool use_mul_mat_vec_q = false;
|
| 8693 |
#else
|
|
|
|
| 8798 |
const int64_t ne1 = ggml_nelements(src1);
|
| 8799 |
const int64_t ne = ggml_nelements(dst);
|
| 8800 |
|
| 8801 |
+
ggml_cuda_set_device(g_main_device);
|
| 8802 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 8803 |
|
| 8804 |
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
|
|
|
| 8916 |
|
| 8917 |
std::vector<char> ids_host(ggml_nbytes(ids));
|
| 8918 |
|
| 8919 |
+
cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
| 8920 |
|
| 8921 |
if (ids->backend == GGML_BACKEND_GPU) {
|
| 8922 |
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
|
|
|
| 8970 |
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
| 8971 |
}
|
| 8972 |
} else {
|
| 8973 |
+
cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
| 8974 |
+
cuda_pool_alloc<char> dst_contiguous(sizeof(float)*ggml_nelements(dst));
|
|
|
|
| 8975 |
|
| 8976 |
+
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
| 8977 |
+
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
| 8978 |
|
| 8979 |
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
| 8980 |
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
|
|
|
| 8994 |
|
| 8995 |
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
| 8996 |
|
| 8997 |
+
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
|
| 8998 |
nb11, src1_kind, stream));
|
| 8999 |
num_src1_rows++;
|
| 9000 |
}
|
|
|
|
| 9026 |
|
| 9027 |
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
| 9028 |
|
| 9029 |
+
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
|
| 9030 |
nb1, dst_kind, stream));
|
| 9031 |
num_src1_rows++;
|
| 9032 |
}
|
| 9033 |
}
|
|
|
|
|
|
|
|
|
|
| 9034 |
}
|
| 9035 |
|
| 9036 |
if (dst->backend == GGML_BACKEND_CPU) {
|
|
|
|
| 9072 |
const int64_t nb11 = src1->nb[1];
|
| 9073 |
const int64_t nb12 = src1->nb[2];
|
| 9074 |
|
| 9075 |
+
ggml_cuda_set_device(g_main_device);
|
| 9076 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 9077 |
|
| 9078 |
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
|
|
| 9162 |
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
| 9163 |
memset(extra, 0, sizeof(*extra));
|
| 9164 |
|
| 9165 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 9166 |
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
| 9167 |
continue;
|
| 9168 |
}
|
|
|
|
| 9233 |
|
| 9234 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 9235 |
|
| 9236 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 9237 |
+
ggml_cuda_set_device(id);
|
| 9238 |
if (extra->data_device[id] != nullptr) {
|
|
|
|
| 9239 |
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
| 9240 |
}
|
| 9241 |
|
| 9242 |
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
| 9243 |
if (extra->events[id][is] != nullptr) {
|
|
|
|
| 9244 |
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
| 9245 |
}
|
| 9246 |
}
|
|
|
|
| 9294 |
force_inplace;
|
| 9295 |
const size_t size = ggml_nbytes(tensor);
|
| 9296 |
|
| 9297 |
+
ggml_cuda_set_device(g_main_device);
|
| 9298 |
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
| 9299 |
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
| 9300 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
|
|
|
| 9371 |
GGML_ASSERT(ggml_is_contiguous(tensor));
|
| 9372 |
|
| 9373 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 9374 |
+
ggml_cuda_set_device(g_main_device);
|
| 9375 |
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
| 9376 |
}
|
| 9377 |
|
|
|
|
| 9795 |
// host buffer type
|
| 9796 |
|
| 9797 |
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 9798 |
+
ggml_cuda_host_free(buffer->context);
|
| 9799 |
}
|
| 9800 |
|
| 9801 |
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 9802 |
+
void * ptr = ggml_cuda_host_malloc(size);
|
| 9803 |
+
|
| 9804 |
+
if (ptr == nullptr) {
|
| 9805 |
+
// fallback to cpu buffer
|
| 9806 |
+
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
| 9807 |
+
}
|
| 9808 |
|
| 9809 |
// FIXME: this is a hack to avoid having to implement a new buffer type
|
| 9810 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
ggml-quants.c
CHANGED
|
@@ -407,6 +407,18 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
|
| 407 |
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
| 408 |
|
| 409 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
#endif
|
| 411 |
|
| 412 |
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
|
@@ -2468,32 +2480,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
|
| 2468 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 2469 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 2470 |
|
| 2471 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 2472 |
// dot product into int32x4_t
|
| 2473 |
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
| 2474 |
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
| 2475 |
|
| 2476 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 2477 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 2478 |
-
#else
|
| 2479 |
-
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
|
| 2480 |
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
|
| 2481 |
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
|
| 2482 |
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
|
| 2483 |
-
|
| 2484 |
-
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
|
| 2485 |
-
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
|
| 2486 |
-
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
|
| 2487 |
-
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
|
| 2488 |
-
|
| 2489 |
-
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
| 2490 |
-
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
| 2491 |
-
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2492 |
-
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2493 |
-
|
| 2494 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 2495 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 2496 |
-
#endif
|
| 2497 |
}
|
| 2498 |
|
| 2499 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
@@ -2776,32 +2768,12 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
|
| 2776 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 2777 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 2778 |
|
| 2779 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 2780 |
// dot product into int32x4_t
|
| 2781 |
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
| 2782 |
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
| 2783 |
|
| 2784 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 2785 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
| 2786 |
-
#else
|
| 2787 |
-
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
|
| 2788 |
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
|
| 2789 |
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
|
| 2790 |
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
|
| 2791 |
-
|
| 2792 |
-
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
|
| 2793 |
-
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
|
| 2794 |
-
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
|
| 2795 |
-
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
|
| 2796 |
-
|
| 2797 |
-
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
| 2798 |
-
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
| 2799 |
-
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2800 |
-
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2801 |
-
|
| 2802 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 2803 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
| 2804 |
-
#endif
|
| 2805 |
}
|
| 2806 |
|
| 2807 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
|
@@ -2963,32 +2935,12 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
|
| 2963 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 2964 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 2965 |
|
| 2966 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 2967 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 2968 |
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
| 2969 |
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 2970 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 2971 |
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
| 2972 |
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 2973 |
-
#else
|
| 2974 |
-
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
| 2975 |
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
| 2976 |
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
|
| 2977 |
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
|
| 2978 |
-
|
| 2979 |
-
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
|
| 2980 |
-
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
|
| 2981 |
-
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
|
| 2982 |
-
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
|
| 2983 |
-
|
| 2984 |
-
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
| 2985 |
-
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
| 2986 |
-
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2987 |
-
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2988 |
-
|
| 2989 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 2990 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 2991 |
-
#endif
|
| 2992 |
}
|
| 2993 |
|
| 2994 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
@@ -3275,32 +3227,12 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
|
| 3275 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 3276 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 3277 |
|
| 3278 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 3279 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 3280 |
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
| 3281 |
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 3282 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 3283 |
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
| 3284 |
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
| 3285 |
-
#else
|
| 3286 |
-
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
| 3287 |
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
| 3288 |
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
|
| 3289 |
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
|
| 3290 |
-
|
| 3291 |
-
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
|
| 3292 |
-
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
|
| 3293 |
-
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
|
| 3294 |
-
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
|
| 3295 |
-
|
| 3296 |
-
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
| 3297 |
-
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
| 3298 |
-
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 3299 |
-
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 3300 |
-
|
| 3301 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 3302 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
| 3303 |
-
#endif
|
| 3304 |
}
|
| 3305 |
|
| 3306 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
|
@@ -3550,7 +3482,6 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
|
| 3550 |
const int8x16_t y1_0 = vld1q_s8(y1->qs);
|
| 3551 |
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
|
| 3552 |
|
| 3553 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 3554 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 3555 |
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
| 3556 |
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
|
@@ -3558,26 +3489,6 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
|
| 3558 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 3559 |
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
| 3560 |
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 3561 |
-
|
| 3562 |
-
#else
|
| 3563 |
-
const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
|
| 3564 |
-
const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
|
| 3565 |
-
const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
|
| 3566 |
-
const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
|
| 3567 |
-
|
| 3568 |
-
const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
|
| 3569 |
-
const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
|
| 3570 |
-
const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
|
| 3571 |
-
const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
|
| 3572 |
-
|
| 3573 |
-
const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
|
| 3574 |
-
const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
|
| 3575 |
-
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
|
| 3576 |
-
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
|
| 3577 |
-
|
| 3578 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 3579 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 3580 |
-
#endif
|
| 3581 |
}
|
| 3582 |
|
| 3583 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
@@ -3650,12 +3561,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 3650 |
const int nb = n / QK_K;
|
| 3651 |
|
| 3652 |
#ifdef __ARM_NEON
|
| 3653 |
-
|
| 3654 |
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
| 3655 |
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
| 3656 |
-
|
| 3657 |
-
const int32x4_t
|
| 3658 |
-
#endif
|
| 3659 |
|
| 3660 |
ggml_int8x16x2_t q2bytes;
|
| 3661 |
uint8_t aux[16];
|
|
@@ -3663,7 +3572,6 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 3663 |
float sum = 0;
|
| 3664 |
|
| 3665 |
for (int i = 0; i < nb; ++i) {
|
| 3666 |
-
|
| 3667 |
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
| 3668 |
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
| 3669 |
|
|
@@ -3689,20 +3597,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 3689 |
|
| 3690 |
// We use this macro instead of a function call because for some reason
|
| 3691 |
// the code runs 2-3% slower, even if the function is declared inline
|
| 3692 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 3693 |
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
| 3694 |
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
| 3695 |
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
| 3696 |
-
#else
|
| 3697 |
-
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
| 3698 |
-
{\
|
| 3699 |
-
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),\
|
| 3700 |
-
vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));\
|
| 3701 |
-
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),\
|
| 3702 |
-
vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));\
|
| 3703 |
-
isum += vaddvq_s16(p1) * aux[is+(index)] + vaddvq_s16(p2) * aux[is+1+(index)];\
|
| 3704 |
-
}
|
| 3705 |
-
#endif
|
| 3706 |
|
| 3707 |
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
| 3708 |
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
|
@@ -3710,26 +3607,23 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 3710 |
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
|
| 3711 |
MULTIPLY_ACCUM_WITH_SCALE((index));
|
| 3712 |
|
| 3713 |
-
|
| 3714 |
for (int j = 0; j < QK_K/128; ++j) {
|
| 3715 |
-
|
| 3716 |
const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
|
| 3717 |
|
| 3718 |
ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
| 3719 |
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
|
| 3720 |
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
|
|
|
|
| 3721 |
MULTIPLY_ACCUM_WITH_SCALE(0);
|
| 3722 |
|
| 3723 |
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
|
| 3724 |
-
|
| 3725 |
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
|
| 3726 |
-
|
| 3727 |
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
|
| 3728 |
|
| 3729 |
is += 8;
|
| 3730 |
}
|
| 3731 |
-
sum += d * isum;
|
| 3732 |
|
|
|
|
| 3733 |
}
|
| 3734 |
|
| 3735 |
*s = sum;
|
|
@@ -4043,11 +3937,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 4043 |
const int nb = n / QK_K;
|
| 4044 |
|
| 4045 |
#ifdef __ARM_NEON
|
| 4046 |
-
|
| 4047 |
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
| 4048 |
-
|
| 4049 |
-
const int32x4_t
|
| 4050 |
-
#endif
|
| 4051 |
|
| 4052 |
ggml_int8x16x4_t q2bytes;
|
| 4053 |
|
|
@@ -4081,28 +3973,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 4081 |
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
|
| 4082 |
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
|
| 4083 |
|
| 4084 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 4085 |
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
| 4086 |
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
| 4087 |
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
| 4088 |
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
| 4089 |
-
#else
|
| 4090 |
-
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
| 4091 |
-
vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
| 4092 |
-
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
| 4093 |
-
vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
| 4094 |
-
isum1 += vaddvq_s16(p1) * scales[0];
|
| 4095 |
-
isum2 += vaddvq_s16(p2) * scales[1];
|
| 4096 |
-
|
| 4097 |
-
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
| 4098 |
-
vmull_s8(vget_high_s8(q2bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
| 4099 |
-
const int16x8_t p4 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
| 4100 |
-
vmull_s8(vget_high_s8(q2bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
| 4101 |
-
isum1 += vaddvq_s16(p3) * scales[2];
|
| 4102 |
-
isum2 += vaddvq_s16(p4) * scales[3];
|
| 4103 |
-
#endif
|
| 4104 |
-
sum += d * (isum1 + isum2);
|
| 4105 |
|
|
|
|
| 4106 |
}
|
| 4107 |
|
| 4108 |
*s = sum;
|
|
@@ -4328,9 +4204,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 4328 |
uint32_t utmp[4];
|
| 4329 |
|
| 4330 |
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
| 4331 |
-
#ifdef __ARM_FEATURE_DOTPROD
|
| 4332 |
const int32x4_t vzero = vdupq_n_s32(0);
|
| 4333 |
-
#endif
|
| 4334 |
|
| 4335 |
const uint8x16_t m0 = vdupq_n_u8(1);
|
| 4336 |
const uint8x16_t m1 = vshlq_n_u8(m0, 1);
|
|
@@ -4382,22 +4256,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 4382 |
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
| 4383 |
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
| 4384 |
|
| 4385 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 4386 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
| 4387 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
| 4388 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
| 4389 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
| 4390 |
-
|
| 4391 |
-
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_1.val[0])),
|
| 4392 |
-
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_1.val[0])));
|
| 4393 |
-
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_1.val[1])),
|
| 4394 |
-
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_1.val[1])));
|
| 4395 |
-
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_1.val[2])),
|
| 4396 |
-
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_1.val[2])));
|
| 4397 |
-
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_1.val[3])),
|
| 4398 |
-
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_1.val[3])));
|
| 4399 |
-
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
|
| 4400 |
-
#endif
|
| 4401 |
scale += 4;
|
| 4402 |
|
| 4403 |
q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
|
|
@@ -4410,22 +4273,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 4410 |
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
| 4411 |
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
| 4412 |
|
| 4413 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 4414 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
| 4415 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
| 4416 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
| 4417 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
| 4418 |
-
|
| 4419 |
-
p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_2.val[0])),
|
| 4420 |
-
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_2.val[0])));
|
| 4421 |
-
p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_2.val[1])),
|
| 4422 |
-
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_2.val[1])));
|
| 4423 |
-
p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_2.val[2])),
|
| 4424 |
-
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_2.val[2])));
|
| 4425 |
-
p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_2.val[3])),
|
| 4426 |
-
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_2.val[3])));
|
| 4427 |
-
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
|
| 4428 |
-
#endif
|
| 4429 |
scale += 4;
|
| 4430 |
|
| 4431 |
if (j == 0) {
|
|
@@ -4864,10 +4716,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 4864 |
const int nb = n / QK_K;
|
| 4865 |
|
| 4866 |
#ifdef __ARM_NEON
|
| 4867 |
-
|
| 4868 |
-
#ifdef __ARM_FEATURE_DOTPROD
|
| 4869 |
-
const int32x4_t vzero = vdupq_n_s32(0);
|
| 4870 |
-
#endif
|
| 4871 |
|
| 4872 |
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
| 4873 |
const uint8x16_t mh = vdupq_n_u8(4);
|
|
@@ -4908,22 +4757,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 4908 |
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
|
| 4909 |
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
|
| 4910 |
|
| 4911 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 4912 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
| 4913 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
| 4914 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
| 4915 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
| 4916 |
-
#else
|
| 4917 |
-
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
| 4918 |
-
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
| 4919 |
-
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
| 4920 |
-
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
| 4921 |
-
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
| 4922 |
-
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
| 4923 |
-
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
| 4924 |
-
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
| 4925 |
-
isum += vaddvq_s16(p0) * scales[0] + vaddvq_s16(p1) * scales[2] + vaddvq_s16(p2) * scales[1] + vaddvq_s16(p3) * scales[3];
|
| 4926 |
-
#endif
|
| 4927 |
|
| 4928 |
sum += d * isum;
|
| 4929 |
|
|
@@ -5228,11 +5065,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 5228 |
uint32_t utmp[4];
|
| 5229 |
|
| 5230 |
#ifdef __ARM_NEON
|
| 5231 |
-
|
| 5232 |
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
| 5233 |
-
#ifdef __ARM_FEATURE_DOTPROD
|
| 5234 |
const int32x4_t mzero = vdupq_n_s32(0);
|
| 5235 |
-
#endif
|
| 5236 |
|
| 5237 |
ggml_int8x16x2_t q4bytes;
|
| 5238 |
ggml_int8x16x2_t q8bytes;
|
|
@@ -5269,10 +5103,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 5269 |
int32_t sumi2 = 0;
|
| 5270 |
|
| 5271 |
for (int j = 0; j < QK_K/64; ++j) {
|
| 5272 |
-
|
| 5273 |
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
| 5274 |
|
| 5275 |
-
#ifdef __ARM_FEATURE_DOTPROD
|
| 5276 |
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
| 5277 |
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
| 5278 |
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
|
@@ -5287,26 +5119,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 5287 |
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
| 5288 |
|
| 5289 |
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
| 5290 |
-
#else
|
| 5291 |
-
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
| 5292 |
-
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
| 5293 |
-
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
| 5294 |
-
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
| 5295 |
-
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
| 5296 |
-
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
| 5297 |
-
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
| 5298 |
-
sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
|
| 5299 |
-
|
| 5300 |
-
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
| 5301 |
-
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
| 5302 |
-
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
| 5303 |
-
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
| 5304 |
-
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
| 5305 |
-
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
| 5306 |
-
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
| 5307 |
-
sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) * scales[2*j+1];
|
| 5308 |
-
|
| 5309 |
-
#endif
|
| 5310 |
}
|
| 5311 |
|
| 5312 |
sumf += d * (sumi1 + sumi2);
|
|
@@ -5603,12 +5415,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 5603 |
const int nb = n / QK_K;
|
| 5604 |
|
| 5605 |
#ifdef __ARM_NEON
|
| 5606 |
-
|
| 5607 |
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
| 5608 |
|
| 5609 |
-
#ifdef __ARM_FEATURE_DOTPROD
|
| 5610 |
const int32x4_t mzero = vdupq_n_s32(0);
|
| 5611 |
-
#endif
|
| 5612 |
|
| 5613 |
float sumf = 0;
|
| 5614 |
|
|
@@ -5636,7 +5445,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 5636 |
|
| 5637 |
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
| 5638 |
|
| 5639 |
-
#ifdef __ARM_FEATURE_DOTPROD
|
| 5640 |
q8bytes = ggml_vld1q_s8_x4(q8);
|
| 5641 |
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
| 5642 |
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
|
@@ -5650,27 +5458,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 5650 |
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
| 5651 |
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
| 5652 |
|
| 5653 |
-
#else
|
| 5654 |
-
q8bytes = ggml_vld1q_s8_x4(q8);
|
| 5655 |
-
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
| 5656 |
-
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
| 5657 |
-
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
| 5658 |
-
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
| 5659 |
-
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
| 5660 |
-
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
| 5661 |
-
int32_t sumi1 = vaddvq_s16(vaddq_s16(p0, p1)) * scales[0];
|
| 5662 |
-
|
| 5663 |
-
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
| 5664 |
-
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
| 5665 |
-
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[2])),
|
| 5666 |
-
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[2])));
|
| 5667 |
-
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[3])),
|
| 5668 |
-
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[3])));
|
| 5669 |
-
int32_t sumi2 = vaddvq_s16(vaddq_s16(p2, p3)) * scales[1];
|
| 5670 |
-
|
| 5671 |
-
#endif
|
| 5672 |
sumf += d * (sumi1 + sumi2);
|
| 5673 |
-
|
| 5674 |
}
|
| 5675 |
|
| 5676 |
*s = sumf - sum_mins;
|
|
@@ -5875,15 +5663,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 5875 |
|
| 5876 |
uint32_t utmp[4];
|
| 5877 |
|
| 5878 |
-
|
| 5879 |
#ifdef __ARM_NEON
|
| 5880 |
-
|
| 5881 |
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
| 5882 |
const uint8x16_t mone = vdupq_n_u8(1);
|
| 5883 |
const uint8x16_t mtwo = vdupq_n_u8(2);
|
| 5884 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 5885 |
const int32x4_t mzero = vdupq_n_s32(0);
|
| 5886 |
-
#endif
|
| 5887 |
|
| 5888 |
ggml_int8x16x4_t q5bytes;
|
| 5889 |
|
|
@@ -5938,28 +5722,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 5938 |
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
|
| 5939 |
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
|
| 5940 |
|
| 5941 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 5942 |
-
|
| 5943 |
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
| 5944 |
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
| 5945 |
-
#else
|
| 5946 |
-
|
| 5947 |
-
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
| 5948 |
-
vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
| 5949 |
-
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
| 5950 |
-
vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
| 5951 |
-
sumi += vaddvq_s16(vaddq_s16(p0, p1)) * *scales++;
|
| 5952 |
-
|
| 5953 |
-
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
| 5954 |
-
vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
| 5955 |
-
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
| 5956 |
-
vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
| 5957 |
-
sumi += vaddvq_s16(vaddq_s16(p2, p3)) * *scales++;
|
| 5958 |
-
#endif
|
| 5959 |
}
|
| 5960 |
|
| 5961 |
sumf += d * sumi - dmin * sumi_mins;
|
| 5962 |
-
|
| 5963 |
}
|
| 5964 |
|
| 5965 |
*s = sumf;
|
|
@@ -6311,12 +6078,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 6311 |
const int nb = n / QK_K;
|
| 6312 |
|
| 6313 |
#ifdef __ARM_NEON
|
| 6314 |
-
|
| 6315 |
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
| 6316 |
const uint8x16_t mh = vdupq_n_u8(16);
|
| 6317 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 6318 |
const int32x4_t mzero = vdupq_n_s32(0);
|
| 6319 |
-
#endif
|
| 6320 |
|
| 6321 |
ggml_int8x16x4_t q5bytes;
|
| 6322 |
ggml_uint8x16x4_t q5h;
|
|
@@ -6348,32 +6112,12 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 6348 |
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
|
| 6349 |
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
|
| 6350 |
|
| 6351 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 6352 |
-
|
| 6353 |
int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
| 6354 |
int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
| 6355 |
int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
| 6356 |
int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
| 6357 |
|
| 6358 |
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
| 6359 |
-
|
| 6360 |
-
#else
|
| 6361 |
-
|
| 6362 |
-
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
| 6363 |
-
vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
| 6364 |
-
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
| 6365 |
-
vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
| 6366 |
-
int32_t sumi = sc[0] * vaddvq_s16(p0) + sc[1] * vaddvq_s16(p1);
|
| 6367 |
-
|
| 6368 |
-
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
| 6369 |
-
vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
| 6370 |
-
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
| 6371 |
-
vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
| 6372 |
-
sumi += sc[2] * vaddvq_s16(p2) + sc[3] * vaddvq_s16(p3);
|
| 6373 |
-
|
| 6374 |
-
sumf += d*sumi;
|
| 6375 |
-
#endif
|
| 6376 |
-
|
| 6377 |
}
|
| 6378 |
|
| 6379 |
*s = sumf;
|
|
@@ -6600,13 +6344,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 6600 |
const int nb = n / QK_K;
|
| 6601 |
|
| 6602 |
#ifdef __ARM_NEON
|
| 6603 |
-
|
| 6604 |
float sum = 0;
|
| 6605 |
|
| 6606 |
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
| 6607 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 6608 |
const int32x4_t vzero = vdupq_n_s32(0);
|
| 6609 |
-
#endif
|
| 6610 |
//const int8x16_t m32s = vdupq_n_s8(32);
|
| 6611 |
|
| 6612 |
const uint8x16_t mone = vdupq_n_u8(3);
|
|
@@ -6658,30 +6399,12 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 6658 |
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
|
| 6659 |
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
|
| 6660 |
|
| 6661 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 6662 |
-
|
| 6663 |
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
| 6664 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
| 6665 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
| 6666 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
| 6667 |
-
scale += 4;
|
| 6668 |
|
| 6669 |
-
|
| 6670 |
-
|
| 6671 |
-
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
| 6672 |
-
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
| 6673 |
-
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
| 6674 |
-
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
| 6675 |
-
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
|
| 6676 |
-
scale += 2;
|
| 6677 |
-
|
| 6678 |
-
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
| 6679 |
-
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
| 6680 |
-
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
| 6681 |
-
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
| 6682 |
-
isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
|
| 6683 |
-
scale += 2;
|
| 6684 |
-
#endif
|
| 6685 |
|
| 6686 |
q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
| 6687 |
|
|
@@ -6703,34 +6426,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 6703 |
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
|
| 6704 |
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
|
| 6705 |
|
| 6706 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 6707 |
-
|
| 6708 |
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
| 6709 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
| 6710 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
| 6711 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
| 6712 |
scale += 4;
|
| 6713 |
-
|
| 6714 |
-
//for (int l = 0; l < 4; ++l) {
|
| 6715 |
-
// const int32x4_t p = vdotq_s32(vzero, q6bytes.val[l], q8bytes.val[l]);
|
| 6716 |
-
// isum += vaddvq_s32(p) * *scale++;
|
| 6717 |
-
//}
|
| 6718 |
-
#else
|
| 6719 |
-
p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
| 6720 |
-
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
| 6721 |
-
p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
| 6722 |
-
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
| 6723 |
-
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
|
| 6724 |
-
scale += 2;
|
| 6725 |
-
|
| 6726 |
-
p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
| 6727 |
-
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
| 6728 |
-
p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
| 6729 |
-
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
| 6730 |
-
isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
|
| 6731 |
-
scale += 2;
|
| 6732 |
-
#endif
|
| 6733 |
-
|
| 6734 |
}
|
| 6735 |
//sum += isum * d_all * y[i].d;
|
| 6736 |
sum += d_all * y[i].d * (isum - 32 * isum_mins);
|
|
@@ -7076,14 +6776,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 7076 |
const int nb = n / QK_K;
|
| 7077 |
|
| 7078 |
#ifdef __ARM_NEON
|
| 7079 |
-
|
| 7080 |
float sum = 0;
|
| 7081 |
|
| 7082 |
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
| 7083 |
const int8x16_t m32s = vdupq_n_s8(32);
|
| 7084 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 7085 |
const int32x4_t vzero = vdupq_n_s32(0);
|
| 7086 |
-
#endif
|
| 7087 |
|
| 7088 |
const uint8x16_t mone = vdupq_n_u8(3);
|
| 7089 |
|
|
@@ -7119,26 +6816,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 7119 |
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
|
| 7120 |
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
|
| 7121 |
|
| 7122 |
-
#if defined(__ARM_FEATURE_DOTPROD)
|
| 7123 |
-
|
| 7124 |
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
| 7125 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
| 7126 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
| 7127 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
| 7128 |
-
#else
|
| 7129 |
-
|
| 7130 |
-
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
| 7131 |
-
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
| 7132 |
-
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
| 7133 |
-
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
| 7134 |
-
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
|
| 7135 |
-
|
| 7136 |
-
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
| 7137 |
-
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
| 7138 |
-
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
| 7139 |
-
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
| 7140 |
-
isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
|
| 7141 |
-
#endif
|
| 7142 |
|
| 7143 |
sum += isum * d_all * y[i].d;
|
| 7144 |
|
|
|
|
| 407 |
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
| 408 |
|
| 409 |
#endif
|
| 410 |
+
|
| 411 |
+
#if !defined(__ARM_FEATURE_DOTPROD)
|
| 412 |
+
|
| 413 |
+
inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
| 414 |
+
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
| 415 |
+
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
| 416 |
+
|
| 417 |
+
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
#endif
|
| 421 |
+
|
| 422 |
#endif
|
| 423 |
|
| 424 |
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
|
|
|
| 2480 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 2481 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 2482 |
|
|
|
|
| 2483 |
// dot product into int32x4_t
|
| 2484 |
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
| 2485 |
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
| 2486 |
|
| 2487 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 2488 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2489 |
}
|
| 2490 |
|
| 2491 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
|
|
| 2768 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 2769 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 2770 |
|
|
|
|
| 2771 |
// dot product into int32x4_t
|
| 2772 |
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
| 2773 |
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
| 2774 |
|
| 2775 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 2776 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2777 |
}
|
| 2778 |
|
| 2779 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
|
|
|
| 2935 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 2936 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 2937 |
|
|
|
|
| 2938 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 2939 |
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
| 2940 |
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 2941 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 2942 |
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
| 2943 |
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2944 |
}
|
| 2945 |
|
| 2946 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
|
|
| 3227 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 3228 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 3229 |
|
|
|
|
| 3230 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 3231 |
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
| 3232 |
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 3233 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 3234 |
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
| 3235 |
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3236 |
}
|
| 3237 |
|
| 3238 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
|
|
|
| 3482 |
const int8x16_t y1_0 = vld1q_s8(y1->qs);
|
| 3483 |
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
|
| 3484 |
|
|
|
|
| 3485 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 3486 |
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
| 3487 |
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
|
|
|
| 3489 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 3490 |
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
| 3491 |
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3492 |
}
|
| 3493 |
|
| 3494 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
|
|
| 3561 |
const int nb = n / QK_K;
|
| 3562 |
|
| 3563 |
#ifdef __ARM_NEON
|
|
|
|
| 3564 |
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
| 3565 |
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
| 3566 |
+
|
| 3567 |
+
const int32x4_t vzero = vdupq_n_s32(0);
|
|
|
|
| 3568 |
|
| 3569 |
ggml_int8x16x2_t q2bytes;
|
| 3570 |
uint8_t aux[16];
|
|
|
|
| 3572 |
float sum = 0;
|
| 3573 |
|
| 3574 |
for (int i = 0; i < nb; ++i) {
|
|
|
|
| 3575 |
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
| 3576 |
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
| 3577 |
|
|
|
|
| 3597 |
|
| 3598 |
// We use this macro instead of a function call because for some reason
|
| 3599 |
// the code runs 2-3% slower, even if the function is declared inline
|
|
|
|
| 3600 |
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
| 3601 |
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
| 3602 |
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3603 |
|
| 3604 |
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
| 3605 |
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
|
|
|
| 3607 |
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
|
| 3608 |
MULTIPLY_ACCUM_WITH_SCALE((index));
|
| 3609 |
|
|
|
|
| 3610 |
for (int j = 0; j < QK_K/128; ++j) {
|
|
|
|
| 3611 |
const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
|
| 3612 |
|
| 3613 |
ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
| 3614 |
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
|
| 3615 |
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
|
| 3616 |
+
|
| 3617 |
MULTIPLY_ACCUM_WITH_SCALE(0);
|
| 3618 |
|
| 3619 |
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
|
|
|
|
| 3620 |
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
|
|
|
|
| 3621 |
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
|
| 3622 |
|
| 3623 |
is += 8;
|
| 3624 |
}
|
|
|
|
| 3625 |
|
| 3626 |
+
sum += d * isum;
|
| 3627 |
}
|
| 3628 |
|
| 3629 |
*s = sum;
|
|
|
|
| 3937 |
const int nb = n / QK_K;
|
| 3938 |
|
| 3939 |
#ifdef __ARM_NEON
|
|
|
|
| 3940 |
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
| 3941 |
+
|
| 3942 |
+
const int32x4_t vzero = vdupq_n_s32(0);
|
|
|
|
| 3943 |
|
| 3944 |
ggml_int8x16x4_t q2bytes;
|
| 3945 |
|
|
|
|
| 3973 |
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
|
| 3974 |
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
|
| 3975 |
|
|
|
|
| 3976 |
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
| 3977 |
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
| 3978 |
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
| 3979 |
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3980 |
|
| 3981 |
+
sum += d * (isum1 + isum2);
|
| 3982 |
}
|
| 3983 |
|
| 3984 |
*s = sum;
|
|
|
|
| 4204 |
uint32_t utmp[4];
|
| 4205 |
|
| 4206 |
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
|
|
|
| 4207 |
const int32x4_t vzero = vdupq_n_s32(0);
|
|
|
|
| 4208 |
|
| 4209 |
const uint8x16_t m0 = vdupq_n_u8(1);
|
| 4210 |
const uint8x16_t m1 = vshlq_n_u8(m0, 1);
|
|
|
|
| 4256 |
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
| 4257 |
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
| 4258 |
|
|
|
|
| 4259 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
| 4260 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
| 4261 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
| 4262 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
| 4263 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4264 |
scale += 4;
|
| 4265 |
|
| 4266 |
q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
|
|
|
|
| 4273 |
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
| 4274 |
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
| 4275 |
|
|
|
|
| 4276 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
| 4277 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
| 4278 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
| 4279 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
| 4280 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4281 |
scale += 4;
|
| 4282 |
|
| 4283 |
if (j == 0) {
|
|
|
|
| 4716 |
const int nb = n / QK_K;
|
| 4717 |
|
| 4718 |
#ifdef __ARM_NEON
|
| 4719 |
+
const int32x4_t vzero = vdupq_n_s32(0);
|
|
|
|
|
|
|
|
|
|
| 4720 |
|
| 4721 |
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
| 4722 |
const uint8x16_t mh = vdupq_n_u8(4);
|
|
|
|
| 4757 |
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
|
| 4758 |
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
|
| 4759 |
|
|
|
|
| 4760 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
| 4761 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
| 4762 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
| 4763 |
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4764 |
|
| 4765 |
sum += d * isum;
|
| 4766 |
|
|
|
|
| 5065 |
uint32_t utmp[4];
|
| 5066 |
|
| 5067 |
#ifdef __ARM_NEON
|
|
|
|
| 5068 |
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
|
|
|
| 5069 |
const int32x4_t mzero = vdupq_n_s32(0);
|
|
|
|
| 5070 |
|
| 5071 |
ggml_int8x16x2_t q4bytes;
|
| 5072 |
ggml_int8x16x2_t q8bytes;
|
|
|
|
| 5103 |
int32_t sumi2 = 0;
|
| 5104 |
|
| 5105 |
for (int j = 0; j < QK_K/64; ++j) {
|
|
|
|
| 5106 |
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
| 5107 |
|
|
|
|
| 5108 |
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
| 5109 |
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
| 5110 |
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
|
|
|
| 5119 |
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
| 5120 |
|
| 5121 |
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5122 |
}
|
| 5123 |
|
| 5124 |
sumf += d * (sumi1 + sumi2);
|
|
|
|
| 5415 |
const int nb = n / QK_K;
|
| 5416 |
|
| 5417 |
#ifdef __ARM_NEON
|
|
|
|
| 5418 |
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
| 5419 |
|
|
|
|
| 5420 |
const int32x4_t mzero = vdupq_n_s32(0);
|
|
|
|
| 5421 |
|
| 5422 |
float sumf = 0;
|
| 5423 |
|
|
|
|
| 5445 |
|
| 5446 |
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
| 5447 |
|
|
|
|
| 5448 |
q8bytes = ggml_vld1q_s8_x4(q8);
|
| 5449 |
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
| 5450 |
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
|
|
|
| 5458 |
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
| 5459 |
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
| 5460 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5461 |
sumf += d * (sumi1 + sumi2);
|
|
|
|
| 5462 |
}
|
| 5463 |
|
| 5464 |
*s = sumf - sum_mins;
|
|
|
|
| 5663 |
|
| 5664 |
uint32_t utmp[4];
|
| 5665 |
|
|
|
|
| 5666 |
#ifdef __ARM_NEON
|
|
|
|
| 5667 |
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
| 5668 |
const uint8x16_t mone = vdupq_n_u8(1);
|
| 5669 |
const uint8x16_t mtwo = vdupq_n_u8(2);
|
|
|
|
| 5670 |
const int32x4_t mzero = vdupq_n_s32(0);
|
|
|
|
| 5671 |
|
| 5672 |
ggml_int8x16x4_t q5bytes;
|
| 5673 |
|
|
|
|
| 5722 |
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
|
| 5723 |
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
|
| 5724 |
|
|
|
|
|
|
|
| 5725 |
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
| 5726 |
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5727 |
}
|
| 5728 |
|
| 5729 |
sumf += d * sumi - dmin * sumi_mins;
|
|
|
|
| 5730 |
}
|
| 5731 |
|
| 5732 |
*s = sumf;
|
|
|
|
| 6078 |
const int nb = n / QK_K;
|
| 6079 |
|
| 6080 |
#ifdef __ARM_NEON
|
|
|
|
| 6081 |
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
| 6082 |
const uint8x16_t mh = vdupq_n_u8(16);
|
|
|
|
| 6083 |
const int32x4_t mzero = vdupq_n_s32(0);
|
|
|
|
| 6084 |
|
| 6085 |
ggml_int8x16x4_t q5bytes;
|
| 6086 |
ggml_uint8x16x4_t q5h;
|
|
|
|
| 6112 |
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
|
| 6113 |
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
|
| 6114 |
|
|
|
|
|
|
|
| 6115 |
int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
| 6116 |
int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
| 6117 |
int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
| 6118 |
int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
| 6119 |
|
| 6120 |
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6121 |
}
|
| 6122 |
|
| 6123 |
*s = sumf;
|
|
|
|
| 6344 |
const int nb = n / QK_K;
|
| 6345 |
|
| 6346 |
#ifdef __ARM_NEON
|
|
|
|
| 6347 |
float sum = 0;
|
| 6348 |
|
| 6349 |
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
|
|
|
| 6350 |
const int32x4_t vzero = vdupq_n_s32(0);
|
|
|
|
| 6351 |
//const int8x16_t m32s = vdupq_n_s8(32);
|
| 6352 |
|
| 6353 |
const uint8x16_t mone = vdupq_n_u8(3);
|
|
|
|
| 6399 |
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
|
| 6400 |
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
|
| 6401 |
|
|
|
|
|
|
|
| 6402 |
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
| 6403 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
| 6404 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
| 6405 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
|
|
|
| 6406 |
|
| 6407 |
+
scale += 4;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6408 |
|
| 6409 |
q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
| 6410 |
|
|
|
|
| 6426 |
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
|
| 6427 |
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
|
| 6428 |
|
|
|
|
|
|
|
| 6429 |
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
| 6430 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
| 6431 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
| 6432 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
| 6433 |
scale += 4;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6434 |
}
|
| 6435 |
//sum += isum * d_all * y[i].d;
|
| 6436 |
sum += d_all * y[i].d * (isum - 32 * isum_mins);
|
|
|
|
| 6776 |
const int nb = n / QK_K;
|
| 6777 |
|
| 6778 |
#ifdef __ARM_NEON
|
|
|
|
| 6779 |
float sum = 0;
|
| 6780 |
|
| 6781 |
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
| 6782 |
const int8x16_t m32s = vdupq_n_s8(32);
|
|
|
|
| 6783 |
const int32x4_t vzero = vdupq_n_s32(0);
|
|
|
|
| 6784 |
|
| 6785 |
const uint8x16_t mone = vdupq_n_u8(3);
|
| 6786 |
|
|
|
|
| 6816 |
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
|
| 6817 |
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
|
| 6818 |
|
|
|
|
|
|
|
| 6819 |
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
| 6820 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
| 6821 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
| 6822 |
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6823 |
|
| 6824 |
sum += isum * d_all * y[i].d;
|
| 6825 |
|
ggml.c
CHANGED
|
@@ -4041,7 +4041,6 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
|
| 4041 |
result->op = GGML_OP_GROUP_NORM;
|
| 4042 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 4043 |
result->src[0] = a;
|
| 4044 |
-
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
| 4045 |
|
| 4046 |
return result;
|
| 4047 |
}
|
|
@@ -5541,7 +5540,6 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|
| 5541 |
result->op_params[0] = scale_factor;
|
| 5542 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 5543 |
result->src[0] = a;
|
| 5544 |
-
result->src[1] = NULL;
|
| 5545 |
|
| 5546 |
return result;
|
| 5547 |
}
|
|
@@ -5846,7 +5844,6 @@ struct ggml_tensor * ggml_get_rel_pos(
|
|
| 5846 |
result->op = GGML_OP_GET_REL_POS;
|
| 5847 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 5848 |
result->src[0] = a;
|
| 5849 |
-
result->src[1] = NULL;
|
| 5850 |
|
| 5851 |
return result;
|
| 5852 |
}
|
|
@@ -9690,7 +9687,7 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9690 |
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
| 9691 |
|
| 9692 |
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
| 9693 |
-
|
| 9694 |
|
| 9695 |
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
| 9696 |
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
|
@@ -17456,9 +17453,9 @@ static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g
|
|
| 17456 |
}
|
| 17457 |
|
| 17458 |
//
|
| 17459 |
-
//
|
| 17460 |
//
|
| 17461 |
-
//
|
| 17462 |
//
|
| 17463 |
|
| 17464 |
static enum ggml_opt_result ggml_opt_adam(
|
|
@@ -19351,7 +19348,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
| 19351 |
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
| 19352 |
}
|
| 19353 |
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
| 19354 |
-
free(data);
|
| 19355 |
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
| 19356 |
GGML_ASSERT(false && "nested arrays not supported");
|
| 19357 |
} else {
|
|
|
|
| 4041 |
result->op = GGML_OP_GROUP_NORM;
|
| 4042 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 4043 |
result->src[0] = a;
|
|
|
|
| 4044 |
|
| 4045 |
return result;
|
| 4046 |
}
|
|
|
|
| 5540 |
result->op_params[0] = scale_factor;
|
| 5541 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 5542 |
result->src[0] = a;
|
|
|
|
| 5543 |
|
| 5544 |
return result;
|
| 5545 |
}
|
|
|
|
| 5844 |
result->op = GGML_OP_GET_REL_POS;
|
| 5845 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 5846 |
result->src[0] = a;
|
|
|
|
| 5847 |
|
| 5848 |
return result;
|
| 5849 |
}
|
|
|
|
| 9687 |
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
| 9688 |
|
| 9689 |
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
| 9690 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 9691 |
|
| 9692 |
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
| 9693 |
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
|
|
|
| 17453 |
}
|
| 17454 |
|
| 17455 |
//
|
| 17456 |
+
// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
|
| 17457 |
//
|
| 17458 |
+
// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
|
| 17459 |
//
|
| 17460 |
|
| 17461 |
static enum ggml_opt_result ggml_opt_adam(
|
|
|
|
| 19348 |
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
| 19349 |
}
|
| 19350 |
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
| 19351 |
+
free((void *)data);
|
| 19352 |
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
| 19353 |
GGML_ASSERT(false && "nested arrays not supported");
|
| 19354 |
} else {
|
ggml.h
CHANGED
|
@@ -255,6 +255,8 @@
|
|
| 255 |
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
| 256 |
#elif defined(__GNUC__)
|
| 257 |
#define GGML_UNREACHABLE() __builtin_unreachable()
|
|
|
|
|
|
|
| 258 |
#else
|
| 259 |
#define GGML_UNREACHABLE() ((void) 0)
|
| 260 |
#endif
|
|
|
|
| 255 |
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
| 256 |
#elif defined(__GNUC__)
|
| 257 |
#define GGML_UNREACHABLE() __builtin_unreachable()
|
| 258 |
+
#elif defined(_MSC_VER)
|
| 259 |
+
#define GGML_UNREACHABLE() __assume(0)
|
| 260 |
#else
|
| 261 |
#define GGML_UNREACHABLE() ((void) 0)
|
| 262 |
#endif
|