Spaces:
Running
Running
sync : ggml (#2001)
Browse files* sync : update scripts
* sync : ggml
* talk-llama : sync llama.cpp
* make : WHISPER_CUBLAS -> WHISPER_CUDA
* ci : try to fix sycl build
* talk-llama : fix make build
This view is limited to 50 files because it contains too many changes.
See raw diff
- .github/workflows/build.yml +9 -9
- CMakeLists.txt +12 -4
- Makefile +28 -8
- README.md +2 -2
- examples/common-ggml.cpp +4 -0
- examples/talk-llama/CMakeLists.txt +1 -1
- examples/talk-llama/llama.cpp +0 -0
- examples/talk-llama/llama.h +55 -20
- examples/talk-llama/unicode-data.cpp +0 -0
- examples/talk-llama/unicode-data.h +16 -0
- examples/talk-llama/unicode.cpp +0 -0
- examples/talk-llama/unicode.h +2 -0
- examples/wchess/CMakeLists.txt +0 -2
- extra/sync-ggml-am.sh +2 -0
- extra/sync-ggml.sh +1 -0
- extra/sync-llama.sh +6 -4
- ggml-alloc.c +7 -3
- ggml-backend-impl.h +5 -0
- ggml-backend.c +155 -125
- ggml-backend.h +4 -4
- ggml-common.h +22 -0
- ggml-cuda.cu +0 -0
- ggml-cuda.h +6 -15
- ggml-cuda/acc.cu +47 -0
- ggml-cuda/acc.cuh +5 -0
- ggml-cuda/alibi.cu +63 -0
- ggml-cuda/alibi.cuh +5 -0
- ggml-cuda/arange.cu +34 -0
- ggml-cuda/arange.cuh +5 -0
- ggml-cuda/argsort.cu +77 -0
- ggml-cuda/argsort.cuh +3 -0
- ggml-cuda/binbcast.cu +236 -0
- ggml-cuda/binbcast.cuh +6 -0
- ggml-cuda/clamp.cu +35 -0
- ggml-cuda/clamp.cuh +5 -0
- ggml-cuda/common.cuh +557 -0
- ggml-cuda/concat.cu +49 -0
- ggml-cuda/concat.cuh +5 -0
- ggml-cuda/convert.cu +824 -0
- ggml-cuda/convert.cuh +13 -0
- ggml-cuda/cpy.cu +461 -0
- ggml-cuda/cpy.cuh +7 -0
- ggml-cuda/dequantize.cuh +103 -0
- ggml-cuda/diagmask.cu +40 -0
- ggml-cuda/diagmask.cuh +5 -0
- ggml-cuda/dmmv.cu +817 -0
- ggml-cuda/dmmv.cuh +7 -0
- ggml-cuda/getrows.cu +178 -0
- ggml-cuda/getrows.cuh +5 -0
- ggml-cuda/im2col.cu +104 -0
.github/workflows/build.yml
CHANGED
|
@@ -152,13 +152,13 @@ jobs:
|
|
| 152 |
|
| 153 |
ubuntu-22-cmake-sycl:
|
| 154 |
runs-on: ubuntu-22.04
|
| 155 |
-
|
| 156 |
strategy:
|
| 157 |
fail-fast: false
|
| 158 |
matrix:
|
| 159 |
dwhisper_sycl: [ON]
|
| 160 |
dcmake_c_compiler: [icx]
|
| 161 |
-
dcmake_cxx_compiler: [icpx]
|
| 162 |
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
| 163 |
|
| 164 |
continue-on-error: true
|
|
@@ -166,7 +166,7 @@ jobs:
|
|
| 166 |
steps:
|
| 167 |
- name: Clone
|
| 168 |
uses: actions/checkout@v3
|
| 169 |
-
|
| 170 |
- name: add oneAPI to apt
|
| 171 |
shell: bash
|
| 172 |
run: |
|
|
@@ -190,7 +190,7 @@ jobs:
|
|
| 190 |
- name: Clone
|
| 191 |
id: checkout
|
| 192 |
uses: actions/checkout@v3
|
| 193 |
-
|
| 194 |
- name: Build
|
| 195 |
id: cmake_build
|
| 196 |
run: |
|
|
@@ -202,13 +202,13 @@ jobs:
|
|
| 202 |
|
| 203 |
ubuntu-22-cmake-sycl-fp16:
|
| 204 |
runs-on: ubuntu-22.04
|
| 205 |
-
|
| 206 |
strategy:
|
| 207 |
fail-fast: false
|
| 208 |
matrix:
|
| 209 |
dwhisper_sycl: [ON]
|
| 210 |
dcmake_c_compiler: [icx]
|
| 211 |
-
dcmake_cxx_compiler: [icpx]
|
| 212 |
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
| 213 |
|
| 214 |
continue-on-error: true
|
|
@@ -216,7 +216,7 @@ jobs:
|
|
| 216 |
steps:
|
| 217 |
- name: Clone
|
| 218 |
uses: actions/checkout@v3
|
| 219 |
-
|
| 220 |
- name: add oneAPI to apt
|
| 221 |
shell: bash
|
| 222 |
run: |
|
|
@@ -240,7 +240,7 @@ jobs:
|
|
| 240 |
- name: Clone
|
| 241 |
id: checkout
|
| 242 |
uses: actions/checkout@v3
|
| 243 |
-
|
| 244 |
- name: Build
|
| 245 |
id: cmake_build
|
| 246 |
run: |
|
|
@@ -249,7 +249,7 @@ jobs:
|
|
| 249 |
cd build
|
| 250 |
cmake -DWHISPER_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
| 251 |
cmake --build . --config Release -j $(nproc)
|
| 252 |
-
|
| 253 |
windows:
|
| 254 |
runs-on: windows-latest
|
| 255 |
|
|
|
|
| 152 |
|
| 153 |
ubuntu-22-cmake-sycl:
|
| 154 |
runs-on: ubuntu-22.04
|
| 155 |
+
|
| 156 |
strategy:
|
| 157 |
fail-fast: false
|
| 158 |
matrix:
|
| 159 |
dwhisper_sycl: [ON]
|
| 160 |
dcmake_c_compiler: [icx]
|
| 161 |
+
dcmake_cxx_compiler: [icpx]
|
| 162 |
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
| 163 |
|
| 164 |
continue-on-error: true
|
|
|
|
| 166 |
steps:
|
| 167 |
- name: Clone
|
| 168 |
uses: actions/checkout@v3
|
| 169 |
+
|
| 170 |
- name: add oneAPI to apt
|
| 171 |
shell: bash
|
| 172 |
run: |
|
|
|
|
| 190 |
- name: Clone
|
| 191 |
id: checkout
|
| 192 |
uses: actions/checkout@v3
|
| 193 |
+
|
| 194 |
- name: Build
|
| 195 |
id: cmake_build
|
| 196 |
run: |
|
|
|
|
| 202 |
|
| 203 |
ubuntu-22-cmake-sycl-fp16:
|
| 204 |
runs-on: ubuntu-22.04
|
| 205 |
+
|
| 206 |
strategy:
|
| 207 |
fail-fast: false
|
| 208 |
matrix:
|
| 209 |
dwhisper_sycl: [ON]
|
| 210 |
dcmake_c_compiler: [icx]
|
| 211 |
+
dcmake_cxx_compiler: [icpx]
|
| 212 |
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
| 213 |
|
| 214 |
continue-on-error: true
|
|
|
|
| 216 |
steps:
|
| 217 |
- name: Clone
|
| 218 |
uses: actions/checkout@v3
|
| 219 |
+
|
| 220 |
- name: add oneAPI to apt
|
| 221 |
shell: bash
|
| 222 |
run: |
|
|
|
|
| 240 |
- name: Clone
|
| 241 |
id: checkout
|
| 242 |
uses: actions/checkout@v3
|
| 243 |
+
|
| 244 |
- name: Build
|
| 245 |
id: cmake_build
|
| 246 |
run: |
|
|
|
|
| 249 |
cd build
|
| 250 |
cmake -DWHISPER_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
| 251 |
cmake --build . --config Release -j $(nproc)
|
| 252 |
+
|
| 253 |
windows:
|
| 254 |
runs-on: windows-latest
|
| 255 |
|
CMakeLists.txt
CHANGED
|
@@ -74,7 +74,8 @@ else()
|
|
| 74 |
option(WHISPER_BLAS "whisper: use BLAS libraries" OFF)
|
| 75 |
option(WHISPER_BLAS_VENDOR "whisper: BLAS library vendor" Generic)
|
| 76 |
option(WHISPER_OPENBLAS "whisper: prefer OpenBLAS" OFF)
|
| 77 |
-
option(
|
|
|
|
| 78 |
option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF)
|
| 79 |
option(WHISPER_CLBLAST "whisper: use CLBlast" OFF)
|
| 80 |
option(WHISPER_SYCL "whisper: use SYCL" OFF)
|
|
@@ -240,6 +241,11 @@ if (WHISPER_BLAS)
|
|
| 240 |
endif ()
|
| 241 |
|
| 242 |
if (WHISPER_CUBLAS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
cmake_minimum_required(VERSION 3.17)
|
| 244 |
|
| 245 |
find_package(CUDAToolkit)
|
|
@@ -249,9 +255,11 @@ if (WHISPER_CUBLAS)
|
|
| 249 |
|
| 250 |
enable_language(CUDA)
|
| 251 |
|
| 252 |
-
|
|
|
|
|
|
|
| 253 |
|
| 254 |
-
add_compile_definitions(
|
| 255 |
|
| 256 |
if (WHISPER_STATIC)
|
| 257 |
if (WIN32)
|
|
@@ -286,7 +294,7 @@ if (WHISPER_HIPBLAS)
|
|
| 286 |
|
| 287 |
if (${hipblas_FOUND} AND ${hip_FOUND})
|
| 288 |
message(STATUS "HIP and hipBLAS found")
|
| 289 |
-
add_compile_definitions(GGML_USE_HIPBLAS
|
| 290 |
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
| 291 |
set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
|
| 292 |
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
|
|
|
| 74 |
option(WHISPER_BLAS "whisper: use BLAS libraries" OFF)
|
| 75 |
option(WHISPER_BLAS_VENDOR "whisper: BLAS library vendor" Generic)
|
| 76 |
option(WHISPER_OPENBLAS "whisper: prefer OpenBLAS" OFF)
|
| 77 |
+
option(WHISPER_CUDA "whisper: support for CUDA" OFF)
|
| 78 |
+
option(WHISPER_CUBLAS "whisper: support for CUDA (deprecated)" OFF)
|
| 79 |
option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF)
|
| 80 |
option(WHISPER_CLBLAST "whisper: use CLBlast" OFF)
|
| 81 |
option(WHISPER_SYCL "whisper: use SYCL" OFF)
|
|
|
|
| 241 |
endif ()
|
| 242 |
|
| 243 |
if (WHISPER_CUBLAS)
|
| 244 |
+
message(WARNING "WHISPER_CUBLAS is deprecated and will be removed in the future.\nUse WHISPER_CUDA instead")
|
| 245 |
+
set(WHISPER_CUDA ON)
|
| 246 |
+
endif()
|
| 247 |
+
|
| 248 |
+
if (WHISPER_CUDA)
|
| 249 |
cmake_minimum_required(VERSION 3.17)
|
| 250 |
|
| 251 |
find_package(CUDAToolkit)
|
|
|
|
| 255 |
|
| 256 |
enable_language(CUDA)
|
| 257 |
|
| 258 |
+
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
|
| 259 |
+
list(APPEND GGML_SOURCES_CUDA ggml-cuda.h)
|
| 260 |
+
list(APPEND GGML_SOURCES_CUDA ggml-cuda.cu)
|
| 261 |
|
| 262 |
+
add_compile_definitions(GGML_USE_CUDA)
|
| 263 |
|
| 264 |
if (WHISPER_STATIC)
|
| 265 |
if (WIN32)
|
|
|
|
| 294 |
|
| 295 |
if (${hipblas_FOUND} AND ${hip_FOUND})
|
| 296 |
message(STATUS "HIP and hipBLAS found")
|
| 297 |
+
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
|
| 298 |
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
| 299 |
set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
|
| 300 |
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
Makefile
CHANGED
|
@@ -216,20 +216,29 @@ ifdef WHISPER_OPENBLAS
|
|
| 216 |
endif
|
| 217 |
|
| 218 |
ifdef WHISPER_CUBLAS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
|
| 220 |
CUDA_ARCH_FLAG ?= native
|
| 221 |
else
|
| 222 |
CUDA_ARCH_FLAG ?= all
|
| 223 |
endif
|
| 224 |
|
| 225 |
-
CFLAGS += -
|
| 226 |
-
CXXFLAGS += -
|
| 227 |
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
| 228 |
WHISPER_OBJ += ggml-cuda.o
|
|
|
|
| 229 |
NVCC = nvcc
|
| 230 |
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
|
| 231 |
|
| 232 |
-
ggml-cuda
|
|
|
|
|
|
|
|
|
|
| 233 |
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
| 234 |
endif
|
| 235 |
|
|
@@ -237,14 +246,18 @@ ifdef WHISPER_HIPBLAS
|
|
| 237 |
ROCM_PATH ?= /opt/rocm
|
| 238 |
HIPCC ?= $(ROCM_PATH)/bin/hipcc
|
| 239 |
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
| 240 |
-
CFLAGS += -DGGML_USE_HIPBLAS -
|
| 241 |
-
CXXFLAGS += -DGGML_USE_HIPBLAS -
|
| 242 |
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
| 243 |
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
| 244 |
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
|
| 245 |
WHISPER_OBJ += ggml-cuda.o
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
-
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
| 248 |
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
| 249 |
endif
|
| 250 |
|
|
@@ -309,6 +322,13 @@ $(info I CC: $(CCV))
|
|
| 309 |
$(info I CXX: $(CXXV))
|
| 310 |
$(info )
|
| 311 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
#
|
| 313 |
# Build library
|
| 314 |
#
|
|
@@ -410,8 +430,8 @@ lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
|
| 410 |
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
| 411 |
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
|
| 412 |
|
| 413 |
-
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
| 414 |
-
$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
|
| 415 |
|
| 416 |
#
|
| 417 |
# Audio samples
|
|
|
|
| 216 |
endif
|
| 217 |
|
| 218 |
ifdef WHISPER_CUBLAS
|
| 219 |
+
# WHISPER_CUBLAS is deprecated and will be removed in the future
|
| 220 |
+
WHISPER_CUDA := 1
|
| 221 |
+
endif
|
| 222 |
+
|
| 223 |
+
ifdef WHISPER_CUDA
|
| 224 |
ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
|
| 225 |
CUDA_ARCH_FLAG ?= native
|
| 226 |
else
|
| 227 |
CUDA_ARCH_FLAG ?= all
|
| 228 |
endif
|
| 229 |
|
| 230 |
+
CFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
| 231 |
+
CXXFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
| 232 |
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
| 233 |
WHISPER_OBJ += ggml-cuda.o
|
| 234 |
+
WHISPER_OBJ += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
| 235 |
NVCC = nvcc
|
| 236 |
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
|
| 237 |
|
| 238 |
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
| 239 |
+
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
|
| 240 |
+
|
| 241 |
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
| 242 |
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
| 243 |
endif
|
| 244 |
|
|
|
|
| 246 |
ROCM_PATH ?= /opt/rocm
|
| 247 |
HIPCC ?= $(ROCM_PATH)/bin/hipcc
|
| 248 |
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
| 249 |
+
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
| 250 |
+
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
| 251 |
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
| 252 |
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
| 253 |
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
|
| 254 |
WHISPER_OBJ += ggml-cuda.o
|
| 255 |
+
WHISPER_OBJ += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
| 256 |
+
|
| 257 |
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
| 258 |
+
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
| 259 |
|
| 260 |
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
| 261 |
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
| 262 |
endif
|
| 263 |
|
|
|
|
| 322 |
$(info I CXX: $(CXXV))
|
| 323 |
$(info )
|
| 324 |
|
| 325 |
+
ifdef WHISPER_CUBLAS
|
| 326 |
+
$(info !!!!)
|
| 327 |
+
$(info WHISPER_CUBLAS is deprecated and will be removed in the future. Use WHISPER_CUDA instead.)
|
| 328 |
+
$(info !!!!)
|
| 329 |
+
$(info )
|
| 330 |
+
endif
|
| 331 |
+
|
| 332 |
#
|
| 333 |
# Build library
|
| 334 |
#
|
|
|
|
| 430 |
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
| 431 |
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
|
| 432 |
|
| 433 |
+
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
| 434 |
+
$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
|
| 435 |
|
| 436 |
#
|
| 437 |
# Audio samples
|
README.md
CHANGED
|
@@ -414,11 +414,11 @@ For more information about the Core ML implementation please refer to PR [#1037]
|
|
| 414 |
With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
|
| 415 |
First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
|
| 416 |
|
| 417 |
-
Now build `whisper.cpp` with
|
| 418 |
|
| 419 |
```
|
| 420 |
make clean
|
| 421 |
-
|
| 422 |
```
|
| 423 |
|
| 424 |
## OpenCL GPU support via CLBlast
|
|
|
|
| 414 |
With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
|
| 415 |
First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
|
| 416 |
|
| 417 |
+
Now build `whisper.cpp` with CUDA support:
|
| 418 |
|
| 419 |
```
|
| 420 |
make clean
|
| 421 |
+
WHISPER_CUDA=1 make -j
|
| 422 |
```
|
| 423 |
|
| 424 |
## OpenCL GPU support via CLBlast
|
examples/common-ggml.cpp
CHANGED
|
@@ -70,6 +70,7 @@ bool ggml_common_quantize_0(
|
|
| 70 |
case GGML_FTYPE_MOSTLY_IQ1_S:
|
| 71 |
case GGML_FTYPE_MOSTLY_IQ4_NL:
|
| 72 |
case GGML_FTYPE_MOSTLY_IQ4_XS:
|
|
|
|
| 73 |
{
|
| 74 |
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
| 75 |
return false;
|
|
@@ -193,6 +194,8 @@ bool ggml_common_quantize_0(
|
|
| 193 |
case GGML_TYPE_I8:
|
| 194 |
case GGML_TYPE_I16:
|
| 195 |
case GGML_TYPE_I32:
|
|
|
|
|
|
|
| 196 |
case GGML_TYPE_Q8_1:
|
| 197 |
case GGML_TYPE_Q8_K:
|
| 198 |
case GGML_TYPE_IQ2_XXS:
|
|
@@ -203,6 +206,7 @@ bool ggml_common_quantize_0(
|
|
| 203 |
case GGML_TYPE_IQ1_S:
|
| 204 |
case GGML_TYPE_IQ4_NL:
|
| 205 |
case GGML_TYPE_IQ4_XS:
|
|
|
|
| 206 |
case GGML_TYPE_COUNT:
|
| 207 |
{
|
| 208 |
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
|
|
|
| 70 |
case GGML_FTYPE_MOSTLY_IQ1_S:
|
| 71 |
case GGML_FTYPE_MOSTLY_IQ4_NL:
|
| 72 |
case GGML_FTYPE_MOSTLY_IQ4_XS:
|
| 73 |
+
case GGML_FTYPE_MOSTLY_IQ1_M:
|
| 74 |
{
|
| 75 |
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
| 76 |
return false;
|
|
|
|
| 194 |
case GGML_TYPE_I8:
|
| 195 |
case GGML_TYPE_I16:
|
| 196 |
case GGML_TYPE_I32:
|
| 197 |
+
case GGML_TYPE_I64:
|
| 198 |
+
case GGML_TYPE_F64:
|
| 199 |
case GGML_TYPE_Q8_1:
|
| 200 |
case GGML_TYPE_Q8_K:
|
| 201 |
case GGML_TYPE_IQ2_XXS:
|
|
|
|
| 206 |
case GGML_TYPE_IQ1_S:
|
| 207 |
case GGML_TYPE_IQ4_NL:
|
| 208 |
case GGML_TYPE_IQ4_XS:
|
| 209 |
+
case GGML_TYPE_IQ1_M:
|
| 210 |
case GGML_TYPE_COUNT:
|
| 211 |
{
|
| 212 |
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
examples/talk-llama/CMakeLists.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
if (WHISPER_SDL2)
|
| 2 |
# talk-llama
|
| 3 |
set(TARGET talk-llama)
|
| 4 |
-
add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp)
|
| 5 |
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
| 6 |
|
| 7 |
if (WHISPER_CLBLAST)
|
|
|
|
| 1 |
if (WHISPER_SDL2)
|
| 2 |
# talk-llama
|
| 3 |
set(TARGET talk-llama)
|
| 4 |
+
add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
|
| 5 |
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
| 6 |
|
| 7 |
if (WHISPER_CLBLAST)
|
examples/talk-llama/llama.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/llama.h
CHANGED
|
@@ -39,7 +39,7 @@
|
|
| 39 |
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
| 40 |
|
| 41 |
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 42 |
-
#define LLAMA_SESSION_VERSION
|
| 43 |
|
| 44 |
#ifdef __cplusplus
|
| 45 |
extern "C" {
|
|
@@ -117,6 +117,7 @@ extern "C" {
|
|
| 117 |
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
| 118 |
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
| 119 |
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
|
|
|
| 120 |
|
| 121 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 122 |
};
|
|
@@ -275,13 +276,16 @@ extern "C" {
|
|
| 275 |
|
| 276 |
// model quantization parameters
|
| 277 |
typedef struct llama_model_quantize_params {
|
| 278 |
-
int32_t nthread;
|
| 279 |
-
enum llama_ftype ftype;
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
bool
|
| 283 |
-
bool
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
| 285 |
} llama_model_quantize_params;
|
| 286 |
|
| 287 |
// grammar types
|
|
@@ -388,6 +392,7 @@ extern "C" {
|
|
| 388 |
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
| 389 |
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
| 390 |
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
|
|
|
| 391 |
|
| 392 |
// Get the model's RoPE frequency scaling factor
|
| 393 |
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
|
@@ -435,10 +440,24 @@ extern "C" {
|
|
| 435 |
// Returns 0 on success
|
| 436 |
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
| 437 |
const struct llama_model * model,
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
//
|
| 444 |
// KV cache
|
|
@@ -659,23 +678,29 @@ extern "C" {
|
|
| 659 |
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
| 660 |
|
| 661 |
// Token logits obtained from the last call to llama_decode()
|
| 662 |
-
// The logits for
|
| 663 |
-
//
|
| 664 |
-
// Rows:
|
| 665 |
// Cols: n_vocab
|
| 666 |
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
| 667 |
|
| 668 |
// Logits for the ith token. Equivalent to:
|
| 669 |
-
// llama_get_logits(ctx) + i*n_vocab
|
|
|
|
| 670 |
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
| 671 |
|
| 672 |
-
// Get all output token embeddings
|
| 673 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
| 675 |
|
| 676 |
-
// Get the embeddings for the ith token
|
| 677 |
-
// llama_get_embeddings(ctx) + i*n_embd
|
| 678 |
// shape: [n_embd] (1-dimensional)
|
|
|
|
| 679 |
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
| 680 |
|
| 681 |
// Get the embeddings for a sequence id
|
|
@@ -945,6 +970,16 @@ extern "C" {
|
|
| 945 |
int32_t n_past,
|
| 946 |
int32_t n_predict);
|
| 947 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 948 |
// Performance information
|
| 949 |
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
| 950 |
|
|
|
|
| 39 |
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
| 40 |
|
| 41 |
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 42 |
+
#define LLAMA_SESSION_VERSION 5
|
| 43 |
|
| 44 |
#ifdef __cplusplus
|
| 45 |
extern "C" {
|
|
|
|
| 117 |
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
| 118 |
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
| 119 |
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
| 120 |
+
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
| 121 |
|
| 122 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 123 |
};
|
|
|
|
| 276 |
|
| 277 |
// model quantization parameters
|
| 278 |
typedef struct llama_model_quantize_params {
|
| 279 |
+
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
| 280 |
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
| 281 |
+
enum ggml_type output_tensor_type; // output tensor type
|
| 282 |
+
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
| 283 |
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
| 284 |
+
bool quantize_output_tensor; // quantize output.weight
|
| 285 |
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
| 286 |
+
bool pure; // quantize all tensors to the default type
|
| 287 |
+
void * imatrix; // pointer to importance matrix data
|
| 288 |
+
void * kv_overrides; // pointer to vector containing overrides
|
| 289 |
} llama_model_quantize_params;
|
| 290 |
|
| 291 |
// grammar types
|
|
|
|
| 392 |
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
| 393 |
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
| 394 |
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
| 395 |
+
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
| 396 |
|
| 397 |
// Get the model's RoPE frequency scaling factor
|
| 398 |
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
|
|
|
| 440 |
// Returns 0 on success
|
| 441 |
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
| 442 |
const struct llama_model * model,
|
| 443 |
+
const char * path_lora,
|
| 444 |
+
float scale,
|
| 445 |
+
const char * path_base_model,
|
| 446 |
+
int32_t n_threads);
|
| 447 |
+
|
| 448 |
+
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
| 449 |
+
// the currently loaded vector.
|
| 450 |
+
// n_embd should be the size of a single layer's control, and data should point
|
| 451 |
+
// to an n_embd x n_layers buffer starting from layer 1.
|
| 452 |
+
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
| 453 |
+
// See llama_control_vector_load in common to load a control vector.
|
| 454 |
+
LLAMA_API int32_t llama_control_vector_apply(
|
| 455 |
+
struct llama_context * lctx,
|
| 456 |
+
const float * data,
|
| 457 |
+
size_t len,
|
| 458 |
+
int32_t n_embd,
|
| 459 |
+
int32_t il_start,
|
| 460 |
+
int32_t il_end);
|
| 461 |
|
| 462 |
//
|
| 463 |
// KV cache
|
|
|
|
| 678 |
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
| 679 |
|
| 680 |
// Token logits obtained from the last call to llama_decode()
|
| 681 |
+
// The logits for which llama_batch.logits[i] != 0 are stored contiguously
|
| 682 |
+
// in the order they have appeared in the batch.
|
| 683 |
+
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
| 684 |
// Cols: n_vocab
|
| 685 |
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
| 686 |
|
| 687 |
// Logits for the ith token. Equivalent to:
|
| 688 |
+
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
| 689 |
+
// returns NULL for invalid ids.
|
| 690 |
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
| 691 |
|
| 692 |
+
// Get all output token embeddings.
|
| 693 |
+
// when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
|
| 694 |
+
// the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
|
| 695 |
+
// in the order they have appeared in the batch.
|
| 696 |
+
// shape: [n_outputs*n_embd]
|
| 697 |
+
// Otherwise, returns NULL.
|
| 698 |
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
| 699 |
|
| 700 |
+
// Get the embeddings for the ith token. Equivalent to:
|
| 701 |
+
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
| 702 |
// shape: [n_embd] (1-dimensional)
|
| 703 |
+
// returns NULL for invalid ids.
|
| 704 |
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
| 705 |
|
| 706 |
// Get the embeddings for a sequence id
|
|
|
|
| 970 |
int32_t n_past,
|
| 971 |
int32_t n_predict);
|
| 972 |
|
| 973 |
+
/// @details Build a split GGUF final path for this chunk.
|
| 974 |
+
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
| 975 |
+
// Returns the split_path length.
|
| 976 |
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
| 977 |
+
|
| 978 |
+
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
| 979 |
+
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
| 980 |
+
// Returns the split_prefix length.
|
| 981 |
+
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
| 982 |
+
|
| 983 |
// Performance information
|
| 984 |
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
| 985 |
|
examples/talk-llama/unicode-data.cpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/unicode-data.h
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <cstdint>
|
| 4 |
+
#include <map>
|
| 5 |
+
#include <utility>
|
| 6 |
+
#include <vector>
|
| 7 |
+
|
| 8 |
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
|
| 9 |
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
| 10 |
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
| 11 |
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
| 12 |
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
| 13 |
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
|
| 14 |
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
| 15 |
+
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
| 16 |
+
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|
examples/talk-llama/unicode.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/unicode.h
CHANGED
|
@@ -24,3 +24,5 @@ int unicode_cpt_type(const std::string & utf8);
|
|
| 24 |
std::string unicode_byte_to_utf8(uint8_t byte);
|
| 25 |
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
| 26 |
|
|
|
|
|
|
|
|
|
| 24 |
std::string unicode_byte_to_utf8(uint8_t byte);
|
| 25 |
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
| 26 |
|
| 27 |
+
// simple tolower that only implements one-to-one mapping, not one-to-many
|
| 28 |
+
char32_t unicode_tolower(char32_t cp);
|
examples/wchess/CMakeLists.txt
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
set(CMAKE_CXX_STANDARD 11)
|
| 2 |
-
|
| 3 |
add_subdirectory(libwchess)
|
| 4 |
|
| 5 |
if (EMSCRIPTEN)
|
|
|
|
|
|
|
|
|
|
| 1 |
add_subdirectory(libwchess)
|
| 2 |
|
| 3 |
if (EMSCRIPTEN)
|
extra/sync-ggml-am.sh
CHANGED
|
@@ -98,6 +98,7 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
|
|
| 98 |
# src/ggml-backend-impl.h -> ggml-backend-impl.h
|
| 99 |
# src/ggml-backend.c -> ggml-backend.c
|
| 100 |
# src/ggml-common.h -> ggml-common.h
|
|
|
|
| 101 |
# src/ggml-cuda.cu -> ggml-cuda.cu
|
| 102 |
# src/ggml-cuda.h -> ggml-cuda.h
|
| 103 |
# src/ggml-impl.h -> ggml-impl.h
|
|
@@ -135,6 +136,7 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
|
|
| 135 |
-e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
|
| 136 |
-e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
|
| 137 |
-e 's/src\/ggml-common\.h/ggml-common.h/g' \
|
|
|
|
| 138 |
-e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
|
| 139 |
-e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
|
| 140 |
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
|
|
|
|
| 98 |
# src/ggml-backend-impl.h -> ggml-backend-impl.h
|
| 99 |
# src/ggml-backend.c -> ggml-backend.c
|
| 100 |
# src/ggml-common.h -> ggml-common.h
|
| 101 |
+
# src/ggml-cuda/* -> ggml-cuda/
|
| 102 |
# src/ggml-cuda.cu -> ggml-cuda.cu
|
| 103 |
# src/ggml-cuda.h -> ggml-cuda.h
|
| 104 |
# src/ggml-impl.h -> ggml-impl.h
|
|
|
|
| 136 |
-e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
|
| 137 |
-e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
|
| 138 |
-e 's/src\/ggml-common\.h/ggml-common.h/g' \
|
| 139 |
+
-e 's/src\/ggml-cuda\//ggml-cuda\//g' \
|
| 140 |
-e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
|
| 141 |
-e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
|
| 142 |
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
|
extra/sync-ggml.sh
CHANGED
|
@@ -6,6 +6,7 @@ cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c
|
|
| 6 |
cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h
|
| 7 |
cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c
|
| 8 |
cp -rpv ../ggml/src/ggml-common.h ./ggml-common.h
|
|
|
|
| 9 |
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
| 10 |
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
| 11 |
cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml-kompute.cpp
|
|
|
|
| 6 |
cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h
|
| 7 |
cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c
|
| 8 |
cp -rpv ../ggml/src/ggml-common.h ./ggml-common.h
|
| 9 |
+
cp -rpv ../ggml/src/ggml-cuda/* ./ggml-cuda/
|
| 10 |
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
| 11 |
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
| 12 |
cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml-kompute.cpp
|
extra/sync-llama.sh
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
-
cp -rpv ../llama.cpp/llama.h
|
| 4 |
-
cp -rpv ../llama.cpp/llama.cpp
|
| 5 |
-
cp -rpv ../llama.cpp/unicode.h
|
| 6 |
-
cp -rpv ../llama.cpp/unicode.cpp
|
|
|
|
|
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
+
cp -rpv ../llama.cpp/llama.h ./examples/talk-llama/llama.h
|
| 4 |
+
cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
|
| 5 |
+
cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
|
| 6 |
+
cp -rpv ../llama.cpp/unicode.cpp ./examples/talk-llama/unicode.cpp
|
| 7 |
+
cp -rpv ../llama.cpp/unicode-data.h ./examples/talk-llama/unicode-data.h
|
| 8 |
+
cp -rpv ../llama.cpp/unicode-data.cpp ./examples/talk-llama/unicode-data.cpp
|
ggml-alloc.c
CHANGED
|
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
| 548 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 549 |
struct ggml_tensor * node = graph->nodes[i];
|
| 550 |
|
| 551 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
struct ggml_tensor * view_src = node->view_src;
|
| 553 |
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
| 554 |
}
|
|
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
| 565 |
|
| 566 |
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
| 567 |
|
| 568 |
-
// allocate explicit inputs
|
| 569 |
-
if (src->flags & GGML_TENSOR_FLAG_INPUT
|
| 570 |
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
| 571 |
}
|
| 572 |
}
|
|
|
|
| 548 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 549 |
struct ggml_tensor * node = graph->nodes[i];
|
| 550 |
|
| 551 |
+
// TODO: better way to add external dependencies
|
| 552 |
+
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
|
| 553 |
+
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
|
| 554 |
+
// itself is never used and should not be considered a dependency
|
| 555 |
+
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
|
| 556 |
struct ggml_tensor * view_src = node->view_src;
|
| 557 |
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
| 558 |
}
|
|
|
|
| 569 |
|
| 570 |
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
| 571 |
|
| 572 |
+
// allocate explicit inputs
|
| 573 |
+
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 574 |
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
| 575 |
}
|
| 576 |
}
|
ggml-backend-impl.h
CHANGED
|
@@ -103,6 +103,11 @@ extern "C" {
|
|
| 103 |
// check if the backend supports an operation
|
| 104 |
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
// (optional) event synchronization
|
| 107 |
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
| 108 |
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
|
|
|
| 103 |
// check if the backend supports an operation
|
| 104 |
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 105 |
|
| 106 |
+
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
| 107 |
+
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
| 108 |
+
// even if the weight has to be copied from the CPU temporarily
|
| 109 |
+
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 110 |
+
|
| 111 |
// (optional) event synchronization
|
| 112 |
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
| 113 |
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
ggml-backend.c
CHANGED
|
@@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
|
|
| 278 |
return err;
|
| 279 |
}
|
| 280 |
|
| 281 |
-
|
| 282 |
return backend->iface.graph_compute(backend, cgraph);
|
| 283 |
}
|
| 284 |
|
|
@@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
|
| 286 |
return backend->iface.supports_op(backend, op);
|
| 287 |
}
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
// backend copy
|
| 290 |
|
| 291 |
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
|
@@ -413,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
| 413 |
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
| 414 |
|
| 415 |
// add forward decls here to avoid including the backend headers
|
| 416 |
-
#ifdef
|
| 417 |
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
| 418 |
ggml_backend_cuda_reg_devices();
|
| 419 |
#endif
|
|
@@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
|
| 761 |
|
| 762 |
if (cpu_plan->cplan.work_size > 0) {
|
| 763 |
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
}
|
| 765 |
|
| 766 |
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
|
@@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
| 834 |
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
| 835 |
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
| 836 |
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
|
|
|
| 837 |
/* .event_new = */ NULL,
|
| 838 |
/* .event_free = */ NULL,
|
| 839 |
/* .event_record = */ NULL,
|
|
@@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|
| 999 |
#endif
|
| 1000 |
|
| 1001 |
#ifndef GGML_SCHED_MAX_SPLITS
|
| 1002 |
-
#define GGML_SCHED_MAX_SPLITS
|
| 1003 |
#endif
|
| 1004 |
|
| 1005 |
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
| 1006 |
-
#define GGML_SCHED_MAX_SPLIT_INPUTS
|
| 1007 |
#endif
|
| 1008 |
|
| 1009 |
#ifndef GGML_SCHED_MAX_COPIES
|
|
@@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
|
|
| 1043 |
struct ggml_cgraph * graph;
|
| 1044 |
|
| 1045 |
// graph splits
|
| 1046 |
-
struct ggml_backend_sched_split splits
|
| 1047 |
int n_splits;
|
|
|
|
| 1048 |
|
| 1049 |
// pipeline parallelism support
|
| 1050 |
int n_copies;
|
|
@@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
| 1114 |
// TODO: use supports_op to check if the backend supports the op
|
| 1115 |
|
| 1116 |
// assign pre-allocated nodes to their backend
|
| 1117 |
-
|
| 1118 |
-
|
| 1119 |
-
if (cur_backend != -1) {
|
| 1120 |
SET_CAUSE(tensor, "1.dst");
|
| 1121 |
-
return
|
| 1122 |
}
|
| 1123 |
|
| 1124 |
// view_src
|
| 1125 |
if (tensor->view_src != NULL) {
|
| 1126 |
-
|
| 1127 |
-
if (
|
| 1128 |
SET_CAUSE(tensor, "1.vsrc");
|
| 1129 |
-
return
|
| 1130 |
}
|
| 1131 |
}
|
| 1132 |
|
| 1133 |
-
// input
|
| 1134 |
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1135 |
-
|
| 1136 |
SET_CAUSE(tensor, "1.inp");
|
| 1137 |
-
return
|
| 1138 |
}
|
| 1139 |
|
| 1140 |
// assign nodes that use weights to the backend of the weights
|
|
|
|
| 1141 |
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 1142 |
const struct ggml_tensor * src = tensor->src[i];
|
| 1143 |
if (src == NULL) {
|
| 1144 |
continue;
|
| 1145 |
}
|
| 1146 |
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1147 |
-
int
|
| 1148 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1149 |
SET_CAUSE(tensor, "1.wgt%d", i);
|
| 1150 |
-
return
|
| 1151 |
}
|
| 1152 |
}
|
| 1153 |
|
|
@@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1227 |
// pass 1: assign backends to ops with pre-allocated inputs
|
| 1228 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1229 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1230 |
-
|
|
|
|
| 1231 |
// do not overwrite user assignments
|
| 1232 |
continue;
|
| 1233 |
}
|
| 1234 |
-
|
| 1235 |
}
|
| 1236 |
|
| 1237 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1238 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1239 |
-
|
|
|
|
| 1240 |
// do not overwrite user assignments
|
| 1241 |
continue;
|
| 1242 |
}
|
| 1243 |
-
|
| 1244 |
// src
|
| 1245 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1246 |
struct ggml_tensor * src = node->src[j];
|
| 1247 |
if (src == NULL) {
|
| 1248 |
continue;
|
| 1249 |
}
|
| 1250 |
-
|
| 1251 |
-
|
|
|
|
| 1252 |
}
|
| 1253 |
}
|
| 1254 |
}
|
|
@@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1270 |
if (ggml_is_view_op(node->op)) {
|
| 1271 |
continue;
|
| 1272 |
}
|
| 1273 |
-
int
|
| 1274 |
-
if (
|
| 1275 |
-
if (
|
| 1276 |
// skip cpu (lowest prio backend)
|
| 1277 |
cur_backend_id = -1;
|
| 1278 |
} else {
|
| 1279 |
-
cur_backend_id =
|
| 1280 |
}
|
| 1281 |
} else {
|
| 1282 |
-
|
| 1283 |
SET_CAUSE(node, "2.2");
|
| 1284 |
}
|
| 1285 |
}
|
| 1286 |
}
|
| 1287 |
-
|
| 1288 |
// pass 2.1 expand gpu up
|
| 1289 |
{
|
| 1290 |
int cur_backend_id = -1;
|
|
@@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1293 |
if (ggml_is_view_op(node->op)) {
|
| 1294 |
continue;
|
| 1295 |
}
|
| 1296 |
-
int
|
| 1297 |
-
if (
|
| 1298 |
-
if (
|
| 1299 |
// skip cpu (lowest prio backend)
|
| 1300 |
cur_backend_id = -1;
|
| 1301 |
} else {
|
| 1302 |
-
cur_backend_id =
|
| 1303 |
}
|
| 1304 |
} else {
|
| 1305 |
-
|
| 1306 |
SET_CAUSE(node, "2.1");
|
| 1307 |
}
|
| 1308 |
}
|
| 1309 |
}
|
| 1310 |
-
|
| 1311 |
-
|
| 1312 |
// pass 2.4 expand rest down
|
| 1313 |
{
|
| 1314 |
int cur_backend_id = -1;
|
|
@@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1317 |
if (ggml_is_view_op(node->op)) {
|
| 1318 |
continue;
|
| 1319 |
}
|
| 1320 |
-
int
|
| 1321 |
-
if (
|
| 1322 |
-
cur_backend_id =
|
| 1323 |
} else {
|
| 1324 |
-
|
| 1325 |
SET_CAUSE(node, "2.4");
|
| 1326 |
}
|
| 1327 |
}
|
| 1328 |
}
|
| 1329 |
-
|
| 1330 |
{
|
| 1331 |
int cur_backend_id = -1;
|
| 1332 |
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
@@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1334 |
if (ggml_is_view_op(node->op)) {
|
| 1335 |
continue;
|
| 1336 |
}
|
| 1337 |
-
int
|
| 1338 |
-
if (
|
| 1339 |
-
cur_backend_id =
|
| 1340 |
} else {
|
| 1341 |
-
|
| 1342 |
SET_CAUSE(node, "2.3");
|
| 1343 |
}
|
| 1344 |
}
|
|
@@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1351 |
// pass 3: assign backends to remaining src from dst and view_src
|
| 1352 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1353 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1354 |
-
int cur_backend_id = tensor_backend_id(node);
|
| 1355 |
-
if (node->view_src != NULL && cur_backend_id == -1) {
|
| 1356 |
-
cur_backend_id = tensor_backend_id(node
|
| 1357 |
SET_CAUSE(node, "3.vsrc");
|
| 1358 |
}
|
| 1359 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
@@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1361 |
if (src == NULL) {
|
| 1362 |
continue;
|
| 1363 |
}
|
| 1364 |
-
int src_backend_id = tensor_backend_id(src);
|
| 1365 |
-
if (src_backend_id == -1) {
|
| 1366 |
if (src->view_src != NULL) {
|
| 1367 |
// views are always on the same backend as the source
|
| 1368 |
-
|
| 1369 |
SET_CAUSE(src, "3.vsrc");
|
| 1370 |
} else {
|
| 1371 |
-
|
| 1372 |
SET_CAUSE(src, "3.cur");
|
| 1373 |
}
|
| 1374 |
}
|
|
@@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1380 |
|
| 1381 |
// pass 4: split graph, find tensors that need to be copied
|
| 1382 |
{
|
| 1383 |
-
int
|
|
|
|
| 1384 |
// find the backend of the first split, skipping view ops
|
| 1385 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1386 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1387 |
if (!ggml_is_view_op(node->op)) {
|
| 1388 |
-
|
| 1389 |
break;
|
| 1390 |
}
|
| 1391 |
}
|
| 1392 |
-
|
| 1393 |
-
|
| 1394 |
-
memset(
|
| 1395 |
-
int cur_backend_id =
|
| 1396 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1397 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1398 |
|
|
@@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1400 |
continue;
|
| 1401 |
}
|
| 1402 |
|
| 1403 |
-
int
|
| 1404 |
|
| 1405 |
-
GGML_ASSERT(
|
| 1406 |
|
| 1407 |
-
if
|
| 1408 |
-
|
| 1409 |
-
|
| 1410 |
-
|
| 1411 |
-
|
| 1412 |
-
|
| 1413 |
-
|
| 1414 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1415 |
}
|
| 1416 |
|
| 1417 |
// find inputs that are not on the same backend
|
|
@@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1421 |
continue;
|
| 1422 |
}
|
| 1423 |
|
| 1424 |
-
int src_backend_id = tensor_backend_id(src);
|
| 1425 |
assert(src_backend_id != -1); // all inputs should be assigned by now
|
| 1426 |
|
| 1427 |
-
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1428 |
size_t id = hash_id(src);
|
| 1429 |
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
| 1430 |
ggml_backend_t backend = sched->backends[src_backend_id];
|
|
@@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1441 |
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
| 1442 |
}
|
| 1443 |
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
| 1444 |
-
tensor_backend_id(tensor_copy) = src_backend_id;
|
| 1445 |
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1446 |
}
|
| 1447 |
int n_graph_inputs = sched->n_graph_inputs++;
|
|
@@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1450 |
}
|
| 1451 |
}
|
| 1452 |
|
| 1453 |
-
if (src_backend_id !=
|
| 1454 |
// create a copy of the input in the split's backend
|
| 1455 |
-
size_t id = hash_id(src);
|
| 1456 |
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
| 1457 |
ggml_backend_t backend = sched->backends[cur_backend_id];
|
| 1458 |
for (int c = 0; c < sched->n_copies; c++) {
|
|
@@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1463 |
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
| 1464 |
}
|
| 1465 |
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
| 1466 |
-
tensor_backend_id(tensor_copy) = cur_backend_id;
|
| 1467 |
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1468 |
}
|
| 1469 |
-
int n_inputs =
|
| 1470 |
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
| 1471 |
-
|
| 1472 |
}
|
| 1473 |
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
| 1474 |
}
|
| 1475 |
}
|
| 1476 |
}
|
| 1477 |
-
|
| 1478 |
-
sched->n_splits =
|
| 1479 |
}
|
| 1480 |
#ifdef DEBUG_PASS4
|
| 1481 |
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
| 1482 |
#endif
|
| 1483 |
|
| 1484 |
-
#ifndef NDEBUG
|
| 1485 |
-
// sanity check: all sources should have the same backend as the node
|
| 1486 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1487 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1488 |
-
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
| 1489 |
-
if (tensor_backend == NULL) {
|
| 1490 |
-
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
| 1491 |
-
}
|
| 1492 |
-
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
| 1493 |
-
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
| 1494 |
-
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
| 1495 |
-
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
| 1496 |
-
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
| 1497 |
-
}
|
| 1498 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1499 |
-
struct ggml_tensor * src = node->src[j];
|
| 1500 |
-
if (src == NULL) {
|
| 1501 |
-
continue;
|
| 1502 |
-
}
|
| 1503 |
-
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
| 1504 |
-
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
| 1505 |
-
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
| 1506 |
-
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
| 1507 |
-
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
| 1508 |
-
}
|
| 1509 |
-
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
| 1510 |
-
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
| 1511 |
-
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
| 1512 |
-
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
| 1513 |
-
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
| 1514 |
-
}
|
| 1515 |
-
}
|
| 1516 |
-
}
|
| 1517 |
-
fflush(stderr);
|
| 1518 |
-
#endif
|
| 1519 |
-
|
| 1520 |
// create copies of the graph for each split
|
| 1521 |
// TODO: avoid this copy
|
| 1522 |
-
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
| 1523 |
for (int i = 0; i < sched->n_splits; i++) {
|
| 1524 |
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1525 |
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
| 1526 |
|
| 1527 |
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
| 1528 |
for (int j = 0; j < split->n_inputs; j++) {
|
|
|
|
|
|
|
| 1529 |
struct ggml_tensor * input = split->inputs[j];
|
| 1530 |
-
|
|
|
|
| 1531 |
|
| 1532 |
// add a dependency to the input source so that it is not freed before the copy is done
|
| 1533 |
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
| 1534 |
input_dep->src[0] = input;
|
| 1535 |
-
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id
|
| 1536 |
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
| 1537 |
|
| 1538 |
// add a dependency to the input copy so that it is allocated at the start of the split
|
|
@@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1541 |
}
|
| 1542 |
|
| 1543 |
for (int j = split->i_start; j < split->i_end; j++) {
|
|
|
|
| 1544 |
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
| 1545 |
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
| 1546 |
}
|
|
@@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
| 1625 |
}
|
| 1626 |
ggml_backend_tensor_copy(input, input_cpy);
|
| 1627 |
} else {
|
|
|
|
| 1628 |
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 1629 |
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
| 1630 |
} else {
|
| 1631 |
ggml_backend_synchronize(split_backend);
|
| 1632 |
-
ggml_backend_synchronize(input_backend);
|
| 1633 |
}
|
| 1634 |
-
|
| 1635 |
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
| 1636 |
}
|
| 1637 |
}
|
|
@@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
| 1701 |
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
| 1702 |
|
| 1703 |
// initialize hash table
|
| 1704 |
-
sched->hash_set = ggml_hash_set_new(graph_size
|
| 1705 |
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
| 1706 |
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
| 1707 |
-
|
| 1708 |
-
|
|
|
|
|
|
|
| 1709 |
|
| 1710 |
sched->n_backends = n_backends;
|
| 1711 |
|
| 1712 |
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
| 1713 |
|
| 1714 |
-
|
|
|
|
|
|
|
| 1715 |
|
| 1716 |
for (int b = 0; b < n_backends; b++) {
|
| 1717 |
sched->backends[b] = backends[b];
|
|
@@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
| 1742 |
}
|
| 1743 |
ggml_gallocr_free(sched->galloc);
|
| 1744 |
ggml_free(sched->ctx);
|
|
|
|
| 1745 |
free(sched->hash_set.keys);
|
| 1746 |
free(sched->tensor_backend_id);
|
| 1747 |
free(sched->tensor_copies);
|
|
@@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
| 1762 |
}
|
| 1763 |
|
| 1764 |
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
|
|
|
|
|
|
| 1765 |
ggml_backend_sched_split_graph(sched, measure_graph);
|
| 1766 |
|
| 1767 |
// TODO: extract this to a separate function
|
|
@@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
| 1776 |
}
|
| 1777 |
|
| 1778 |
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1779 |
-
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes
|
| 1780 |
|
| 1781 |
ggml_backend_sched_split_graph(sched, graph);
|
| 1782 |
|
|
|
|
| 278 |
return err;
|
| 279 |
}
|
| 280 |
|
| 281 |
+
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 282 |
return backend->iface.graph_compute(backend, cgraph);
|
| 283 |
}
|
| 284 |
|
|
|
|
| 286 |
return backend->iface.supports_op(backend, op);
|
| 287 |
}
|
| 288 |
|
| 289 |
+
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 290 |
+
if (backend->iface.offload_op != NULL) {
|
| 291 |
+
return backend->iface.offload_op(backend, op);
|
| 292 |
+
}
|
| 293 |
+
return false;
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
// backend copy
|
| 297 |
|
| 298 |
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
|
|
|
| 420 |
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
| 421 |
|
| 422 |
// add forward decls here to avoid including the backend headers
|
| 423 |
+
#ifdef GGML_USE_CUDA
|
| 424 |
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
| 425 |
ggml_backend_cuda_reg_devices();
|
| 426 |
#endif
|
|
|
|
| 768 |
|
| 769 |
if (cpu_plan->cplan.work_size > 0) {
|
| 770 |
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
| 771 |
+
if (cpu_plan->cplan.work_data == NULL) {
|
| 772 |
+
free(cpu_plan);
|
| 773 |
+
return NULL;
|
| 774 |
+
}
|
| 775 |
}
|
| 776 |
|
| 777 |
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
|
|
|
| 845 |
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
| 846 |
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
| 847 |
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
| 848 |
+
/* .offload_op = */ NULL,
|
| 849 |
/* .event_new = */ NULL,
|
| 850 |
/* .event_free = */ NULL,
|
| 851 |
/* .event_record = */ NULL,
|
|
|
|
| 1011 |
#endif
|
| 1012 |
|
| 1013 |
#ifndef GGML_SCHED_MAX_SPLITS
|
| 1014 |
+
#define GGML_SCHED_MAX_SPLITS 2048
|
| 1015 |
#endif
|
| 1016 |
|
| 1017 |
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
| 1018 |
+
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
| 1019 |
#endif
|
| 1020 |
|
| 1021 |
#ifndef GGML_SCHED_MAX_COPIES
|
|
|
|
| 1055 |
struct ggml_cgraph * graph;
|
| 1056 |
|
| 1057 |
// graph splits
|
| 1058 |
+
struct ggml_backend_sched_split * splits;
|
| 1059 |
int n_splits;
|
| 1060 |
+
int splits_capacity;
|
| 1061 |
|
| 1062 |
// pipeline parallelism support
|
| 1063 |
int n_copies;
|
|
|
|
| 1127 |
// TODO: use supports_op to check if the backend supports the op
|
| 1128 |
|
| 1129 |
// assign pre-allocated nodes to their backend
|
| 1130 |
+
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
| 1131 |
+
if (cur_backend_id != -1) {
|
|
|
|
| 1132 |
SET_CAUSE(tensor, "1.dst");
|
| 1133 |
+
return cur_backend_id;
|
| 1134 |
}
|
| 1135 |
|
| 1136 |
// view_src
|
| 1137 |
if (tensor->view_src != NULL) {
|
| 1138 |
+
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
| 1139 |
+
if (cur_backend_id != -1) {
|
| 1140 |
SET_CAUSE(tensor, "1.vsrc");
|
| 1141 |
+
return cur_backend_id;
|
| 1142 |
}
|
| 1143 |
}
|
| 1144 |
|
| 1145 |
+
// graph input
|
| 1146 |
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1147 |
+
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
| 1148 |
SET_CAUSE(tensor, "1.inp");
|
| 1149 |
+
return cur_backend_id;
|
| 1150 |
}
|
| 1151 |
|
| 1152 |
// assign nodes that use weights to the backend of the weights
|
| 1153 |
+
// operations with weights are preferably run on the same backend as the weights
|
| 1154 |
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 1155 |
const struct ggml_tensor * src = tensor->src[i];
|
| 1156 |
if (src == NULL) {
|
| 1157 |
continue;
|
| 1158 |
}
|
| 1159 |
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1160 |
+
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
| 1161 |
+
// check if a backend with higher prio wants to offload the op
|
| 1162 |
+
if (src_backend_id == sched->n_backends - 1) {
|
| 1163 |
+
for (int b = 0; b < src_backend_id; b++) {
|
| 1164 |
+
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
| 1165 |
+
SET_CAUSE(tensor, "1.off");
|
| 1166 |
+
return b;
|
| 1167 |
+
}
|
| 1168 |
+
}
|
| 1169 |
+
}
|
| 1170 |
SET_CAUSE(tensor, "1.wgt%d", i);
|
| 1171 |
+
return src_backend_id;
|
| 1172 |
}
|
| 1173 |
}
|
| 1174 |
|
|
|
|
| 1248 |
// pass 1: assign backends to ops with pre-allocated inputs
|
| 1249 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1250 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1251 |
+
int * leaf_backend_id = &tensor_backend_id(leaf);
|
| 1252 |
+
if (*leaf_backend_id != -1) {
|
| 1253 |
// do not overwrite user assignments
|
| 1254 |
continue;
|
| 1255 |
}
|
| 1256 |
+
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
| 1257 |
}
|
| 1258 |
|
| 1259 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1260 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1261 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1262 |
+
if (*node_backend_id != -1) {
|
| 1263 |
// do not overwrite user assignments
|
| 1264 |
continue;
|
| 1265 |
}
|
| 1266 |
+
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
| 1267 |
// src
|
| 1268 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1269 |
struct ggml_tensor * src = node->src[j];
|
| 1270 |
if (src == NULL) {
|
| 1271 |
continue;
|
| 1272 |
}
|
| 1273 |
+
int * src_backend_id = &tensor_backend_id(src);
|
| 1274 |
+
if (*src_backend_id == -1) {
|
| 1275 |
+
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
| 1276 |
}
|
| 1277 |
}
|
| 1278 |
}
|
|
|
|
| 1294 |
if (ggml_is_view_op(node->op)) {
|
| 1295 |
continue;
|
| 1296 |
}
|
| 1297 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1298 |
+
if (*node_backend_id != -1) {
|
| 1299 |
+
if (*node_backend_id == sched->n_backends - 1) {
|
| 1300 |
// skip cpu (lowest prio backend)
|
| 1301 |
cur_backend_id = -1;
|
| 1302 |
} else {
|
| 1303 |
+
cur_backend_id = *node_backend_id;
|
| 1304 |
}
|
| 1305 |
} else {
|
| 1306 |
+
*node_backend_id = cur_backend_id;
|
| 1307 |
SET_CAUSE(node, "2.2");
|
| 1308 |
}
|
| 1309 |
}
|
| 1310 |
}
|
|
|
|
| 1311 |
// pass 2.1 expand gpu up
|
| 1312 |
{
|
| 1313 |
int cur_backend_id = -1;
|
|
|
|
| 1316 |
if (ggml_is_view_op(node->op)) {
|
| 1317 |
continue;
|
| 1318 |
}
|
| 1319 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1320 |
+
if (*node_backend_id != -1) {
|
| 1321 |
+
if (*node_backend_id == sched->n_backends - 1) {
|
| 1322 |
// skip cpu (lowest prio backend)
|
| 1323 |
cur_backend_id = -1;
|
| 1324 |
} else {
|
| 1325 |
+
cur_backend_id = *node_backend_id;
|
| 1326 |
}
|
| 1327 |
} else {
|
| 1328 |
+
*node_backend_id = cur_backend_id;
|
| 1329 |
SET_CAUSE(node, "2.1");
|
| 1330 |
}
|
| 1331 |
}
|
| 1332 |
}
|
|
|
|
|
|
|
| 1333 |
// pass 2.4 expand rest down
|
| 1334 |
{
|
| 1335 |
int cur_backend_id = -1;
|
|
|
|
| 1338 |
if (ggml_is_view_op(node->op)) {
|
| 1339 |
continue;
|
| 1340 |
}
|
| 1341 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1342 |
+
if (*node_backend_id != -1) {
|
| 1343 |
+
cur_backend_id = *node_backend_id;
|
| 1344 |
} else {
|
| 1345 |
+
*node_backend_id = cur_backend_id;
|
| 1346 |
SET_CAUSE(node, "2.4");
|
| 1347 |
}
|
| 1348 |
}
|
| 1349 |
}
|
| 1350 |
+
// pass 2.3 expand rest up
|
| 1351 |
{
|
| 1352 |
int cur_backend_id = -1;
|
| 1353 |
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
|
|
| 1355 |
if (ggml_is_view_op(node->op)) {
|
| 1356 |
continue;
|
| 1357 |
}
|
| 1358 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1359 |
+
if (*node_backend_id != -1) {
|
| 1360 |
+
cur_backend_id = *node_backend_id;
|
| 1361 |
} else {
|
| 1362 |
+
*node_backend_id = cur_backend_id;
|
| 1363 |
SET_CAUSE(node, "2.3");
|
| 1364 |
}
|
| 1365 |
}
|
|
|
|
| 1372 |
// pass 3: assign backends to remaining src from dst and view_src
|
| 1373 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1374 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1375 |
+
int * cur_backend_id = &tensor_backend_id(node);
|
| 1376 |
+
if (node->view_src != NULL && *cur_backend_id == -1) {
|
| 1377 |
+
*cur_backend_id = tensor_backend_id(node->view_src);
|
| 1378 |
SET_CAUSE(node, "3.vsrc");
|
| 1379 |
}
|
| 1380 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
|
|
| 1382 |
if (src == NULL) {
|
| 1383 |
continue;
|
| 1384 |
}
|
| 1385 |
+
int * src_backend_id = &tensor_backend_id(src);
|
| 1386 |
+
if (*src_backend_id == -1) {
|
| 1387 |
if (src->view_src != NULL) {
|
| 1388 |
// views are always on the same backend as the source
|
| 1389 |
+
*src_backend_id = tensor_backend_id(src->view_src);
|
| 1390 |
SET_CAUSE(src, "3.vsrc");
|
| 1391 |
} else {
|
| 1392 |
+
*src_backend_id = *cur_backend_id;
|
| 1393 |
SET_CAUSE(src, "3.cur");
|
| 1394 |
}
|
| 1395 |
}
|
|
|
|
| 1401 |
|
| 1402 |
// pass 4: split graph, find tensors that need to be copied
|
| 1403 |
{
|
| 1404 |
+
int i_split = 0;
|
| 1405 |
+
struct ggml_backend_sched_split * split = &sched->splits[0];
|
| 1406 |
// find the backend of the first split, skipping view ops
|
| 1407 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1408 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1409 |
if (!ggml_is_view_op(node->op)) {
|
| 1410 |
+
split->backend_id = tensor_backend_id(node);
|
| 1411 |
break;
|
| 1412 |
}
|
| 1413 |
}
|
| 1414 |
+
split->i_start = 0;
|
| 1415 |
+
split->n_inputs = 0;
|
| 1416 |
+
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
| 1417 |
+
int cur_backend_id = split->backend_id;
|
| 1418 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1419 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1420 |
|
|
|
|
| 1422 |
continue;
|
| 1423 |
}
|
| 1424 |
|
| 1425 |
+
const int node_backend_id = tensor_backend_id(node);
|
| 1426 |
|
| 1427 |
+
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
| 1428 |
|
| 1429 |
+
// check if we should start a new split based on the sources of the current node
|
| 1430 |
+
bool need_new_split = false;
|
| 1431 |
+
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
| 1432 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1433 |
+
struct ggml_tensor * src = node->src[j];
|
| 1434 |
+
if (src == NULL) {
|
| 1435 |
+
continue;
|
| 1436 |
+
}
|
| 1437 |
+
// check if a weight is on a different backend
|
| 1438 |
+
// by starting a new split, the memory of the previously offloaded weights can be reused
|
| 1439 |
+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1440 |
+
int src_backend_id = tensor_backend_id(src);
|
| 1441 |
+
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
| 1442 |
+
need_new_split = true;
|
| 1443 |
+
break;
|
| 1444 |
+
}
|
| 1445 |
+
}
|
| 1446 |
+
// check if the split has too many inputs
|
| 1447 |
+
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
| 1448 |
+
const size_t id = hash_id(src);
|
| 1449 |
+
int src_backend_id = sched->tensor_backend_id[id];
|
| 1450 |
+
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
|
| 1451 |
+
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
| 1452 |
+
need_new_split = true;
|
| 1453 |
+
break;
|
| 1454 |
+
}
|
| 1455 |
+
}
|
| 1456 |
+
}
|
| 1457 |
+
}
|
| 1458 |
+
|
| 1459 |
+
if (node_backend_id != cur_backend_id || need_new_split) {
|
| 1460 |
+
split->i_end = i;
|
| 1461 |
+
i_split++;
|
| 1462 |
+
if (i_split >= sched->splits_capacity) {
|
| 1463 |
+
sched->splits_capacity *= 2;
|
| 1464 |
+
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
| 1465 |
+
GGML_ASSERT(sched->splits != NULL);
|
| 1466 |
+
}
|
| 1467 |
+
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
| 1468 |
+
split = &sched->splits[i_split];
|
| 1469 |
+
split->backend_id = node_backend_id;
|
| 1470 |
+
split->i_start = i;
|
| 1471 |
+
split->n_inputs = 0;
|
| 1472 |
+
cur_backend_id = node_backend_id;
|
| 1473 |
}
|
| 1474 |
|
| 1475 |
// find inputs that are not on the same backend
|
|
|
|
| 1479 |
continue;
|
| 1480 |
}
|
| 1481 |
|
| 1482 |
+
const int src_backend_id = tensor_backend_id(src);
|
| 1483 |
assert(src_backend_id != -1); // all inputs should be assigned by now
|
| 1484 |
|
| 1485 |
+
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
| 1486 |
size_t id = hash_id(src);
|
| 1487 |
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
| 1488 |
ggml_backend_t backend = sched->backends[src_backend_id];
|
|
|
|
| 1499 |
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
| 1500 |
}
|
| 1501 |
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
|
|
|
| 1502 |
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1503 |
}
|
| 1504 |
int n_graph_inputs = sched->n_graph_inputs++;
|
|
|
|
| 1507 |
}
|
| 1508 |
}
|
| 1509 |
|
| 1510 |
+
if (src_backend_id != node_backend_id) {
|
| 1511 |
// create a copy of the input in the split's backend
|
| 1512 |
+
const size_t id = hash_id(src);
|
| 1513 |
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
| 1514 |
ggml_backend_t backend = sched->backends[cur_backend_id];
|
| 1515 |
for (int c = 0; c < sched->n_copies; c++) {
|
|
|
|
| 1520 |
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
| 1521 |
}
|
| 1522 |
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
|
|
|
| 1523 |
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1524 |
}
|
| 1525 |
+
int n_inputs = split->n_inputs++;
|
| 1526 |
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
| 1527 |
+
split->inputs[n_inputs] = src;
|
| 1528 |
}
|
| 1529 |
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
| 1530 |
}
|
| 1531 |
}
|
| 1532 |
}
|
| 1533 |
+
split->i_end = graph->n_nodes;
|
| 1534 |
+
sched->n_splits = i_split + 1;
|
| 1535 |
}
|
| 1536 |
#ifdef DEBUG_PASS4
|
| 1537 |
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
| 1538 |
#endif
|
| 1539 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1540 |
// create copies of the graph for each split
|
| 1541 |
// TODO: avoid this copy
|
| 1542 |
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
| 1543 |
for (int i = 0; i < sched->n_splits; i++) {
|
| 1544 |
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1545 |
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
| 1546 |
|
| 1547 |
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
| 1548 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1549 |
+
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
| 1550 |
+
|
| 1551 |
struct ggml_tensor * input = split->inputs[j];
|
| 1552 |
+
const size_t input_id = hash_id(input);
|
| 1553 |
+
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
| 1554 |
|
| 1555 |
// add a dependency to the input source so that it is not freed before the copy is done
|
| 1556 |
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
| 1557 |
input_dep->src[0] = input;
|
| 1558 |
+
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
|
| 1559 |
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
| 1560 |
|
| 1561 |
// add a dependency to the input copy so that it is allocated at the start of the split
|
|
|
|
| 1564 |
}
|
| 1565 |
|
| 1566 |
for (int j = split->i_start; j < split->i_end; j++) {
|
| 1567 |
+
assert(graph_copy->size > graph_copy->n_nodes);
|
| 1568 |
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
| 1569 |
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
| 1570 |
}
|
|
|
|
| 1649 |
}
|
| 1650 |
ggml_backend_tensor_copy(input, input_cpy);
|
| 1651 |
} else {
|
| 1652 |
+
// wait for the split backend to finish using the input before overwriting it
|
| 1653 |
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 1654 |
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
| 1655 |
} else {
|
| 1656 |
ggml_backend_synchronize(split_backend);
|
|
|
|
| 1657 |
}
|
|
|
|
| 1658 |
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
| 1659 |
}
|
| 1660 |
}
|
|
|
|
| 1724 |
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
| 1725 |
|
| 1726 |
// initialize hash table
|
| 1727 |
+
sched->hash_set = ggml_hash_set_new(graph_size);
|
| 1728 |
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
| 1729 |
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
| 1730 |
+
|
| 1731 |
+
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
| 1732 |
+
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
|
| 1733 |
+
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
|
| 1734 |
|
| 1735 |
sched->n_backends = n_backends;
|
| 1736 |
|
| 1737 |
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
| 1738 |
|
| 1739 |
+
const int initial_splits_capacity = 16;
|
| 1740 |
+
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
|
| 1741 |
+
sched->splits_capacity = initial_splits_capacity;
|
| 1742 |
|
| 1743 |
for (int b = 0; b < n_backends; b++) {
|
| 1744 |
sched->backends[b] = backends[b];
|
|
|
|
| 1769 |
}
|
| 1770 |
ggml_gallocr_free(sched->galloc);
|
| 1771 |
ggml_free(sched->ctx);
|
| 1772 |
+
free(sched->splits);
|
| 1773 |
free(sched->hash_set.keys);
|
| 1774 |
free(sched->tensor_backend_id);
|
| 1775 |
free(sched->tensor_copies);
|
|
|
|
| 1790 |
}
|
| 1791 |
|
| 1792 |
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
| 1793 |
+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
| 1794 |
+
|
| 1795 |
ggml_backend_sched_split_graph(sched, measure_graph);
|
| 1796 |
|
| 1797 |
// TODO: extract this to a separate function
|
|
|
|
| 1806 |
}
|
| 1807 |
|
| 1808 |
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1809 |
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
| 1810 |
|
| 1811 |
ggml_backend_sched_split_graph(sched, graph);
|
| 1812 |
|
ggml-backend.h
CHANGED
|
@@ -70,11 +70,11 @@ extern "C" {
|
|
| 70 |
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 71 |
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 72 |
|
| 73 |
-
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 74 |
-
GGML_API enum ggml_status ggml_backend_graph_compute
|
| 75 |
-
|
| 76 |
-
GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 77 |
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
|
|
| 78 |
|
| 79 |
// tensor copy between different backends
|
| 80 |
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
|
|
| 70 |
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 71 |
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 72 |
|
| 73 |
+
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 74 |
+
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 75 |
+
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
|
|
| 76 |
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 77 |
+
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 78 |
|
| 79 |
// tensor copy between different backends
|
| 80 |
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
ggml-common.h
CHANGED
|
@@ -377,6 +377,27 @@ typedef struct {
|
|
| 377 |
} block_iq1_s;
|
| 378 |
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
| 379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
// Non-linear quants
|
| 381 |
#define QK4_NL 32
|
| 382 |
typedef struct {
|
|
@@ -1050,6 +1071,7 @@ GGML_TABLE_END()
|
|
| 1050 |
|
| 1051 |
#define NGRID_IQ1S 2048
|
| 1052 |
#define IQ1S_DELTA 0.125f
|
|
|
|
| 1053 |
#if defined(GGML_COMMON_IMPL_C)
|
| 1054 |
GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
|
| 1055 |
0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
|
|
|
|
| 377 |
} block_iq1_s;
|
| 378 |
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
| 379 |
|
| 380 |
+
// 1.75 bpw
|
| 381 |
+
typedef struct {
|
| 382 |
+
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
| 383 |
+
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
| 384 |
+
#if QK_K == 64
|
| 385 |
+
ggml_half d;
|
| 386 |
+
#endif
|
| 387 |
+
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
| 388 |
+
} block_iq1_m;
|
| 389 |
+
#if QK_K == 64
|
| 390 |
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
| 391 |
+
#else
|
| 392 |
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
| 393 |
+
#endif
|
| 394 |
+
|
| 395 |
+
// Used by IQ1_M quants
|
| 396 |
+
typedef union {
|
| 397 |
+
ggml_half f16;
|
| 398 |
+
uint16_t u16;
|
| 399 |
+
} iq1m_scale_t;
|
| 400 |
+
|
| 401 |
// Non-linear quants
|
| 402 |
#define QK4_NL 32
|
| 403 |
typedef struct {
|
|
|
|
| 1071 |
|
| 1072 |
#define NGRID_IQ1S 2048
|
| 1073 |
#define IQ1S_DELTA 0.125f
|
| 1074 |
+
#define IQ1M_DELTA 0.125f
|
| 1075 |
#if defined(GGML_COMMON_IMPL_C)
|
| 1076 |
GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
|
| 1077 |
0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
|
ggml-cuda.cu
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml-cuda.h
CHANGED
|
@@ -17,29 +17,17 @@ extern "C" {
|
|
| 17 |
|
| 18 |
#define GGML_CUDA_MAX_DEVICES 16
|
| 19 |
|
| 20 |
-
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
| 21 |
-
GGML_API GGML_CALL void ggml_init_cublas(void);
|
| 22 |
-
|
| 23 |
-
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
| 24 |
-
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
|
| 25 |
-
|
| 26 |
-
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
|
| 27 |
-
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
|
| 28 |
-
|
| 29 |
-
GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 30 |
-
GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
| 31 |
-
|
| 32 |
-
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
|
| 33 |
-
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
| 34 |
-
|
| 35 |
// backend API
|
| 36 |
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
| 37 |
|
| 38 |
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
| 39 |
|
|
|
|
| 40 |
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
|
|
|
| 41 |
// split tensor buffer that splits matrices by rows across multiple devices
|
| 42 |
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
|
|
|
| 43 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 44 |
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
| 45 |
|
|
@@ -47,6 +35,9 @@ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
|
| 47 |
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
| 48 |
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
| 49 |
|
|
|
|
|
|
|
|
|
|
| 50 |
#ifdef __cplusplus
|
| 51 |
}
|
| 52 |
#endif
|
|
|
|
| 17 |
|
| 18 |
#define GGML_CUDA_MAX_DEVICES 16
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
// backend API
|
| 21 |
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
| 22 |
|
| 23 |
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
| 24 |
|
| 25 |
+
// device buffer
|
| 26 |
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
| 27 |
+
|
| 28 |
// split tensor buffer that splits matrices by rows across multiple devices
|
| 29 |
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
| 30 |
+
|
| 31 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 32 |
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
| 33 |
|
|
|
|
| 35 |
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
| 36 |
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
| 37 |
|
| 38 |
+
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
| 39 |
+
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
| 40 |
+
|
| 41 |
#ifdef __cplusplus
|
| 42 |
}
|
| 43 |
#endif
|
ggml-cuda/acc.cu
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "acc.cuh"
|
| 2 |
+
|
| 3 |
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
| 4 |
+
const int ne10, const int ne11, const int ne12,
|
| 5 |
+
const int nb1, const int nb2, int offset) {
|
| 6 |
+
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
| 7 |
+
if (i >= ne) {
|
| 8 |
+
return;
|
| 9 |
+
}
|
| 10 |
+
int src1_idx = i - offset;
|
| 11 |
+
int oz = src1_idx / nb2;
|
| 12 |
+
int oy = (src1_idx - (oz * nb2)) / nb1;
|
| 13 |
+
int ox = src1_idx % nb1;
|
| 14 |
+
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
|
| 15 |
+
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
|
| 16 |
+
} else {
|
| 17 |
+
dst[i] = x[i];
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
|
| 22 |
+
const int ne10, const int ne11, const int ne12,
|
| 23 |
+
const int nb1, const int nb2, const int offset, cudaStream_t stream) {
|
| 24 |
+
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
| 25 |
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 29 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 30 |
+
const ggml_tensor * src1 = dst->src[1];
|
| 31 |
+
const float * src0_d = (const float *)src0->data;
|
| 32 |
+
const float * src1_d = (const float *)src1->data;
|
| 33 |
+
float * dst_d = (float *)dst->data;
|
| 34 |
+
cudaStream_t stream = ctx.stream();
|
| 35 |
+
|
| 36 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 37 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 38 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 39 |
+
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
| 40 |
+
|
| 41 |
+
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
| 42 |
+
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
| 43 |
+
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
| 44 |
+
int offset = dst->op_params[3] / 4; // offset in bytes
|
| 45 |
+
|
| 46 |
+
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
|
| 47 |
+
}
|
ggml-cuda/acc.cuh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
#define CUDA_ACC_BLOCK_SIZE 256
|
| 4 |
+
|
| 5 |
+
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml-cuda/alibi.cu
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "alibi.cuh"
|
| 2 |
+
|
| 3 |
+
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
| 4 |
+
const int n_heads_log2_floor, const float m0, const float m1) {
|
| 5 |
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
| 6 |
+
|
| 7 |
+
if (col >= ncols) {
|
| 8 |
+
return;
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
| 12 |
+
const int i = row*ncols + col;
|
| 13 |
+
|
| 14 |
+
const int k = row/k_rows;
|
| 15 |
+
|
| 16 |
+
float m_k;
|
| 17 |
+
if (k < n_heads_log2_floor) {
|
| 18 |
+
m_k = powf(m0, k + 1);
|
| 19 |
+
} else {
|
| 20 |
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
dst[i] = col * m_k + x[i];
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
| 27 |
+
const int k_rows, const int n_heads_log2_floor, const float m0,
|
| 28 |
+
const float m1, cudaStream_t stream) {
|
| 29 |
+
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
|
| 30 |
+
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
|
| 31 |
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
| 32 |
+
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 36 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 37 |
+
const float * src0_d = (const float *)src0->data;
|
| 38 |
+
float * dst_d = (float *)dst->data;
|
| 39 |
+
cudaStream_t stream = ctx.stream();
|
| 40 |
+
|
| 41 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 42 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 43 |
+
|
| 44 |
+
const int64_t ne00 = src0->ne[0];
|
| 45 |
+
const int64_t ne01 = src0->ne[1];
|
| 46 |
+
const int64_t ne02 = src0->ne[2];
|
| 47 |
+
const int64_t nrows = ggml_nrows(src0);
|
| 48 |
+
|
| 49 |
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
| 50 |
+
const int n_head = ((int32_t *) dst->op_params)[1];
|
| 51 |
+
float max_bias;
|
| 52 |
+
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
| 53 |
+
|
| 54 |
+
//GGML_ASSERT(ne01 + n_past == ne00);
|
| 55 |
+
GGML_ASSERT(n_head == ne02);
|
| 56 |
+
|
| 57 |
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
| 58 |
+
|
| 59 |
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
| 60 |
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
| 61 |
+
|
| 62 |
+
alibi_f32_cuda(src0_d, dst_d, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, stream);
|
| 63 |
+
}
|
ggml-cuda/alibi.cuh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
#define CUDA_ALIBI_BLOCK_SIZE 32
|
| 4 |
+
|
| 5 |
+
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml-cuda/arange.cu
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "arange.cuh"
|
| 2 |
+
|
| 3 |
+
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
| 4 |
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
| 5 |
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 6 |
+
if (nidx >= ne0) {
|
| 7 |
+
return;
|
| 8 |
+
}
|
| 9 |
+
dst[nidx] = start + step * nidx;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
| 13 |
+
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
| 14 |
+
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 18 |
+
float * dst_d = (float *)dst->data;
|
| 19 |
+
cudaStream_t stream = ctx.stream();
|
| 20 |
+
|
| 21 |
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 22 |
+
|
| 23 |
+
float start;
|
| 24 |
+
float stop;
|
| 25 |
+
float step;
|
| 26 |
+
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
| 27 |
+
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
| 28 |
+
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
| 29 |
+
|
| 30 |
+
int64_t steps = (int64_t)ceil((stop - start) / step);
|
| 31 |
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
| 32 |
+
|
| 33 |
+
arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
|
| 34 |
+
}
|
ggml-cuda/arange.cuh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
#define CUDA_ARANGE_BLOCK_SIZE 256
|
| 4 |
+
|
| 5 |
+
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml-cuda/argsort.cu
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "argsort.cuh"
|
| 2 |
+
|
| 3 |
+
template<typename T>
|
| 4 |
+
static inline __device__ void ggml_cuda_swap(T & a, T & b) {
|
| 5 |
+
T tmp = a;
|
| 6 |
+
a = b;
|
| 7 |
+
b = tmp;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
template<ggml_sort_order order>
|
| 11 |
+
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
|
| 12 |
+
// bitonic sort
|
| 13 |
+
int col = threadIdx.x;
|
| 14 |
+
int row = blockIdx.y;
|
| 15 |
+
|
| 16 |
+
if (col >= ncols) return;
|
| 17 |
+
|
| 18 |
+
const float * x_row = x + row * ncols;
|
| 19 |
+
int * dst_row = dst + row * ncols;
|
| 20 |
+
|
| 21 |
+
// initialize indices
|
| 22 |
+
if (col < ncols) {
|
| 23 |
+
dst_row[col] = col;
|
| 24 |
+
}
|
| 25 |
+
__syncthreads();
|
| 26 |
+
|
| 27 |
+
for (int k = 2; k <= ncols; k *= 2) {
|
| 28 |
+
for (int j = k / 2; j > 0; j /= 2) {
|
| 29 |
+
int ixj = col ^ j;
|
| 30 |
+
if (ixj > col) {
|
| 31 |
+
if ((col & k) == 0) {
|
| 32 |
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
| 33 |
+
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
|
| 34 |
+
}
|
| 35 |
+
} else {
|
| 36 |
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
| 37 |
+
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
|
| 38 |
+
}
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
__syncthreads();
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
| 47 |
+
// bitonic sort requires ncols to be power of 2
|
| 48 |
+
GGML_ASSERT((ncols & (ncols - 1)) == 0);
|
| 49 |
+
|
| 50 |
+
const dim3 block_dims(ncols, 1, 1);
|
| 51 |
+
const dim3 block_nums(1, nrows, 1);
|
| 52 |
+
if (order == GGML_SORT_ORDER_ASC) {
|
| 53 |
+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
| 54 |
+
} else if (order == GGML_SORT_ORDER_DESC) {
|
| 55 |
+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
| 56 |
+
} else {
|
| 57 |
+
GGML_ASSERT(false);
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 62 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 63 |
+
const float * src0_d = (const float *)src0->data;
|
| 64 |
+
float * dst_d = (float *)dst->data;
|
| 65 |
+
cudaStream_t stream = ctx.stream();
|
| 66 |
+
|
| 67 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 68 |
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
| 69 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 70 |
+
|
| 71 |
+
const int64_t ncols = src0->ne[0];
|
| 72 |
+
const int64_t nrows = ggml_nrows(src0);
|
| 73 |
+
|
| 74 |
+
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
| 75 |
+
|
| 76 |
+
argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
| 77 |
+
}
|
ggml-cuda/argsort.cuh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml-cuda/binbcast.cu
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "binbcast.cuh"
|
| 2 |
+
|
| 3 |
+
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
| 4 |
+
return b;
|
| 5 |
+
GGML_UNUSED(a);
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
| 9 |
+
return a + b;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
| 13 |
+
return a * b;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
static __device__ __forceinline__ float op_div(const float a, const float b) {
|
| 17 |
+
return a / b;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
| 21 |
+
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
| 22 |
+
int ne0, int ne1, int ne2, int ne3,
|
| 23 |
+
int ne10, int ne11, int ne12, int ne13,
|
| 24 |
+
/*int s0, */ int s1, int s2, int s3,
|
| 25 |
+
/*int s10,*/ int s11, int s12, int s13) {
|
| 26 |
+
const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
|
| 27 |
+
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
|
| 28 |
+
const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
|
| 29 |
+
const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
|
| 30 |
+
|
| 31 |
+
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
| 32 |
+
return;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
const int i11 = i1 % ne11;
|
| 36 |
+
const int i12 = i2 % ne12;
|
| 37 |
+
const int i13 = i3 % ne13;
|
| 38 |
+
|
| 39 |
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
| 40 |
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
| 41 |
+
const size_t i_dst = i_src0;
|
| 42 |
+
|
| 43 |
+
const src0_t * src0_row = src0 + i_src0;
|
| 44 |
+
const src1_t * src1_row = src1 + i_src1;
|
| 45 |
+
dst_t * dst_row = dst + i_dst;
|
| 46 |
+
|
| 47 |
+
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
|
| 48 |
+
const int i10 = i0 % ne10;
|
| 49 |
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
| 54 |
+
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
| 55 |
+
int ne0, int ne1, int ne2, int ne3,
|
| 56 |
+
int ne10, int ne11, int ne12, int ne13,
|
| 57 |
+
/*int s0, */ int s1, int s2, int s3,
|
| 58 |
+
/*int s10,*/ int s11, int s12, int s13) {
|
| 59 |
+
|
| 60 |
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 61 |
+
|
| 62 |
+
const int i3 = i/(ne2*ne1*ne0);
|
| 63 |
+
const int i2 = (i/(ne1*ne0)) % ne2;
|
| 64 |
+
const int i1 = (i/ne0) % ne1;
|
| 65 |
+
const int i0 = i % ne0;
|
| 66 |
+
|
| 67 |
+
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
| 68 |
+
return;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
const int i11 = i1 % ne11;
|
| 72 |
+
const int i12 = i2 % ne12;
|
| 73 |
+
const int i13 = i3 % ne13;
|
| 74 |
+
|
| 75 |
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
| 76 |
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
| 77 |
+
const size_t i_dst = i_src0;
|
| 78 |
+
|
| 79 |
+
const src0_t * src0_row = src0 + i_src0;
|
| 80 |
+
const src1_t * src1_row = src1 + i_src1;
|
| 81 |
+
dst_t * dst_row = dst + i_dst;
|
| 82 |
+
|
| 83 |
+
const int i10 = i0 % ne10;
|
| 84 |
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
template<float (*bin_op)(const float, const float)>
|
| 88 |
+
struct bin_bcast_cuda {
|
| 89 |
+
template<typename src0_t, typename src1_t, typename dst_t>
|
| 90 |
+
void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
|
| 91 |
+
const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
|
| 92 |
+
cudaStream_t stream) {
|
| 93 |
+
|
| 94 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
| 95 |
+
|
| 96 |
+
int nr0 = ne10/ne0;
|
| 97 |
+
int nr1 = ne11/ne1;
|
| 98 |
+
int nr2 = ne12/ne2;
|
| 99 |
+
int nr3 = ne13/ne3;
|
| 100 |
+
|
| 101 |
+
int nr[4] = { nr0, nr1, nr2, nr3 };
|
| 102 |
+
|
| 103 |
+
// collapse dimensions until first broadcast dimension
|
| 104 |
+
int64_t cne0[] = {ne0, ne1, ne2, ne3};
|
| 105 |
+
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
| 106 |
+
size_t cnb0[] = {nb0, nb1, nb2, nb3};
|
| 107 |
+
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
| 108 |
+
auto collapse = [](int64_t cne[]) {
|
| 109 |
+
cne[0] *= cne[1];
|
| 110 |
+
cne[1] = cne[2];
|
| 111 |
+
cne[2] = cne[3];
|
| 112 |
+
cne[3] = 1;
|
| 113 |
+
};
|
| 114 |
+
|
| 115 |
+
auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
|
| 116 |
+
cnb[1] *= cne[1];
|
| 117 |
+
cnb[2] *= cne[2];
|
| 118 |
+
cnb[3] *= cne[3];
|
| 119 |
+
};
|
| 120 |
+
|
| 121 |
+
for (int i = 0; i < 4; i++) {
|
| 122 |
+
if (nr[i] != 1) {
|
| 123 |
+
break;
|
| 124 |
+
}
|
| 125 |
+
if (i > 0) {
|
| 126 |
+
collapse_nb(cnb0, cne0);
|
| 127 |
+
collapse_nb(cnb1, cne1);
|
| 128 |
+
collapse(cne0);
|
| 129 |
+
collapse(cne1);
|
| 130 |
+
}
|
| 131 |
+
}
|
| 132 |
+
{
|
| 133 |
+
int64_t ne0 = cne0[0];
|
| 134 |
+
int64_t ne1 = cne0[1];
|
| 135 |
+
int64_t ne2 = cne0[2];
|
| 136 |
+
int64_t ne3 = cne0[3];
|
| 137 |
+
|
| 138 |
+
int64_t ne10 = cne1[0];
|
| 139 |
+
int64_t ne11 = cne1[1];
|
| 140 |
+
int64_t ne12 = cne1[2];
|
| 141 |
+
int64_t ne13 = cne1[3];
|
| 142 |
+
|
| 143 |
+
size_t nb0 = cnb0[0];
|
| 144 |
+
size_t nb1 = cnb0[1];
|
| 145 |
+
size_t nb2 = cnb0[2];
|
| 146 |
+
size_t nb3 = cnb0[3];
|
| 147 |
+
|
| 148 |
+
size_t nb10 = cnb1[0];
|
| 149 |
+
size_t nb11 = cnb1[1];
|
| 150 |
+
size_t nb12 = cnb1[2];
|
| 151 |
+
size_t nb13 = cnb1[3];
|
| 152 |
+
|
| 153 |
+
size_t s0 = nb0 / sizeof(dst_t);
|
| 154 |
+
size_t s1 = nb1 / sizeof(dst_t);
|
| 155 |
+
size_t s2 = nb2 / sizeof(dst_t);
|
| 156 |
+
size_t s3 = nb3 / sizeof(dst_t);
|
| 157 |
+
|
| 158 |
+
size_t s10 = nb10 / sizeof(src1_t);
|
| 159 |
+
size_t s11 = nb11 / sizeof(src1_t);
|
| 160 |
+
size_t s12 = nb12 / sizeof(src1_t);
|
| 161 |
+
size_t s13 = nb13 / sizeof(src1_t);
|
| 162 |
+
|
| 163 |
+
GGML_ASSERT(s0 == 1);
|
| 164 |
+
GGML_ASSERT(s10 == 1);
|
| 165 |
+
|
| 166 |
+
const int block_size = 128;
|
| 167 |
+
|
| 168 |
+
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
| 169 |
+
|
| 170 |
+
dim3 block_dims;
|
| 171 |
+
block_dims.x = std::min<unsigned int>(hne0, block_size);
|
| 172 |
+
block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
|
| 173 |
+
block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
|
| 174 |
+
|
| 175 |
+
dim3 block_nums(
|
| 176 |
+
(hne0 + block_dims.x - 1) / block_dims.x,
|
| 177 |
+
(ne1 + block_dims.y - 1) / block_dims.y,
|
| 178 |
+
(ne2*ne3 + block_dims.z - 1) / block_dims.z
|
| 179 |
+
);
|
| 180 |
+
|
| 181 |
+
if (block_nums.z > 65535) {
|
| 182 |
+
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
| 183 |
+
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
| 184 |
+
k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
|
| 185 |
+
src0_dd, src1_dd, dst_dd,
|
| 186 |
+
ne0, ne1, ne2, ne3,
|
| 187 |
+
ne10, ne11, ne12, ne13,
|
| 188 |
+
/* s0, */ s1, s2, s3,
|
| 189 |
+
/* s10, */ s11, s12, s13);
|
| 190 |
+
} else {
|
| 191 |
+
k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
|
| 192 |
+
src0_dd, src1_dd, dst_dd,
|
| 193 |
+
ne0, ne1, ne2, ne3,
|
| 194 |
+
ne10, ne11, ne12, ne13,
|
| 195 |
+
/* s0, */ s1, s2, s3,
|
| 196 |
+
/* s10, */ s11, s12, s13);
|
| 197 |
+
}
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
};
|
| 201 |
+
|
| 202 |
+
template<class op>
|
| 203 |
+
static void ggml_cuda_op_bin_bcast(
|
| 204 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 205 |
+
const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
|
| 206 |
+
|
| 207 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 208 |
+
|
| 209 |
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
| 210 |
+
op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
|
| 211 |
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
| 212 |
+
op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
|
| 213 |
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
| 214 |
+
op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
|
| 215 |
+
} else {
|
| 216 |
+
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
| 217 |
+
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
| 218 |
+
GGML_ASSERT(false);
|
| 219 |
+
}
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 223 |
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 227 |
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 231 |
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 235 |
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
| 236 |
+
}
|
ggml-cuda/binbcast.cuh
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
| 4 |
+
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
| 5 |
+
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
| 6 |
+
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml-cuda/clamp.cu
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "clamp.cuh"
|
| 2 |
+
|
| 3 |
+
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
|
| 4 |
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 5 |
+
|
| 6 |
+
if (i >= k) {
|
| 7 |
+
return;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
| 14 |
+
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
|
| 15 |
+
clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 20 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 21 |
+
const float * src0_d = (const float *)src0->data;
|
| 22 |
+
float * dst_d = (float *)dst->data;
|
| 23 |
+
cudaStream_t stream = ctx.stream();
|
| 24 |
+
|
| 25 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 26 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 27 |
+
|
| 28 |
+
float min;
|
| 29 |
+
float max;
|
| 30 |
+
memcpy(&min, dst->op_params, sizeof(float));
|
| 31 |
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
| 32 |
+
|
| 33 |
+
clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
|
| 34 |
+
CUDA_CHECK(cudaGetLastError());
|
| 35 |
+
}
|
ggml-cuda/clamp.cuh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
#define CUDA_CLAMP_BLOCK_SIZE 256
|
| 4 |
+
|
| 5 |
+
void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml-cuda/common.cuh
ADDED
|
@@ -0,0 +1,557 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "ggml.h"
|
| 4 |
+
#include "ggml-cuda.h"
|
| 5 |
+
|
| 6 |
+
#include <memory>
|
| 7 |
+
|
| 8 |
+
#if defined(GGML_USE_HIPBLAS)
|
| 9 |
+
#define GGML_COMMON_DECL_HIP
|
| 10 |
+
#define GGML_COMMON_IMPL_HIP
|
| 11 |
+
#else
|
| 12 |
+
#define GGML_COMMON_DECL_CUDA
|
| 13 |
+
#define GGML_COMMON_IMPL_CUDA
|
| 14 |
+
#endif
|
| 15 |
+
#include "ggml-common.h"
|
| 16 |
+
|
| 17 |
+
#include <cstdio>
|
| 18 |
+
#include <array>
|
| 19 |
+
#include <cassert>
|
| 20 |
+
#include <cfloat>
|
| 21 |
+
#include <string>
|
| 22 |
+
|
| 23 |
+
#if defined(GGML_USE_HIPBLAS)
|
| 24 |
+
#include <hip/hip_runtime.h>
|
| 25 |
+
#include <hipblas/hipblas.h>
|
| 26 |
+
#include <hip/hip_fp16.h>
|
| 27 |
+
#ifdef __HIP_PLATFORM_AMD__
|
| 28 |
+
// for rocblas_initialize()
|
| 29 |
+
#include "rocblas/rocblas.h"
|
| 30 |
+
#endif // __HIP_PLATFORM_AMD__
|
| 31 |
+
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
| 32 |
+
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
| 33 |
+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
| 34 |
+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
| 35 |
+
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
| 36 |
+
#define CUBLAS_OP_N HIPBLAS_OP_N
|
| 37 |
+
#define CUBLAS_OP_T HIPBLAS_OP_T
|
| 38 |
+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
| 39 |
+
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
| 40 |
+
#define CUDA_R_16F HIPBLAS_R_16F
|
| 41 |
+
#define CUDA_R_32F HIPBLAS_R_32F
|
| 42 |
+
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
| 43 |
+
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
| 44 |
+
#define cublasCreate hipblasCreate
|
| 45 |
+
#define cublasDestroy hipblasDestroy
|
| 46 |
+
#define cublasGemmEx hipblasGemmEx
|
| 47 |
+
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
| 48 |
+
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
| 49 |
+
#define cublasHandle_t hipblasHandle_t
|
| 50 |
+
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
| 51 |
+
#define cublasSetStream hipblasSetStream
|
| 52 |
+
#define cublasSgemm hipblasSgemm
|
| 53 |
+
#define cublasStatus_t hipblasStatus_t
|
| 54 |
+
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
| 55 |
+
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
| 56 |
+
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
| 57 |
+
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
| 58 |
+
#define cudaDeviceProp hipDeviceProp_t
|
| 59 |
+
#define cudaDeviceSynchronize hipDeviceSynchronize
|
| 60 |
+
#define cudaError_t hipError_t
|
| 61 |
+
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
| 62 |
+
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
| 63 |
+
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
| 64 |
+
#define cudaEventDisableTiming hipEventDisableTiming
|
| 65 |
+
#define cudaEventRecord hipEventRecord
|
| 66 |
+
#define cudaEventSynchronize hipEventSynchronize
|
| 67 |
+
#define cudaEvent_t hipEvent_t
|
| 68 |
+
#define cudaEventDestroy hipEventDestroy
|
| 69 |
+
#define cudaFree hipFree
|
| 70 |
+
#define cudaFreeHost hipHostFree
|
| 71 |
+
#define cudaGetDevice hipGetDevice
|
| 72 |
+
#define cudaGetDeviceCount hipGetDeviceCount
|
| 73 |
+
#define cudaGetDeviceProperties hipGetDeviceProperties
|
| 74 |
+
#define cudaGetErrorString hipGetErrorString
|
| 75 |
+
#define cudaGetLastError hipGetLastError
|
| 76 |
+
#define cudaHostRegister hipHostRegister
|
| 77 |
+
#define cudaHostRegisterPortable hipHostRegisterPortable
|
| 78 |
+
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
| 79 |
+
#define cudaHostUnregister hipHostUnregister
|
| 80 |
+
#define cudaLaunchHostFunc hipLaunchHostFunc
|
| 81 |
+
#ifdef GGML_HIP_UMA
|
| 82 |
+
#define cudaMalloc hipMallocManaged
|
| 83 |
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
| 84 |
+
#else
|
| 85 |
+
#define cudaMalloc hipMalloc
|
| 86 |
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
| 87 |
+
#endif
|
| 88 |
+
#define cudaMemcpy hipMemcpy
|
| 89 |
+
#define cudaMemcpyAsync hipMemcpyAsync
|
| 90 |
+
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
| 91 |
+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
| 92 |
+
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
| 93 |
+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
| 94 |
+
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
| 95 |
+
#define cudaMemcpyKind hipMemcpyKind
|
| 96 |
+
#define cudaMemset hipMemset
|
| 97 |
+
#define cudaMemsetAsync hipMemsetAsync
|
| 98 |
+
#define cudaMemGetInfo hipMemGetInfo
|
| 99 |
+
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
| 100 |
+
#define cudaSetDevice hipSetDevice
|
| 101 |
+
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
| 102 |
+
#define cudaStreamDestroy hipStreamDestroy
|
| 103 |
+
#define cudaStreamFireAndForget hipStreamFireAndForget
|
| 104 |
+
#define cudaStreamNonBlocking hipStreamNonBlocking
|
| 105 |
+
#define cudaStreamPerThread hipStreamPerThread
|
| 106 |
+
#define cudaStreamSynchronize hipStreamSynchronize
|
| 107 |
+
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
| 108 |
+
#define cudaStream_t hipStream_t
|
| 109 |
+
#define cudaSuccess hipSuccess
|
| 110 |
+
#define __trap abort
|
| 111 |
+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
| 112 |
+
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
| 113 |
+
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
| 114 |
+
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
| 115 |
+
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
| 116 |
+
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
| 117 |
+
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
| 118 |
+
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
| 119 |
+
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
| 120 |
+
#else
|
| 121 |
+
#include <cuda_runtime.h>
|
| 122 |
+
#include <cuda.h>
|
| 123 |
+
#include <cublas_v2.h>
|
| 124 |
+
#include <cuda_fp16.h>
|
| 125 |
+
|
| 126 |
+
#if CUDART_VERSION < 11020
|
| 127 |
+
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
| 128 |
+
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
| 129 |
+
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
| 130 |
+
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
| 131 |
+
#define cublasComputeType_t cudaDataType_t
|
| 132 |
+
#endif // CUDART_VERSION < 11020
|
| 133 |
+
|
| 134 |
+
#endif // defined(GGML_USE_HIPBLAS)
|
| 135 |
+
|
| 136 |
+
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
| 137 |
+
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
| 138 |
+
|
| 139 |
+
#define WARP_SIZE 32
|
| 140 |
+
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
| 141 |
+
|
| 142 |
+
#define CC_PASCAL 600
|
| 143 |
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
| 144 |
+
#define CC_VOLTA 700
|
| 145 |
+
#define CC_OFFSET_AMD 1000000
|
| 146 |
+
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
| 147 |
+
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
| 148 |
+
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
| 149 |
+
|
| 150 |
+
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
| 151 |
+
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
| 152 |
+
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
| 153 |
+
// - 7B quantum model: +100-200 MB
|
| 154 |
+
// - 13B quantum model: +200-400 MB
|
| 155 |
+
//
|
| 156 |
+
//#define GGML_CUDA_FORCE_MMQ
|
| 157 |
+
|
| 158 |
+
// TODO: improve this to be correct for more hardware
|
| 159 |
+
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
| 160 |
+
#if !defined(GGML_CUDA_FORCE_MMQ)
|
| 161 |
+
#define CUDA_USE_TENSOR_CORES
|
| 162 |
+
#endif
|
| 163 |
+
|
| 164 |
+
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
|
| 165 |
+
#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
|
| 166 |
+
|
| 167 |
+
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
| 168 |
+
|
| 169 |
+
#if defined(_MSC_VER)
|
| 170 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
| 171 |
+
#endif
|
| 172 |
+
|
| 173 |
+
#define GGML_CUDA_MAX_STREAMS 8
|
| 174 |
+
|
| 175 |
+
[[noreturn]]
|
| 176 |
+
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
|
| 177 |
+
|
| 178 |
+
#define CUDA_CHECK_GEN(err, success, error_fn) \
|
| 179 |
+
do { \
|
| 180 |
+
auto err_ = (err); \
|
| 181 |
+
if (err_ != (success)) { \
|
| 182 |
+
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
|
| 183 |
+
} \
|
| 184 |
+
} while (0)
|
| 185 |
+
|
| 186 |
+
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
|
| 187 |
+
|
| 188 |
+
#if CUDART_VERSION >= 12000
|
| 189 |
+
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
| 190 |
+
return cublasGetStatusString(err);
|
| 191 |
+
}
|
| 192 |
+
#else
|
| 193 |
+
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
| 194 |
+
switch (err) {
|
| 195 |
+
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
|
| 196 |
+
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
|
| 197 |
+
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
|
| 198 |
+
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
|
| 199 |
+
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
|
| 200 |
+
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
|
| 201 |
+
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
|
| 202 |
+
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
|
| 203 |
+
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
|
| 204 |
+
default: return "unknown error";
|
| 205 |
+
}
|
| 206 |
+
}
|
| 207 |
+
#endif // CUDART_VERSION >= 12000
|
| 208 |
+
|
| 209 |
+
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
| 210 |
+
|
| 211 |
+
#if !defined(GGML_USE_HIPBLAS)
|
| 212 |
+
static const char * cu_get_error_str(CUresult err) {
|
| 213 |
+
const char * err_str;
|
| 214 |
+
cuGetErrorString(err, &err_str);
|
| 215 |
+
return err_str;
|
| 216 |
+
}
|
| 217 |
+
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
| 218 |
+
#endif
|
| 219 |
+
|
| 220 |
+
#if CUDART_VERSION >= 11100
|
| 221 |
+
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
| 222 |
+
#else
|
| 223 |
+
#define GGML_CUDA_ASSUME(x)
|
| 224 |
+
#endif // CUDART_VERSION >= 11100
|
| 225 |
+
|
| 226 |
+
#ifdef GGML_CUDA_F16
|
| 227 |
+
typedef half dfloat; // dequantize float
|
| 228 |
+
typedef half2 dfloat2;
|
| 229 |
+
#else
|
| 230 |
+
typedef float dfloat; // dequantize float
|
| 231 |
+
typedef float2 dfloat2;
|
| 232 |
+
#endif //GGML_CUDA_F16
|
| 233 |
+
|
| 234 |
+
// dmmv = dequantize_mul_mat_vec
|
| 235 |
+
// TODO: remove this?
|
| 236 |
+
#ifndef GGML_CUDA_DMMV_X
|
| 237 |
+
#define GGML_CUDA_DMMV_X 32
|
| 238 |
+
#endif
|
| 239 |
+
|
| 240 |
+
[[noreturn]]
|
| 241 |
+
static __device__ void no_device_code(
|
| 242 |
+
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
| 243 |
+
|
| 244 |
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 245 |
+
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
| 246 |
+
file_name, line, function_name, arch);
|
| 247 |
+
GGML_UNUSED(arch_list);
|
| 248 |
+
#else
|
| 249 |
+
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
| 250 |
+
file_name, line, function_name, arch, arch_list);
|
| 251 |
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 252 |
+
__trap();
|
| 253 |
+
|
| 254 |
+
GGML_UNUSED(no_device_code); // suppress unused function warning
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
#ifdef __CUDA_ARCH__
|
| 258 |
+
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
| 259 |
+
#else
|
| 260 |
+
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
| 261 |
+
#endif // __CUDA_ARCH__
|
| 262 |
+
|
| 263 |
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
| 264 |
+
#pragma unroll
|
| 265 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 266 |
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
| 267 |
+
}
|
| 268 |
+
return x;
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
| 272 |
+
#pragma unroll
|
| 273 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 274 |
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
| 275 |
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
| 276 |
+
}
|
| 277 |
+
return a;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
#ifdef GGML_CUDA_F16
|
| 281 |
+
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
| 282 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
| 283 |
+
#pragma unroll
|
| 284 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 285 |
+
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
| 286 |
+
}
|
| 287 |
+
return a;
|
| 288 |
+
#else
|
| 289 |
+
GGML_UNUSED(a);
|
| 290 |
+
NO_DEVICE_CODE;
|
| 291 |
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
| 292 |
+
}
|
| 293 |
+
#endif // GGML_CUDA_F16
|
| 294 |
+
|
| 295 |
+
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
| 296 |
+
#pragma unroll
|
| 297 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 298 |
+
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
| 299 |
+
}
|
| 300 |
+
return x;
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 304 |
+
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 305 |
+
//#pragma unroll
|
| 306 |
+
// for (int mask = 16; mask > 0; mask >>= 1) {
|
| 307 |
+
// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
| 308 |
+
// }
|
| 309 |
+
// return x;
|
| 310 |
+
//#else
|
| 311 |
+
// GGML_UNUSED(x);
|
| 312 |
+
// NO_DEVICE_CODE;
|
| 313 |
+
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 314 |
+
//}
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
#if defined(GGML_USE_HIPBLAS)
|
| 318 |
+
#define __CUDA_ARCH__ 1300
|
| 319 |
+
|
| 320 |
+
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
| 321 |
+
defined(__gfx1150__) || defined(__gfx1151__)
|
| 322 |
+
#define RDNA3
|
| 323 |
+
#endif
|
| 324 |
+
|
| 325 |
+
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
| 326 |
+
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
| 327 |
+
#define RDNA2
|
| 328 |
+
#endif
|
| 329 |
+
|
| 330 |
+
#ifndef __has_builtin
|
| 331 |
+
#define __has_builtin(x) 0
|
| 332 |
+
#endif
|
| 333 |
+
|
| 334 |
+
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
| 335 |
+
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
| 336 |
+
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
| 337 |
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
| 338 |
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
| 339 |
+
#if __has_builtin(__builtin_elementwise_sub_sat)
|
| 340 |
+
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
| 341 |
+
return reinterpret_cast<const int &>(c);
|
| 342 |
+
#else
|
| 343 |
+
int8x4_t c;
|
| 344 |
+
int16_t tmp;
|
| 345 |
+
#pragma unroll
|
| 346 |
+
for (int i = 0; i < 4; i++) {
|
| 347 |
+
tmp = va[i] - vb[i];
|
| 348 |
+
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
| 349 |
+
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
| 350 |
+
c[i] = tmp;
|
| 351 |
+
}
|
| 352 |
+
return reinterpret_cast<int &>(c);
|
| 353 |
+
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
| 357 |
+
return __vsubss4(a, b);
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
| 361 |
+
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
| 362 |
+
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
| 363 |
+
unsigned int c;
|
| 364 |
+
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
| 365 |
+
#pragma unroll
|
| 366 |
+
for (int i = 0; i < 4; ++i) {
|
| 367 |
+
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
| 368 |
+
}
|
| 369 |
+
return c;
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
| 373 |
+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
| 374 |
+
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
| 375 |
+
#elif defined(RDNA3)
|
| 376 |
+
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
| 377 |
+
#elif defined(__gfx1010__) || defined(__gfx900__)
|
| 378 |
+
int tmp1;
|
| 379 |
+
int tmp2;
|
| 380 |
+
asm("\n \
|
| 381 |
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
| 382 |
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
| 383 |
+
v_add3_u32 %0, %1, %2, %0 \n \
|
| 384 |
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
| 385 |
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
| 386 |
+
v_add3_u32 %0, %1, %2, %0 \n \
|
| 387 |
+
"
|
| 388 |
+
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
| 389 |
+
: "v"(a), "v"(b)
|
| 390 |
+
);
|
| 391 |
+
#else
|
| 392 |
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
| 393 |
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
| 394 |
+
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
| 395 |
+
#endif
|
| 396 |
+
return c;
|
| 397 |
+
}
|
| 398 |
+
#endif // defined(GGML_USE_HIPBLAS)
|
| 399 |
+
|
| 400 |
+
// TODO: move to ggml-common.h
|
| 401 |
+
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
| 402 |
+
|
| 403 |
+
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
//////////////////////
|
| 407 |
+
|
| 408 |
+
struct ggml_cuda_device_info {
|
| 409 |
+
int device_count;
|
| 410 |
+
|
| 411 |
+
struct cuda_device_info {
|
| 412 |
+
int cc; // compute capability
|
| 413 |
+
size_t smpb; // max. shared memory per block
|
| 414 |
+
bool vmm; // virtual memory support
|
| 415 |
+
size_t vmm_granularity; // granularity of virtual memory
|
| 416 |
+
size_t total_vram;
|
| 417 |
+
};
|
| 418 |
+
|
| 419 |
+
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
|
| 420 |
+
|
| 421 |
+
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
| 422 |
+
};
|
| 423 |
+
|
| 424 |
+
const ggml_cuda_device_info & ggml_cuda_info();
|
| 425 |
+
|
| 426 |
+
void ggml_cuda_set_device(int device);
|
| 427 |
+
int ggml_cuda_get_device();
|
| 428 |
+
|
| 429 |
+
struct ggml_cuda_pool {
|
| 430 |
+
virtual ~ggml_cuda_pool() = default;
|
| 431 |
+
|
| 432 |
+
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
| 433 |
+
virtual void free(void * ptr, size_t size) = 0;
|
| 434 |
+
};
|
| 435 |
+
|
| 436 |
+
template<typename T>
|
| 437 |
+
struct ggml_cuda_pool_alloc {
|
| 438 |
+
ggml_cuda_pool * pool = nullptr;
|
| 439 |
+
T * ptr = nullptr;
|
| 440 |
+
size_t actual_size = 0;
|
| 441 |
+
|
| 442 |
+
ggml_cuda_pool_alloc() = default;
|
| 443 |
+
|
| 444 |
+
explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
|
| 448 |
+
alloc(size);
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
~ggml_cuda_pool_alloc() {
|
| 452 |
+
if (ptr != nullptr) {
|
| 453 |
+
pool->free(ptr, actual_size);
|
| 454 |
+
}
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
// size is in number of elements
|
| 458 |
+
T * alloc(size_t size) {
|
| 459 |
+
GGML_ASSERT(pool != nullptr);
|
| 460 |
+
GGML_ASSERT(ptr == nullptr);
|
| 461 |
+
ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
|
| 462 |
+
return ptr;
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
T * alloc(ggml_cuda_pool & pool, size_t size) {
|
| 466 |
+
this->pool = &pool;
|
| 467 |
+
return alloc(size);
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
T * get() {
|
| 471 |
+
return ptr;
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
|
| 475 |
+
ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
|
| 476 |
+
ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
|
| 477 |
+
ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
|
| 478 |
+
};
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
// backend interface
|
| 482 |
+
|
| 483 |
+
struct ggml_tensor_extra_gpu {
|
| 484 |
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
| 485 |
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
|
| 486 |
+
};
|
| 487 |
+
|
| 488 |
+
struct ggml_backend_cuda_context {
|
| 489 |
+
int device;
|
| 490 |
+
std::string name;
|
| 491 |
+
cudaEvent_t copy_event = nullptr;
|
| 492 |
+
|
| 493 |
+
cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
|
| 494 |
+
cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
| 495 |
+
|
| 496 |
+
explicit ggml_backend_cuda_context(int device) :
|
| 497 |
+
device(device),
|
| 498 |
+
name(GGML_CUDA_NAME + std::to_string(device)) {
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
~ggml_backend_cuda_context() {
|
| 502 |
+
if (copy_event != nullptr) {
|
| 503 |
+
CUDA_CHECK(cudaEventDestroy(copy_event));
|
| 504 |
+
}
|
| 505 |
+
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
|
| 506 |
+
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
|
| 507 |
+
if (streams[i][j] != nullptr) {
|
| 508 |
+
CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
|
| 509 |
+
}
|
| 510 |
+
}
|
| 511 |
+
if (cublas_handles[i] != nullptr) {
|
| 512 |
+
CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
|
| 513 |
+
}
|
| 514 |
+
}
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
cudaStream_t stream(int device, int stream) {
|
| 518 |
+
if (streams[device][stream] == nullptr) {
|
| 519 |
+
ggml_cuda_set_device(device);
|
| 520 |
+
CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
|
| 521 |
+
}
|
| 522 |
+
return streams[device][stream];
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
cudaStream_t stream() {
|
| 526 |
+
return stream(device, 0);
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
cublasHandle_t cublas_handle(int device) {
|
| 530 |
+
if (cublas_handles[device] == nullptr) {
|
| 531 |
+
ggml_cuda_set_device(device);
|
| 532 |
+
CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
|
| 533 |
+
CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
|
| 534 |
+
}
|
| 535 |
+
return cublas_handles[device];
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
cublasHandle_t cublas_handle() {
|
| 539 |
+
return cublas_handle(device);
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
// pool
|
| 543 |
+
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
|
| 544 |
+
|
| 545 |
+
static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
|
| 546 |
+
|
| 547 |
+
ggml_cuda_pool & pool(int device) {
|
| 548 |
+
if (pools[device] == nullptr) {
|
| 549 |
+
pools[device] = new_pool_for_device(device);
|
| 550 |
+
}
|
| 551 |
+
return *pools[device];
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
ggml_cuda_pool & pool() {
|
| 555 |
+
return pool(device);
|
| 556 |
+
}
|
| 557 |
+
};
|
ggml-cuda/concat.cu
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "concat.cuh"
|
| 2 |
+
|
| 3 |
+
static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
|
| 4 |
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 5 |
+
if (nidx >= ne0) {
|
| 6 |
+
return;
|
| 7 |
+
}
|
| 8 |
+
// operation
|
| 9 |
+
int offset_dst =
|
| 10 |
+
nidx +
|
| 11 |
+
blockIdx.y * ne0 +
|
| 12 |
+
blockIdx.z * ne0 * gridDim.y;
|
| 13 |
+
if (blockIdx.z < ne02) { // src0
|
| 14 |
+
int offset_src =
|
| 15 |
+
nidx +
|
| 16 |
+
blockIdx.y * ne0 +
|
| 17 |
+
blockIdx.z * ne0 * gridDim.y;
|
| 18 |
+
dst[offset_dst] = x[offset_src];
|
| 19 |
+
} else {
|
| 20 |
+
int offset_src =
|
| 21 |
+
nidx +
|
| 22 |
+
blockIdx.y * ne0 +
|
| 23 |
+
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
| 24 |
+
dst[offset_dst] = y[offset_src];
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
|
| 29 |
+
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
| 30 |
+
dim3 gridDim(num_blocks, ne1, ne2);
|
| 31 |
+
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 35 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 36 |
+
const ggml_tensor * src1 = dst->src[1];
|
| 37 |
+
const float * src0_d = (const float *)src0->data;
|
| 38 |
+
const float * src1_d = (const float *)src1->data;
|
| 39 |
+
float * dst_d = (float *)dst->data;
|
| 40 |
+
cudaStream_t stream = ctx.stream();
|
| 41 |
+
|
| 42 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 43 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 44 |
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 45 |
+
|
| 46 |
+
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
| 47 |
+
concat_f32_cuda(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), dst_d + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], stream);
|
| 48 |
+
}
|
| 49 |
+
}
|
ggml-cuda/concat.cuh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
#define CUDA_CONCAT_BLOCK_SIZE 256
|
| 4 |
+
|
| 5 |
+
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml-cuda/convert.cu
ADDED
|
@@ -0,0 +1,824 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "convert.cuh"
|
| 2 |
+
#include "dequantize.cuh"
|
| 3 |
+
|
| 4 |
+
#define CUDA_Q8_0_NE_ALIGN 2048
|
| 5 |
+
|
| 6 |
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 7 |
+
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
| 8 |
+
const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
| 9 |
+
|
| 10 |
+
if (i >= k) {
|
| 11 |
+
return;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
const int ib = i/qk; // block index
|
| 15 |
+
const int iqs = (i%qk)/qr; // quant index
|
| 16 |
+
const int iybs = i - i%qk; // y block start index
|
| 17 |
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
| 18 |
+
|
| 19 |
+
// dequantize
|
| 20 |
+
dfloat2 v;
|
| 21 |
+
dequantize_kernel(vx, ib, iqs, v);
|
| 22 |
+
|
| 23 |
+
y[iybs + iqs + 0] = v.x;
|
| 24 |
+
y[iybs + iqs + y_offset] = v.y;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
template <bool need_check>
|
| 28 |
+
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
| 29 |
+
#if __CUDA_ARCH__ >= CC_PASCAL
|
| 30 |
+
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
| 31 |
+
|
| 32 |
+
const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
| 33 |
+
const int * x0 = ((int *) vx) + blockIdx.x * nint;
|
| 34 |
+
half2 * y2 = (half2 *) (y + i0);
|
| 35 |
+
|
| 36 |
+
__shared__ int vals[nint];
|
| 37 |
+
|
| 38 |
+
#pragma unroll
|
| 39 |
+
for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
|
| 40 |
+
if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
|
| 41 |
+
break;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
const int ix = ix0 + threadIdx.x;
|
| 45 |
+
vals[ix] = x0[ix];
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
#pragma unroll
|
| 49 |
+
for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
|
| 50 |
+
if (need_check && i0 + iy + 2*threadIdx.x >= k) {
|
| 51 |
+
return;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
|
| 55 |
+
const half d = *b0;
|
| 56 |
+
const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
|
| 57 |
+
|
| 58 |
+
y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
|
| 59 |
+
}
|
| 60 |
+
#else
|
| 61 |
+
GGML_UNUSED(vx);
|
| 62 |
+
GGML_UNUSED(y);
|
| 63 |
+
GGML_UNUSED(k);
|
| 64 |
+
NO_DEVICE_CODE;
|
| 65 |
+
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
template<typename dst_t>
|
| 69 |
+
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
| 70 |
+
|
| 71 |
+
const int i = blockIdx.x;
|
| 72 |
+
|
| 73 |
+
// assume 32 threads
|
| 74 |
+
const int tid = threadIdx.x;
|
| 75 |
+
const int il = tid/8;
|
| 76 |
+
const int ir = tid%8;
|
| 77 |
+
const int ib = 8*i + ir;
|
| 78 |
+
if (ib >= nb32) {
|
| 79 |
+
return;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
| 83 |
+
|
| 84 |
+
const block_q4_0 * x = (const block_q4_0 *)vx + ib;
|
| 85 |
+
const float d = __half2float(x->d);
|
| 86 |
+
const float dm = -8*d;
|
| 87 |
+
|
| 88 |
+
const uint8_t * q = x->qs + 4*il;
|
| 89 |
+
|
| 90 |
+
for (int l = 0; l < 4; ++l) {
|
| 91 |
+
y[l+ 0] = d * (q[l] & 0xF) + dm;
|
| 92 |
+
y[l+16] = d * (q[l] >> 4) + dm;
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
template<typename dst_t>
|
| 97 |
+
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
| 98 |
+
|
| 99 |
+
const int i = blockIdx.x;
|
| 100 |
+
|
| 101 |
+
// assume 32 threads
|
| 102 |
+
const int tid = threadIdx.x;
|
| 103 |
+
const int il = tid/8;
|
| 104 |
+
const int ir = tid%8;
|
| 105 |
+
const int ib = 8*i + ir;
|
| 106 |
+
if (ib >= nb32) {
|
| 107 |
+
return;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
| 111 |
+
|
| 112 |
+
const block_q4_1 * x = (const block_q4_1 *)vx + ib;
|
| 113 |
+
const float2 d = __half22float2(x->dm);
|
| 114 |
+
|
| 115 |
+
const uint8_t * q = x->qs + 4*il;
|
| 116 |
+
|
| 117 |
+
for (int l = 0; l < 4; ++l) {
|
| 118 |
+
y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
|
| 119 |
+
y[l+16] = d.x * (q[l] >> 4) + d.y;
|
| 120 |
+
}
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
//================================== k-quants
|
| 124 |
+
|
| 125 |
+
template<typename dst_t>
|
| 126 |
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 127 |
+
|
| 128 |
+
const int i = blockIdx.x;
|
| 129 |
+
const block_q2_K * x = (const block_q2_K *) vx;
|
| 130 |
+
|
| 131 |
+
const int tid = threadIdx.x;
|
| 132 |
+
#if QK_K == 256
|
| 133 |
+
const int n = tid/32;
|
| 134 |
+
const int l = tid - 32*n;
|
| 135 |
+
const int is = 8*n + l/16;
|
| 136 |
+
|
| 137 |
+
const uint8_t q = x[i].qs[32*n + l];
|
| 138 |
+
dst_t * y = yy + i*QK_K + 128*n;
|
| 139 |
+
|
| 140 |
+
float dall = __low2half(x[i].dm);
|
| 141 |
+
float dmin = __high2half(x[i].dm);
|
| 142 |
+
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
| 143 |
+
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
| 144 |
+
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
| 145 |
+
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
| 146 |
+
#else
|
| 147 |
+
const int is = tid/16; // 0 or 1
|
| 148 |
+
const int il = tid%16; // 0...15
|
| 149 |
+
const uint8_t q = x[i].qs[il] >> (2*is);
|
| 150 |
+
dst_t * y = yy + i*QK_K + 16*is + il;
|
| 151 |
+
float dall = __low2half(x[i].dm);
|
| 152 |
+
float dmin = __high2half(x[i].dm);
|
| 153 |
+
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
| 154 |
+
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
| 155 |
+
#endif
|
| 156 |
+
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
template<typename dst_t>
|
| 160 |
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 161 |
+
|
| 162 |
+
const int i = blockIdx.x;
|
| 163 |
+
const block_q3_K * x = (const block_q3_K *) vx;
|
| 164 |
+
|
| 165 |
+
#if QK_K == 256
|
| 166 |
+
const int r = threadIdx.x/4;
|
| 167 |
+
const int tid = r/2;
|
| 168 |
+
const int is0 = r%2;
|
| 169 |
+
const int l0 = 16*is0 + 4*(threadIdx.x%4);
|
| 170 |
+
const int n = tid / 4;
|
| 171 |
+
const int j = tid - 4*n;
|
| 172 |
+
|
| 173 |
+
uint8_t m = 1 << (4*n + j);
|
| 174 |
+
int is = 8*n + 2*j + is0;
|
| 175 |
+
int shift = 2*j;
|
| 176 |
+
|
| 177 |
+
int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
|
| 178 |
+
is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
|
| 179 |
+
is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
|
| 180 |
+
(x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
|
| 181 |
+
float d_all = x[i].d;
|
| 182 |
+
float dl = d_all * (us - 32);
|
| 183 |
+
|
| 184 |
+
dst_t * y = yy + i*QK_K + 128*n + 32*j;
|
| 185 |
+
const uint8_t * q = x[i].qs + 32*n;
|
| 186 |
+
const uint8_t * hm = x[i].hmask;
|
| 187 |
+
|
| 188 |
+
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
| 189 |
+
#else
|
| 190 |
+
const int tid = threadIdx.x;
|
| 191 |
+
const int is = tid/16; // 0 or 1
|
| 192 |
+
const int il = tid%16; // 0...15
|
| 193 |
+
const int im = il/8; // 0...1
|
| 194 |
+
const int in = il%8; // 0...7
|
| 195 |
+
|
| 196 |
+
dst_t * y = yy + i*QK_K + 16*is + il;
|
| 197 |
+
|
| 198 |
+
const uint8_t q = x[i].qs[il] >> (2*is);
|
| 199 |
+
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
| 200 |
+
const float d = (float)x[i].d;
|
| 201 |
+
|
| 202 |
+
if (is == 0) {
|
| 203 |
+
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
| 204 |
+
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
| 205 |
+
} else {
|
| 206 |
+
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
| 207 |
+
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
| 208 |
+
}
|
| 209 |
+
#endif
|
| 210 |
+
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
#if QK_K == 256
|
| 214 |
+
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
| 215 |
+
if (j < 4) {
|
| 216 |
+
d = q[j] & 63; m = q[j + 4] & 63;
|
| 217 |
+
} else {
|
| 218 |
+
d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
|
| 219 |
+
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
| 220 |
+
}
|
| 221 |
+
}
|
| 222 |
+
#endif
|
| 223 |
+
|
| 224 |
+
template<typename dst_t>
|
| 225 |
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 226 |
+
const block_q4_K * x = (const block_q4_K *) vx;
|
| 227 |
+
|
| 228 |
+
const int i = blockIdx.x;
|
| 229 |
+
|
| 230 |
+
#if QK_K == 256
|
| 231 |
+
// assume 32 threads
|
| 232 |
+
const int tid = threadIdx.x;
|
| 233 |
+
const int il = tid/8;
|
| 234 |
+
const int ir = tid%8;
|
| 235 |
+
const int is = 2*il;
|
| 236 |
+
const int n = 4;
|
| 237 |
+
|
| 238 |
+
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
| 239 |
+
|
| 240 |
+
const float dall = __low2half(x[i].dm);
|
| 241 |
+
const float dmin = __high2half(x[i].dm);
|
| 242 |
+
|
| 243 |
+
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
| 244 |
+
|
| 245 |
+
uint8_t sc, m;
|
| 246 |
+
get_scale_min_k4(is + 0, x[i].scales, sc, m);
|
| 247 |
+
const float d1 = dall * sc; const float m1 = dmin * m;
|
| 248 |
+
get_scale_min_k4(is + 1, x[i].scales, sc, m);
|
| 249 |
+
const float d2 = dall * sc; const float m2 = dmin * m;
|
| 250 |
+
for (int l = 0; l < n; ++l) {
|
| 251 |
+
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
| 252 |
+
y[l +32] = d2 * (q[l] >> 4) - m2;
|
| 253 |
+
}
|
| 254 |
+
#else
|
| 255 |
+
const int tid = threadIdx.x;
|
| 256 |
+
const uint8_t * q = x[i].qs;
|
| 257 |
+
dst_t * y = yy + i*QK_K;
|
| 258 |
+
const float d = (float)x[i].dm[0];
|
| 259 |
+
const float m = (float)x[i].dm[1];
|
| 260 |
+
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
| 261 |
+
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
| 262 |
+
#endif
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
template<typename dst_t>
|
| 266 |
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 267 |
+
const block_q5_K * x = (const block_q5_K *) vx;
|
| 268 |
+
|
| 269 |
+
const int i = blockIdx.x;
|
| 270 |
+
|
| 271 |
+
#if QK_K == 256
|
| 272 |
+
// assume 64 threads - this is very slightly better than the one below
|
| 273 |
+
const int tid = threadIdx.x;
|
| 274 |
+
const int il = tid/16; // il is in 0...3
|
| 275 |
+
const int ir = tid%16; // ir is in 0...15
|
| 276 |
+
const int is = 2*il; // is is in 0...6
|
| 277 |
+
|
| 278 |
+
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
|
| 279 |
+
|
| 280 |
+
const float dall = __low2half(x[i].dm);
|
| 281 |
+
const float dmin = __high2half(x[i].dm);
|
| 282 |
+
|
| 283 |
+
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
| 284 |
+
const uint8_t * qh = x[i].qh + 2*ir;
|
| 285 |
+
|
| 286 |
+
uint8_t sc, m;
|
| 287 |
+
get_scale_min_k4(is + 0, x[i].scales, sc, m);
|
| 288 |
+
const float d1 = dall * sc; const float m1 = dmin * m;
|
| 289 |
+
get_scale_min_k4(is + 1, x[i].scales, sc, m);
|
| 290 |
+
const float d2 = dall * sc; const float m2 = dmin * m;
|
| 291 |
+
|
| 292 |
+
uint8_t hm = 1 << (2*il);
|
| 293 |
+
y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
|
| 294 |
+
y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
|
| 295 |
+
hm <<= 1;
|
| 296 |
+
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
| 297 |
+
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
| 298 |
+
#else
|
| 299 |
+
const int tid = threadIdx.x;
|
| 300 |
+
const uint8_t q = x[i].qs[tid];
|
| 301 |
+
const int im = tid/8; // 0...3
|
| 302 |
+
const int in = tid%8; // 0...7
|
| 303 |
+
const int is = tid/16; // 0 or 1
|
| 304 |
+
const uint8_t h = x[i].qh[in] >> im;
|
| 305 |
+
const float d = x[i].d;
|
| 306 |
+
dst_t * y = yy + i*QK_K + tid;
|
| 307 |
+
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
| 308 |
+
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
| 309 |
+
#endif
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
template<typename dst_t>
|
| 313 |
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 314 |
+
const block_q6_K * x = (const block_q6_K *) vx;
|
| 315 |
+
|
| 316 |
+
const int i = blockIdx.x;
|
| 317 |
+
#if QK_K == 256
|
| 318 |
+
|
| 319 |
+
// assume 64 threads - this is very slightly better than the one below
|
| 320 |
+
const int tid = threadIdx.x;
|
| 321 |
+
const int ip = tid/32; // ip is 0 or 1
|
| 322 |
+
const int il = tid - 32*ip; // 0...32
|
| 323 |
+
const int is = 8*ip + il/16;
|
| 324 |
+
|
| 325 |
+
dst_t * y = yy + i*QK_K + 128*ip + il;
|
| 326 |
+
|
| 327 |
+
const float d = x[i].d;
|
| 328 |
+
|
| 329 |
+
const uint8_t * ql = x[i].ql + 64*ip + il;
|
| 330 |
+
const uint8_t qh = x[i].qh[32*ip + il];
|
| 331 |
+
const int8_t * sc = x[i].scales + is;
|
| 332 |
+
|
| 333 |
+
y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
| 334 |
+
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
| 335 |
+
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
| 336 |
+
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
| 337 |
+
#else
|
| 338 |
+
|
| 339 |
+
// assume 32 threads
|
| 340 |
+
const int tid = threadIdx.x;
|
| 341 |
+
const int ip = tid/16; // 0 or 1
|
| 342 |
+
const int il = tid - 16*ip; // 0...15
|
| 343 |
+
|
| 344 |
+
dst_t * y = yy + i*QK_K + 16*ip + il;
|
| 345 |
+
|
| 346 |
+
const float d = x[i].d;
|
| 347 |
+
|
| 348 |
+
const uint8_t ql = x[i].ql[16*ip + il];
|
| 349 |
+
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
| 350 |
+
const int8_t * sc = x[i].scales;
|
| 351 |
+
|
| 352 |
+
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
| 353 |
+
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
| 354 |
+
#endif
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
template<typename dst_t>
|
| 358 |
+
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 359 |
+
|
| 360 |
+
const int i = blockIdx.x;
|
| 361 |
+
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
| 362 |
+
|
| 363 |
+
const int tid = threadIdx.x;
|
| 364 |
+
#if QK_K == 256
|
| 365 |
+
const int il = tid/8; // 0...3
|
| 366 |
+
const int ib = tid%8; // 0...7
|
| 367 |
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 368 |
+
const uint16_t * q2 = x[i].qs + 4*ib;
|
| 369 |
+
const uint8_t * aux8 = (const uint8_t *)q2;
|
| 370 |
+
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
|
| 371 |
+
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
| 372 |
+
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
| 373 |
+
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
| 374 |
+
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 375 |
+
#else
|
| 376 |
+
NO_DEVICE_CODE;
|
| 377 |
+
#endif
|
| 378 |
+
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
template<typename dst_t>
|
| 382 |
+
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 383 |
+
|
| 384 |
+
const int i = blockIdx.x;
|
| 385 |
+
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
| 386 |
+
|
| 387 |
+
const int tid = threadIdx.x;
|
| 388 |
+
#if QK_K == 256
|
| 389 |
+
const int il = tid/8; // 0...3
|
| 390 |
+
const int ib = tid%8; // 0...7
|
| 391 |
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 392 |
+
const uint16_t * q2 = x[i].qs + 4*ib;
|
| 393 |
+
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
|
| 394 |
+
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 395 |
+
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
| 396 |
+
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 397 |
+
#else
|
| 398 |
+
NO_DEVICE_CODE;
|
| 399 |
+
#endif
|
| 400 |
+
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
template<typename dst_t>
|
| 404 |
+
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 405 |
+
|
| 406 |
+
const int i = blockIdx.x;
|
| 407 |
+
const block_iq2_s * x = (const block_iq2_s *) vx;
|
| 408 |
+
|
| 409 |
+
const int tid = threadIdx.x;
|
| 410 |
+
#if QK_K == 256
|
| 411 |
+
const int il = tid/8; // 0...3
|
| 412 |
+
const int ib = tid%8; // 0...7
|
| 413 |
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 414 |
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
| 415 |
+
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 416 |
+
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
| 417 |
+
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 418 |
+
#else
|
| 419 |
+
NO_DEVICE_CODE;
|
| 420 |
+
#endif
|
| 421 |
+
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
template<typename dst_t>
|
| 425 |
+
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 426 |
+
|
| 427 |
+
const int i = blockIdx.x;
|
| 428 |
+
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
| 429 |
+
|
| 430 |
+
const int tid = threadIdx.x;
|
| 431 |
+
#if QK_K == 256
|
| 432 |
+
const int il = tid/8; // 0...3
|
| 433 |
+
const int ib = tid%8; // 0...7
|
| 434 |
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 435 |
+
const uint8_t * q3 = x[i].qs + 8*ib;
|
| 436 |
+
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
|
| 437 |
+
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
|
| 438 |
+
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
|
| 439 |
+
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
| 440 |
+
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
|
| 441 |
+
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
| 442 |
+
for (int j = 0; j < 4; ++j) {
|
| 443 |
+
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 444 |
+
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 445 |
+
}
|
| 446 |
+
#else
|
| 447 |
+
NO_DEVICE_CODE;
|
| 448 |
+
#endif
|
| 449 |
+
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
template<typename dst_t>
|
| 453 |
+
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 454 |
+
|
| 455 |
+
const int i = blockIdx.x;
|
| 456 |
+
const block_iq3_s * x = (const block_iq3_s *) vx;
|
| 457 |
+
|
| 458 |
+
const int tid = threadIdx.x;
|
| 459 |
+
#if QK_K == 256
|
| 460 |
+
const int il = tid/8; // 0...3
|
| 461 |
+
const int ib = tid%8; // 0...7
|
| 462 |
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 463 |
+
const uint8_t * qs = x[i].qs + 8*ib;
|
| 464 |
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
| 465 |
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
|
| 466 |
+
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
| 467 |
+
const uint8_t signs = x[i].signs[4*ib + il];
|
| 468 |
+
for (int j = 0; j < 4; ++j) {
|
| 469 |
+
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 470 |
+
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 471 |
+
}
|
| 472 |
+
#else
|
| 473 |
+
NO_DEVICE_CODE;
|
| 474 |
+
#endif
|
| 475 |
+
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
template<typename dst_t>
|
| 479 |
+
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 480 |
+
|
| 481 |
+
const int i = blockIdx.x;
|
| 482 |
+
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 483 |
+
|
| 484 |
+
const int tid = threadIdx.x;
|
| 485 |
+
#if QK_K == 256
|
| 486 |
+
const int il = tid/8; // 0...3
|
| 487 |
+
const int ib = tid%8; // 0...7
|
| 488 |
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 489 |
+
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
|
| 490 |
+
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
|
| 491 |
+
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
| 492 |
+
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
|
| 493 |
+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
| 494 |
+
grid32[0] &= 0x0f0f0f0f;
|
| 495 |
+
for (int j = 0; j < 8; ++j) {
|
| 496 |
+
y[j] = d * (q[j] + delta);
|
| 497 |
+
}
|
| 498 |
+
#else
|
| 499 |
+
NO_DEVICE_CODE;
|
| 500 |
+
#endif
|
| 501 |
+
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
template<typename dst_t>
|
| 505 |
+
static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 506 |
+
|
| 507 |
+
const int i = blockIdx.x;
|
| 508 |
+
const block_iq1_m * x = (const block_iq1_m *) vx;
|
| 509 |
+
|
| 510 |
+
const int tid = threadIdx.x;
|
| 511 |
+
#if QK_K == 256
|
| 512 |
+
const int il = tid/8; // 0...3
|
| 513 |
+
const int ib = tid%8; // 0...7
|
| 514 |
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 515 |
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
| 516 |
+
iq1m_scale_t scale;
|
| 517 |
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 518 |
+
const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
|
| 519 |
+
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
|
| 520 |
+
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
|
| 521 |
+
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
| 522 |
+
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
|
| 523 |
+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
| 524 |
+
grid32[0] &= 0x0f0f0f0f;
|
| 525 |
+
for (int j = 0; j < 8; ++j) {
|
| 526 |
+
y[j] = d * (q[j] + delta);
|
| 527 |
+
}
|
| 528 |
+
#else
|
| 529 |
+
NO_DEVICE_CODE;
|
| 530 |
+
#endif
|
| 531 |
+
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
template<typename dst_t>
|
| 536 |
+
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 537 |
+
|
| 538 |
+
const int i = blockIdx.x;
|
| 539 |
+
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
| 540 |
+
|
| 541 |
+
const int tid = threadIdx.x;
|
| 542 |
+
const int il = tid/8; // 0...3
|
| 543 |
+
const int ib = tid%8; // 0...7
|
| 544 |
+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
| 545 |
+
const uint8_t * q4 = x[ib].qs + 4*il;
|
| 546 |
+
const float d = (float)x[ib].d;
|
| 547 |
+
for (int j = 0; j < 4; ++j) {
|
| 548 |
+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
| 549 |
+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
| 550 |
+
}
|
| 551 |
+
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
#if QK_K != 64
|
| 555 |
+
template<typename dst_t>
|
| 556 |
+
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 557 |
+
const int i = blockIdx.x;
|
| 558 |
+
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
| 559 |
+
|
| 560 |
+
const int tid = threadIdx.x;
|
| 561 |
+
const int il = tid/8; // 0...3
|
| 562 |
+
const int ib = tid%8; // 0...7
|
| 563 |
+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
| 564 |
+
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
| 565 |
+
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
| 566 |
+
for (int j = 0; j < 4; ++j) {
|
| 567 |
+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
| 568 |
+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
| 569 |
+
}
|
| 570 |
+
}
|
| 571 |
+
#endif
|
| 572 |
+
|
| 573 |
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 574 |
+
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
| 575 |
+
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
| 576 |
+
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
| 580 |
+
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
| 581 |
+
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
| 582 |
+
const bool need_check = false;
|
| 583 |
+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
| 584 |
+
} else {
|
| 585 |
+
const bool need_check = true;
|
| 586 |
+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
| 587 |
+
}
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
template<typename dst_t>
|
| 591 |
+
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 592 |
+
const int nb = k / QK_K;
|
| 593 |
+
#if QK_K == 256
|
| 594 |
+
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
| 595 |
+
#else
|
| 596 |
+
dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 597 |
+
#endif
|
| 598 |
+
}
|
| 599 |
+
|
| 600 |
+
template<typename dst_t>
|
| 601 |
+
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 602 |
+
const int nb = k / QK_K;
|
| 603 |
+
#if QK_K == 256
|
| 604 |
+
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
| 605 |
+
#else
|
| 606 |
+
dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 607 |
+
#endif
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
template<typename dst_t>
|
| 611 |
+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 612 |
+
const int nb32 = k / 32;
|
| 613 |
+
const int nb = (k + 255) / 256;
|
| 614 |
+
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
| 615 |
+
}
|
| 616 |
+
|
| 617 |
+
template<typename dst_t>
|
| 618 |
+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 619 |
+
const int nb32 = k / 32;
|
| 620 |
+
const int nb = (k + 255) / 256;
|
| 621 |
+
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
template<typename dst_t>
|
| 625 |
+
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 626 |
+
const int nb = k / QK_K;
|
| 627 |
+
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
template<typename dst_t>
|
| 631 |
+
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 632 |
+
const int nb = k / QK_K;
|
| 633 |
+
#if QK_K == 256
|
| 634 |
+
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
| 635 |
+
#else
|
| 636 |
+
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 637 |
+
#endif
|
| 638 |
+
}
|
| 639 |
+
|
| 640 |
+
template<typename dst_t>
|
| 641 |
+
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 642 |
+
const int nb = k / QK_K;
|
| 643 |
+
#if QK_K == 256
|
| 644 |
+
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
| 645 |
+
#else
|
| 646 |
+
dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 647 |
+
#endif
|
| 648 |
+
}
|
| 649 |
+
|
| 650 |
+
template<typename dst_t>
|
| 651 |
+
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 652 |
+
const int nb = k / QK_K;
|
| 653 |
+
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
| 654 |
+
}
|
| 655 |
+
|
| 656 |
+
template<typename dst_t>
|
| 657 |
+
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 658 |
+
const int nb = k / QK_K;
|
| 659 |
+
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
template<typename dst_t>
|
| 663 |
+
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 664 |
+
const int nb = k / QK_K;
|
| 665 |
+
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
| 666 |
+
}
|
| 667 |
+
|
| 668 |
+
template<typename dst_t>
|
| 669 |
+
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 670 |
+
const int nb = k / QK_K;
|
| 671 |
+
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
+
template<typename dst_t>
|
| 675 |
+
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 676 |
+
const int nb = k / QK_K;
|
| 677 |
+
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
| 678 |
+
}
|
| 679 |
+
|
| 680 |
+
template<typename dst_t>
|
| 681 |
+
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 682 |
+
const int nb = k / QK_K;
|
| 683 |
+
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
| 684 |
+
}
|
| 685 |
+
|
| 686 |
+
template<typename dst_t>
|
| 687 |
+
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 688 |
+
const int nb = (k + QK_K - 1) / QK_K;
|
| 689 |
+
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
| 690 |
+
}
|
| 691 |
+
|
| 692 |
+
template<typename dst_t>
|
| 693 |
+
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 694 |
+
const int nb = k / QK_K;
|
| 695 |
+
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
template<typename dst_t>
|
| 699 |
+
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 700 |
+
const int nb = (k + QK_K - 1) / QK_K;
|
| 701 |
+
#if QK_K == 64
|
| 702 |
+
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
| 703 |
+
#else
|
| 704 |
+
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
| 705 |
+
#endif
|
| 706 |
+
}
|
| 707 |
+
|
| 708 |
+
template <typename src_t, typename dst_t>
|
| 709 |
+
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
| 710 |
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 711 |
+
|
| 712 |
+
if (i >= k) {
|
| 713 |
+
return;
|
| 714 |
+
}
|
| 715 |
+
|
| 716 |
+
const src_t * x = (src_t *) vx;
|
| 717 |
+
|
| 718 |
+
y[i] = x[i];
|
| 719 |
+
}
|
| 720 |
+
|
| 721 |
+
template <typename src_t, typename dst_t>
|
| 722 |
+
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
| 723 |
+
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 724 |
+
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 725 |
+
}
|
| 726 |
+
|
| 727 |
+
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
| 728 |
+
int id;
|
| 729 |
+
switch (type) {
|
| 730 |
+
case GGML_TYPE_Q4_0:
|
| 731 |
+
return dequantize_row_q4_0_cuda;
|
| 732 |
+
case GGML_TYPE_Q4_1:
|
| 733 |
+
return dequantize_row_q4_1_cuda;
|
| 734 |
+
case GGML_TYPE_Q5_0:
|
| 735 |
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
| 736 |
+
case GGML_TYPE_Q5_1:
|
| 737 |
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
| 738 |
+
case GGML_TYPE_Q8_0:
|
| 739 |
+
CUDA_CHECK(cudaGetDevice(&id));
|
| 740 |
+
if (ggml_cuda_info().devices[id].cc >= CC_PASCAL) {
|
| 741 |
+
return dequantize_block_q8_0_f16_cuda;
|
| 742 |
+
}
|
| 743 |
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
| 744 |
+
case GGML_TYPE_Q2_K:
|
| 745 |
+
return dequantize_row_q2_K_cuda;
|
| 746 |
+
case GGML_TYPE_Q3_K:
|
| 747 |
+
return dequantize_row_q3_K_cuda;
|
| 748 |
+
case GGML_TYPE_Q4_K:
|
| 749 |
+
return dequantize_row_q4_K_cuda;
|
| 750 |
+
case GGML_TYPE_Q5_K:
|
| 751 |
+
return dequantize_row_q5_K_cuda;
|
| 752 |
+
case GGML_TYPE_Q6_K:
|
| 753 |
+
return dequantize_row_q6_K_cuda;
|
| 754 |
+
case GGML_TYPE_IQ2_XXS:
|
| 755 |
+
return dequantize_row_iq2_xxs_cuda;
|
| 756 |
+
case GGML_TYPE_IQ2_XS:
|
| 757 |
+
return dequantize_row_iq2_xs_cuda;
|
| 758 |
+
case GGML_TYPE_IQ2_S:
|
| 759 |
+
return dequantize_row_iq2_s_cuda;
|
| 760 |
+
case GGML_TYPE_IQ3_XXS:
|
| 761 |
+
return dequantize_row_iq3_xxs_cuda;
|
| 762 |
+
case GGML_TYPE_IQ1_S:
|
| 763 |
+
return dequantize_row_iq1_s_cuda;
|
| 764 |
+
case GGML_TYPE_IQ1_M:
|
| 765 |
+
return dequantize_row_iq1_m_cuda;
|
| 766 |
+
case GGML_TYPE_IQ4_NL:
|
| 767 |
+
return dequantize_row_iq4_nl_cuda;
|
| 768 |
+
case GGML_TYPE_IQ4_XS:
|
| 769 |
+
return dequantize_row_iq4_xs_cuda;
|
| 770 |
+
case GGML_TYPE_IQ3_S:
|
| 771 |
+
return dequantize_row_iq3_s_cuda;
|
| 772 |
+
case GGML_TYPE_F32:
|
| 773 |
+
return convert_unary_cuda<float>;
|
| 774 |
+
default:
|
| 775 |
+
return nullptr;
|
| 776 |
+
}
|
| 777 |
+
}
|
| 778 |
+
|
| 779 |
+
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
| 780 |
+
switch (type) {
|
| 781 |
+
case GGML_TYPE_Q4_0:
|
| 782 |
+
return dequantize_row_q4_0_cuda;
|
| 783 |
+
case GGML_TYPE_Q4_1:
|
| 784 |
+
return dequantize_row_q4_1_cuda;
|
| 785 |
+
case GGML_TYPE_Q5_0:
|
| 786 |
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
| 787 |
+
case GGML_TYPE_Q5_1:
|
| 788 |
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
| 789 |
+
case GGML_TYPE_Q8_0:
|
| 790 |
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
| 791 |
+
case GGML_TYPE_Q2_K:
|
| 792 |
+
return dequantize_row_q2_K_cuda;
|
| 793 |
+
case GGML_TYPE_Q3_K:
|
| 794 |
+
return dequantize_row_q3_K_cuda;
|
| 795 |
+
case GGML_TYPE_Q4_K:
|
| 796 |
+
return dequantize_row_q4_K_cuda;
|
| 797 |
+
case GGML_TYPE_Q5_K:
|
| 798 |
+
return dequantize_row_q5_K_cuda;
|
| 799 |
+
case GGML_TYPE_Q6_K:
|
| 800 |
+
return dequantize_row_q6_K_cuda;
|
| 801 |
+
case GGML_TYPE_IQ2_XXS:
|
| 802 |
+
return dequantize_row_iq2_xxs_cuda;
|
| 803 |
+
case GGML_TYPE_IQ2_XS:
|
| 804 |
+
return dequantize_row_iq2_xs_cuda;
|
| 805 |
+
case GGML_TYPE_IQ2_S:
|
| 806 |
+
return dequantize_row_iq2_s_cuda;
|
| 807 |
+
case GGML_TYPE_IQ3_XXS:
|
| 808 |
+
return dequantize_row_iq3_xxs_cuda;
|
| 809 |
+
case GGML_TYPE_IQ1_S:
|
| 810 |
+
return dequantize_row_iq1_s_cuda;
|
| 811 |
+
case GGML_TYPE_IQ1_M:
|
| 812 |
+
return dequantize_row_iq1_m_cuda;
|
| 813 |
+
case GGML_TYPE_IQ4_NL:
|
| 814 |
+
return dequantize_row_iq4_nl_cuda;
|
| 815 |
+
case GGML_TYPE_IQ4_XS:
|
| 816 |
+
return dequantize_row_iq4_xs_cuda;
|
| 817 |
+
case GGML_TYPE_IQ3_S:
|
| 818 |
+
return dequantize_row_iq3_s_cuda;
|
| 819 |
+
case GGML_TYPE_F16:
|
| 820 |
+
return convert_unary_cuda<half>;
|
| 821 |
+
default:
|
| 822 |
+
return nullptr;
|
| 823 |
+
}
|
| 824 |
+
}
|
ggml-cuda/convert.cuh
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
| 4 |
+
|
| 5 |
+
template<typename T>
|
| 6 |
+
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
|
| 7 |
+
|
| 8 |
+
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
| 9 |
+
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
| 10 |
+
|
| 11 |
+
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
|
| 12 |
+
|
| 13 |
+
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
|
ggml-cuda/cpy.cu
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "cpy.cuh"
|
| 2 |
+
|
| 3 |
+
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
| 4 |
+
|
| 5 |
+
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
| 6 |
+
const float * xi = (const float *) cxi;
|
| 7 |
+
float * dsti = (float *) cdsti;
|
| 8 |
+
|
| 9 |
+
*dsti = *xi;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
| 13 |
+
const float * xi = (const float *) cxi;
|
| 14 |
+
half * dsti = (half *) cdsti;
|
| 15 |
+
|
| 16 |
+
*dsti = __float2half(*xi);
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
| 20 |
+
const half * xi = (const half *) cxi;
|
| 21 |
+
half * dsti = (half *) cdsti;
|
| 22 |
+
|
| 23 |
+
*dsti = *xi;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
| 27 |
+
const half * xi = (const half *) cxi;
|
| 28 |
+
float * dsti = (float *) cdsti;
|
| 29 |
+
|
| 30 |
+
*dsti = *xi;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
template <cpy_kernel_t cpy_1>
|
| 34 |
+
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
| 35 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 36 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
| 37 |
+
const int nb12, const int nb13) {
|
| 38 |
+
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 39 |
+
|
| 40 |
+
if (i >= ne) {
|
| 41 |
+
return;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
| 45 |
+
// then combine those indices with the corresponding byte offsets to get the total offsets
|
| 46 |
+
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
| 47 |
+
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
| 48 |
+
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
| 49 |
+
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
| 50 |
+
const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
| 51 |
+
|
| 52 |
+
const int64_t i13 = i/(ne10 * ne11 * ne12);
|
| 53 |
+
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
| 54 |
+
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
| 55 |
+
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
| 56 |
+
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
| 57 |
+
|
| 58 |
+
cpy_1(cx + x_offset, cdst + dst_offset);
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
| 62 |
+
const float * xi = (const float *) cxi;
|
| 63 |
+
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
| 64 |
+
|
| 65 |
+
float amax = 0.0f; // absolute max
|
| 66 |
+
|
| 67 |
+
for (int j = 0; j < QK8_0; j++) {
|
| 68 |
+
const float v = xi[j];
|
| 69 |
+
amax = fmaxf(amax, fabsf(v));
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
const float d = amax / ((1 << 7) - 1);
|
| 73 |
+
const float id = d ? 1.0f/d : 0.0f;
|
| 74 |
+
|
| 75 |
+
dsti->d = d;
|
| 76 |
+
|
| 77 |
+
for (int j = 0; j < QK8_0; ++j) {
|
| 78 |
+
const float x0 = xi[j]*id;
|
| 79 |
+
|
| 80 |
+
dsti->qs[j] = roundf(x0);
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
| 85 |
+
const float * xi = (const float *) cxi;
|
| 86 |
+
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
| 87 |
+
|
| 88 |
+
float amax = 0.0f;
|
| 89 |
+
float vmax = 0.0f;
|
| 90 |
+
|
| 91 |
+
for (int j = 0; j < QK4_0; ++j) {
|
| 92 |
+
const float v = xi[j];
|
| 93 |
+
if (amax < fabsf(v)) {
|
| 94 |
+
amax = fabsf(v);
|
| 95 |
+
vmax = v;
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
const float d = vmax / -8;
|
| 100 |
+
const float id = d ? 1.0f/d : 0.0f;
|
| 101 |
+
|
| 102 |
+
dsti->d = d;
|
| 103 |
+
|
| 104 |
+
for (int j = 0; j < QK4_0/2; ++j) {
|
| 105 |
+
const float x0 = xi[0 + j]*id;
|
| 106 |
+
const float x1 = xi[QK4_0/2 + j]*id;
|
| 107 |
+
|
| 108 |
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
|
| 109 |
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
|
| 110 |
+
|
| 111 |
+
dsti->qs[j] = xi0;
|
| 112 |
+
dsti->qs[j] |= xi1 << 4;
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
| 117 |
+
const float * xi = (const float *) cxi;
|
| 118 |
+
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
| 119 |
+
|
| 120 |
+
float vmin = FLT_MAX;
|
| 121 |
+
float vmax = -FLT_MAX;
|
| 122 |
+
|
| 123 |
+
for (int j = 0; j < QK4_1; ++j) {
|
| 124 |
+
const float v = xi[j];
|
| 125 |
+
|
| 126 |
+
if (v < vmin) vmin = v;
|
| 127 |
+
if (v > vmax) vmax = v;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
| 131 |
+
const float id = d ? 1.0f/d : 0.0f;
|
| 132 |
+
|
| 133 |
+
dsti->dm.x = d;
|
| 134 |
+
dsti->dm.y = vmin;
|
| 135 |
+
|
| 136 |
+
for (int j = 0; j < QK4_1/2; ++j) {
|
| 137 |
+
const float x0 = (xi[0 + j] - vmin)*id;
|
| 138 |
+
const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
|
| 139 |
+
|
| 140 |
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
|
| 141 |
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
|
| 142 |
+
|
| 143 |
+
dsti->qs[j] = xi0;
|
| 144 |
+
dsti->qs[j] |= xi1 << 4;
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
|
| 149 |
+
const float * xi = (const float *) cxi;
|
| 150 |
+
block_q5_0 * dsti = (block_q5_0 *) cdsti;
|
| 151 |
+
|
| 152 |
+
float amax = 0.0f;
|
| 153 |
+
float vmax = 0.0f;
|
| 154 |
+
|
| 155 |
+
for (int j = 0; j < QK5_0; ++j) {
|
| 156 |
+
const float v = xi[j];
|
| 157 |
+
if (amax < fabsf(v)) {
|
| 158 |
+
amax = fabsf(v);
|
| 159 |
+
vmax = v;
|
| 160 |
+
}
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
const float d = vmax / -16;
|
| 164 |
+
const float id = d ? 1.0f/d : 0.0f;
|
| 165 |
+
|
| 166 |
+
dsti->d = d;
|
| 167 |
+
|
| 168 |
+
uint32_t qh = 0;
|
| 169 |
+
for (int j = 0; j < QK5_0/2; ++j) {
|
| 170 |
+
const float x0 = xi[0 + j]*id;
|
| 171 |
+
const float x1 = xi[QK5_0/2 + j]*id;
|
| 172 |
+
|
| 173 |
+
const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
|
| 174 |
+
const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
|
| 175 |
+
|
| 176 |
+
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
| 177 |
+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
| 178 |
+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
| 179 |
+
}
|
| 180 |
+
memcpy(dsti->qh, &qh, sizeof(qh));
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
|
| 184 |
+
const float * xi = (const float *) cxi;
|
| 185 |
+
block_q5_1 * dsti = (block_q5_1 *) cdsti;
|
| 186 |
+
|
| 187 |
+
float min = xi[0];
|
| 188 |
+
float max = xi[0];
|
| 189 |
+
|
| 190 |
+
for (int j = 1; j < QK5_1; ++j) {
|
| 191 |
+
const float v = xi[j];
|
| 192 |
+
min = v < min ? v : min;
|
| 193 |
+
max = v > max ? v : max;
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
const float d = (max - min) / 31;
|
| 197 |
+
const float id = d ? 1.0f/d : 0.0f;
|
| 198 |
+
|
| 199 |
+
dsti->dm.x = d;
|
| 200 |
+
dsti->dm.y = min;
|
| 201 |
+
|
| 202 |
+
uint32_t qh = 0;
|
| 203 |
+
for (int j = 0; j < QK5_1/2; ++j) {
|
| 204 |
+
const float x0 = (xi[0 + j] - min)*id;
|
| 205 |
+
const float x1 = (xi[QK5_1/2 + j] - min)*id;
|
| 206 |
+
|
| 207 |
+
const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
|
| 208 |
+
const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
|
| 209 |
+
|
| 210 |
+
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
| 211 |
+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
| 212 |
+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
|
| 213 |
+
}
|
| 214 |
+
memcpy(dsti->qh, &qh, sizeof(qh));
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
|
| 219 |
+
if (x <= val[0]) return 0;
|
| 220 |
+
if (x >= val[n-1]) return n-1;
|
| 221 |
+
int ml = 0, mu = n-1;
|
| 222 |
+
while (mu-ml > 1) {
|
| 223 |
+
int mav = (ml+mu)/2;
|
| 224 |
+
if (x < val[mav]) mu = mav; else ml = mav;
|
| 225 |
+
}
|
| 226 |
+
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
| 230 |
+
const float * xi = (const float *) cxi;
|
| 231 |
+
block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
|
| 232 |
+
|
| 233 |
+
float amax = 0.0f;
|
| 234 |
+
float vmax = 0.0f;
|
| 235 |
+
|
| 236 |
+
for (int j = 0; j < QK4_NL; ++j) {
|
| 237 |
+
const float v = xi[j];
|
| 238 |
+
if (amax < fabsf(v)) {
|
| 239 |
+
amax = fabsf(v);
|
| 240 |
+
vmax = v;
|
| 241 |
+
}
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
float d = vmax / kvalues_iq4nl[0];
|
| 245 |
+
const float id = d ? 1.0f/d : 0.0f;
|
| 246 |
+
|
| 247 |
+
float sumqx = 0, sumq2 = 0;
|
| 248 |
+
for (int j = 0; j < QK4_NL/2; ++j) {
|
| 249 |
+
const float x0 = xi[0 + j]*id;
|
| 250 |
+
const float x1 = xi[QK4_NL/2 + j]*id;
|
| 251 |
+
const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
|
| 252 |
+
const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
|
| 253 |
+
dsti->qs[j] = xi0 | (xi1 << 4);
|
| 254 |
+
const float v0 = kvalues_iq4nl[xi0];
|
| 255 |
+
const float v1 = kvalues_iq4nl[xi1];
|
| 256 |
+
const float w0 = xi[0 + j]*xi[0 + j];
|
| 257 |
+
const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j];
|
| 258 |
+
sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j];
|
| 259 |
+
sumq2 += w0*v0*v0 + w1*v1*v1;
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
dsti->d = sumq2 > 0 ? sumqx/sumq2 : d;
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
template <cpy_kernel_t cpy_blck, int qk>
|
| 266 |
+
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
| 267 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 268 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
| 269 |
+
const int nb12, const int nb13) {
|
| 270 |
+
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
| 271 |
+
|
| 272 |
+
if (i >= ne) {
|
| 273 |
+
return;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
const int i03 = i/(ne00 * ne01 * ne02);
|
| 277 |
+
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
| 278 |
+
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
| 279 |
+
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
| 280 |
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
| 281 |
+
|
| 282 |
+
const int i13 = i/(ne10 * ne11 * ne12);
|
| 283 |
+
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
| 284 |
+
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
| 285 |
+
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
| 286 |
+
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
| 287 |
+
|
| 288 |
+
cpy_blck(cx + x_offset, cdst + dst_offset);
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
static void ggml_cpy_f16_f32_cuda(
|
| 292 |
+
const char * cx, char * cdst, const int ne,
|
| 293 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 294 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
| 295 |
+
|
| 296 |
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
| 297 |
+
cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
| 298 |
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
static void ggml_cpy_f32_f32_cuda(
|
| 302 |
+
const char * cx, char * cdst, const int ne,
|
| 303 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 304 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
| 305 |
+
|
| 306 |
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
| 307 |
+
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
| 308 |
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
static void ggml_cpy_f32_f16_cuda(
|
| 312 |
+
const char * cx, char * cdst, const int ne,
|
| 313 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 314 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
| 315 |
+
|
| 316 |
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
| 317 |
+
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
| 318 |
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
static void ggml_cpy_f32_q8_0_cuda(
|
| 322 |
+
const char * cx, char * cdst, const int ne,
|
| 323 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 324 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
| 325 |
+
|
| 326 |
+
GGML_ASSERT(ne % QK8_0 == 0);
|
| 327 |
+
const int num_blocks = ne / QK8_0;
|
| 328 |
+
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
| 329 |
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
static void ggml_cpy_f32_q4_0_cuda(
|
| 333 |
+
const char * cx, char * cdst, const int ne,
|
| 334 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 335 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
| 336 |
+
|
| 337 |
+
GGML_ASSERT(ne % QK4_0 == 0);
|
| 338 |
+
const int num_blocks = ne / QK4_0;
|
| 339 |
+
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
| 340 |
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
static void ggml_cpy_f32_q4_1_cuda(
|
| 344 |
+
const char * cx, char * cdst, const int ne,
|
| 345 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 346 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
| 347 |
+
|
| 348 |
+
GGML_ASSERT(ne % QK4_1 == 0);
|
| 349 |
+
const int num_blocks = ne / QK4_1;
|
| 350 |
+
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
| 351 |
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
static void ggml_cpy_f32_q5_0_cuda(
|
| 355 |
+
const char * cx, char * cdst, const int ne,
|
| 356 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 357 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
| 358 |
+
|
| 359 |
+
GGML_ASSERT(ne % QK5_0 == 0);
|
| 360 |
+
const int num_blocks = ne / QK5_0;
|
| 361 |
+
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
|
| 362 |
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
static void ggml_cpy_f32_q5_1_cuda(
|
| 366 |
+
const char * cx, char * cdst, const int ne,
|
| 367 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 368 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
| 369 |
+
|
| 370 |
+
GGML_ASSERT(ne % QK5_1 == 0);
|
| 371 |
+
const int num_blocks = ne / QK5_1;
|
| 372 |
+
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
|
| 373 |
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
static void ggml_cpy_f32_iq4_nl_cuda(
|
| 377 |
+
const char * cx, char * cdst, const int ne,
|
| 378 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 379 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
| 380 |
+
|
| 381 |
+
GGML_ASSERT(ne % QK4_NL == 0);
|
| 382 |
+
const int num_blocks = ne / QK4_NL;
|
| 383 |
+
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
|
| 384 |
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
static void ggml_cpy_f16_f16_cuda(
|
| 388 |
+
const char * cx, char * cdst, const int ne,
|
| 389 |
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 390 |
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
| 391 |
+
|
| 392 |
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
| 393 |
+
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
| 394 |
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
|
| 398 |
+
const int64_t ne = ggml_nelements(src0);
|
| 399 |
+
GGML_ASSERT(ne == ggml_nelements(src1));
|
| 400 |
+
|
| 401 |
+
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
| 402 |
+
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
| 403 |
+
|
| 404 |
+
const int64_t ne00 = src0->ne[0];
|
| 405 |
+
const int64_t ne01 = src0->ne[1];
|
| 406 |
+
const int64_t ne02 = src0->ne[2];
|
| 407 |
+
|
| 408 |
+
//GGML_ASSERT(src0->ne[3] == 1);
|
| 409 |
+
|
| 410 |
+
const int64_t nb00 = src0->nb[0];
|
| 411 |
+
const int64_t nb01 = src0->nb[1];
|
| 412 |
+
const int64_t nb02 = src0->nb[2];
|
| 413 |
+
const int64_t nb03 = src0->nb[3];
|
| 414 |
+
|
| 415 |
+
const int64_t ne10 = src1->ne[0];
|
| 416 |
+
const int64_t ne11 = src1->ne[1];
|
| 417 |
+
const int64_t ne12 = src1->ne[2];
|
| 418 |
+
|
| 419 |
+
//GGML_ASSERT(src1->ne[3] == 1);
|
| 420 |
+
|
| 421 |
+
const int64_t nb10 = src1->nb[0];
|
| 422 |
+
const int64_t nb11 = src1->nb[1];
|
| 423 |
+
const int64_t nb12 = src1->nb[2];
|
| 424 |
+
const int64_t nb13 = src1->nb[3];
|
| 425 |
+
|
| 426 |
+
cudaStream_t main_stream = ctx.stream();
|
| 427 |
+
|
| 428 |
+
char * src0_ddc = (char *) src0->data;
|
| 429 |
+
char * src1_ddc = (char *) src1->data;
|
| 430 |
+
|
| 431 |
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
| 432 |
+
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 433 |
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
| 434 |
+
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 435 |
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
| 436 |
+
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 437 |
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
| 438 |
+
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 439 |
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
| 440 |
+
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 441 |
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
| 442 |
+
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 443 |
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
| 444 |
+
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 445 |
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
| 446 |
+
ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 447 |
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
| 448 |
+
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 449 |
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
| 450 |
+
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 451 |
+
} else {
|
| 452 |
+
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
| 453 |
+
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
| 454 |
+
GGML_ASSERT(false);
|
| 455 |
+
}
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 459 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 460 |
+
ggml_cuda_cpy(ctx, src0, dst);
|
| 461 |
+
}
|
ggml-cuda/cpy.cuh
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
#define CUDA_CPY_BLOCK_SIZE 32
|
| 4 |
+
|
| 5 |
+
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
|
| 6 |
+
|
| 7 |
+
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml-cuda/dequantize.cuh
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
| 4 |
+
const block_q4_0 * x = (const block_q4_0 *) vx;
|
| 5 |
+
|
| 6 |
+
const dfloat d = x[ib].d;
|
| 7 |
+
|
| 8 |
+
const int vui = x[ib].qs[iqs];
|
| 9 |
+
|
| 10 |
+
v.x = vui & 0xF;
|
| 11 |
+
v.y = vui >> 4;
|
| 12 |
+
|
| 13 |
+
#ifdef GGML_CUDA_F16
|
| 14 |
+
v = __hsub2(v, {8.0f, 8.0f});
|
| 15 |
+
v = __hmul2(v, {d, d});
|
| 16 |
+
#else
|
| 17 |
+
v.x = (v.x - 8.0f) * d;
|
| 18 |
+
v.y = (v.y - 8.0f) * d;
|
| 19 |
+
#endif // GGML_CUDA_F16
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
| 23 |
+
const block_q4_1 * x = (const block_q4_1 *) vx;
|
| 24 |
+
|
| 25 |
+
const dfloat d = __low2half(x[ib].dm);
|
| 26 |
+
const dfloat m = __high2half(x[ib].dm);
|
| 27 |
+
|
| 28 |
+
const int vui = x[ib].qs[iqs];
|
| 29 |
+
|
| 30 |
+
v.x = vui & 0xF;
|
| 31 |
+
v.y = vui >> 4;
|
| 32 |
+
|
| 33 |
+
#ifdef GGML_CUDA_F16
|
| 34 |
+
v = __hmul2(v, {d, d});
|
| 35 |
+
v = __hadd2(v, {m, m});
|
| 36 |
+
#else
|
| 37 |
+
v.x = (v.x * d) + m;
|
| 38 |
+
v.y = (v.y * d) + m;
|
| 39 |
+
#endif // GGML_CUDA_F16
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
| 43 |
+
const block_q5_0 * x = (const block_q5_0 *) vx;
|
| 44 |
+
|
| 45 |
+
const dfloat d = x[ib].d;
|
| 46 |
+
|
| 47 |
+
uint32_t qh;
|
| 48 |
+
memcpy(&qh, x[ib].qh, sizeof(qh));
|
| 49 |
+
|
| 50 |
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
| 51 |
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
| 52 |
+
|
| 53 |
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
| 54 |
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
| 55 |
+
|
| 56 |
+
#ifdef GGML_CUDA_F16
|
| 57 |
+
v = __hsub2(v, {16.0f, 16.0f});
|
| 58 |
+
v = __hmul2(v, {d, d});
|
| 59 |
+
#else
|
| 60 |
+
v.x = (v.x - 16.0f) * d;
|
| 61 |
+
v.y = (v.y - 16.0f) * d;
|
| 62 |
+
#endif // GGML_CUDA_F16
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
| 66 |
+
const block_q5_1 * x = (const block_q5_1 *) vx;
|
| 67 |
+
|
| 68 |
+
const dfloat d = __low2half(x[ib].dm);
|
| 69 |
+
const dfloat m = __high2half(x[ib].dm);
|
| 70 |
+
|
| 71 |
+
uint32_t qh;
|
| 72 |
+
memcpy(&qh, x[ib].qh, sizeof(qh));
|
| 73 |
+
|
| 74 |
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
| 75 |
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
| 76 |
+
|
| 77 |
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
| 78 |
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
| 79 |
+
|
| 80 |
+
#ifdef GGML_CUDA_F16
|
| 81 |
+
v = __hmul2(v, {d, d});
|
| 82 |
+
v = __hadd2(v, {m, m});
|
| 83 |
+
#else
|
| 84 |
+
v.x = (v.x * d) + m;
|
| 85 |
+
v.y = (v.y * d) + m;
|
| 86 |
+
#endif // GGML_CUDA_F16
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
| 90 |
+
const block_q8_0 * x = (const block_q8_0 *) vx;
|
| 91 |
+
|
| 92 |
+
const dfloat d = x[ib].d;
|
| 93 |
+
|
| 94 |
+
v.x = x[ib].qs[iqs + 0];
|
| 95 |
+
v.y = x[ib].qs[iqs + 1];
|
| 96 |
+
|
| 97 |
+
#ifdef GGML_CUDA_F16
|
| 98 |
+
v = __hmul2(v, {d, d});
|
| 99 |
+
#else
|
| 100 |
+
v.x *= d;
|
| 101 |
+
v.y *= d;
|
| 102 |
+
#endif // GGML_CUDA_F16
|
| 103 |
+
}
|
ggml-cuda/diagmask.cu
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "diagmask.cuh"
|
| 2 |
+
|
| 3 |
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
| 4 |
+
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
| 5 |
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
| 6 |
+
|
| 7 |
+
if (col >= ncols) {
|
| 8 |
+
return;
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
const int i = row*ncols + col;
|
| 12 |
+
//dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
|
| 13 |
+
//dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
| 14 |
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
| 18 |
+
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
| 19 |
+
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
| 20 |
+
const dim3 block_nums(nrows_x, block_num_x, 1);
|
| 21 |
+
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 25 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 26 |
+
const float * src0_d = (const float *)src0->data;
|
| 27 |
+
float * dst_d = (float *)dst->data;
|
| 28 |
+
cudaStream_t stream = ctx.stream();
|
| 29 |
+
|
| 30 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 31 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 32 |
+
|
| 33 |
+
const int64_t ne00 = src0->ne[0];
|
| 34 |
+
const int64_t ne01 = src0->ne[1];
|
| 35 |
+
const int nrows0 = ggml_nrows(src0);
|
| 36 |
+
|
| 37 |
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
| 38 |
+
|
| 39 |
+
diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream);
|
| 40 |
+
}
|
ggml-cuda/diagmask.cuh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
| 4 |
+
|
| 5 |
+
void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml-cuda/dmmv.cu
ADDED
|
@@ -0,0 +1,817 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "dmmv.cuh"
|
| 2 |
+
#include "dequantize.cuh"
|
| 3 |
+
#include "convert.cuh"
|
| 4 |
+
|
| 5 |
+
#ifndef GGML_CUDA_MMV_Y
|
| 6 |
+
#define GGML_CUDA_MMV_Y 1
|
| 7 |
+
#endif
|
| 8 |
+
|
| 9 |
+
#ifndef K_QUANTS_PER_ITERATION
|
| 10 |
+
#define K_QUANTS_PER_ITERATION 2
|
| 11 |
+
#else
|
| 12 |
+
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
| 13 |
+
#endif
|
| 14 |
+
|
| 15 |
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
| 16 |
+
|
| 17 |
+
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
| 18 |
+
|
| 19 |
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
| 20 |
+
if (row > nrows) return;
|
| 21 |
+
|
| 22 |
+
const int num_blocks_per_row = ncols / QK_K;
|
| 23 |
+
const int ib0 = row*num_blocks_per_row;
|
| 24 |
+
|
| 25 |
+
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
| 26 |
+
|
| 27 |
+
float tmp = 0; // partial sum for thread in warp
|
| 28 |
+
|
| 29 |
+
#if QK_K == 256
|
| 30 |
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
| 31 |
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
| 32 |
+
|
| 33 |
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
| 34 |
+
|
| 35 |
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
| 36 |
+
const int in = tid - step*im; // 0...15 or 0...7
|
| 37 |
+
|
| 38 |
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
| 39 |
+
const int q_offset = 32*im + l0;
|
| 40 |
+
const int s_offset = 8*im;
|
| 41 |
+
const int y_offset = 128*im + l0;
|
| 42 |
+
|
| 43 |
+
uint32_t aux[4];
|
| 44 |
+
const uint8_t * d = (const uint8_t *)aux;
|
| 45 |
+
const uint8_t * m = (const uint8_t *)(aux + 2);
|
| 46 |
+
|
| 47 |
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
| 48 |
+
|
| 49 |
+
const float * y = yy + i * QK_K + y_offset;
|
| 50 |
+
const uint8_t * q = x[i].qs + q_offset;
|
| 51 |
+
|
| 52 |
+
const float dall = __low2half(x[i].dm);
|
| 53 |
+
const float dmin = __high2half(x[i].dm);
|
| 54 |
+
|
| 55 |
+
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
| 56 |
+
aux[0] = a[0] & 0x0f0f0f0f;
|
| 57 |
+
aux[1] = a[1] & 0x0f0f0f0f;
|
| 58 |
+
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
|
| 59 |
+
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
|
| 60 |
+
|
| 61 |
+
float sum1 = 0, sum2 = 0;
|
| 62 |
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 63 |
+
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
|
| 64 |
+
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
|
| 65 |
+
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
|
| 66 |
+
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
|
| 67 |
+
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
|
| 68 |
+
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
|
| 69 |
+
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
|
| 70 |
+
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
|
| 71 |
+
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
|
| 72 |
+
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
|
| 73 |
+
|
| 74 |
+
}
|
| 75 |
+
tmp += dall * sum1 - dmin * sum2;
|
| 76 |
+
|
| 77 |
+
}
|
| 78 |
+
#else
|
| 79 |
+
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
| 80 |
+
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
| 81 |
+
const int offset = tid * K_QUANTS_PER_ITERATION;
|
| 82 |
+
|
| 83 |
+
uint32_t uaux[2];
|
| 84 |
+
const uint8_t * d = (const uint8_t *)uaux;
|
| 85 |
+
|
| 86 |
+
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 87 |
+
|
| 88 |
+
const float * y = yy + i * QK_K + offset;
|
| 89 |
+
const uint8_t * q = x[i].qs + offset;
|
| 90 |
+
const uint32_t * s = (const uint32_t *)x[i].scales;
|
| 91 |
+
|
| 92 |
+
uaux[0] = s[0] & 0x0f0f0f0f;
|
| 93 |
+
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
| 94 |
+
|
| 95 |
+
const float2 dall = __half22float2(x[i].dm);
|
| 96 |
+
|
| 97 |
+
float sum1 = 0, sum2 = 0;
|
| 98 |
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 99 |
+
const uint8_t ql = q[l];
|
| 100 |
+
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
| 101 |
+
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
| 102 |
+
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
| 103 |
+
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
| 104 |
+
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
| 105 |
+
}
|
| 106 |
+
tmp += dall.x * sum1 - dall.y * sum2;
|
| 107 |
+
}
|
| 108 |
+
#endif
|
| 109 |
+
|
| 110 |
+
// sum up partial sums and write back result
|
| 111 |
+
tmp = warp_reduce_sum(tmp);
|
| 112 |
+
|
| 113 |
+
if (threadIdx.x == 0) {
|
| 114 |
+
dst[row] = tmp;
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
| 119 |
+
|
| 120 |
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
| 121 |
+
if (row > nrows) return;
|
| 122 |
+
|
| 123 |
+
const int num_blocks_per_row = ncols / QK_K;
|
| 124 |
+
const int ib0 = row*num_blocks_per_row;
|
| 125 |
+
|
| 126 |
+
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
| 127 |
+
|
| 128 |
+
float tmp = 0; // partial sum for thread in warp
|
| 129 |
+
|
| 130 |
+
#if QK_K == 256
|
| 131 |
+
|
| 132 |
+
const uint16_t kmask1 = 0x0303;
|
| 133 |
+
const uint16_t kmask2 = 0x0f0f;
|
| 134 |
+
|
| 135 |
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
| 136 |
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
| 137 |
+
|
| 138 |
+
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
| 139 |
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
| 140 |
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
| 141 |
+
const int in = tid - step*im; // 0....15 or 0...7
|
| 142 |
+
|
| 143 |
+
const uint8_t m = 1 << (4*im);
|
| 144 |
+
|
| 145 |
+
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
| 146 |
+
const int q_offset = 32*im + l0;
|
| 147 |
+
const int y_offset = 128*im + l0;
|
| 148 |
+
|
| 149 |
+
uint16_t utmp[4];
|
| 150 |
+
const int8_t * s = (const int8_t *)utmp;
|
| 151 |
+
|
| 152 |
+
const uint16_t s_shift = 4*im;
|
| 153 |
+
|
| 154 |
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
| 155 |
+
|
| 156 |
+
const float * y = yy + i * QK_K + y_offset;
|
| 157 |
+
const uint8_t * q = x[i].qs + q_offset;
|
| 158 |
+
const uint8_t * h = x[i].hmask + l0;
|
| 159 |
+
|
| 160 |
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
| 161 |
+
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
|
| 162 |
+
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
|
| 163 |
+
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
|
| 164 |
+
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
|
| 165 |
+
|
| 166 |
+
const float d = x[i].d;
|
| 167 |
+
|
| 168 |
+
float sum = 0;
|
| 169 |
+
for (int l = 0; l < n; ++l) {
|
| 170 |
+
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
|
| 171 |
+
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
|
| 172 |
+
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
|
| 173 |
+
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
|
| 174 |
+
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
|
| 175 |
+
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
|
| 176 |
+
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
|
| 177 |
+
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
|
| 178 |
+
}
|
| 179 |
+
tmp += d * sum;
|
| 180 |
+
|
| 181 |
+
}
|
| 182 |
+
#else
|
| 183 |
+
|
| 184 |
+
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
| 185 |
+
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
| 186 |
+
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
| 187 |
+
const int in = offset/8; // 0 or 1
|
| 188 |
+
const int im = offset%8; // 0...7
|
| 189 |
+
|
| 190 |
+
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 191 |
+
|
| 192 |
+
const float * y = yy + i * QK_K + offset;
|
| 193 |
+
const uint8_t * q = x[i].qs + offset;
|
| 194 |
+
const uint8_t * s = x[i].scales;
|
| 195 |
+
|
| 196 |
+
const float dall = (float)x[i].d;
|
| 197 |
+
|
| 198 |
+
float sum = 0;
|
| 199 |
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 200 |
+
const uint8_t hl = x[i].hmask[im+l] >> in;
|
| 201 |
+
const uint8_t ql = q[l];
|
| 202 |
+
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
| 203 |
+
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
| 204 |
+
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
| 205 |
+
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
| 206 |
+
}
|
| 207 |
+
tmp += sum;
|
| 208 |
+
}
|
| 209 |
+
#endif
|
| 210 |
+
|
| 211 |
+
// sum up partial sums and write back result
|
| 212 |
+
tmp = warp_reduce_sum(tmp);
|
| 213 |
+
|
| 214 |
+
if (threadIdx.x == 0) {
|
| 215 |
+
dst[row] = tmp;
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
| 220 |
+
|
| 221 |
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
| 222 |
+
if (row > nrows) return;
|
| 223 |
+
const int num_blocks_per_row = ncols / QK_K;
|
| 224 |
+
const int ib0 = row*num_blocks_per_row;
|
| 225 |
+
|
| 226 |
+
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
| 227 |
+
|
| 228 |
+
#if QK_K == 256
|
| 229 |
+
const uint16_t kmask1 = 0x3f3f;
|
| 230 |
+
const uint16_t kmask2 = 0x0f0f;
|
| 231 |
+
const uint16_t kmask3 = 0xc0c0;
|
| 232 |
+
|
| 233 |
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
| 234 |
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
| 235 |
+
|
| 236 |
+
const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
| 237 |
+
|
| 238 |
+
const int il = tid/step; // 0...3
|
| 239 |
+
const int ir = tid - step*il; // 0...7 or 0...3
|
| 240 |
+
const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
| 241 |
+
|
| 242 |
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
| 243 |
+
const int in = il%2;
|
| 244 |
+
|
| 245 |
+
const int l0 = n*(2*ir + in);
|
| 246 |
+
const int q_offset = 32*im + l0;
|
| 247 |
+
const int y_offset = 64*im + l0;
|
| 248 |
+
|
| 249 |
+
uint16_t aux[4];
|
| 250 |
+
const uint8_t * sc = (const uint8_t *)aux;
|
| 251 |
+
|
| 252 |
+
#if K_QUANTS_PER_ITERATION == 2
|
| 253 |
+
uint32_t q32[4];
|
| 254 |
+
const uint8_t * q4 = (const uint8_t *)q32;
|
| 255 |
+
#else
|
| 256 |
+
uint16_t q16[4];
|
| 257 |
+
const uint8_t * q4 = (const uint8_t *)q16;
|
| 258 |
+
#endif
|
| 259 |
+
|
| 260 |
+
float tmp = 0; // partial sum for thread in warp
|
| 261 |
+
|
| 262 |
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
| 263 |
+
|
| 264 |
+
const float * y1 = yy + i*QK_K + y_offset;
|
| 265 |
+
const float * y2 = y1 + 128;
|
| 266 |
+
|
| 267 |
+
const float dall = __low2half(x[i].dm);
|
| 268 |
+
const float dmin = __high2half(x[i].dm);
|
| 269 |
+
|
| 270 |
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
| 271 |
+
aux[0] = a[im+0] & kmask1;
|
| 272 |
+
aux[1] = a[im+2] & kmask1;
|
| 273 |
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
| 274 |
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
| 275 |
+
|
| 276 |
+
#if K_QUANTS_PER_ITERATION == 2
|
| 277 |
+
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
| 278 |
+
const uint32_t * q2 = q1 + 16;
|
| 279 |
+
|
| 280 |
+
q32[0] = q1[0] & 0x0f0f0f0f;
|
| 281 |
+
q32[1] = q1[0] & 0xf0f0f0f0;
|
| 282 |
+
q32[2] = q2[0] & 0x0f0f0f0f;
|
| 283 |
+
q32[3] = q2[0] & 0xf0f0f0f0;
|
| 284 |
+
|
| 285 |
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
| 286 |
+
float smin = 0;
|
| 287 |
+
for (int l = 0; l < 4; ++l) {
|
| 288 |
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
|
| 289 |
+
s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
|
| 290 |
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
| 291 |
+
}
|
| 292 |
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
| 293 |
+
#else
|
| 294 |
+
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
| 295 |
+
const uint16_t * q2 = q1 + 32;
|
| 296 |
+
|
| 297 |
+
q16[0] = q1[0] & 0x0f0f;
|
| 298 |
+
q16[1] = q1[0] & 0xf0f0;
|
| 299 |
+
q16[2] = q2[0] & 0x0f0f;
|
| 300 |
+
q16[3] = q2[0] & 0xf0f0;
|
| 301 |
+
|
| 302 |
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
| 303 |
+
float smin = 0;
|
| 304 |
+
for (int l = 0; l < 2; ++l) {
|
| 305 |
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
| 306 |
+
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
| 307 |
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
| 308 |
+
}
|
| 309 |
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
| 310 |
+
#endif
|
| 311 |
+
|
| 312 |
+
}
|
| 313 |
+
#else
|
| 314 |
+
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
| 315 |
+
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
| 316 |
+
|
| 317 |
+
const int step = tid * K_QUANTS_PER_ITERATION;
|
| 318 |
+
|
| 319 |
+
uint16_t aux16[2];
|
| 320 |
+
const uint8_t * s = (const uint8_t *)aux16;
|
| 321 |
+
|
| 322 |
+
float tmp = 0;
|
| 323 |
+
|
| 324 |
+
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 325 |
+
const uint8_t * q = x[i].qs + step;
|
| 326 |
+
const float * y = yy + i*QK_K + step;
|
| 327 |
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
| 328 |
+
aux16[0] = a[0] & 0x0f0f;
|
| 329 |
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
| 330 |
+
const float d = (float)x[i].dm[0];
|
| 331 |
+
const float m = (float)x[i].dm[1];
|
| 332 |
+
float sum = 0.f;
|
| 333 |
+
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
| 334 |
+
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
| 335 |
+
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
| 336 |
+
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
| 337 |
+
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
| 338 |
+
}
|
| 339 |
+
tmp += sum;
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
#endif
|
| 343 |
+
|
| 344 |
+
// sum up partial sums and write back result
|
| 345 |
+
tmp = warp_reduce_sum(tmp);
|
| 346 |
+
|
| 347 |
+
if (tid == 0) {
|
| 348 |
+
dst[row] = tmp;
|
| 349 |
+
}
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
|
| 353 |
+
|
| 354 |
+
const int row = blockIdx.x;
|
| 355 |
+
const int num_blocks_per_row = ncols / QK_K;
|
| 356 |
+
const int ib0 = row*num_blocks_per_row;
|
| 357 |
+
|
| 358 |
+
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
| 359 |
+
|
| 360 |
+
float tmp = 0; // partial sum for thread in warp
|
| 361 |
+
|
| 362 |
+
#if QK_K == 256
|
| 363 |
+
const uint16_t kmask1 = 0x3f3f;
|
| 364 |
+
const uint16_t kmask2 = 0x0f0f;
|
| 365 |
+
const uint16_t kmask3 = 0xc0c0;
|
| 366 |
+
|
| 367 |
+
const int tid = threadIdx.x/2; // 0...15
|
| 368 |
+
const int ix = threadIdx.x%2;
|
| 369 |
+
|
| 370 |
+
const int il = tid/4; // 0...3
|
| 371 |
+
const int ir = tid - 4*il;// 0...3
|
| 372 |
+
const int n = 2;
|
| 373 |
+
|
| 374 |
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
| 375 |
+
const int in = il%2;
|
| 376 |
+
|
| 377 |
+
const int l0 = n*(2*ir + in);
|
| 378 |
+
const int q_offset = 32*im + l0;
|
| 379 |
+
const int y_offset = 64*im + l0;
|
| 380 |
+
|
| 381 |
+
const uint8_t hm1 = 1 << (2*im);
|
| 382 |
+
const uint8_t hm2 = hm1 << 4;
|
| 383 |
+
|
| 384 |
+
uint16_t aux[4];
|
| 385 |
+
const uint8_t * sc = (const uint8_t *)aux;
|
| 386 |
+
|
| 387 |
+
uint16_t q16[8];
|
| 388 |
+
const uint8_t * q4 = (const uint8_t *)q16;
|
| 389 |
+
|
| 390 |
+
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
| 391 |
+
|
| 392 |
+
const uint8_t * ql1 = x[i].qs + q_offset;
|
| 393 |
+
const uint8_t * qh = x[i].qh + l0;
|
| 394 |
+
const float * y1 = yy + i*QK_K + y_offset;
|
| 395 |
+
const float * y2 = y1 + 128;
|
| 396 |
+
|
| 397 |
+
const float dall = __low2half(x[i].dm);
|
| 398 |
+
const float dmin = __high2half(x[i].dm);
|
| 399 |
+
|
| 400 |
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
| 401 |
+
aux[0] = a[im+0] & kmask1;
|
| 402 |
+
aux[1] = a[im+2] & kmask1;
|
| 403 |
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
| 404 |
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
| 405 |
+
|
| 406 |
+
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
| 407 |
+
float smin = 0;
|
| 408 |
+
const uint16_t * q1 = (const uint16_t *)ql1;
|
| 409 |
+
const uint16_t * q2 = q1 + 32;
|
| 410 |
+
q16[0] = q1[0] & 0x0f0f;
|
| 411 |
+
q16[1] = q1[8] & 0x0f0f;
|
| 412 |
+
q16[2] = (q1[0] >> 4) & 0x0f0f;
|
| 413 |
+
q16[3] = (q1[8] >> 4) & 0x0f0f;
|
| 414 |
+
q16[4] = q2[0] & 0x0f0f;
|
| 415 |
+
q16[5] = q2[8] & 0x0f0f;
|
| 416 |
+
q16[6] = (q2[0] >> 4) & 0x0f0f;
|
| 417 |
+
q16[7] = (q2[8] >> 4) & 0x0f0f;
|
| 418 |
+
for (int l = 0; l < n; ++l) {
|
| 419 |
+
sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
| 420 |
+
+ y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
| 421 |
+
sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
| 422 |
+
+ y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
| 423 |
+
sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
| 424 |
+
+ y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
| 425 |
+
sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
| 426 |
+
+ y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
| 427 |
+
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
| 428 |
+
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
| 429 |
+
}
|
| 430 |
+
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
#else
|
| 434 |
+
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
| 435 |
+
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
| 436 |
+
const int step = tid * K_QUANTS_PER_ITERATION;
|
| 437 |
+
const int im = step/8;
|
| 438 |
+
const int in = step%8;
|
| 439 |
+
|
| 440 |
+
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 441 |
+
const uint8_t * q = x[i].qs + step;
|
| 442 |
+
const int8_t * s = x[i].scales;
|
| 443 |
+
const float * y = yy + i*QK_K + step;
|
| 444 |
+
const float d = x[i].d;
|
| 445 |
+
float sum = 0.f;
|
| 446 |
+
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
| 447 |
+
const uint8_t h = x[i].qh[in+j] >> im;
|
| 448 |
+
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
| 449 |
+
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
| 450 |
+
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
| 451 |
+
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
| 452 |
+
}
|
| 453 |
+
tmp += sum;
|
| 454 |
+
}
|
| 455 |
+
#endif
|
| 456 |
+
|
| 457 |
+
// sum up partial sums and write back result
|
| 458 |
+
tmp = warp_reduce_sum(tmp);
|
| 459 |
+
|
| 460 |
+
if (threadIdx.x == 0) {
|
| 461 |
+
dst[row] = tmp;
|
| 462 |
+
}
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
| 466 |
+
|
| 467 |
+
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
| 468 |
+
|
| 469 |
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
| 470 |
+
if (row > nrows) return;
|
| 471 |
+
|
| 472 |
+
const int num_blocks_per_row = ncols / QK_K;
|
| 473 |
+
const int ib0 = row*num_blocks_per_row;
|
| 474 |
+
|
| 475 |
+
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
| 476 |
+
|
| 477 |
+
#if QK_K == 256
|
| 478 |
+
|
| 479 |
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
| 480 |
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
| 481 |
+
|
| 482 |
+
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
| 483 |
+
|
| 484 |
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
| 485 |
+
const int in = tid - step*im; // 0...15 or 0...7
|
| 486 |
+
|
| 487 |
+
#if K_QUANTS_PER_ITERATION == 1
|
| 488 |
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
| 489 |
+
const int is = 0;
|
| 490 |
+
#else
|
| 491 |
+
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
| 492 |
+
const int is = in / 4;
|
| 493 |
+
#endif
|
| 494 |
+
const int ql_offset = 64*im + l0;
|
| 495 |
+
const int qh_offset = 32*im + l0;
|
| 496 |
+
const int s_offset = 8*im + is;
|
| 497 |
+
const int y_offset = 128*im + l0;
|
| 498 |
+
|
| 499 |
+
float tmp = 0; // partial sum for thread in warp
|
| 500 |
+
|
| 501 |
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
| 502 |
+
|
| 503 |
+
const float * y = yy + i * QK_K + y_offset;
|
| 504 |
+
const uint8_t * ql = x[i].ql + ql_offset;
|
| 505 |
+
const uint8_t * qh = x[i].qh + qh_offset;
|
| 506 |
+
const int8_t * s = x[i].scales + s_offset;
|
| 507 |
+
|
| 508 |
+
const float d = x[i].d;
|
| 509 |
+
|
| 510 |
+
#if K_QUANTS_PER_ITERATION == 1
|
| 511 |
+
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
| 512 |
+
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
| 513 |
+
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
| 514 |
+
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
| 515 |
+
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
| 516 |
+
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
| 517 |
+
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
| 518 |
+
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
| 519 |
+
tmp += sum;
|
| 520 |
+
#else
|
| 521 |
+
float sum = 0;
|
| 522 |
+
for (int l = 0; l < 4; ++l) {
|
| 523 |
+
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
| 524 |
+
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
|
| 525 |
+
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
|
| 526 |
+
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
| 527 |
+
}
|
| 528 |
+
tmp += sum;
|
| 529 |
+
#endif
|
| 530 |
+
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
#else
|
| 534 |
+
|
| 535 |
+
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
|
| 536 |
+
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
|
| 537 |
+
|
| 538 |
+
const int step = tid * K_QUANTS_PER_ITERATION;
|
| 539 |
+
|
| 540 |
+
float tmp = 0; // partial sum for thread in warp
|
| 541 |
+
|
| 542 |
+
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 543 |
+
|
| 544 |
+
const float * y = yy + i * QK_K + step;
|
| 545 |
+
const uint8_t * ql = x[i].ql + step;
|
| 546 |
+
const uint8_t * qh = x[i].qh + step;
|
| 547 |
+
const int8_t * s = x[i].scales;
|
| 548 |
+
|
| 549 |
+
const float d = x[i+0].d;
|
| 550 |
+
|
| 551 |
+
float sum = 0;
|
| 552 |
+
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
| 553 |
+
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
| 554 |
+
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
| 555 |
+
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
| 556 |
+
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
| 557 |
+
}
|
| 558 |
+
tmp += sum;
|
| 559 |
+
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
#endif
|
| 563 |
+
|
| 564 |
+
// sum up partial sums and write back result
|
| 565 |
+
tmp = warp_reduce_sum(tmp);
|
| 566 |
+
|
| 567 |
+
if (tid == 0) {
|
| 568 |
+
dst[row] = tmp;
|
| 569 |
+
}
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
| 573 |
+
const half * x = (const half *) vx;
|
| 574 |
+
|
| 575 |
+
// automatic half -> float type cast if dfloat == float
|
| 576 |
+
v.x = x[ib + iqs + 0];
|
| 577 |
+
v.y = x[ib + iqs + 1];
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
| 581 |
+
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
| 582 |
+
// qk = quantized weights per x block
|
| 583 |
+
// qr = number of quantized weights per data value in x block
|
| 584 |
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
| 585 |
+
|
| 586 |
+
if (row >= nrows) {
|
| 587 |
+
return;
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
const int tid = threadIdx.x;
|
| 591 |
+
|
| 592 |
+
const int iter_stride = 2*GGML_CUDA_DMMV_X;
|
| 593 |
+
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
|
| 594 |
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
| 595 |
+
|
| 596 |
+
// partial sum for each thread
|
| 597 |
+
#ifdef GGML_CUDA_F16
|
| 598 |
+
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
| 599 |
+
#else
|
| 600 |
+
float tmp = 0.0f;
|
| 601 |
+
#endif // GGML_CUDA_F16
|
| 602 |
+
|
| 603 |
+
for (int i = 0; i < ncols; i += iter_stride) {
|
| 604 |
+
const int col = i + vals_per_iter*tid;
|
| 605 |
+
const int ib = (row*ncols + col)/qk; // x block index
|
| 606 |
+
const int iqs = (col%qk)/qr; // x quant index
|
| 607 |
+
const int iybs = col - col%qk; // y block start index
|
| 608 |
+
|
| 609 |
+
// processing >2 values per i iter is faster for fast GPUs
|
| 610 |
+
#pragma unroll
|
| 611 |
+
for (int j = 0; j < vals_per_iter; j += 2) {
|
| 612 |
+
// process 2 vals per j iter
|
| 613 |
+
|
| 614 |
+
// dequantize
|
| 615 |
+
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
| 616 |
+
dfloat2 v;
|
| 617 |
+
dequantize_kernel(vx, ib, iqs + j/qr, v);
|
| 618 |
+
|
| 619 |
+
// matrix multiplication
|
| 620 |
+
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
| 621 |
+
#ifdef GGML_CUDA_F16
|
| 622 |
+
tmp += __hmul2(v, {
|
| 623 |
+
y[iybs + iqs + j/qr + 0],
|
| 624 |
+
y[iybs + iqs + j/qr + y_offset]
|
| 625 |
+
});
|
| 626 |
+
#else
|
| 627 |
+
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
| 628 |
+
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
| 629 |
+
#endif // GGML_CUDA_F16
|
| 630 |
+
}
|
| 631 |
+
}
|
| 632 |
+
|
| 633 |
+
// sum up partial sums and write back result
|
| 634 |
+
tmp = warp_reduce_sum(tmp);
|
| 635 |
+
|
| 636 |
+
if (tid == 0) {
|
| 637 |
+
#ifdef GGML_CUDA_F16
|
| 638 |
+
dst[row] = tmp.x + tmp.y;
|
| 639 |
+
#else
|
| 640 |
+
dst[row] = tmp;
|
| 641 |
+
#endif // GGML_CUDA_F16
|
| 642 |
+
}
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 646 |
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 647 |
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 648 |
+
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
| 649 |
+
const dim3 block_nums(block_num_y, 1, 1);
|
| 650 |
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
| 651 |
+
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
| 652 |
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
| 653 |
+
}
|
| 654 |
+
|
| 655 |
+
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 656 |
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 657 |
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 658 |
+
const dim3 block_nums(block_num_y, 1, 1);
|
| 659 |
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
| 660 |
+
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
| 661 |
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
| 662 |
+
}
|
| 663 |
+
|
| 664 |
+
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 665 |
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 666 |
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 667 |
+
const dim3 block_nums(block_num_y, 1, 1);
|
| 668 |
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
| 669 |
+
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
| 670 |
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 674 |
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 675 |
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 676 |
+
const dim3 block_nums(block_num_y, 1, 1);
|
| 677 |
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
| 678 |
+
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
| 679 |
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
| 680 |
+
}
|
| 681 |
+
|
| 682 |
+
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 683 |
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 684 |
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 685 |
+
const dim3 block_nums(block_num_y, 1, 1);
|
| 686 |
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
| 687 |
+
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
| 688 |
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
| 689 |
+
}
|
| 690 |
+
|
| 691 |
+
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 692 |
+
GGML_ASSERT(ncols % QK_K == 0);
|
| 693 |
+
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
| 694 |
+
const int block_num_y = (nrows + ny - 1) / ny;
|
| 695 |
+
const dim3 block_nums(block_num_y, 1, 1);
|
| 696 |
+
const dim3 block_dims(32, ny, 1);
|
| 697 |
+
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 701 |
+
GGML_ASSERT(ncols % QK_K == 0);
|
| 702 |
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
| 703 |
+
const int block_num_y = (nrows + ny - 1) / ny;
|
| 704 |
+
const dim3 block_nums(block_num_y, 1, 1);
|
| 705 |
+
const dim3 block_dims(32, ny, 1);
|
| 706 |
+
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
| 707 |
+
}
|
| 708 |
+
|
| 709 |
+
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 710 |
+
GGML_ASSERT(ncols % QK_K == 0);
|
| 711 |
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
| 712 |
+
const int block_num_y = (nrows + ny - 1) / ny;
|
| 713 |
+
const dim3 block_nums(block_num_y, 1, 1);
|
| 714 |
+
const dim3 block_dims(32, ny, 1);
|
| 715 |
+
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
| 716 |
+
}
|
| 717 |
+
|
| 718 |
+
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 719 |
+
GGML_ASSERT(ncols % QK_K == 0);
|
| 720 |
+
const dim3 block_dims(32, 1, 1);
|
| 721 |
+
dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
| 722 |
+
}
|
| 723 |
+
|
| 724 |
+
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 725 |
+
GGML_ASSERT(ncols % QK_K == 0);
|
| 726 |
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
| 727 |
+
const int block_num_y = (nrows + ny - 1) / ny;
|
| 728 |
+
const dim3 block_nums(block_num_y, 1, 1);
|
| 729 |
+
const dim3 block_dims(32, ny, 1);
|
| 730 |
+
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
| 731 |
+
}
|
| 732 |
+
|
| 733 |
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 734 |
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 735 |
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 736 |
+
const dim3 block_nums(block_num_y, 1, 1);
|
| 737 |
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
| 738 |
+
dequantize_mul_mat_vec<1, 1, convert_f16>
|
| 739 |
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
| 740 |
+
}
|
| 741 |
+
|
| 742 |
+
void ggml_cuda_op_dequantize_mul_mat_vec(
|
| 743 |
+
ggml_backend_cuda_context & ctx,
|
| 744 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 745 |
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 746 |
+
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
| 747 |
+
GGML_UNUSED(ctx);
|
| 748 |
+
const int64_t ne00 = src0->ne[0];
|
| 749 |
+
const int64_t row_diff = row_high - row_low;
|
| 750 |
+
|
| 751 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 752 |
+
|
| 753 |
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
| 754 |
+
#ifdef GGML_CUDA_F16
|
| 755 |
+
ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
|
| 756 |
+
half * src1_dfloat = nullptr; // dfloat == half
|
| 757 |
+
|
| 758 |
+
bool src1_convert_f16 =
|
| 759 |
+
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
| 760 |
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
| 761 |
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
| 762 |
+
|
| 763 |
+
if (src1_convert_f16) {
|
| 764 |
+
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
| 765 |
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
| 766 |
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
| 767 |
+
to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
|
| 768 |
+
}
|
| 769 |
+
#else
|
| 770 |
+
const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
|
| 771 |
+
#endif // GGML_CUDA_F16
|
| 772 |
+
|
| 773 |
+
switch (src0->type) {
|
| 774 |
+
case GGML_TYPE_Q4_0:
|
| 775 |
+
dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
| 776 |
+
break;
|
| 777 |
+
case GGML_TYPE_Q4_1:
|
| 778 |
+
dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
| 779 |
+
break;
|
| 780 |
+
case GGML_TYPE_Q5_0:
|
| 781 |
+
dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
| 782 |
+
break;
|
| 783 |
+
case GGML_TYPE_Q5_1:
|
| 784 |
+
dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
| 785 |
+
break;
|
| 786 |
+
case GGML_TYPE_Q8_0:
|
| 787 |
+
dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
| 788 |
+
break;
|
| 789 |
+
case GGML_TYPE_Q2_K:
|
| 790 |
+
dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
| 791 |
+
break;
|
| 792 |
+
case GGML_TYPE_Q3_K:
|
| 793 |
+
dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
| 794 |
+
break;
|
| 795 |
+
case GGML_TYPE_Q4_K:
|
| 796 |
+
dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
| 797 |
+
break;
|
| 798 |
+
case GGML_TYPE_Q5_K:
|
| 799 |
+
dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
| 800 |
+
break;
|
| 801 |
+
case GGML_TYPE_Q6_K:
|
| 802 |
+
dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
| 803 |
+
break;
|
| 804 |
+
case GGML_TYPE_F16:
|
| 805 |
+
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
| 806 |
+
break;
|
| 807 |
+
default:
|
| 808 |
+
GGML_ASSERT(false);
|
| 809 |
+
break;
|
| 810 |
+
}
|
| 811 |
+
|
| 812 |
+
GGML_UNUSED(src1);
|
| 813 |
+
GGML_UNUSED(dst);
|
| 814 |
+
GGML_UNUSED(src1_ddq_i);
|
| 815 |
+
GGML_UNUSED(src1_ncols);
|
| 816 |
+
GGML_UNUSED(src1_padded_row_size);
|
| 817 |
+
}
|
ggml-cuda/dmmv.cuh
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
void ggml_cuda_op_dequantize_mul_mat_vec(
|
| 4 |
+
ggml_backend_cuda_context & ctx,
|
| 5 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 6 |
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 7 |
+
const int64_t src1_padded_row_size, cudaStream_t stream);
|
ggml-cuda/getrows.cu
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "getrows.cuh"
|
| 2 |
+
#include "dequantize.cuh"
|
| 3 |
+
|
| 4 |
+
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 5 |
+
static __global__ void k_get_rows(
|
| 6 |
+
const void * src0, const int32_t * src1, dst_t * dst,
|
| 7 |
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
| 8 |
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
| 9 |
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
| 10 |
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
| 11 |
+
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
| 12 |
+
|
| 13 |
+
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
| 14 |
+
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
| 15 |
+
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
| 16 |
+
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
| 17 |
+
|
| 18 |
+
if (i00 >= ne00) {
|
| 19 |
+
return;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
| 23 |
+
|
| 24 |
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
| 25 |
+
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
| 26 |
+
|
| 27 |
+
const int ib = i00/qk; // block index
|
| 28 |
+
const int iqs = (i00%qk)/qr; // quant index
|
| 29 |
+
const int iybs = i00 - i00%qk; // dst block start index
|
| 30 |
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
| 31 |
+
|
| 32 |
+
// dequantize
|
| 33 |
+
dfloat2 v;
|
| 34 |
+
dequantize_kernel(src0_row, ib, iqs, v);
|
| 35 |
+
|
| 36 |
+
dst_row[iybs + iqs + 0] = v.x;
|
| 37 |
+
dst_row[iybs + iqs + y_offset] = v.y;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
template<typename src0_t, typename dst_t>
|
| 41 |
+
static __global__ void k_get_rows_float(
|
| 42 |
+
const src0_t * src0, const int32_t * src1, dst_t * dst,
|
| 43 |
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
| 44 |
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
| 45 |
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
| 46 |
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
| 47 |
+
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
| 48 |
+
|
| 49 |
+
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
|
| 50 |
+
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
| 51 |
+
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
| 52 |
+
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
| 53 |
+
|
| 54 |
+
if (i00 >= ne00) {
|
| 55 |
+
return;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
| 59 |
+
|
| 60 |
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
| 61 |
+
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
| 62 |
+
|
| 63 |
+
dst_row[i00] = src0_row[i00];
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
template<int qk, int qr, dequantize_kernel_t dq>
|
| 67 |
+
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 68 |
+
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
| 69 |
+
|
| 70 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
| 71 |
+
|
| 72 |
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
| 73 |
+
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
| 74 |
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
| 75 |
+
|
| 76 |
+
// strides in elements
|
| 77 |
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
| 78 |
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
| 79 |
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
| 80 |
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
| 81 |
+
|
| 82 |
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
| 83 |
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
| 84 |
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
| 85 |
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
| 86 |
+
|
| 87 |
+
GGML_ASSERT(ne00 % 2 == 0);
|
| 88 |
+
|
| 89 |
+
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
|
| 90 |
+
src0_dd, src1_dd, dst_dd,
|
| 91 |
+
ne00, /*ne01, ne02, ne03,*/
|
| 92 |
+
/*ne10, ne11,*/ ne12, /*ne13,*/
|
| 93 |
+
/* s0,*/ s1, s2, s3,
|
| 94 |
+
/* nb00,*/ nb01, nb02, nb03,
|
| 95 |
+
s10, s11, s12/*, s13*/);
|
| 96 |
+
|
| 97 |
+
GGML_UNUSED(dst);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
template<typename src0_t>
|
| 101 |
+
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 102 |
+
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
| 103 |
+
|
| 104 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
| 105 |
+
|
| 106 |
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
| 107 |
+
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
| 108 |
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
| 109 |
+
|
| 110 |
+
// strides in elements
|
| 111 |
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
| 112 |
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
| 113 |
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
| 114 |
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
| 115 |
+
|
| 116 |
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
| 117 |
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
| 118 |
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
| 119 |
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
| 120 |
+
|
| 121 |
+
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
|
| 122 |
+
src0_dd, src1_dd, dst_dd,
|
| 123 |
+
ne00, /*ne01, ne02, ne03,*/
|
| 124 |
+
/*ne10, ne11,*/ ne12, /*ne13,*/
|
| 125 |
+
/* s0,*/ s1, s2, s3,
|
| 126 |
+
/* nb00,*/ nb01, nb02, nb03,
|
| 127 |
+
s10, s11, s12/*, s13*/);
|
| 128 |
+
|
| 129 |
+
GGML_UNUSED(dst);
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 133 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 134 |
+
const ggml_tensor * src1 = dst->src[1];
|
| 135 |
+
const float * src0_d = (const float *)src0->data;
|
| 136 |
+
const float * src1_d = (const float *)src1->data;
|
| 137 |
+
float * dst_d = (float *)dst->data;
|
| 138 |
+
cudaStream_t stream = ctx.stream();
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
| 142 |
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 143 |
+
|
| 144 |
+
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
| 145 |
+
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
| 146 |
+
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
| 147 |
+
|
| 148 |
+
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
| 149 |
+
|
| 150 |
+
switch (src0->type) {
|
| 151 |
+
case GGML_TYPE_F16:
|
| 152 |
+
get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
|
| 153 |
+
break;
|
| 154 |
+
case GGML_TYPE_F32:
|
| 155 |
+
get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 156 |
+
break;
|
| 157 |
+
case GGML_TYPE_Q4_0:
|
| 158 |
+
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 159 |
+
break;
|
| 160 |
+
case GGML_TYPE_Q4_1:
|
| 161 |
+
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 162 |
+
break;
|
| 163 |
+
case GGML_TYPE_Q5_0:
|
| 164 |
+
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 165 |
+
break;
|
| 166 |
+
case GGML_TYPE_Q5_1:
|
| 167 |
+
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 168 |
+
break;
|
| 169 |
+
case GGML_TYPE_Q8_0:
|
| 170 |
+
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 171 |
+
break;
|
| 172 |
+
default:
|
| 173 |
+
// TODO: k-quants
|
| 174 |
+
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
| 175 |
+
GGML_ASSERT(false);
|
| 176 |
+
break;
|
| 177 |
+
}
|
| 178 |
+
}
|
ggml-cuda/getrows.cuh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
| 4 |
+
|
| 5 |
+
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml-cuda/im2col.cu
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "im2col.cuh"
|
| 2 |
+
|
| 3 |
+
template <typename T>
|
| 4 |
+
static __global__ void im2col_kernel(
|
| 5 |
+
const float * x, T * dst, int64_t batch_offset,
|
| 6 |
+
int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
|
| 7 |
+
int s0, int s1, int p0, int p1, int d0, int d1) {
|
| 8 |
+
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
| 9 |
+
if (i >= pelements) {
|
| 10 |
+
return;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
const int64_t ksize = OW * (KH > 1 ? KW : 1);
|
| 14 |
+
const int64_t kx = i / ksize;
|
| 15 |
+
const int64_t kd = kx * ksize;
|
| 16 |
+
const int64_t ky = (i - kd) / OW;
|
| 17 |
+
const int64_t ix = i % OW;
|
| 18 |
+
|
| 19 |
+
const int64_t oh = blockIdx.y;
|
| 20 |
+
const int64_t batch = blockIdx.z / IC;
|
| 21 |
+
const int64_t ic = blockIdx.z % IC;
|
| 22 |
+
|
| 23 |
+
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
| 24 |
+
const int64_t iih = oh * s1 + ky * d1 - p1;
|
| 25 |
+
|
| 26 |
+
const int64_t offset_dst =
|
| 27 |
+
((batch * OH + oh) * OW + ix) * CHW +
|
| 28 |
+
(ic * (KW * KH) + ky * KW + kx);
|
| 29 |
+
|
| 30 |
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
| 31 |
+
dst[offset_dst] = 0.0f;
|
| 32 |
+
} else {
|
| 33 |
+
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
|
| 34 |
+
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
template <typename T>
|
| 39 |
+
static void im2col_cuda(const float * x, T* dst,
|
| 40 |
+
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
| 41 |
+
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
| 42 |
+
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
| 43 |
+
const int parallel_elements = OW * KW * KH;
|
| 44 |
+
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
| 45 |
+
dim3 block_nums(num_blocks, OH, batch * IC);
|
| 46 |
+
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
static void im2col_cuda_f16(const float * x, half * dst,
|
| 50 |
+
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
| 51 |
+
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
| 52 |
+
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
| 53 |
+
|
| 54 |
+
im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
static void im2col_cuda_f32(const float * x, float * dst,
|
| 58 |
+
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
| 59 |
+
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
| 60 |
+
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
| 61 |
+
|
| 62 |
+
im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 66 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 67 |
+
const ggml_tensor * src1 = dst->src[1];
|
| 68 |
+
const float * src1_d = (const float *)src1->data;
|
| 69 |
+
float * dst_d = (float *)dst->data;
|
| 70 |
+
cudaStream_t stream = ctx.stream();
|
| 71 |
+
|
| 72 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
| 73 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 74 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
| 75 |
+
|
| 76 |
+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
| 77 |
+
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
| 78 |
+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
| 79 |
+
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
| 80 |
+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
| 81 |
+
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
| 82 |
+
|
| 83 |
+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
| 84 |
+
|
| 85 |
+
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
| 86 |
+
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
| 87 |
+
const int64_t IW = src1->ne[0];
|
| 88 |
+
|
| 89 |
+
const int64_t KH = is_2D ? src0->ne[1] : 1;
|
| 90 |
+
const int64_t KW = src0->ne[0];
|
| 91 |
+
|
| 92 |
+
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
| 93 |
+
const int64_t OW = dst->ne[1];
|
| 94 |
+
|
| 95 |
+
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
| 96 |
+
const int64_t batch = src1->ne[3];
|
| 97 |
+
const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
|
| 98 |
+
|
| 99 |
+
if(dst->type == GGML_TYPE_F16) {
|
| 100 |
+
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
|
| 101 |
+
} else {
|
| 102 |
+
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
|
| 103 |
+
}
|
| 104 |
+
}
|