Spaces:
Running
Running
slaren
commited on
ggml : add GGML_CUDA_USE_GRAPHS option, restore GGML_CUDA_FORCE_CUBLAS (cmake) (llama/8140)
Browse files- ggml/CMakeLists.txt +2 -0
- ggml/src/CMakeLists.txt +4 -1
ggml/CMakeLists.txt
CHANGED
|
@@ -109,6 +109,7 @@ option(GGML_LLAMAFILE "ggml: use ggml SGEMM"
|
|
| 109 |
option(GGML_CUDA "ggml: use CUDA" OFF)
|
| 110 |
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
| 111 |
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
|
|
|
| 112 |
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
| 113 |
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
| 114 |
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
|
@@ -119,6 +120,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
| 119 |
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
| 120 |
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
|
| 121 |
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
|
|
|
| 122 |
|
| 123 |
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
|
| 124 |
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
|
|
|
| 109 |
option(GGML_CUDA "ggml: use CUDA" OFF)
|
| 110 |
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
| 111 |
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
| 112 |
+
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
| 113 |
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
| 114 |
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
| 115 |
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
|
|
|
| 120 |
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
| 121 |
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
|
| 122 |
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
| 123 |
+
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
|
| 124 |
|
| 125 |
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
|
| 126 |
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
ggml/src/CMakeLists.txt
CHANGED
|
@@ -295,12 +295,15 @@ if (GGML_CUDA)
|
|
| 295 |
|
| 296 |
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
|
| 297 |
|
| 298 |
-
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
| 299 |
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
|
| 300 |
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
|
| 301 |
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
|
| 302 |
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
| 303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
if (GGML_CUDA_FORCE_DMMV)
|
| 305 |
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
| 306 |
endif()
|
|
|
|
| 295 |
|
| 296 |
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
|
| 297 |
|
|
|
|
| 298 |
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
|
| 299 |
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
|
| 300 |
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
|
| 301 |
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
| 302 |
|
| 303 |
+
if (GGML_CUDA_USE_GRAPHS)
|
| 304 |
+
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
| 305 |
+
endif()
|
| 306 |
+
|
| 307 |
if (GGML_CUDA_FORCE_DMMV)
|
| 308 |
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
| 309 |
endif()
|