slaren commited on
Commit
e83fdad
·
unverified ·
1 Parent(s): 7d13d39

ggml : add GGML_CUDA_USE_GRAPHS option, restore GGML_CUDA_FORCE_CUBLAS (cmake) (llama/8140)

Browse files
Files changed (2) hide show
  1. ggml/CMakeLists.txt +2 -0
  2. ggml/src/CMakeLists.txt +4 -1
ggml/CMakeLists.txt CHANGED
@@ -109,6 +109,7 @@ option(GGML_LLAMAFILE "ggml: use ggml SGEMM"
109
  option(GGML_CUDA "ggml: use CUDA" OFF)
110
  option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
111
  option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
 
112
  set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
113
  set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
114
  option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
@@ -119,6 +120,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
119
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
120
  option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
121
  option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
 
122
 
123
  option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
124
  option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
 
109
  option(GGML_CUDA "ggml: use CUDA" OFF)
110
  option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
111
  option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
112
+ option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
113
  set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
114
  set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
115
  option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
 
120
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
121
  option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
122
  option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
123
+ option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
124
 
125
  option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
126
  option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
ggml/src/CMakeLists.txt CHANGED
@@ -295,12 +295,15 @@ if (GGML_CUDA)
295
 
296
  list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
297
 
298
- add_compile_definitions(GGML_CUDA_USE_GRAPHS)
299
  add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
300
  add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
301
  add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
302
  add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
303
 
 
 
 
 
304
  if (GGML_CUDA_FORCE_DMMV)
305
  add_compile_definitions(GGML_CUDA_FORCE_DMMV)
306
  endif()
 
295
 
296
  list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
297
 
 
298
  add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
299
  add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
300
  add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
301
  add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
302
 
303
+ if (GGML_CUDA_USE_GRAPHS)
304
+ add_compile_definitions(GGML_CUDA_USE_GRAPHS)
305
+ endif()
306
+
307
  if (GGML_CUDA_FORCE_DMMV)
308
  add_compile_definitions(GGML_CUDA_FORCE_DMMV)
309
  endif()