JohannesGaessler commited on
Commit
fbc5f16
·
1 Parent(s): 6662d54

CUDA: app option to compile without FlashAttention (llama/12025)

Browse files
ggml/CMakeLists.txt CHANGED
@@ -151,6 +151,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
151
  "ggml: max. batch size for using peer access")
152
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
153
  option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
 
154
  option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
155
  option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
156
 
 
151
  "ggml: max. batch size for using peer access")
152
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
153
  option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
154
+ option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
155
  option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
156
  option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
157
 
ggml/src/ggml-cuda/CMakeLists.txt CHANGED
@@ -69,6 +69,10 @@ if (CUDAToolkit_FOUND)
69
  add_compile_definitions(GGML_CUDA_NO_VMM)
70
  endif()
71
 
 
 
 
 
72
  if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
73
  add_compile_definitions(GGML_CUDA_F16)
74
  endif()
 
69
  add_compile_definitions(GGML_CUDA_NO_VMM)
70
  endif()
71
 
72
+ if (NOT GGML_CUDA_FA)
73
+ add_compile_definitions(GGML_CUDA_NO_FA)
74
+ endif()
75
+
76
  if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
77
  add_compile_definitions(GGML_CUDA_F16)
78
  endif()
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -204,9 +204,9 @@ typedef float2 dfloat2;
204
  #define CP_ASYNC_AVAILABLE
205
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
206
 
207
- #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
208
  #define FLASH_ATTN_AVAILABLE
209
- #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
210
 
211
  static bool fp16_available(const int cc) {
212
  return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
 
204
  #define CP_ASYNC_AVAILABLE
205
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
206
 
207
+ #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
208
  #define FLASH_ATTN_AVAILABLE
209
+ #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
210
 
211
  static bool fp16_available(const int cc) {
212
  return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
ggml/src/ggml-cuda/fattn-mma-f16.cuh CHANGED
@@ -839,10 +839,7 @@ static __global__ void flash_attn_ext_f16(
839
  const int ne1,
840
  const int ne2,
841
  const int ne3) {
842
- #ifndef NEW_MMA_AVAILABLE
843
- NO_DEVICE_CODE;
844
- return;
845
- #endif // NEW_MMA_AVAILABLE
846
 
847
  // Skip unused kernel variants for faster compilation:
848
  if (use_logit_softcap && !(D == 128 || D == 256)) {
@@ -933,6 +930,9 @@ static __global__ void flash_attn_ext_f16(
933
  flash_attn_ext_f16_process_tile<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup>
934
  (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
935
  ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
 
 
 
936
  }
937
 
938
  template <int D, int ncols1, int ncols2>
 
839
  const int ne1,
840
  const int ne2,
841
  const int ne3) {
842
+ #if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
 
 
 
843
 
844
  // Skip unused kernel variants for faster compilation:
845
  if (use_logit_softcap && !(D == 128 || D == 256)) {
 
930
  flash_attn_ext_f16_process_tile<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup>
931
  (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
932
  ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
933
+ #else
934
+ NO_DEVICE_CODE;
935
+ #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
936
  }
937
 
938
  template <int D, int ncols1, int ncols2>
ggml/src/ggml-cuda/fattn-tile-f16.cu CHANGED
@@ -44,12 +44,7 @@ static __global__ void flash_attn_tile_ext_f16(
44
  const int ne1,
45
  const int ne2,
46
  const int ne3) {
47
- #ifdef FP16_AVAILABLE
48
-
49
- #ifndef FLASH_ATTN_AVAILABLE
50
- NO_DEVICE_CODE;
51
- return;
52
- #endif // FLASH_ATTN_AVAILABLE
53
 
54
  // Skip unused kernel variants for faster compilation:
55
  #ifdef FP16_MMA_AVAILABLE
@@ -290,7 +285,7 @@ static __global__ void flash_attn_tile_ext_f16(
290
  }
291
  #else
292
  NO_DEVICE_CODE;
293
- #endif // FP16_AVAILABLE
294
  }
295
 
296
  template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
 
44
  const int ne1,
45
  const int ne2,
46
  const int ne3) {
47
+ #if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 
 
 
 
 
48
 
49
  // Skip unused kernel variants for faster compilation:
50
  #ifdef FP16_MMA_AVAILABLE
 
285
  }
286
  #else
287
  NO_DEVICE_CODE;
288
+ #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
289
  }
290
 
291
  template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
ggml/src/ggml-cuda/fattn-tile-f32.cu CHANGED
@@ -44,10 +44,7 @@ static __global__ void flash_attn_tile_ext_f32(
44
  const int ne1,
45
  const int ne2,
46
  const int ne3) {
47
- #ifndef FLASH_ATTN_AVAILABLE
48
- NO_DEVICE_CODE;
49
- return;
50
- #endif // FLASH_ATTN_AVAILABLE
51
 
52
  // Skip unused kernel variants for faster compilation:
53
  #ifdef FP16_MMA_AVAILABLE
@@ -285,6 +282,9 @@ static __global__ void flash_attn_tile_ext_f32(
285
  dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
286
  }
287
  }
 
 
 
288
  }
289
 
290
  template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
 
44
  const int ne1,
45
  const int ne2,
46
  const int ne3) {
47
+ #ifdef FLASH_ATTN_AVAILABLE
 
 
 
48
 
49
  // Skip unused kernel variants for faster compilation:
50
  #ifdef FP16_MMA_AVAILABLE
 
282
  dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
283
  }
284
  }
285
+ #else
286
+ NO_DEVICE_CODE;
287
+ #endif // FLASH_ATTN_AVAILABLE
288
  }
289
 
290
  template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
ggml/src/ggml-cuda/fattn-vec-f16.cuh CHANGED
@@ -41,12 +41,7 @@ static __global__ void flash_attn_vec_ext_f16(
41
  const int ne1,
42
  const int ne2,
43
  const int ne3) {
44
- #ifdef FP16_AVAILABLE
45
-
46
- #ifndef FLASH_ATTN_AVAILABLE
47
- NO_DEVICE_CODE;
48
- return;
49
- #endif // FLASH_ATTN_AVAILABLE
50
 
51
  // Skip unused kernel variants for faster compilation:
52
  if (use_logit_softcap && !(D == 128 || D == 256)) {
@@ -300,7 +295,7 @@ static __global__ void flash_attn_vec_ext_f16(
300
  }
301
  #else
302
  NO_DEVICE_CODE;
303
- #endif // FP16_AVAILABLE
304
  }
305
 
306
  template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
 
41
  const int ne1,
42
  const int ne2,
43
  const int ne3) {
44
+ #if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 
 
 
 
 
45
 
46
  // Skip unused kernel variants for faster compilation:
47
  if (use_logit_softcap && !(D == 128 || D == 256)) {
 
295
  }
296
  #else
297
  NO_DEVICE_CODE;
298
+ #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
299
  }
300
 
301
  template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
ggml/src/ggml-cuda/fattn-vec-f32.cuh CHANGED
@@ -41,10 +41,7 @@ static __global__ void flash_attn_vec_ext_f32(
41
  const int ne1,
42
  const int ne2,
43
  const int ne3) {
44
- #ifndef FLASH_ATTN_AVAILABLE
45
- NO_DEVICE_CODE;
46
- return;
47
- #endif // FLASH_ATTN_AVAILABLE
48
 
49
  // Skip unused kernel variants for faster compilation:
50
  if (use_logit_softcap && !(D == 128 || D == 256)) {
@@ -281,6 +278,9 @@ static __global__ void flash_attn_vec_ext_f32(
281
  if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
282
  dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
283
  }
 
 
 
284
  }
285
 
286
  template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
 
41
  const int ne1,
42
  const int ne2,
43
  const int ne3) {
44
+ #ifdef FLASH_ATTN_AVAILABLE
 
 
 
45
 
46
  // Skip unused kernel variants for faster compilation:
47
  if (use_logit_softcap && !(D == 128 || D == 256)) {
 
278
  if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
279
  dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
280
  }
281
+ #else
282
+ NO_DEVICE_CODE;
283
+ #endif // FLASH_ATTN_AVAILABLE
284
  }
285
 
286
  template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
ggml/src/ggml-cuda/fattn-wmma-f16.cu CHANGED
@@ -51,7 +51,7 @@ static __global__ void flash_attn_ext_f16(
51
  const int ne1,
52
  const int ne2,
53
  const int ne3) {
54
- #if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
55
  // Skip unused kernel variants for faster compilation:
56
  if (use_logit_softcap && !(D == 128 || D == 256)) {
57
  NO_DEVICE_CODE;
@@ -425,7 +425,7 @@ static __global__ void flash_attn_ext_f16(
425
  }
426
  #else
427
  NO_DEVICE_CODE;
428
- #endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
429
  }
430
 
431
  constexpr int get_max_power_of_2(int x) {
 
51
  const int ne1,
52
  const int ne2,
53
  const int ne3) {
54
+ #if defined(FLASH_ATTN_AVAILABLE) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
55
  // Skip unused kernel variants for faster compilation:
56
  if (use_logit_softcap && !(D == 128 || D == 256)) {
57
  NO_DEVICE_CODE;
 
425
  }
426
  #else
427
  NO_DEVICE_CODE;
428
+ #endif // defined(FLASH_ATTN_AVAILABLE) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
429
  }
430
 
431
  constexpr int get_max_power_of_2(int x) {
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -3203,7 +3203,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3203
  case GGML_OP_FLASH_ATTN_EXT: {
3204
  #ifndef FLASH_ATTN_AVAILABLE
3205
  return false;
3206
- #endif
3207
  if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
3208
  return false;
3209
  }
 
3203
  case GGML_OP_FLASH_ATTN_EXT: {
3204
  #ifndef FLASH_ATTN_AVAILABLE
3205
  return false;
3206
+ #endif // FLASH_ATTN_AVAILABLE
3207
  if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
3208
  return false;
3209
  }
ggml/src/ggml-hip/CMakeLists.txt CHANGED
@@ -107,6 +107,10 @@ if (GGML_HIP_NO_VMM)
107
  add_compile_definitions(GGML_HIP_NO_VMM)
108
  endif()
109
 
 
 
 
 
110
  if (CXX_IS_HIPCC)
111
  set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
112
  target_link_libraries(ggml-hip PRIVATE hip::device)
 
107
  add_compile_definitions(GGML_HIP_NO_VMM)
108
  endif()
109
 
110
+ if (NOT GGML_CUDA_FA)
111
+ add_compile_definitions(GGML_CUDA_NO_FA)
112
+ endif()
113
+
114
  if (CXX_IS_HIPCC)
115
  set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
116
  target_link_libraries(ggml-hip PRIVATE hip::device)
ggml/src/ggml-musa/CMakeLists.txt CHANGED
@@ -83,6 +83,10 @@ if (MUSAToolkit_FOUND)
83
  add_compile_definitions(GGML_CUDA_NO_VMM)
84
  endif()
85
 
 
 
 
 
86
  if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
87
  add_compile_definitions(GGML_CUDA_F16)
88
  endif()
 
83
  add_compile_definitions(GGML_CUDA_NO_VMM)
84
  endif()
85
 
86
+ if (NOT GGML_CUDA_FA)
87
+ add_compile_definitions(GGML_CUDA_NO_FA)
88
+ endif()
89
+
90
  if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
91
  add_compile_definitions(GGML_CUDA_F16)
92
  endif()