Spaces:
Running
Running
Commit
·
fbc5f16
1
Parent(s):
6662d54
CUDA: app option to compile without FlashAttention (llama/12025)
Browse files- ggml/CMakeLists.txt +1 -0
- ggml/src/ggml-cuda/CMakeLists.txt +4 -0
- ggml/src/ggml-cuda/common.cuh +2 -2
- ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -4
- ggml/src/ggml-cuda/fattn-tile-f16.cu +2 -7
- ggml/src/ggml-cuda/fattn-tile-f32.cu +4 -4
- ggml/src/ggml-cuda/fattn-vec-f16.cuh +2 -7
- ggml/src/ggml-cuda/fattn-vec-f32.cuh +4 -4
- ggml/src/ggml-cuda/fattn-wmma-f16.cu +2 -2
- ggml/src/ggml-cuda/ggml-cuda.cu +1 -1
- ggml/src/ggml-hip/CMakeLists.txt +4 -0
- ggml/src/ggml-musa/CMakeLists.txt +4 -0
ggml/CMakeLists.txt
CHANGED
|
@@ -151,6 +151,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
| 151 |
"ggml: max. batch size for using peer access")
|
| 152 |
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
| 153 |
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
|
|
|
|
| 154 |
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
| 155 |
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
| 156 |
|
|
|
|
| 151 |
"ggml: max. batch size for using peer access")
|
| 152 |
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
| 153 |
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
|
| 154 |
+
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
|
| 155 |
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
| 156 |
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
| 157 |
|
ggml/src/ggml-cuda/CMakeLists.txt
CHANGED
|
@@ -69,6 +69,10 @@ if (CUDAToolkit_FOUND)
|
|
| 69 |
add_compile_definitions(GGML_CUDA_NO_VMM)
|
| 70 |
endif()
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
| 73 |
add_compile_definitions(GGML_CUDA_F16)
|
| 74 |
endif()
|
|
|
|
| 69 |
add_compile_definitions(GGML_CUDA_NO_VMM)
|
| 70 |
endif()
|
| 71 |
|
| 72 |
+
if (NOT GGML_CUDA_FA)
|
| 73 |
+
add_compile_definitions(GGML_CUDA_NO_FA)
|
| 74 |
+
endif()
|
| 75 |
+
|
| 76 |
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
| 77 |
add_compile_definitions(GGML_CUDA_F16)
|
| 78 |
endif()
|
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -204,9 +204,9 @@ typedef float2 dfloat2;
|
|
| 204 |
#define CP_ASYNC_AVAILABLE
|
| 205 |
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 206 |
|
| 207 |
-
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
| 208 |
#define FLASH_ATTN_AVAILABLE
|
| 209 |
-
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
| 210 |
|
| 211 |
static bool fp16_available(const int cc) {
|
| 212 |
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
|
|
|
|
| 204 |
#define CP_ASYNC_AVAILABLE
|
| 205 |
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 206 |
|
| 207 |
+
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
| 208 |
#define FLASH_ATTN_AVAILABLE
|
| 209 |
+
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
| 210 |
|
| 211 |
static bool fp16_available(const int cc) {
|
| 212 |
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
|
ggml/src/ggml-cuda/fattn-mma-f16.cuh
CHANGED
|
@@ -839,10 +839,7 @@ static __global__ void flash_attn_ext_f16(
|
|
| 839 |
const int ne1,
|
| 840 |
const int ne2,
|
| 841 |
const int ne3) {
|
| 842 |
-
#
|
| 843 |
-
NO_DEVICE_CODE;
|
| 844 |
-
return;
|
| 845 |
-
#endif // NEW_MMA_AVAILABLE
|
| 846 |
|
| 847 |
// Skip unused kernel variants for faster compilation:
|
| 848 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
|
@@ -933,6 +930,9 @@ static __global__ void flash_attn_ext_f16(
|
|
| 933 |
flash_attn_ext_f16_process_tile<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup>
|
| 934 |
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
|
| 935 |
ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
|
|
|
|
|
|
|
|
|
|
| 936 |
}
|
| 937 |
|
| 938 |
template <int D, int ncols1, int ncols2>
|
|
|
|
| 839 |
const int ne1,
|
| 840 |
const int ne2,
|
| 841 |
const int ne3) {
|
| 842 |
+
#if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
|
|
|
|
|
|
|
|
|
| 843 |
|
| 844 |
// Skip unused kernel variants for faster compilation:
|
| 845 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
|
|
|
| 930 |
flash_attn_ext_f16_process_tile<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup>
|
| 931 |
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
|
| 932 |
ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
|
| 933 |
+
#else
|
| 934 |
+
NO_DEVICE_CODE;
|
| 935 |
+
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
| 936 |
}
|
| 937 |
|
| 938 |
template <int D, int ncols1, int ncols2>
|
ggml/src/ggml-cuda/fattn-tile-f16.cu
CHANGED
|
@@ -44,12 +44,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
|
| 44 |
const int ne1,
|
| 45 |
const int ne2,
|
| 46 |
const int ne3) {
|
| 47 |
-
#
|
| 48 |
-
|
| 49 |
-
#ifndef FLASH_ATTN_AVAILABLE
|
| 50 |
-
NO_DEVICE_CODE;
|
| 51 |
-
return;
|
| 52 |
-
#endif // FLASH_ATTN_AVAILABLE
|
| 53 |
|
| 54 |
// Skip unused kernel variants for faster compilation:
|
| 55 |
#ifdef FP16_MMA_AVAILABLE
|
|
@@ -290,7 +285,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
|
| 290 |
}
|
| 291 |
#else
|
| 292 |
NO_DEVICE_CODE;
|
| 293 |
-
#endif // FP16_AVAILABLE
|
| 294 |
}
|
| 295 |
|
| 296 |
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
|
|
|
|
| 44 |
const int ne1,
|
| 45 |
const int ne2,
|
| 46 |
const int ne3) {
|
| 47 |
+
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
// Skip unused kernel variants for faster compilation:
|
| 50 |
#ifdef FP16_MMA_AVAILABLE
|
|
|
|
| 285 |
}
|
| 286 |
#else
|
| 287 |
NO_DEVICE_CODE;
|
| 288 |
+
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
| 289 |
}
|
| 290 |
|
| 291 |
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
|
ggml/src/ggml-cuda/fattn-tile-f32.cu
CHANGED
|
@@ -44,10 +44,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
|
| 44 |
const int ne1,
|
| 45 |
const int ne2,
|
| 46 |
const int ne3) {
|
| 47 |
-
#
|
| 48 |
-
NO_DEVICE_CODE;
|
| 49 |
-
return;
|
| 50 |
-
#endif // FLASH_ATTN_AVAILABLE
|
| 51 |
|
| 52 |
// Skip unused kernel variants for faster compilation:
|
| 53 |
#ifdef FP16_MMA_AVAILABLE
|
|
@@ -285,6 +282,9 @@ static __global__ void flash_attn_tile_ext_f32(
|
|
| 285 |
dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
| 286 |
}
|
| 287 |
}
|
|
|
|
|
|
|
|
|
|
| 288 |
}
|
| 289 |
|
| 290 |
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
|
|
|
|
| 44 |
const int ne1,
|
| 45 |
const int ne2,
|
| 46 |
const int ne3) {
|
| 47 |
+
#ifdef FLASH_ATTN_AVAILABLE
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
// Skip unused kernel variants for faster compilation:
|
| 50 |
#ifdef FP16_MMA_AVAILABLE
|
|
|
|
| 282 |
dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
| 283 |
}
|
| 284 |
}
|
| 285 |
+
#else
|
| 286 |
+
NO_DEVICE_CODE;
|
| 287 |
+
#endif // FLASH_ATTN_AVAILABLE
|
| 288 |
}
|
| 289 |
|
| 290 |
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
|
ggml/src/ggml-cuda/fattn-vec-f16.cuh
CHANGED
|
@@ -41,12 +41,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
|
| 41 |
const int ne1,
|
| 42 |
const int ne2,
|
| 43 |
const int ne3) {
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
#ifndef FLASH_ATTN_AVAILABLE
|
| 47 |
-
NO_DEVICE_CODE;
|
| 48 |
-
return;
|
| 49 |
-
#endif // FLASH_ATTN_AVAILABLE
|
| 50 |
|
| 51 |
// Skip unused kernel variants for faster compilation:
|
| 52 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
|
@@ -300,7 +295,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
|
| 300 |
}
|
| 301 |
#else
|
| 302 |
NO_DEVICE_CODE;
|
| 303 |
-
#endif // FP16_AVAILABLE
|
| 304 |
}
|
| 305 |
|
| 306 |
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
|
|
|
| 41 |
const int ne1,
|
| 42 |
const int ne2,
|
| 43 |
const int ne3) {
|
| 44 |
+
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
// Skip unused kernel variants for faster compilation:
|
| 47 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
|
|
|
| 295 |
}
|
| 296 |
#else
|
| 297 |
NO_DEVICE_CODE;
|
| 298 |
+
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
| 299 |
}
|
| 300 |
|
| 301 |
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
ggml/src/ggml-cuda/fattn-vec-f32.cuh
CHANGED
|
@@ -41,10 +41,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
|
| 41 |
const int ne1,
|
| 42 |
const int ne2,
|
| 43 |
const int ne3) {
|
| 44 |
-
#
|
| 45 |
-
NO_DEVICE_CODE;
|
| 46 |
-
return;
|
| 47 |
-
#endif // FLASH_ATTN_AVAILABLE
|
| 48 |
|
| 49 |
// Skip unused kernel variants for faster compilation:
|
| 50 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
|
@@ -281,6 +278,9 @@ static __global__ void flash_attn_vec_ext_f32(
|
|
| 281 |
if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
|
| 282 |
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
|
| 283 |
}
|
|
|
|
|
|
|
|
|
|
| 284 |
}
|
| 285 |
|
| 286 |
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
|
|
|
| 41 |
const int ne1,
|
| 42 |
const int ne2,
|
| 43 |
const int ne3) {
|
| 44 |
+
#ifdef FLASH_ATTN_AVAILABLE
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
// Skip unused kernel variants for faster compilation:
|
| 47 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
|
|
|
| 278 |
if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
|
| 279 |
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
|
| 280 |
}
|
| 281 |
+
#else
|
| 282 |
+
NO_DEVICE_CODE;
|
| 283 |
+
#endif // FLASH_ATTN_AVAILABLE
|
| 284 |
}
|
| 285 |
|
| 286 |
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
ggml/src/ggml-cuda/fattn-wmma-f16.cu
CHANGED
|
@@ -51,7 +51,7 @@ static __global__ void flash_attn_ext_f16(
|
|
| 51 |
const int ne1,
|
| 52 |
const int ne2,
|
| 53 |
const int ne3) {
|
| 54 |
-
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
| 55 |
// Skip unused kernel variants for faster compilation:
|
| 56 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
| 57 |
NO_DEVICE_CODE;
|
|
@@ -425,7 +425,7 @@ static __global__ void flash_attn_ext_f16(
|
|
| 425 |
}
|
| 426 |
#else
|
| 427 |
NO_DEVICE_CODE;
|
| 428 |
-
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
| 429 |
}
|
| 430 |
|
| 431 |
constexpr int get_max_power_of_2(int x) {
|
|
|
|
| 51 |
const int ne1,
|
| 52 |
const int ne2,
|
| 53 |
const int ne3) {
|
| 54 |
+
#if defined(FLASH_ATTN_AVAILABLE) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
| 55 |
// Skip unused kernel variants for faster compilation:
|
| 56 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
| 57 |
NO_DEVICE_CODE;
|
|
|
|
| 425 |
}
|
| 426 |
#else
|
| 427 |
NO_DEVICE_CODE;
|
| 428 |
+
#endif // defined(FLASH_ATTN_AVAILABLE) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
| 429 |
}
|
| 430 |
|
| 431 |
constexpr int get_max_power_of_2(int x) {
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -3203,7 +3203,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3203 |
case GGML_OP_FLASH_ATTN_EXT: {
|
| 3204 |
#ifndef FLASH_ATTN_AVAILABLE
|
| 3205 |
return false;
|
| 3206 |
-
#endif
|
| 3207 |
if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
|
| 3208 |
return false;
|
| 3209 |
}
|
|
|
|
| 3203 |
case GGML_OP_FLASH_ATTN_EXT: {
|
| 3204 |
#ifndef FLASH_ATTN_AVAILABLE
|
| 3205 |
return false;
|
| 3206 |
+
#endif // FLASH_ATTN_AVAILABLE
|
| 3207 |
if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
|
| 3208 |
return false;
|
| 3209 |
}
|
ggml/src/ggml-hip/CMakeLists.txt
CHANGED
|
@@ -107,6 +107,10 @@ if (GGML_HIP_NO_VMM)
|
|
| 107 |
add_compile_definitions(GGML_HIP_NO_VMM)
|
| 108 |
endif()
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
if (CXX_IS_HIPCC)
|
| 111 |
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
| 112 |
target_link_libraries(ggml-hip PRIVATE hip::device)
|
|
|
|
| 107 |
add_compile_definitions(GGML_HIP_NO_VMM)
|
| 108 |
endif()
|
| 109 |
|
| 110 |
+
if (NOT GGML_CUDA_FA)
|
| 111 |
+
add_compile_definitions(GGML_CUDA_NO_FA)
|
| 112 |
+
endif()
|
| 113 |
+
|
| 114 |
if (CXX_IS_HIPCC)
|
| 115 |
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
| 116 |
target_link_libraries(ggml-hip PRIVATE hip::device)
|
ggml/src/ggml-musa/CMakeLists.txt
CHANGED
|
@@ -83,6 +83,10 @@ if (MUSAToolkit_FOUND)
|
|
| 83 |
add_compile_definitions(GGML_CUDA_NO_VMM)
|
| 84 |
endif()
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
| 87 |
add_compile_definitions(GGML_CUDA_F16)
|
| 88 |
endif()
|
|
|
|
| 83 |
add_compile_definitions(GGML_CUDA_NO_VMM)
|
| 84 |
endif()
|
| 85 |
|
| 86 |
+
if (NOT GGML_CUDA_FA)
|
| 87 |
+
add_compile_definitions(GGML_CUDA_NO_FA)
|
| 88 |
+
endif()
|
| 89 |
+
|
| 90 |
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
| 91 |
add_compile_definitions(GGML_CUDA_F16)
|
| 92 |
endif()
|