Spaces:
Running
Running
Commit
·
e35329b
1
Parent(s):
fea8f94
musa: enable fp16 mma (all) and cublas on qy2 (llama/13842)
Browse files* musa: enable fp16 mma (all) and cublas on qy2
Signed-off-by: Xiaodong Ye <[email protected]>
* Update ggml/src/ggml-cuda/ggml-cuda.cu
Co-authored-by: Johannes Gäßler <[email protected]>
* Address review comments
Signed-off-by: Xiaodong Ye <[email protected]>
* Address review comments
Signed-off-by: Xiaodong Ye <[email protected]>
* musa: disable MUL_MAT_ID (q2_k × f32) due to precision issues
Signed-off-by: Xiaodong Ye <[email protected]>
---------
Signed-off-by: Xiaodong Ye <[email protected]>
Co-authored-by: Johannes Gäßler <[email protected]>
- ggml/src/ggml-cuda/common.cuh +13 -12
- ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- ggml/src/ggml-cuda/ggml-cuda.cu +15 -10
- ggml/src/ggml-musa/mudnn.cuh +2 -2
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -76,11 +76,9 @@
|
|
| 76 |
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
| 77 |
|
| 78 |
// Moore Threads
|
| 79 |
-
#define
|
| 80 |
-
|
| 81 |
-
#define
|
| 82 |
-
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
| 83 |
-
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
| 84 |
|
| 85 |
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
| 86 |
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
|
@@ -203,9 +201,9 @@ typedef float2 dfloat2;
|
|
| 203 |
#define FAST_FP16_AVAILABLE
|
| 204 |
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
| 205 |
|
| 206 |
-
#if !
|
| 207 |
#define FP16_MMA_AVAILABLE
|
| 208 |
-
#endif // !
|
| 209 |
|
| 210 |
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
| 211 |
#define FP16_MMA_AVAILABLE
|
|
@@ -219,9 +217,9 @@ typedef float2 dfloat2;
|
|
| 219 |
#define CP_ASYNC_AVAILABLE
|
| 220 |
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 221 |
|
| 222 |
-
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) &&
|
| 223 |
#define FLASH_ATTN_AVAILABLE
|
| 224 |
-
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) &&
|
| 225 |
|
| 226 |
static bool fp16_available(const int cc) {
|
| 227 |
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
|
|
@@ -233,7 +231,8 @@ static bool fast_fp16_available(const int cc) {
|
|
| 233 |
|
| 234 |
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
| 235 |
static bool fast_fp16_hardware_available(const int cc) {
|
| 236 |
-
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc)
|
|
|
|
| 237 |
}
|
| 238 |
|
| 239 |
// Any FP16 tensor core instructions are available for ggml code.
|
|
@@ -242,7 +241,8 @@ static bool fp16_mma_available(const int cc) {
|
|
| 242 |
return false;
|
| 243 |
#else
|
| 244 |
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
| 245 |
-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)
|
|
|
|
| 246 |
return true;
|
| 247 |
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
| 248 |
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
|
@@ -259,7 +259,8 @@ static bool fp16_mma_available(const int cc) {
|
|
| 259 |
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
| 260 |
static bool fp16_mma_hardware_available(const int cc) {
|
| 261 |
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
|
| 262 |
-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)
|
|
|
|
| 263 |
}
|
| 264 |
|
| 265 |
static bool bf16_mma_hardware_available(const int cc) {
|
|
|
|
| 76 |
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
| 77 |
|
| 78 |
// Moore Threads
|
| 79 |
+
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
| 80 |
+
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
| 81 |
+
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
|
|
|
|
|
|
| 82 |
|
| 83 |
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
| 84 |
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
|
|
|
| 201 |
#define FAST_FP16_AVAILABLE
|
| 202 |
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
| 203 |
|
| 204 |
+
#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
|
| 205 |
#define FP16_MMA_AVAILABLE
|
| 206 |
+
#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
|
| 207 |
|
| 208 |
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
| 209 |
#define FP16_MMA_AVAILABLE
|
|
|
|
| 217 |
#define CP_ASYNC_AVAILABLE
|
| 218 |
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 219 |
|
| 220 |
+
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
| 221 |
#define FLASH_ATTN_AVAILABLE
|
| 222 |
+
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
| 223 |
|
| 224 |
static bool fp16_available(const int cc) {
|
| 225 |
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
|
|
|
|
| 231 |
|
| 232 |
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
| 233 |
static bool fast_fp16_hardware_available(const int cc) {
|
| 234 |
+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) ||
|
| 235 |
+
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
| 236 |
}
|
| 237 |
|
| 238 |
// Any FP16 tensor core instructions are available for ggml code.
|
|
|
|
| 241 |
return false;
|
| 242 |
#else
|
| 243 |
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
| 244 |
+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
|
| 245 |
+
GGML_CUDA_CC_IS_MTHREADS(cc)) {
|
| 246 |
return true;
|
| 247 |
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
| 248 |
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
|
|
|
| 259 |
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
| 260 |
static bool fp16_mma_hardware_available(const int cc) {
|
| 261 |
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
|
| 262 |
+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
|
| 263 |
+
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
| 264 |
}
|
| 265 |
|
| 266 |
static bool bf16_mma_hardware_available(const int cc) {
|
ggml/src/ggml-cuda/fattn-wmma-f16.cu
CHANGED
|
@@ -9,7 +9,11 @@
|
|
| 9 |
#ifdef FP16_MMA_AVAILABLE
|
| 10 |
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
| 11 |
#include <mma.h>
|
|
|
|
|
|
|
|
|
|
| 12 |
namespace wmma = nvcuda::wmma;
|
|
|
|
| 13 |
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
|
| 14 |
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
| 15 |
#include <rocwmma/rocwmma.hpp>
|
|
|
|
| 9 |
#ifdef FP16_MMA_AVAILABLE
|
| 10 |
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
| 11 |
#include <mma.h>
|
| 12 |
+
#ifdef GGML_USE_MUSA
|
| 13 |
+
namespace wmma = mtmusa::wmma;
|
| 14 |
+
#else // GGML_USE_MUSA
|
| 15 |
namespace wmma = nvcuda::wmma;
|
| 16 |
+
#endif // GGML_USE_MUSA
|
| 17 |
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
|
| 18 |
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
| 19 |
#include <rocwmma/rocwmma.hpp>
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -1227,9 +1227,12 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
| 1227 |
|
| 1228 |
const int cc = ggml_cuda_info().devices[id].cc;
|
| 1229 |
|
|
|
|
|
|
|
|
|
|
| 1230 |
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
|
| 1231 |
|
| 1232 |
-
if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
| 1233 |
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
|
| 1234 |
if (src1->type != GGML_TYPE_BF16) {
|
| 1235 |
const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
|
|
@@ -1257,7 +1260,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
| 1257 |
|
| 1258 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
| 1259 |
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
| 1260 |
-
} else if ((
|
| 1261 |
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
| 1262 |
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
| 1263 |
if (src0->type != GGML_TYPE_F16) {
|
|
@@ -3061,9 +3064,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3061 |
return false;
|
| 3062 |
}
|
| 3063 |
#ifdef GGML_USE_MUSA
|
| 3064 |
-
|
| 3065 |
-
|
| 3066 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3067 |
}
|
| 3068 |
#endif // GGML_USE_MUSA
|
| 3069 |
switch (a->type) {
|
|
@@ -3090,11 +3100,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3090 |
case GGML_TYPE_IQ4_NL:
|
| 3091 |
case GGML_TYPE_IQ4_XS:
|
| 3092 |
case GGML_TYPE_BF16:
|
| 3093 |
-
#ifdef GGML_USE_MUSA
|
| 3094 |
-
if (a->type == GGML_TYPE_Q3_K) {
|
| 3095 |
-
return false;
|
| 3096 |
-
}
|
| 3097 |
-
#endif // GGML_USE_MUSA
|
| 3098 |
return true;
|
| 3099 |
default:
|
| 3100 |
return false;
|
|
|
|
| 1227 |
|
| 1228 |
const int cc = ggml_cuda_info().devices[id].cc;
|
| 1229 |
|
| 1230 |
+
const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
|
| 1231 |
+
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
| 1232 |
+
|
| 1233 |
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
|
| 1234 |
|
| 1235 |
+
if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
| 1236 |
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
|
| 1237 |
if (src1->type != GGML_TYPE_BF16) {
|
| 1238 |
const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
|
|
|
|
| 1260 |
|
| 1261 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
| 1262 |
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
| 1263 |
+
} else if (fast_fp16_hardware_available(cc) && use_fp16) {
|
| 1264 |
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
| 1265 |
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
| 1266 |
if (src0->type != GGML_TYPE_F16) {
|
|
|
|
| 3064 |
return false;
|
| 3065 |
}
|
| 3066 |
#ifdef GGML_USE_MUSA
|
| 3067 |
+
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
| 3068 |
+
if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
|
| 3069 |
+
if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
|
| 3070 |
+
a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
|
| 3071 |
+
return false;
|
| 3072 |
+
}
|
| 3073 |
+
if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
|
| 3074 |
+
a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
|
| 3075 |
+
return false;
|
| 3076 |
+
}
|
| 3077 |
}
|
| 3078 |
#endif // GGML_USE_MUSA
|
| 3079 |
switch (a->type) {
|
|
|
|
| 3100 |
case GGML_TYPE_IQ4_NL:
|
| 3101 |
case GGML_TYPE_IQ4_XS:
|
| 3102 |
case GGML_TYPE_BF16:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3103 |
return true;
|
| 3104 |
default:
|
| 3105 |
return false;
|
ggml/src/ggml-musa/mudnn.cuh
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
-
#include "
|
| 4 |
-
#include "
|
| 5 |
|
| 6 |
// Asynchronously copies data from src tensor to dst tensor using the provided context.
|
| 7 |
// Returns a musaError_t indicating success or failure.
|
|
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
+
#include "ggml-cuda/common.cuh"
|
| 4 |
+
#include "ggml.h"
|
| 5 |
|
| 6 |
// Asynchronously copies data from src tensor to dst tensor using the provided context.
|
| 7 |
// Returns a musaError_t indicating success or failure.
|