yeahdongcn JohannesGaessler commited on
Commit
e35329b
·
1 Parent(s): fea8f94

musa: enable fp16 mma (all) and cublas on qy2 (llama/13842)

Browse files

* musa: enable fp16 mma (all) and cublas on qy2

Signed-off-by: Xiaodong Ye <[email protected]>

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Co-authored-by: Johannes Gäßler <[email protected]>

* Address review comments

Signed-off-by: Xiaodong Ye <[email protected]>

* Address review comments

Signed-off-by: Xiaodong Ye <[email protected]>

* musa: disable MUL_MAT_ID (q2_k × f32) due to precision issues

Signed-off-by: Xiaodong Ye <[email protected]>

---------

Signed-off-by: Xiaodong Ye <[email protected]>
Co-authored-by: Johannes Gäßler <[email protected]>

ggml/src/ggml-cuda/common.cuh CHANGED
@@ -76,11 +76,9 @@
76
  #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
77
 
78
  // Moore Threads
79
- #define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
80
-
81
- #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
82
- #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
83
- #define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
84
 
85
  #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
86
  #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
@@ -203,9 +201,9 @@ typedef float2 dfloat2;
203
  #define FAST_FP16_AVAILABLE
204
  #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
205
 
206
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
207
  #define FP16_MMA_AVAILABLE
208
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
209
 
210
  #if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
211
  #define FP16_MMA_AVAILABLE
@@ -219,9 +217,9 @@ typedef float2 dfloat2;
219
  #define CP_ASYNC_AVAILABLE
220
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
221
 
222
- #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
223
  #define FLASH_ATTN_AVAILABLE
224
- #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
225
 
226
  static bool fp16_available(const int cc) {
227
  return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
@@ -233,7 +231,8 @@ static bool fast_fp16_available(const int cc) {
233
 
234
  // To be used for feature selection of external libraries, e.g. cuBLAS.
235
  static bool fast_fp16_hardware_available(const int cc) {
236
- return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
 
237
  }
238
 
239
  // Any FP16 tensor core instructions are available for ggml code.
@@ -242,7 +241,8 @@ static bool fp16_mma_available(const int cc) {
242
  return false;
243
  #else
244
  if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
245
- GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
 
246
  return true;
247
  } else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
248
  #if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
@@ -259,7 +259,8 @@ static bool fp16_mma_available(const int cc) {
259
  // To be used for feature selection of external libraries, e.g. cuBLAS.
260
  static bool fp16_mma_hardware_available(const int cc) {
261
  return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
262
- GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
 
263
  }
264
 
265
  static bool bf16_mma_hardware_available(const int cc) {
 
76
  #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
77
 
78
  // Moore Threads
79
+ #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
80
+ #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
81
+ #define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
 
 
82
 
83
  #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
84
  #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
 
201
  #define FAST_FP16_AVAILABLE
202
  #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
203
 
204
+ #if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
205
  #define FP16_MMA_AVAILABLE
206
+ #endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
207
 
208
  #if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
209
  #define FP16_MMA_AVAILABLE
 
217
  #define CP_ASYNC_AVAILABLE
218
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
219
 
220
+ #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
221
  #define FLASH_ATTN_AVAILABLE
222
+ #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
223
 
224
  static bool fp16_available(const int cc) {
225
  return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
 
231
 
232
  // To be used for feature selection of external libraries, e.g. cuBLAS.
233
  static bool fast_fp16_hardware_available(const int cc) {
234
+ return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) ||
235
+ (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
236
  }
237
 
238
  // Any FP16 tensor core instructions are available for ggml code.
 
241
  return false;
242
  #else
243
  if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
244
+ GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
245
+ GGML_CUDA_CC_IS_MTHREADS(cc)) {
246
  return true;
247
  } else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
248
  #if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
 
259
  // To be used for feature selection of external libraries, e.g. cuBLAS.
260
  static bool fp16_mma_hardware_available(const int cc) {
261
  return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
262
+ GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
263
+ (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
264
  }
265
 
266
  static bool bf16_mma_hardware_available(const int cc) {
ggml/src/ggml-cuda/fattn-wmma-f16.cu CHANGED
@@ -9,7 +9,11 @@
9
  #ifdef FP16_MMA_AVAILABLE
10
  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
11
  #include <mma.h>
 
 
 
12
  namespace wmma = nvcuda::wmma;
 
13
  #elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
14
  #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
15
  #include <rocwmma/rocwmma.hpp>
 
9
  #ifdef FP16_MMA_AVAILABLE
10
  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
11
  #include <mma.h>
12
+ #ifdef GGML_USE_MUSA
13
+ namespace wmma = mtmusa::wmma;
14
+ #else // GGML_USE_MUSA
15
  namespace wmma = nvcuda::wmma;
16
+ #endif // GGML_USE_MUSA
17
  #elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
18
  #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
19
  #include <rocwmma/rocwmma.hpp>
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -1227,9 +1227,12 @@ static void ggml_cuda_op_mul_mat_cublas(
1227
 
1228
  const int cc = ggml_cuda_info().devices[id].cc;
1229
 
 
 
 
1230
  const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
1231
 
1232
- if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
1233
  ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
1234
  if (src1->type != GGML_TYPE_BF16) {
1235
  const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
@@ -1257,7 +1260,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1257
 
1258
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
1259
  to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1260
- } else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
1261
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1262
  ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1263
  if (src0->type != GGML_TYPE_F16) {
@@ -3061,9 +3064,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3061
  return false;
3062
  }
3063
  #ifdef GGML_USE_MUSA
3064
- if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
3065
- !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3066
- return false;
 
 
 
 
 
 
 
3067
  }
3068
  #endif // GGML_USE_MUSA
3069
  switch (a->type) {
@@ -3090,11 +3100,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3090
  case GGML_TYPE_IQ4_NL:
3091
  case GGML_TYPE_IQ4_XS:
3092
  case GGML_TYPE_BF16:
3093
- #ifdef GGML_USE_MUSA
3094
- if (a->type == GGML_TYPE_Q3_K) {
3095
- return false;
3096
- }
3097
- #endif // GGML_USE_MUSA
3098
  return true;
3099
  default:
3100
  return false;
 
1227
 
1228
  const int cc = ggml_cuda_info().devices[id].cc;
1229
 
1230
+ const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
1231
+ (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
1232
+
1233
  const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
1234
 
1235
+ if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
1236
  ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
1237
  if (src1->type != GGML_TYPE_BF16) {
1238
  const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
 
1260
 
1261
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
1262
  to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1263
+ } else if (fast_fp16_hardware_available(cc) && use_fp16) {
1264
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1265
  ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1266
  if (src0->type != GGML_TYPE_F16) {
 
3064
  return false;
3065
  }
3066
  #ifdef GGML_USE_MUSA
3067
+ const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3068
+ if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3069
+ if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
3070
+ a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
3071
+ return false;
3072
+ }
3073
+ if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
3074
+ a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
3075
+ return false;
3076
+ }
3077
  }
3078
  #endif // GGML_USE_MUSA
3079
  switch (a->type) {
 
3100
  case GGML_TYPE_IQ4_NL:
3101
  case GGML_TYPE_IQ4_XS:
3102
  case GGML_TYPE_BF16:
 
 
 
 
 
3103
  return true;
3104
  default:
3105
  return false;
ggml/src/ggml-musa/mudnn.cuh CHANGED
@@ -1,7 +1,7 @@
1
  #pragma once
2
 
3
- #include "../include/ggml.h"
4
- #include "../ggml-cuda/common.cuh"
5
 
6
  // Asynchronously copies data from src tensor to dst tensor using the provided context.
7
  // Returns a musaError_t indicating success or failure.
 
1
  #pragma once
2
 
3
+ #include "ggml-cuda/common.cuh"
4
+ #include "ggml.h"
5
 
6
  // Asynchronously copies data from src tensor to dst tensor using the provided context.
7
  // Returns a musaError_t indicating success or failure.