Bodhi Bodhi Hu commited on
Commit
ab96dac
·
1 Parent(s): 9de6d81

MUSA: support ARM64 and enable dp4a .etc (llama/11843)

Browse files

* MUSA: support ARM64 and enable __dp4a .etc

* fix cross entropy loss op for musa

* update

* add cc info log for musa

* add comment for the MUSA .cc calculation block

---------

Co-authored-by: Bodhi Hu <[email protected]>

ggml/src/ggml-cuda/common.cuh CHANGED
@@ -411,13 +411,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
411
 
412
  #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
413
 
414
- #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
415
  return __dp4a(a, b, c);
416
- #else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
417
  const int8_t * a8 = (const int8_t *) &a;
418
  const int8_t * b8 = (const int8_t *) &b;
419
  return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
420
- #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
421
 
422
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
423
  }
 
411
 
412
  #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
413
 
414
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
415
  return __dp4a(a, b, c);
416
+ #else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
417
  const int8_t * a8 = (const int8_t *) &a;
418
  const int8_t * b8 = (const int8_t *) &b;
419
  return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
420
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
421
 
422
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
423
  }
ggml/src/ggml-cuda/cross-entropy-loss.cu CHANGED
@@ -123,13 +123,13 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
123
  ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
124
 
125
  if (nbytes_shared <= smpbo) {
126
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
127
  static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
128
  if (!shared_memory_limit_raised[id]) {
129
- CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
130
  shared_memory_limit_raised[id] = true;
131
  }
132
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
133
  cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
134
  } else {
135
  cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
@@ -175,13 +175,13 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten
175
  const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
176
 
177
  if (nbytes_shared <= smpbo) {
178
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
179
  static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
180
  if (!shared_memory_limit_raised[id]) {
181
  CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
182
  shared_memory_limit_raised[id] = true;
183
  }
184
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
185
  cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
186
  } else {
187
  cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
 
123
  ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
124
 
125
  if (nbytes_shared <= smpbo) {
126
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
127
  static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
128
  if (!shared_memory_limit_raised[id]) {
129
+ CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
130
  shared_memory_limit_raised[id] = true;
131
  }
132
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
133
  cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
134
  } else {
135
  cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
 
175
  const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
176
 
177
  if (nbytes_shared <= smpbo) {
178
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
179
  static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
180
  if (!shared_memory_limit_raised[id]) {
181
  CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
182
  shared_memory_limit_raised[id] = true;
183
  }
184
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
185
  cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
186
  } else {
187
  cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -261,6 +261,12 @@ static ggml_cuda_device_info ggml_cuda_init() {
261
  GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
262
  id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
263
  device_vmm ? "yes" : "no", prop.warpSize);
 
 
 
 
 
 
264
  #else
265
  info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
266
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
@@ -1782,9 +1788,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1782
  }
1783
  }
1784
  #else
1785
- #ifdef GGML_USE_MUSA
1786
- GGML_ASSERT(false);
1787
- #else // !GGML_USE_MUSA
1788
  if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
1789
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
1790
  // use cublasGemmStridedBatchedEx
@@ -1827,7 +1830,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1827
  cu_compute_type,
1828
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1829
  }
1830
- #endif // GGML_USE_MUSA
1831
  #endif
1832
 
1833
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
 
261
  GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
262
  id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
263
  device_vmm ? "yes" : "no", prop.warpSize);
264
+ #elif defined(GGML_USE_MUSA)
265
+ // TODO: refine the .cc to reflect MUSA's actual CC capabilities
266
+ info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
267
+ info.devices[id].cc = 100*prop.major + 10*prop.minor;
268
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
269
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
270
  #else
271
  info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
272
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
 
1788
  }
1789
  }
1790
  #else
 
 
 
1791
  if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
1792
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
1793
  // use cublasGemmStridedBatchedEx
 
1830
  cu_compute_type,
1831
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1832
  }
 
1833
  #endif
1834
 
1835
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
ggml/src/ggml-impl.h CHANGED
@@ -16,7 +16,7 @@
16
  #include <arm_sve.h>
17
  #endif // __ARM_FEATURE_SVE
18
 
19
- #if defined(__ARM_NEON) && !defined(__CUDACC__)
20
  // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
21
  //
22
  // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
 
16
  #include <arm_sve.h>
17
  #endif // __ARM_FEATURE_SVE
18
 
19
+ #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
20
  // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
21
  //
22
  // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
ggml/src/ggml-musa/CMakeLists.txt CHANGED
@@ -49,7 +49,7 @@ if (MUSAToolkit_FOUND)
49
 
50
  set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
51
  foreach(SOURCE ${GGML_SOURCES_MUSA})
52
- set(COMPILE_FLAGS "-x musa -mtgpu")
53
  foreach(ARCH ${MUSA_ARCHITECTURES})
54
  set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
55
  endforeach()
 
49
 
50
  set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
51
  foreach(SOURCE ${GGML_SOURCES_MUSA})
52
+ set(COMPILE_FLAGS "-fsigned-char -x musa -mtgpu")
53
  foreach(ARCH ${MUSA_ARCHITECTURES})
54
  set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
55
  endforeach()