Spaces:
Sleeping
Sleeping
Bodhi
Bodhi Hu
commited on
Commit
·
ab96dac
1
Parent(s):
9de6d81
MUSA: support ARM64 and enable dp4a .etc (llama/11843)
Browse files* MUSA: support ARM64 and enable __dp4a .etc
* fix cross entropy loss op for musa
* update
* add cc info log for musa
* add comment for the MUSA .cc calculation block
---------
Co-authored-by: Bodhi Hu <[email protected]>
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -411,13 +411,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|
| 411 |
|
| 412 |
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 413 |
|
| 414 |
-
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
| 415 |
return __dp4a(a, b, c);
|
| 416 |
-
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
| 417 |
const int8_t * a8 = (const int8_t *) &a;
|
| 418 |
const int8_t * b8 = (const int8_t *) &b;
|
| 419 |
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
| 420 |
-
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
| 421 |
|
| 422 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 423 |
}
|
|
|
|
| 411 |
|
| 412 |
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 413 |
|
| 414 |
+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
| 415 |
return __dp4a(a, b, c);
|
| 416 |
+
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
| 417 |
const int8_t * a8 = (const int8_t *) &a;
|
| 418 |
const int8_t * b8 = (const int8_t *) &b;
|
| 419 |
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
| 420 |
+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
| 421 |
|
| 422 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 423 |
}
|
ggml/src/ggml-cuda/cross-entropy-loss.cu
CHANGED
|
@@ -123,13 +123,13 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
| 123 |
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
|
| 124 |
|
| 125 |
if (nbytes_shared <= smpbo) {
|
| 126 |
-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
| 127 |
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
| 128 |
if (!shared_memory_limit_raised[id]) {
|
| 129 |
-
CUDA_CHECK(cudaFuncSetAttribute(
|
| 130 |
shared_memory_limit_raised[id] = true;
|
| 131 |
}
|
| 132 |
-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
| 133 |
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
| 134 |
} else {
|
| 135 |
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
|
@@ -175,13 +175,13 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten
|
|
| 175 |
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
| 176 |
|
| 177 |
if (nbytes_shared <= smpbo) {
|
| 178 |
-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
| 179 |
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
| 180 |
if (!shared_memory_limit_raised[id]) {
|
| 181 |
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
| 182 |
shared_memory_limit_raised[id] = true;
|
| 183 |
}
|
| 184 |
-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
| 185 |
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
| 186 |
} else {
|
| 187 |
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
|
|
|
| 123 |
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
|
| 124 |
|
| 125 |
if (nbytes_shared <= smpbo) {
|
| 126 |
+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
| 127 |
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
| 128 |
if (!shared_memory_limit_raised[id]) {
|
| 129 |
+
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
| 130 |
shared_memory_limit_raised[id] = true;
|
| 131 |
}
|
| 132 |
+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
| 133 |
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
| 134 |
} else {
|
| 135 |
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
|
|
|
| 175 |
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
| 176 |
|
| 177 |
if (nbytes_shared <= smpbo) {
|
| 178 |
+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
| 179 |
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
| 180 |
if (!shared_memory_limit_raised[id]) {
|
| 181 |
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
| 182 |
shared_memory_limit_raised[id] = true;
|
| 183 |
}
|
| 184 |
+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
| 185 |
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
| 186 |
} else {
|
| 187 |
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -261,6 +261,12 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 261 |
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
|
| 262 |
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
|
| 263 |
device_vmm ? "yes" : "no", prop.warpSize);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
#else
|
| 265 |
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
| 266 |
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
|
@@ -1782,9 +1788,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
| 1782 |
}
|
| 1783 |
}
|
| 1784 |
#else
|
| 1785 |
-
#ifdef GGML_USE_MUSA
|
| 1786 |
-
GGML_ASSERT(false);
|
| 1787 |
-
#else // !GGML_USE_MUSA
|
| 1788 |
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
| 1789 |
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
| 1790 |
// use cublasGemmStridedBatchedEx
|
|
@@ -1827,7 +1830,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
| 1827 |
cu_compute_type,
|
| 1828 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 1829 |
}
|
| 1830 |
-
#endif // GGML_USE_MUSA
|
| 1831 |
#endif
|
| 1832 |
|
| 1833 |
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
|
|
| 261 |
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
|
| 262 |
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
|
| 263 |
device_vmm ? "yes" : "no", prop.warpSize);
|
| 264 |
+
#elif defined(GGML_USE_MUSA)
|
| 265 |
+
// TODO: refine the .cc to reflect MUSA's actual CC capabilities
|
| 266 |
+
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
| 267 |
+
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
| 268 |
+
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
| 269 |
+
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
| 270 |
#else
|
| 271 |
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
| 272 |
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
|
|
|
| 1788 |
}
|
| 1789 |
}
|
| 1790 |
#else
|
|
|
|
|
|
|
|
|
|
| 1791 |
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
| 1792 |
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
| 1793 |
// use cublasGemmStridedBatchedEx
|
|
|
|
| 1830 |
cu_compute_type,
|
| 1831 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 1832 |
}
|
|
|
|
| 1833 |
#endif
|
| 1834 |
|
| 1835 |
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
ggml/src/ggml-impl.h
CHANGED
|
@@ -16,7 +16,7 @@
|
|
| 16 |
#include <arm_sve.h>
|
| 17 |
#endif // __ARM_FEATURE_SVE
|
| 18 |
|
| 19 |
-
#if defined(__ARM_NEON) && !defined(__CUDACC__)
|
| 20 |
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
| 21 |
//
|
| 22 |
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
|
|
|
| 16 |
#include <arm_sve.h>
|
| 17 |
#endif // __ARM_FEATURE_SVE
|
| 18 |
|
| 19 |
+
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
|
| 20 |
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
| 21 |
//
|
| 22 |
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
ggml/src/ggml-musa/CMakeLists.txt
CHANGED
|
@@ -49,7 +49,7 @@ if (MUSAToolkit_FOUND)
|
|
| 49 |
|
| 50 |
set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
|
| 51 |
foreach(SOURCE ${GGML_SOURCES_MUSA})
|
| 52 |
-
set(COMPILE_FLAGS "-x musa -mtgpu")
|
| 53 |
foreach(ARCH ${MUSA_ARCHITECTURES})
|
| 54 |
set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
|
| 55 |
endforeach()
|
|
|
|
| 49 |
|
| 50 |
set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
|
| 51 |
foreach(SOURCE ${GGML_SOURCES_MUSA})
|
| 52 |
+
set(COMPILE_FLAGS "-fsigned-char -x musa -mtgpu")
|
| 53 |
foreach(ARCH ${MUSA_ARCHITECTURES})
|
| 54 |
set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
|
| 55 |
endforeach()
|