Spaces:
Running
Running
uvos
commited on
Commit
·
089afa0
1
Parent(s):
7b7c5d3
hip : Add hipGraph and VMM support to ROCM (llama/11362)
Browse files- ggml/CMakeLists.txt +1 -0
- ggml/src/ggml-cuda/common.cuh +1 -1
- ggml/src/ggml-cuda/ggml-cuda.cu +39 -19
- ggml/src/ggml-cuda/vendors/hip.h +43 -0
- ggml/src/ggml-hip/CMakeLists.txt +8 -0
ggml/CMakeLists.txt
CHANGED
|
@@ -154,6 +154,7 @@ option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashA
|
|
| 154 |
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
| 155 |
|
| 156 |
option(GGML_HIP "ggml: use HIP" OFF)
|
|
|
|
| 157 |
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
| 158 |
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
| 159 |
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
|
|
| 154 |
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
| 155 |
|
| 156 |
option(GGML_HIP "ggml: use HIP" OFF)
|
| 157 |
+
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
| 158 |
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
| 159 |
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
| 160 |
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -588,7 +588,7 @@ struct ggml_tensor_extra_gpu {
|
|
| 588 |
};
|
| 589 |
|
| 590 |
|
| 591 |
-
#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
|
| 592 |
#define USE_CUDA_GRAPH
|
| 593 |
#endif
|
| 594 |
|
|
|
|
| 588 |
};
|
| 589 |
|
| 590 |
|
| 591 |
+
#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
|
| 592 |
#define USE_CUDA_GRAPH
|
| 593 |
#endif
|
| 594 |
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -62,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
| 62 |
[[noreturn]]
|
| 63 |
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
| 64 |
int id = -1; // in case cudaGetDevice fails
|
| 65 |
-
cudaGetDevice(&id);
|
| 66 |
|
| 67 |
GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
|
| 68 |
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
|
@@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 152 |
for (int id = 0; id < info.device_count; ++id) {
|
| 153 |
int device_vmm = 0;
|
| 154 |
|
| 155 |
-
#if !defined(
|
| 156 |
CUdevice device;
|
| 157 |
CU_CHECK(cuDeviceGet(&device, id));
|
| 158 |
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
|
@@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 164 |
alloc_prop.location.id = id;
|
| 165 |
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
| 166 |
}
|
| 167 |
-
#endif // !defined(
|
| 168 |
info.devices[id].vmm = !!device_vmm;
|
| 169 |
|
| 170 |
cudaDeviceProp prop;
|
|
@@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
| 300 |
};
|
| 301 |
|
| 302 |
// pool with virtual memory
|
| 303 |
-
#if !defined(
|
| 304 |
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
| 305 |
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
| 306 |
|
|
@@ -309,6 +309,9 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|
| 309 |
size_t pool_used = 0;
|
| 310 |
size_t pool_size = 0;
|
| 311 |
size_t granularity;
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
explicit ggml_cuda_pool_vmm(int device) :
|
| 314 |
device(device),
|
|
@@ -317,7 +320,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|
| 317 |
|
| 318 |
~ggml_cuda_pool_vmm() {
|
| 319 |
if (pool_addr != 0) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
|
|
|
|
| 321 |
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
|
| 322 |
}
|
| 323 |
}
|
|
@@ -350,7 +360,11 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|
| 350 |
}
|
| 351 |
|
| 352 |
// map at the end of the pool
|
| 353 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
// the memory allocation handle is no longer needed after mapping
|
| 356 |
CU_CHECK(cuMemRelease(handle));
|
|
@@ -360,7 +374,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|
| 360 |
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
| 361 |
access.location.id = device;
|
| 362 |
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
| 363 |
-
CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
|
| 364 |
|
| 365 |
// add to the pool
|
| 366 |
pool_size += reserve_size;
|
|
@@ -372,7 +386,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|
| 372 |
|
| 373 |
GGML_ASSERT(pool_addr != 0);
|
| 374 |
|
| 375 |
-
void * ptr = (void *) (pool_addr + pool_used);
|
| 376 |
*actual_size = size;
|
| 377 |
pool_used += size;
|
| 378 |
|
|
@@ -391,17 +405,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|
| 391 |
pool_used -= size;
|
| 392 |
|
| 393 |
// all deallocations must be in reverse order of the allocations
|
| 394 |
-
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
| 395 |
}
|
| 396 |
};
|
| 397 |
-
#endif // !defined(
|
| 398 |
|
| 399 |
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
| 400 |
-
#if !defined(
|
| 401 |
if (ggml_cuda_info().devices[device].vmm) {
|
| 402 |
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
| 403 |
}
|
| 404 |
-
#endif // !defined(
|
| 405 |
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
| 406 |
}
|
| 407 |
|
|
@@ -547,7 +561,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
|
| 547 |
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
| 548 |
if (err != cudaSuccess) {
|
| 549 |
// clear the error
|
| 550 |
-
cudaGetLastError();
|
| 551 |
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
| 552 |
return nullptr;
|
| 553 |
}
|
|
@@ -962,7 +976,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
| 962 |
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
| 963 |
if (err != cudaSuccess) {
|
| 964 |
// clear the error
|
| 965 |
-
cudaGetLastError();
|
| 966 |
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
| 967 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 968 |
return nullptr;
|
|
@@ -1209,7 +1223,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
|
| 1209 |
CUDA_CHECK(err);
|
| 1210 |
} else {
|
| 1211 |
// reset the error
|
| 1212 |
-
cudaGetLastError();
|
| 1213 |
}
|
| 1214 |
} else {
|
| 1215 |
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
|
@@ -1217,7 +1231,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
|
| 1217 |
CUDA_CHECK(err);
|
| 1218 |
} else {
|
| 1219 |
// reset the error
|
| 1220 |
-
cudaGetLastError();
|
| 1221 |
}
|
| 1222 |
}
|
| 1223 |
}
|
|
@@ -2452,7 +2466,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
|
|
| 2452 |
if (stat == cudaErrorInvalidDeviceFunction) {
|
| 2453 |
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
| 2454 |
// We don't need to update blas nodes, so clear error and move on.
|
| 2455 |
-
cudaGetLastError();
|
| 2456 |
} else {
|
| 2457 |
GGML_ASSERT(stat == cudaSuccess);
|
| 2458 |
}
|
|
@@ -2507,14 +2521,20 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
|
|
| 2507 |
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
| 2508 |
|
| 2509 |
cudaGraphExecUpdateResultInfo result_info;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2510 |
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
|
|
|
| 2511 |
if (stat == cudaErrorGraphExecUpdateFailure) {
|
| 2512 |
#ifndef NDEBUG
|
| 2513 |
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
|
| 2514 |
#endif
|
|
|
|
| 2515 |
// The pre-existing graph exec cannot be updated due to violated constraints
|
| 2516 |
// so instead clear error and re-instantiate
|
| 2517 |
-
cudaGetLastError();
|
| 2518 |
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
|
| 2519 |
cuda_ctx->cuda_graph->instance = nullptr;
|
| 2520 |
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
|
@@ -2742,7 +2762,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
|
| 2742 |
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
| 2743 |
if (err != cudaSuccess) {
|
| 2744 |
// clear the error
|
| 2745 |
-
cudaGetLastError();
|
| 2746 |
|
| 2747 |
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
| 2748 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
|
@@ -2762,7 +2782,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
|
| 2762 |
cudaError_t err = cudaHostUnregister(buffer);
|
| 2763 |
if (err != cudaSuccess) {
|
| 2764 |
// clear the error
|
| 2765 |
-
cudaGetLastError();
|
| 2766 |
}
|
| 2767 |
}
|
| 2768 |
|
|
|
|
| 62 |
[[noreturn]]
|
| 63 |
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
| 64 |
int id = -1; // in case cudaGetDevice fails
|
| 65 |
+
(void)cudaGetDevice(&id);
|
| 66 |
|
| 67 |
GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
|
| 68 |
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
|
|
|
| 152 |
for (int id = 0; id < info.device_count; ++id) {
|
| 153 |
int device_vmm = 0;
|
| 154 |
|
| 155 |
+
#if !defined(GGML_CUDA_NO_VMM)
|
| 156 |
CUdevice device;
|
| 157 |
CU_CHECK(cuDeviceGet(&device, id));
|
| 158 |
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
|
|
|
| 164 |
alloc_prop.location.id = id;
|
| 165 |
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
| 166 |
}
|
| 167 |
+
#endif // !defined(GGML_CUDA_NO_VMM)
|
| 168 |
info.devices[id].vmm = !!device_vmm;
|
| 169 |
|
| 170 |
cudaDeviceProp prop;
|
|
|
|
| 300 |
};
|
| 301 |
|
| 302 |
// pool with virtual memory
|
| 303 |
+
#if !defined(GGML_CUDA_NO_VMM)
|
| 304 |
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
| 305 |
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
| 306 |
|
|
|
|
| 309 |
size_t pool_used = 0;
|
| 310 |
size_t pool_size = 0;
|
| 311 |
size_t granularity;
|
| 312 |
+
#if defined(GGML_USE_HIP)
|
| 313 |
+
std::vector<std::pair<CUdeviceptr, size_t>> mappings;
|
| 314 |
+
#endif
|
| 315 |
|
| 316 |
explicit ggml_cuda_pool_vmm(int device) :
|
| 317 |
device(device),
|
|
|
|
| 320 |
|
| 321 |
~ggml_cuda_pool_vmm() {
|
| 322 |
if (pool_addr != 0) {
|
| 323 |
+
#if defined(GGML_USE_HIP)
|
| 324 |
+
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
|
| 325 |
+
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
|
| 326 |
+
CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
|
| 327 |
+
}
|
| 328 |
+
#else
|
| 329 |
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
|
| 330 |
+
#endif
|
| 331 |
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
|
| 332 |
}
|
| 333 |
}
|
|
|
|
| 360 |
}
|
| 361 |
|
| 362 |
// map at the end of the pool
|
| 363 |
+
CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
|
| 364 |
+
CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
|
| 365 |
+
#if defined(GGML_USE_HIP)
|
| 366 |
+
mappings.push_back({start_ptr, reserve_size});
|
| 367 |
+
#endif
|
| 368 |
|
| 369 |
// the memory allocation handle is no longer needed after mapping
|
| 370 |
CU_CHECK(cuMemRelease(handle));
|
|
|
|
| 374 |
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
| 375 |
access.location.id = device;
|
| 376 |
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
| 377 |
+
CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
|
| 378 |
|
| 379 |
// add to the pool
|
| 380 |
pool_size += reserve_size;
|
|
|
|
| 386 |
|
| 387 |
GGML_ASSERT(pool_addr != 0);
|
| 388 |
|
| 389 |
+
void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
|
| 390 |
*actual_size = size;
|
| 391 |
pool_used += size;
|
| 392 |
|
|
|
|
| 405 |
pool_used -= size;
|
| 406 |
|
| 407 |
// all deallocations must be in reverse order of the allocations
|
| 408 |
+
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
|
| 409 |
}
|
| 410 |
};
|
| 411 |
+
#endif // !defined(GGML_CUDA_NO_VMM)
|
| 412 |
|
| 413 |
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
| 414 |
+
#if !defined(GGML_CUDA_NO_VMM)
|
| 415 |
if (ggml_cuda_info().devices[device].vmm) {
|
| 416 |
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
| 417 |
}
|
| 418 |
+
#endif // !defined(GGML_CUDA_NO_VMM)
|
| 419 |
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
| 420 |
}
|
| 421 |
|
|
|
|
| 561 |
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
| 562 |
if (err != cudaSuccess) {
|
| 563 |
// clear the error
|
| 564 |
+
(void)cudaGetLastError();
|
| 565 |
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
| 566 |
return nullptr;
|
| 567 |
}
|
|
|
|
| 976 |
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
| 977 |
if (err != cudaSuccess) {
|
| 978 |
// clear the error
|
| 979 |
+
(void)cudaGetLastError();
|
| 980 |
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
| 981 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 982 |
return nullptr;
|
|
|
|
| 1223 |
CUDA_CHECK(err);
|
| 1224 |
} else {
|
| 1225 |
// reset the error
|
| 1226 |
+
(void)cudaGetLastError();
|
| 1227 |
}
|
| 1228 |
} else {
|
| 1229 |
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
|
|
|
| 1231 |
CUDA_CHECK(err);
|
| 1232 |
} else {
|
| 1233 |
// reset the error
|
| 1234 |
+
(void)cudaGetLastError();
|
| 1235 |
}
|
| 1236 |
}
|
| 1237 |
}
|
|
|
|
| 2466 |
if (stat == cudaErrorInvalidDeviceFunction) {
|
| 2467 |
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
| 2468 |
// We don't need to update blas nodes, so clear error and move on.
|
| 2469 |
+
(void)cudaGetLastError();
|
| 2470 |
} else {
|
| 2471 |
GGML_ASSERT(stat == cudaSuccess);
|
| 2472 |
}
|
|
|
|
| 2521 |
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
| 2522 |
|
| 2523 |
cudaGraphExecUpdateResultInfo result_info;
|
| 2524 |
+
#ifdef __HIP_PLATFORM_AMD__
|
| 2525 |
+
hipGraphNode_t errorNode;
|
| 2526 |
+
hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
|
| 2527 |
+
#else
|
| 2528 |
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
| 2529 |
+
#endif
|
| 2530 |
if (stat == cudaErrorGraphExecUpdateFailure) {
|
| 2531 |
#ifndef NDEBUG
|
| 2532 |
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
|
| 2533 |
#endif
|
| 2534 |
+
|
| 2535 |
// The pre-existing graph exec cannot be updated due to violated constraints
|
| 2536 |
// so instead clear error and re-instantiate
|
| 2537 |
+
(void)cudaGetLastError();
|
| 2538 |
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
|
| 2539 |
cuda_ctx->cuda_graph->instance = nullptr;
|
| 2540 |
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
|
|
|
| 2762 |
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
| 2763 |
if (err != cudaSuccess) {
|
| 2764 |
// clear the error
|
| 2765 |
+
(void)cudaGetLastError();
|
| 2766 |
|
| 2767 |
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
| 2768 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
|
|
|
| 2782 |
cudaError_t err = cudaHostUnregister(buffer);
|
| 2783 |
if (err != cudaSuccess) {
|
| 2784 |
// clear the error
|
| 2785 |
+
(void)cudaGetLastError();
|
| 2786 |
}
|
| 2787 |
}
|
| 2788 |
|
ggml/src/ggml-cuda/vendors/hip.h
CHANGED
|
@@ -19,6 +19,12 @@
|
|
| 19 |
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
| 20 |
#define CUDA_R_16F HIPBLAS_R_16F
|
| 21 |
#define CUDA_R_32F HIPBLAS_R_32F
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
| 23 |
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
| 24 |
#define cublasCreate hipblasCreate
|
|
@@ -74,6 +80,21 @@
|
|
| 74 |
#define cudaMemGetInfo hipMemGetInfo
|
| 75 |
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
| 76 |
#define cudaSetDevice hipSetDevice
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
| 78 |
#define cudaStreamDestroy hipStreamDestroy
|
| 79 |
#define cudaStreamFireAndForget hipStreamFireAndForget
|
|
@@ -81,6 +102,28 @@
|
|
| 81 |
#define cudaStreamPerThread hipStreamPerThread
|
| 82 |
#define cudaStreamSynchronize hipStreamSynchronize
|
| 83 |
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
#define cudaStream_t hipStream_t
|
| 85 |
#define cudaSuccess hipSuccess
|
| 86 |
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
|
|
|
| 19 |
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
| 20 |
#define CUDA_R_16F HIPBLAS_R_16F
|
| 21 |
#define CUDA_R_32F HIPBLAS_R_32F
|
| 22 |
+
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
|
| 23 |
+
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
|
| 24 |
+
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
|
| 25 |
+
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
|
| 26 |
+
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
|
| 27 |
+
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
|
| 28 |
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
| 29 |
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
| 30 |
#define cublasCreate hipblasCreate
|
|
|
|
| 80 |
#define cudaMemGetInfo hipMemGetInfo
|
| 81 |
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
| 82 |
#define cudaSetDevice hipSetDevice
|
| 83 |
+
#define cuDeviceGet hipDeviceGet
|
| 84 |
+
#define CUdevice hipDevice_t
|
| 85 |
+
#define CUdeviceptr hipDeviceptr_t
|
| 86 |
+
#define cuMemUnmap hipMemUnmap
|
| 87 |
+
#define CUmemAccessDesc hipMemAccessDesc
|
| 88 |
+
#define cuMemAddressFree hipMemAddressFree
|
| 89 |
+
#define cuMemRelease hipMemRelease
|
| 90 |
+
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
|
| 91 |
+
#define cuMemCreate hipMemCreate
|
| 92 |
+
#define cuMemAddressReserve hipMemAddressReserve
|
| 93 |
+
#define cuMemMap hipMemMap
|
| 94 |
+
#define cuMemSetAccess hipMemSetAccess
|
| 95 |
+
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
|
| 96 |
+
#define CUmemAllocationProp hipMemAllocationProp
|
| 97 |
+
#define cuDeviceGetAttribute hipDeviceGetAttribute
|
| 98 |
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
| 99 |
#define cudaStreamDestroy hipStreamDestroy
|
| 100 |
#define cudaStreamFireAndForget hipStreamFireAndForget
|
|
|
|
| 102 |
#define cudaStreamPerThread hipStreamPerThread
|
| 103 |
#define cudaStreamSynchronize hipStreamSynchronize
|
| 104 |
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
| 105 |
+
#define cudaGraphExec_t hipGraphExec_t
|
| 106 |
+
#define cudaGraphNode_t hipGraphNode_t
|
| 107 |
+
#define cudaKernelNodeParams hipKernelNodeParams
|
| 108 |
+
#define cudaKernelNodeParams hipKernelNodeParams
|
| 109 |
+
#define cudaGraphExecDestroy hipGraphExecDestroy
|
| 110 |
+
#define cudaGraphLaunch hipGraphLaunch
|
| 111 |
+
#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
|
| 112 |
+
#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
|
| 113 |
+
#define cudaGraphNodeType hipGraphNodeType
|
| 114 |
+
#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
|
| 115 |
+
#define cudaGraphInstantiate hipGraphInstantiate
|
| 116 |
+
#define cudaStreamEndCapture hipStreamEndCapture
|
| 117 |
+
#define cudaGraphDestroy hipGraphDestroy
|
| 118 |
+
#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
|
| 119 |
+
#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
|
| 120 |
+
#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
|
| 121 |
+
#define cudaGraphNodeGetType hipGraphNodeGetType
|
| 122 |
+
#define cudaGraphGetNodes hipGraphGetNodes
|
| 123 |
+
#define cudaGraphExecUpdate hipGraphExecUpdate
|
| 124 |
+
#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
|
| 125 |
+
#define cudaStreamBeginCapture hipStreamBeginCapture
|
| 126 |
+
#define cudaGraph_t hipGraph_t
|
| 127 |
#define cudaStream_t hipStream_t
|
| 128 |
#define cudaSuccess hipSuccess
|
| 129 |
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
ggml/src/ggml-hip/CMakeLists.txt
CHANGED
|
@@ -92,6 +92,14 @@ if (GGML_CUDA_NO_PEER_COPY)
|
|
| 92 |
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
| 93 |
endif()
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
if (CXX_IS_HIPCC)
|
| 96 |
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
| 97 |
target_link_libraries(ggml-hip PRIVATE hip::device)
|
|
|
|
| 92 |
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
| 93 |
endif()
|
| 94 |
|
| 95 |
+
if (GGML_HIP_GRAPHS)
|
| 96 |
+
add_compile_definitions(GGML_HIP_GRAPHS)
|
| 97 |
+
endif()
|
| 98 |
+
|
| 99 |
+
if (GGML_CUDA_NO_VMM)
|
| 100 |
+
add_compile_definitions(GGML_CUDA_NO_VMM)
|
| 101 |
+
endif()
|
| 102 |
+
|
| 103 |
if (CXX_IS_HIPCC)
|
| 104 |
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
| 105 |
target_link_libraries(ggml-hip PRIVATE hip::device)
|