Spaces:
Running
Running
Basic Vulkan Multi-GPU implementation (llama/5321)
Browse files* Initial Vulkan multi-gpu implementation
Move most global variables into backend context
* Add names to backend device functions
* Add further missing cleanup code
* Reduce code duplication in tensor split layer assignment
* generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h
* Only do device info print in the beginning and initialize one backend for cpu assist
Add missing cleanup code
* Rework backend memory management to make sure devices and buffers get properly allocated and freed
* Rename cpu assist free function
---------
Co-authored-by: slaren <[email protected]>
ggml.c
CHANGED
|
@@ -2343,7 +2343,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 2343 |
#elif defined(GGML_USE_CLBLAST)
|
| 2344 |
ggml_cl_init();
|
| 2345 |
#elif defined(GGML_USE_VULKAN)
|
| 2346 |
-
|
| 2347 |
#elif defined(GGML_USE_SYCL)
|
| 2348 |
ggml_init_sycl();
|
| 2349 |
#endif
|
|
@@ -14850,10 +14850,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
| 14850 |
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
| 14851 |
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
| 14852 |
#elif defined(GGML_USE_VULKAN)
|
| 14853 |
-
const bool skip_cpu =
|
| 14854 |
#ifdef GGML_VULKAN_CHECK_RESULTS
|
| 14855 |
if (skip_cpu) {
|
| 14856 |
-
|
| 14857 |
}
|
| 14858 |
#endif
|
| 14859 |
if (skip_cpu) {
|
|
@@ -17269,12 +17269,12 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
| 17269 |
|
| 17270 |
#ifdef GGML_USE_VULKAN
|
| 17271 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 17272 |
-
|
| 17273 |
}
|
| 17274 |
-
|
| 17275 |
|
| 17276 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 17277 |
-
|
| 17278 |
}
|
| 17279 |
#endif
|
| 17280 |
|
|
@@ -17330,7 +17330,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
| 17330 |
}
|
| 17331 |
|
| 17332 |
#ifdef GGML_USE_VULKAN
|
| 17333 |
-
|
| 17334 |
#endif
|
| 17335 |
|
| 17336 |
// performance stats (graph)
|
|
|
|
| 2343 |
#elif defined(GGML_USE_CLBLAST)
|
| 2344 |
ggml_cl_init();
|
| 2345 |
#elif defined(GGML_USE_VULKAN)
|
| 2346 |
+
ggml_vk_init_cpu_assist();
|
| 2347 |
#elif defined(GGML_USE_SYCL)
|
| 2348 |
ggml_init_sycl();
|
| 2349 |
#endif
|
|
|
|
| 14850 |
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
| 14851 |
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
| 14852 |
#elif defined(GGML_USE_VULKAN)
|
| 14853 |
+
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
| 14854 |
#ifdef GGML_VULKAN_CHECK_RESULTS
|
| 14855 |
if (skip_cpu) {
|
| 14856 |
+
ggml_vk_check_results_1_cpu_assist(params, tensor);
|
| 14857 |
}
|
| 14858 |
#endif
|
| 14859 |
if (skip_cpu) {
|
|
|
|
| 17269 |
|
| 17270 |
#ifdef GGML_USE_VULKAN
|
| 17271 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 17272 |
+
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
|
| 17273 |
}
|
| 17274 |
+
ggml_vk_preallocate_buffers_cpu_assist();
|
| 17275 |
|
| 17276 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 17277 |
+
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
| 17278 |
}
|
| 17279 |
#endif
|
| 17280 |
|
|
|
|
| 17330 |
}
|
| 17331 |
|
| 17332 |
#ifdef GGML_USE_VULKAN
|
| 17333 |
+
ggml_vk_graph_cleanup_cpu_assist();
|
| 17334 |
#endif
|
| 17335 |
|
| 17336 |
// performance stats (graph)
|