OccamRazor slaren commited on
Commit
5d130aa
·
unverified ·
1 Parent(s): 7aa3216

Basic Vulkan Multi-GPU implementation (llama/5321)

Browse files

* Initial Vulkan multi-gpu implementation

Move most global variables into backend context

* Add names to backend device functions

* Add further missing cleanup code

* Reduce code duplication in tensor split layer assignment

* generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h

* Only do device info print in the beginning and initialize one backend for cpu assist

Add missing cleanup code

* Rework backend memory management to make sure devices and buffers get properly allocated and freed

* Rename cpu assist free function

---------

Co-authored-by: slaren <[email protected]>

Files changed (1) hide show
  1. ggml.c +7 -7
ggml.c CHANGED
@@ -2343,7 +2343,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2343
  #elif defined(GGML_USE_CLBLAST)
2344
  ggml_cl_init();
2345
  #elif defined(GGML_USE_VULKAN)
2346
- ggml_vk_init();
2347
  #elif defined(GGML_USE_SYCL)
2348
  ggml_init_sycl();
2349
  #endif
@@ -14850,10 +14850,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14850
  GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14851
  GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14852
  #elif defined(GGML_USE_VULKAN)
14853
- const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
14854
  #ifdef GGML_VULKAN_CHECK_RESULTS
14855
  if (skip_cpu) {
14856
- ggml_vk_check_results_1(params, tensor);
14857
  }
14858
  #endif
14859
  if (skip_cpu) {
@@ -17269,12 +17269,12 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17269
 
17270
  #ifdef GGML_USE_VULKAN
17271
  for (int i = 0; i < cgraph->n_nodes; i++) {
17272
- ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
17273
  }
17274
- ggml_vk_preallocate_buffers();
17275
 
17276
  for (int i = 0; i < cgraph->n_nodes; i++) {
17277
- ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
17278
  }
17279
  #endif
17280
 
@@ -17330,7 +17330,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17330
  }
17331
 
17332
  #ifdef GGML_USE_VULKAN
17333
- ggml_vk_graph_cleanup();
17334
  #endif
17335
 
17336
  // performance stats (graph)
 
2343
  #elif defined(GGML_USE_CLBLAST)
2344
  ggml_cl_init();
2345
  #elif defined(GGML_USE_VULKAN)
2346
+ ggml_vk_init_cpu_assist();
2347
  #elif defined(GGML_USE_SYCL)
2348
  ggml_init_sycl();
2349
  #endif
 
14850
  GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14851
  GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14852
  #elif defined(GGML_USE_VULKAN)
14853
+ const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
14854
  #ifdef GGML_VULKAN_CHECK_RESULTS
14855
  if (skip_cpu) {
14856
+ ggml_vk_check_results_1_cpu_assist(params, tensor);
14857
  }
14858
  #endif
14859
  if (skip_cpu) {
 
17269
 
17270
  #ifdef GGML_USE_VULKAN
17271
  for (int i = 0; i < cgraph->n_nodes; i++) {
17272
+ ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
17273
  }
17274
+ ggml_vk_preallocate_buffers_cpu_assist();
17275
 
17276
  for (int i = 0; i < cgraph->n_nodes; i++) {
17277
+ ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
17278
  }
17279
  #endif
17280
 
 
17330
  }
17331
 
17332
  #ifdef GGML_USE_VULKAN
17333
+ ggml_vk_graph_cleanup_cpu_assist();
17334
  #endif
17335
 
17336
  // performance stats (graph)