Gilad S commited on
Commit
cf75979
·
1 Parent(s): c24f7b1

fix: use `vm_allocate` to allocate CPU backend buffer on macOS (llama/9875)

Browse files

* fix: use `vm_allocate` to allocate CPU backend buffer on macOS

* fix: switch to `posix_memalign` to keep existing `free()` usages work

* feat: move `GGML_ALIGNED_MALLOC` to `ggml-backend-impl.h`, add support for `vm_allocate` on macOS

* style: formatting

* fix: move const outside of `#ifndef`

* style: formatting

* fix: unused var

* fix: transform `GGML_ALIGNED_MALLOC` and `GGML_ALIGNED_FREE` into functions and add them to `ggml-impl.h`

* fix: unused var

* fix: page align to `GGUF_DEFAULT_ALIGNMENT`

* fix: page align to `TENSOR_ALIGNMENT`

* fix: convert `TENSOR_ALIGNMENT` to a macro

* fix: increase page size to `32` on iOS

* fix: iOS page size

* fix: `hbw_posix_memalign` alignment

ggml/src/ggml-backend.cpp CHANGED
@@ -682,8 +682,6 @@ ggml_backend_t ggml_backend_init_best(void) {
682
 
683
  // backend CPU
684
 
685
- static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
686
-
687
  static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
688
  return "CPU";
689
 
@@ -702,7 +700,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
702
  }
703
 
704
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
705
- free(buffer->context);
706
  }
707
 
708
  static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -770,8 +768,8 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
770
  }
771
 
772
  static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
773
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
774
- void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
775
  if (data == NULL) {
776
  GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
777
  return NULL;
 
682
 
683
  // backend CPU
684
 
 
 
685
  static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
686
  return "CPU";
687
 
 
700
  }
701
 
702
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
703
+ ggml_aligned_free(buffer->context, buffer->size);
704
  }
705
 
706
  static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
 
768
  }
769
 
770
  static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
771
+ void * data = ggml_aligned_malloc(size);
772
+
773
  if (data == NULL) {
774
  GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
775
  return NULL;
ggml/src/ggml-impl.h CHANGED
@@ -19,6 +19,9 @@ extern "C" {
19
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
20
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
21
 
 
 
 
22
  // static_assert should be a #define, but if it's not,
23
  // fall back to the _Static_assert C11 keyword.
24
  // if C99 - static_assert is noop
@@ -196,6 +199,11 @@ struct ggml_cgraph {
196
 
197
  struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
198
 
 
 
 
 
 
199
  #ifdef __cplusplus
200
  }
201
  #endif
 
19
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
20
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
21
 
22
+ // required for mmap as gguf only guarantees 32-byte alignment
23
+ #define TENSOR_ALIGNMENT 32
24
+
25
  // static_assert should be a #define, but if it's not,
26
  // fall back to the _Static_assert C11 keyword.
27
  // if C99 - static_assert is noop
 
199
 
200
  struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
201
 
202
+ // Memory allocation
203
+
204
+ void * ggml_aligned_malloc(size_t size);
205
+ void ggml_aligned_free(void * ptr, size_t size);
206
+
207
  #ifdef __cplusplus
208
  }
209
  #endif
ggml/src/ggml.c CHANGED
@@ -35,10 +35,6 @@
35
  #include <omp.h>
36
  #endif
37
 
38
- #ifdef GGML_USE_METAL
39
- #include <unistd.h>
40
- #endif
41
-
42
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
43
  #undef GGML_USE_LLAMAFILE
44
  #endif
@@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
189
  #endif
190
 
191
  #if defined(__APPLE__)
 
 
192
  #include <TargetConditionals.h>
193
  #endif
194
 
@@ -387,22 +385,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
387
  //#define GGML_SOFT_MAX_ACCELERATE
388
  #endif
389
 
 
 
390
  #if defined(_MSC_VER) || defined(__MINGW32__)
391
- #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
392
- #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
393
  #else
394
- inline static void * ggml_aligned_malloc(size_t size) {
395
  if (size == 0) {
396
  GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
397
  return NULL;
398
  }
399
  void * aligned_memory = NULL;
400
  #ifdef GGML_USE_CPU_HBM
401
- int result = hbw_posix_memalign(&aligned_memory, 16, size);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  #elif GGML_USE_METAL
403
- int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
 
404
  #else
405
- int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
406
  #endif
407
  if (result != 0) {
408
  // Handle allocation failure
@@ -420,14 +436,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
420
  return NULL;
421
  }
422
  return aligned_memory;
 
423
  }
424
- #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
425
- #ifdef GGML_USE_CPU_HBM
426
- #define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
 
 
 
 
 
 
 
 
 
 
427
  #else
428
- #define GGML_ALIGNED_FREE(ptr) free(ptr)
429
- #endif
430
  #endif
 
 
431
 
432
  inline static void * ggml_malloc(size_t size) {
433
  if (size == 0) {
@@ -3847,7 +3875,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3847
 
3848
  *ctx = (struct ggml_context) {
3849
  /*.mem_size =*/ mem_size,
3850
- /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
3851
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3852
  /*.no_alloc =*/ params.no_alloc,
3853
  /*.no_alloc_save =*/ params.no_alloc,
@@ -3885,7 +3913,7 @@ void ggml_free(struct ggml_context * ctx) {
3885
  }
3886
 
3887
  if (ctx->mem_buffer_owned) {
3888
- GGML_ALIGNED_FREE(ctx->mem_buffer);
3889
  }
3890
 
3891
  GGML_FREE(ctx);
@@ -19575,9 +19603,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
19575
  void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
19576
  if (!threadpool) return;
19577
 
 
 
19578
  #ifndef GGML_USE_OPENMP
19579
  struct ggml_compute_state* workers = threadpool->workers;
19580
- const int n_threads = threadpool->n_threads_max;
19581
 
19582
  ggml_mutex_lock(&threadpool->mutex);
19583
 
@@ -19597,8 +19626,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
19597
  ggml_cond_destroy(&threadpool->cond);
19598
  #endif // GGML_USE_OPENMP
19599
 
19600
- GGML_ALIGNED_FREE(threadpool->workers);
19601
- GGML_ALIGNED_FREE(threadpool);
 
19602
  }
19603
 
19604
  #ifndef GGML_USE_OPENMP
@@ -20030,7 +20060,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
20030
  struct ggml_cplan * cplan) {
20031
 
20032
  struct ggml_threadpool * threadpool =
20033
- GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
20034
  {
20035
  threadpool->cgraph = cgraph;
20036
  threadpool->cplan = cplan;
@@ -20051,7 +20081,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
20051
 
20052
  // Allocate and init workers state
20053
  const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
20054
- struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
20055
 
20056
  memset(workers, 0, workers_size);
20057
  for (int j = 0; j < tpp->n_threads; j++) {
 
35
  #include <omp.h>
36
  #endif
37
 
 
 
 
 
38
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
39
  #undef GGML_USE_LLAMAFILE
40
  #endif
 
185
  #endif
186
 
187
  #if defined(__APPLE__)
188
+ #include <unistd.h>
189
+ #include <mach/mach.h>
190
  #include <TargetConditionals.h>
191
  #endif
192
 
 
385
  //#define GGML_SOFT_MAX_ACCELERATE
386
  #endif
387
 
388
+
389
+ void * ggml_aligned_malloc(size_t size) {
390
  #if defined(_MSC_VER) || defined(__MINGW32__)
391
+ return _aligned_malloc(size, TENSOR_ALIGNMENT);
 
392
  #else
 
393
  if (size == 0) {
394
  GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
395
  return NULL;
396
  }
397
  void * aligned_memory = NULL;
398
  #ifdef GGML_USE_CPU_HBM
399
+ int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
400
+ #elif TARGET_OS_OSX
401
+ kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
402
+ int result = EFAULT;
403
+ switch (alloc_status) {
404
+ case KERN_SUCCESS:
405
+ result = 0;
406
+ break;
407
+ case KERN_INVALID_ADDRESS:
408
+ result = EINVAL;
409
+ break;
410
+ case KERN_NO_SPACE:
411
+ result = ENOMEM;
412
+ break;
413
+ default:
414
+ result = EFAULT;
415
+ break;
416
+ }
417
  #elif GGML_USE_METAL
418
+ const long page_size = sysconf(_SC_PAGESIZE);
419
+ int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
420
  #else
421
+ int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
422
  #endif
423
  if (result != 0) {
424
  // Handle allocation failure
 
436
  return NULL;
437
  }
438
  return aligned_memory;
439
+ #endif
440
  }
441
+
442
+ void ggml_aligned_free(void * ptr, size_t size) {
443
+ GGML_UNUSED(size);
444
+ #if defined(_MSC_VER) || defined(__MINGW32__)
445
+ _aligned_free(ptr);
446
+ #elif GGML_USE_CPU_HBM
447
+ if (ptr != NULL) {
448
+ hbw_free(ptr);
449
+ }
450
+ #elif TARGET_OS_OSX
451
+ if (ptr != NULL) {
452
+ vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
453
+ }
454
  #else
455
+ free(ptr);
 
456
  #endif
457
+ }
458
+
459
 
460
  inline static void * ggml_malloc(size_t size) {
461
  if (size == 0) {
 
3875
 
3876
  *ctx = (struct ggml_context) {
3877
  /*.mem_size =*/ mem_size,
3878
+ /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
3879
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3880
  /*.no_alloc =*/ params.no_alloc,
3881
  /*.no_alloc_save =*/ params.no_alloc,
 
3913
  }
3914
 
3915
  if (ctx->mem_buffer_owned) {
3916
+ ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
3917
  }
3918
 
3919
  GGML_FREE(ctx);
 
19603
  void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
19604
  if (!threadpool) return;
19605
 
19606
+ const int n_threads = threadpool->n_threads_max;
19607
+
19608
  #ifndef GGML_USE_OPENMP
19609
  struct ggml_compute_state* workers = threadpool->workers;
 
19610
 
19611
  ggml_mutex_lock(&threadpool->mutex);
19612
 
 
19626
  ggml_cond_destroy(&threadpool->cond);
19627
  #endif // GGML_USE_OPENMP
19628
 
19629
+ const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
19630
+ ggml_aligned_free(threadpool->workers, workers_size);
19631
+ ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
19632
  }
19633
 
19634
  #ifndef GGML_USE_OPENMP
 
20060
  struct ggml_cplan * cplan) {
20061
 
20062
  struct ggml_threadpool * threadpool =
20063
+ ggml_aligned_malloc(sizeof(struct ggml_threadpool));
20064
  {
20065
  threadpool->cgraph = cgraph;
20066
  threadpool->cplan = cplan;
 
20081
 
20082
  // Allocate and init workers state
20083
  const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
20084
+ struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
20085
 
20086
  memset(workers, 0, workers_size);
20087
  for (int j = 0; j < tpp->n_threads; j++) {