Spaces:
Sleeping
Sleeping
Gilad S
commited on
Commit
·
cf75979
1
Parent(s):
c24f7b1
fix: use `vm_allocate` to allocate CPU backend buffer on macOS (llama/9875)
Browse files* fix: use `vm_allocate` to allocate CPU backend buffer on macOS
* fix: switch to `posix_memalign` to keep existing `free()` usages work
* feat: move `GGML_ALIGNED_MALLOC` to `ggml-backend-impl.h`, add support for `vm_allocate` on macOS
* style: formatting
* fix: move const outside of `#ifndef`
* style: formatting
* fix: unused var
* fix: transform `GGML_ALIGNED_MALLOC` and `GGML_ALIGNED_FREE` into functions and add them to `ggml-impl.h`
* fix: unused var
* fix: page align to `GGUF_DEFAULT_ALIGNMENT`
* fix: page align to `TENSOR_ALIGNMENT`
* fix: convert `TENSOR_ALIGNMENT` to a macro
* fix: increase page size to `32` on iOS
* fix: iOS page size
* fix: `hbw_posix_memalign` alignment
- ggml/src/ggml-backend.cpp +3 -5
- ggml/src/ggml-impl.h +8 -0
- ggml/src/ggml.c +52 -22
ggml/src/ggml-backend.cpp
CHANGED
|
@@ -682,8 +682,6 @@ ggml_backend_t ggml_backend_init_best(void) {
|
|
| 682 |
|
| 683 |
// backend CPU
|
| 684 |
|
| 685 |
-
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
| 686 |
-
|
| 687 |
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 688 |
return "CPU";
|
| 689 |
|
|
@@ -702,7 +700,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
| 702 |
}
|
| 703 |
|
| 704 |
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 705 |
-
|
| 706 |
}
|
| 707 |
|
| 708 |
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
@@ -770,8 +768,8 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
|
|
| 770 |
}
|
| 771 |
|
| 772 |
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 773 |
-
|
| 774 |
-
|
| 775 |
if (data == NULL) {
|
| 776 |
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
| 777 |
return NULL;
|
|
|
|
| 682 |
|
| 683 |
// backend CPU
|
| 684 |
|
|
|
|
|
|
|
| 685 |
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 686 |
return "CPU";
|
| 687 |
|
|
|
|
| 700 |
}
|
| 701 |
|
| 702 |
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 703 |
+
ggml_aligned_free(buffer->context, buffer->size);
|
| 704 |
}
|
| 705 |
|
| 706 |
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
|
|
| 768 |
}
|
| 769 |
|
| 770 |
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 771 |
+
void * data = ggml_aligned_malloc(size);
|
| 772 |
+
|
| 773 |
if (data == NULL) {
|
| 774 |
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
| 775 |
return NULL;
|
ggml/src/ggml-impl.h
CHANGED
|
@@ -19,6 +19,9 @@ extern "C" {
|
|
| 19 |
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
| 20 |
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
// static_assert should be a #define, but if it's not,
|
| 23 |
// fall back to the _Static_assert C11 keyword.
|
| 24 |
// if C99 - static_assert is noop
|
|
@@ -196,6 +199,11 @@ struct ggml_cgraph {
|
|
| 196 |
|
| 197 |
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
#ifdef __cplusplus
|
| 200 |
}
|
| 201 |
#endif
|
|
|
|
| 19 |
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
| 20 |
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
| 21 |
|
| 22 |
+
// required for mmap as gguf only guarantees 32-byte alignment
|
| 23 |
+
#define TENSOR_ALIGNMENT 32
|
| 24 |
+
|
| 25 |
// static_assert should be a #define, but if it's not,
|
| 26 |
// fall back to the _Static_assert C11 keyword.
|
| 27 |
// if C99 - static_assert is noop
|
|
|
|
| 199 |
|
| 200 |
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
| 201 |
|
| 202 |
+
// Memory allocation
|
| 203 |
+
|
| 204 |
+
void * ggml_aligned_malloc(size_t size);
|
| 205 |
+
void ggml_aligned_free(void * ptr, size_t size);
|
| 206 |
+
|
| 207 |
#ifdef __cplusplus
|
| 208 |
}
|
| 209 |
#endif
|
ggml/src/ggml.c
CHANGED
|
@@ -35,10 +35,6 @@
|
|
| 35 |
#include <omp.h>
|
| 36 |
#endif
|
| 37 |
|
| 38 |
-
#ifdef GGML_USE_METAL
|
| 39 |
-
#include <unistd.h>
|
| 40 |
-
#endif
|
| 41 |
-
|
| 42 |
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
| 43 |
#undef GGML_USE_LLAMAFILE
|
| 44 |
#endif
|
|
@@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
|
|
| 189 |
#endif
|
| 190 |
|
| 191 |
#if defined(__APPLE__)
|
|
|
|
|
|
|
| 192 |
#include <TargetConditionals.h>
|
| 193 |
#endif
|
| 194 |
|
|
@@ -387,22 +385,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
|
|
| 387 |
//#define GGML_SOFT_MAX_ACCELERATE
|
| 388 |
#endif
|
| 389 |
|
|
|
|
|
|
|
| 390 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 391 |
-
|
| 392 |
-
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
| 393 |
#else
|
| 394 |
-
inline static void * ggml_aligned_malloc(size_t size) {
|
| 395 |
if (size == 0) {
|
| 396 |
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
| 397 |
return NULL;
|
| 398 |
}
|
| 399 |
void * aligned_memory = NULL;
|
| 400 |
#ifdef GGML_USE_CPU_HBM
|
| 401 |
-
int result = hbw_posix_memalign(&aligned_memory,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
#elif GGML_USE_METAL
|
| 403 |
-
|
|
|
|
| 404 |
#else
|
| 405 |
-
int result = posix_memalign(&aligned_memory,
|
| 406 |
#endif
|
| 407 |
if (result != 0) {
|
| 408 |
// Handle allocation failure
|
|
@@ -420,14 +436,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
| 420 |
return NULL;
|
| 421 |
}
|
| 422 |
return aligned_memory;
|
|
|
|
| 423 |
}
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
#else
|
| 428 |
-
|
| 429 |
-
#endif
|
| 430 |
#endif
|
|
|
|
|
|
|
| 431 |
|
| 432 |
inline static void * ggml_malloc(size_t size) {
|
| 433 |
if (size == 0) {
|
|
@@ -3847,7 +3875,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 3847 |
|
| 3848 |
*ctx = (struct ggml_context) {
|
| 3849 |
/*.mem_size =*/ mem_size,
|
| 3850 |
-
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer :
|
| 3851 |
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
| 3852 |
/*.no_alloc =*/ params.no_alloc,
|
| 3853 |
/*.no_alloc_save =*/ params.no_alloc,
|
|
@@ -3885,7 +3913,7 @@ void ggml_free(struct ggml_context * ctx) {
|
|
| 3885 |
}
|
| 3886 |
|
| 3887 |
if (ctx->mem_buffer_owned) {
|
| 3888 |
-
|
| 3889 |
}
|
| 3890 |
|
| 3891 |
GGML_FREE(ctx);
|
|
@@ -19575,9 +19603,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
|
| 19575 |
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
| 19576 |
if (!threadpool) return;
|
| 19577 |
|
|
|
|
|
|
|
| 19578 |
#ifndef GGML_USE_OPENMP
|
| 19579 |
struct ggml_compute_state* workers = threadpool->workers;
|
| 19580 |
-
const int n_threads = threadpool->n_threads_max;
|
| 19581 |
|
| 19582 |
ggml_mutex_lock(&threadpool->mutex);
|
| 19583 |
|
|
@@ -19597,8 +19626,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
|
| 19597 |
ggml_cond_destroy(&threadpool->cond);
|
| 19598 |
#endif // GGML_USE_OPENMP
|
| 19599 |
|
| 19600 |
-
|
| 19601 |
-
|
|
|
|
| 19602 |
}
|
| 19603 |
|
| 19604 |
#ifndef GGML_USE_OPENMP
|
|
@@ -20030,7 +20060,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|
| 20030 |
struct ggml_cplan * cplan) {
|
| 20031 |
|
| 20032 |
struct ggml_threadpool * threadpool =
|
| 20033 |
-
|
| 20034 |
{
|
| 20035 |
threadpool->cgraph = cgraph;
|
| 20036 |
threadpool->cplan = cplan;
|
|
@@ -20051,7 +20081,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|
| 20051 |
|
| 20052 |
// Allocate and init workers state
|
| 20053 |
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
|
| 20054 |
-
struct ggml_compute_state * workers =
|
| 20055 |
|
| 20056 |
memset(workers, 0, workers_size);
|
| 20057 |
for (int j = 0; j < tpp->n_threads; j++) {
|
|
|
|
| 35 |
#include <omp.h>
|
| 36 |
#endif
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
| 39 |
#undef GGML_USE_LLAMAFILE
|
| 40 |
#endif
|
|
|
|
| 185 |
#endif
|
| 186 |
|
| 187 |
#if defined(__APPLE__)
|
| 188 |
+
#include <unistd.h>
|
| 189 |
+
#include <mach/mach.h>
|
| 190 |
#include <TargetConditionals.h>
|
| 191 |
#endif
|
| 192 |
|
|
|
|
| 385 |
//#define GGML_SOFT_MAX_ACCELERATE
|
| 386 |
#endif
|
| 387 |
|
| 388 |
+
|
| 389 |
+
void * ggml_aligned_malloc(size_t size) {
|
| 390 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 391 |
+
return _aligned_malloc(size, TENSOR_ALIGNMENT);
|
|
|
|
| 392 |
#else
|
|
|
|
| 393 |
if (size == 0) {
|
| 394 |
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
| 395 |
return NULL;
|
| 396 |
}
|
| 397 |
void * aligned_memory = NULL;
|
| 398 |
#ifdef GGML_USE_CPU_HBM
|
| 399 |
+
int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
|
| 400 |
+
#elif TARGET_OS_OSX
|
| 401 |
+
kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
|
| 402 |
+
int result = EFAULT;
|
| 403 |
+
switch (alloc_status) {
|
| 404 |
+
case KERN_SUCCESS:
|
| 405 |
+
result = 0;
|
| 406 |
+
break;
|
| 407 |
+
case KERN_INVALID_ADDRESS:
|
| 408 |
+
result = EINVAL;
|
| 409 |
+
break;
|
| 410 |
+
case KERN_NO_SPACE:
|
| 411 |
+
result = ENOMEM;
|
| 412 |
+
break;
|
| 413 |
+
default:
|
| 414 |
+
result = EFAULT;
|
| 415 |
+
break;
|
| 416 |
+
}
|
| 417 |
#elif GGML_USE_METAL
|
| 418 |
+
const long page_size = sysconf(_SC_PAGESIZE);
|
| 419 |
+
int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
|
| 420 |
#else
|
| 421 |
+
int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
|
| 422 |
#endif
|
| 423 |
if (result != 0) {
|
| 424 |
// Handle allocation failure
|
|
|
|
| 436 |
return NULL;
|
| 437 |
}
|
| 438 |
return aligned_memory;
|
| 439 |
+
#endif
|
| 440 |
}
|
| 441 |
+
|
| 442 |
+
void ggml_aligned_free(void * ptr, size_t size) {
|
| 443 |
+
GGML_UNUSED(size);
|
| 444 |
+
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 445 |
+
_aligned_free(ptr);
|
| 446 |
+
#elif GGML_USE_CPU_HBM
|
| 447 |
+
if (ptr != NULL) {
|
| 448 |
+
hbw_free(ptr);
|
| 449 |
+
}
|
| 450 |
+
#elif TARGET_OS_OSX
|
| 451 |
+
if (ptr != NULL) {
|
| 452 |
+
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
|
| 453 |
+
}
|
| 454 |
#else
|
| 455 |
+
free(ptr);
|
|
|
|
| 456 |
#endif
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
|
| 460 |
inline static void * ggml_malloc(size_t size) {
|
| 461 |
if (size == 0) {
|
|
|
|
| 3875 |
|
| 3876 |
*ctx = (struct ggml_context) {
|
| 3877 |
/*.mem_size =*/ mem_size,
|
| 3878 |
+
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
|
| 3879 |
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
| 3880 |
/*.no_alloc =*/ params.no_alloc,
|
| 3881 |
/*.no_alloc_save =*/ params.no_alloc,
|
|
|
|
| 3913 |
}
|
| 3914 |
|
| 3915 |
if (ctx->mem_buffer_owned) {
|
| 3916 |
+
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
|
| 3917 |
}
|
| 3918 |
|
| 3919 |
GGML_FREE(ctx);
|
|
|
|
| 19603 |
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
| 19604 |
if (!threadpool) return;
|
| 19605 |
|
| 19606 |
+
const int n_threads = threadpool->n_threads_max;
|
| 19607 |
+
|
| 19608 |
#ifndef GGML_USE_OPENMP
|
| 19609 |
struct ggml_compute_state* workers = threadpool->workers;
|
|
|
|
| 19610 |
|
| 19611 |
ggml_mutex_lock(&threadpool->mutex);
|
| 19612 |
|
|
|
|
| 19626 |
ggml_cond_destroy(&threadpool->cond);
|
| 19627 |
#endif // GGML_USE_OPENMP
|
| 19628 |
|
| 19629 |
+
const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
|
| 19630 |
+
ggml_aligned_free(threadpool->workers, workers_size);
|
| 19631 |
+
ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
|
| 19632 |
}
|
| 19633 |
|
| 19634 |
#ifndef GGML_USE_OPENMP
|
|
|
|
| 20060 |
struct ggml_cplan * cplan) {
|
| 20061 |
|
| 20062 |
struct ggml_threadpool * threadpool =
|
| 20063 |
+
ggml_aligned_malloc(sizeof(struct ggml_threadpool));
|
| 20064 |
{
|
| 20065 |
threadpool->cgraph = cgraph;
|
| 20066 |
threadpool->cplan = cplan;
|
|
|
|
| 20081 |
|
| 20082 |
// Allocate and init workers state
|
| 20083 |
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
|
| 20084 |
+
struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
|
| 20085 |
|
| 20086 |
memset(workers, 0, workers_size);
|
| 20087 |
for (int j = 0; j < tpp->n_threads; j++) {
|