Spaces:
Running
Running
ggml : alloc ggml_contexts on the heap (#2525)
Browse files* whisper : reduce ggml_context usage
* ggml : allocate contexts on the heap (v2)
* ggml : aligned malloc -> malloc
- ggml/include/ggml.h +4 -3
- ggml/src/ggml-metal.m +1 -1
- ggml/src/ggml.c +17 -50
- src/whisper.cpp +15 -13
ggml/include/ggml.h
CHANGED
|
@@ -217,7 +217,6 @@
|
|
| 217 |
|
| 218 |
#define GGML_MAX_DIMS 4
|
| 219 |
#define GGML_MAX_PARAMS 2048
|
| 220 |
-
#define GGML_MAX_CONTEXTS 64
|
| 221 |
#define GGML_MAX_SRC 10
|
| 222 |
#define GGML_MAX_N_THREADS 512
|
| 223 |
#define GGML_MAX_OP_PARAMS 64
|
|
@@ -657,6 +656,7 @@ extern "C" {
|
|
| 657 |
};
|
| 658 |
|
| 659 |
// scratch buffer
|
|
|
|
| 660 |
struct ggml_scratch {
|
| 661 |
size_t offs;
|
| 662 |
size_t size;
|
|
@@ -760,8 +760,9 @@ extern "C" {
|
|
| 760 |
|
| 761 |
// main
|
| 762 |
|
| 763 |
-
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
| 764 |
-
GGML_API void
|
|
|
|
| 765 |
|
| 766 |
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
| 767 |
|
|
|
|
| 217 |
|
| 218 |
#define GGML_MAX_DIMS 4
|
| 219 |
#define GGML_MAX_PARAMS 2048
|
|
|
|
| 220 |
#define GGML_MAX_SRC 10
|
| 221 |
#define GGML_MAX_N_THREADS 512
|
| 222 |
#define GGML_MAX_OP_PARAMS 64
|
|
|
|
| 656 |
};
|
| 657 |
|
| 658 |
// scratch buffer
|
| 659 |
+
// TODO: deprecate and remove
|
| 660 |
struct ggml_scratch {
|
| 661 |
size_t offs;
|
| 662 |
size_t size;
|
|
|
|
| 760 |
|
| 761 |
// main
|
| 762 |
|
| 763 |
+
GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
|
| 764 |
+
GGML_API void ggml_reset(struct ggml_context * ctx);
|
| 765 |
+
GGML_API void ggml_free (struct ggml_context * ctx);
|
| 766 |
|
| 767 |
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
| 768 |
|
ggml/src/ggml-metal.m
CHANGED
|
@@ -3129,7 +3129,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
| 3129 |
|
| 3130 |
// default buffer
|
| 3131 |
static id<MTLDevice> g_backend_device = nil;
|
| 3132 |
-
static int g_backend_device_ref_count = 0;
|
| 3133 |
|
| 3134 |
static id<MTLDevice> ggml_backend_metal_get_device(void) {
|
| 3135 |
if (g_backend_device == nil) {
|
|
|
|
| 3129 |
|
| 3130 |
// default buffer
|
| 3131 |
static id<MTLDevice> g_backend_device = nil;
|
| 3132 |
+
static int g_backend_device_ref_count = 0; // TODO: make thread-safe
|
| 3133 |
|
| 3134 |
static id<MTLDevice> ggml_backend_metal_get_device(void) {
|
| 3135 |
if (g_backend_device == nil) {
|
ggml/src/ggml.c
CHANGED
|
@@ -308,6 +308,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
|
| 308 |
}
|
| 309 |
|
| 310 |
#define GGML_DEBUG 0
|
|
|
|
| 311 |
#define GGML_GELU_FP16
|
| 312 |
#define GGML_GELU_QUICK_FP16
|
| 313 |
|
|
@@ -1985,7 +1986,7 @@ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
|
| 1985 |
|
| 1986 |
struct ggml_context {
|
| 1987 |
size_t mem_size;
|
| 1988 |
-
void* mem_buffer;
|
| 1989 |
bool mem_buffer_owned;
|
| 1990 |
bool no_alloc;
|
| 1991 |
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
|
@@ -3234,7 +3235,6 @@ struct ggml_numa_nodes {
|
|
| 3234 |
//
|
| 3235 |
|
| 3236 |
struct ggml_state {
|
| 3237 |
-
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
| 3238 |
struct ggml_numa_nodes numa;
|
| 3239 |
};
|
| 3240 |
|
|
@@ -3816,17 +3816,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 3816 |
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
| 3817 |
|
| 3818 |
g_state = (struct ggml_state) {
|
| 3819 |
-
/*.contexts =*/ { { 0 } },
|
| 3820 |
/*.numa =*/ {
|
| 3821 |
.n_nodes = 0,
|
| 3822 |
.total_cpus = 0,
|
| 3823 |
},
|
| 3824 |
};
|
| 3825 |
|
| 3826 |
-
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
|
| 3827 |
-
g_state.contexts[i].used = false;
|
| 3828 |
-
}
|
| 3829 |
-
|
| 3830 |
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
| 3831 |
|
| 3832 |
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
|
@@ -3839,26 +3834,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 3839 |
is_first_call = false;
|
| 3840 |
}
|
| 3841 |
|
| 3842 |
-
|
| 3843 |
-
struct ggml_context * ctx = NULL;
|
| 3844 |
-
|
| 3845 |
-
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
|
| 3846 |
-
if (!g_state.contexts[i].used) {
|
| 3847 |
-
g_state.contexts[i].used = true;
|
| 3848 |
-
ctx = &g_state.contexts[i].context;
|
| 3849 |
-
|
| 3850 |
-
GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
|
| 3851 |
-
break;
|
| 3852 |
-
}
|
| 3853 |
-
}
|
| 3854 |
-
|
| 3855 |
-
if (ctx == NULL) {
|
| 3856 |
-
GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
|
| 3857 |
-
|
| 3858 |
-
ggml_critical_section_end();
|
| 3859 |
|
| 3860 |
-
|
| 3861 |
-
}
|
| 3862 |
|
| 3863 |
// allow to call ggml_init with 0 size
|
| 3864 |
if (params.mem_size == 0) {
|
|
@@ -3886,42 +3864,31 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 3886 |
|
| 3887 |
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
| 3888 |
|
| 3889 |
-
ggml_critical_section_end();
|
| 3890 |
-
|
| 3891 |
return ctx;
|
| 3892 |
}
|
| 3893 |
|
| 3894 |
-
void
|
| 3895 |
if (ctx == NULL) {
|
| 3896 |
return;
|
| 3897 |
}
|
| 3898 |
|
| 3899 |
-
|
| 3900 |
-
|
| 3901 |
-
|
| 3902 |
-
|
| 3903 |
-
|
| 3904 |
-
|
| 3905 |
-
if (&g_state.contexts[i].context == ctx) {
|
| 3906 |
-
g_state.contexts[i].used = false;
|
| 3907 |
-
|
| 3908 |
-
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
|
| 3909 |
-
__func__, i, ggml_used_mem(ctx));
|
| 3910 |
-
|
| 3911 |
-
if (ctx->mem_buffer_owned) {
|
| 3912 |
-
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
| 3913 |
-
}
|
| 3914 |
|
| 3915 |
-
|
| 3916 |
-
|
| 3917 |
-
|
| 3918 |
}
|
| 3919 |
|
| 3920 |
-
if (
|
| 3921 |
-
|
| 3922 |
}
|
| 3923 |
|
| 3924 |
-
|
| 3925 |
}
|
| 3926 |
|
| 3927 |
size_t ggml_used_mem(const struct ggml_context * ctx) {
|
|
|
|
| 308 |
}
|
| 309 |
|
| 310 |
#define GGML_DEBUG 0
|
| 311 |
+
|
| 312 |
#define GGML_GELU_FP16
|
| 313 |
#define GGML_GELU_QUICK_FP16
|
| 314 |
|
|
|
|
| 1986 |
|
| 1987 |
struct ggml_context {
|
| 1988 |
size_t mem_size;
|
| 1989 |
+
void * mem_buffer;
|
| 1990 |
bool mem_buffer_owned;
|
| 1991 |
bool no_alloc;
|
| 1992 |
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
|
|
|
| 3235 |
//
|
| 3236 |
|
| 3237 |
struct ggml_state {
|
|
|
|
| 3238 |
struct ggml_numa_nodes numa;
|
| 3239 |
};
|
| 3240 |
|
|
|
|
| 3816 |
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
| 3817 |
|
| 3818 |
g_state = (struct ggml_state) {
|
|
|
|
| 3819 |
/*.numa =*/ {
|
| 3820 |
.n_nodes = 0,
|
| 3821 |
.total_cpus = 0,
|
| 3822 |
},
|
| 3823 |
};
|
| 3824 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3825 |
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
| 3826 |
|
| 3827 |
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
|
|
|
| 3834 |
is_first_call = false;
|
| 3835 |
}
|
| 3836 |
|
| 3837 |
+
ggml_critical_section_end();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3838 |
|
| 3839 |
+
struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));
|
|
|
|
| 3840 |
|
| 3841 |
// allow to call ggml_init with 0 size
|
| 3842 |
if (params.mem_size == 0) {
|
|
|
|
| 3864 |
|
| 3865 |
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
| 3866 |
|
|
|
|
|
|
|
| 3867 |
return ctx;
|
| 3868 |
}
|
| 3869 |
|
| 3870 |
+
void ggml_reset(struct ggml_context * ctx) {
|
| 3871 |
if (ctx == NULL) {
|
| 3872 |
return;
|
| 3873 |
}
|
| 3874 |
|
| 3875 |
+
ctx->n_objects = 0;
|
| 3876 |
+
ctx->objects_begin = NULL;
|
| 3877 |
+
ctx->objects_end = NULL;
|
| 3878 |
+
ctx->scratch = (struct ggml_scratch) { 0, 0, NULL, };
|
| 3879 |
+
ctx->scratch_save = (struct ggml_scratch) { 0, 0, NULL, };
|
| 3880 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3881 |
|
| 3882 |
+
void ggml_free(struct ggml_context * ctx) {
|
| 3883 |
+
if (ctx == NULL) {
|
| 3884 |
+
return;
|
| 3885 |
}
|
| 3886 |
|
| 3887 |
+
if (ctx->mem_buffer_owned) {
|
| 3888 |
+
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
| 3889 |
}
|
| 3890 |
|
| 3891 |
+
GGML_FREE(ctx);
|
| 3892 |
}
|
| 3893 |
|
| 3894 |
size_t ggml_used_mem(const struct ggml_context * ctx) {
|
src/whisper.cpp
CHANGED
|
@@ -699,9 +699,9 @@ struct whisper_kv_cache {
|
|
| 699 |
struct ggml_tensor * k;
|
| 700 |
struct ggml_tensor * v;
|
| 701 |
|
| 702 |
-
struct ggml_context * ctx = nullptr;
|
| 703 |
-
|
| 704 |
ggml_backend_buffer_t buffer = nullptr;
|
|
|
|
|
|
|
| 705 |
};
|
| 706 |
|
| 707 |
struct whisper_model {
|
|
@@ -941,9 +941,11 @@ static bool whisper_kv_cache_init(
|
|
| 941 |
const int64_t n_mem = n_text_layer*n_ctx;
|
| 942 |
const int64_t n_elements = n_text_state*n_mem;
|
| 943 |
|
|
|
|
|
|
|
| 944 |
struct ggml_init_params params = {
|
| 945 |
-
/*.mem_size =*/
|
| 946 |
-
/*.mem_buffer =*/
|
| 947 |
/*.no_alloc =*/ true,
|
| 948 |
};
|
| 949 |
|
|
@@ -953,17 +955,17 @@ static bool whisper_kv_cache_init(
|
|
| 953 |
cache.cells.clear();
|
| 954 |
cache.cells.resize(n_ctx);
|
| 955 |
|
| 956 |
-
|
| 957 |
|
| 958 |
-
if (!
|
| 959 |
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
|
| 960 |
return false;
|
| 961 |
}
|
| 962 |
|
| 963 |
-
cache.k = ggml_new_tensor_1d(
|
| 964 |
-
cache.v = ggml_new_tensor_1d(
|
| 965 |
|
| 966 |
-
cache.buffer = ggml_backend_alloc_ctx_tensors(
|
| 967 |
if (!cache.buffer) {
|
| 968 |
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
|
| 969 |
return false;
|
|
@@ -971,13 +973,13 @@ static bool whisper_kv_cache_init(
|
|
| 971 |
|
| 972 |
ggml_backend_buffer_clear(cache.buffer, 0);
|
| 973 |
|
|
|
|
|
|
|
| 974 |
return true;
|
| 975 |
}
|
| 976 |
|
| 977 |
static void whisper_kv_cache_free(struct whisper_kv_cache & cache) {
|
| 978 |
-
ggml_free(cache.ctx);
|
| 979 |
ggml_backend_buffer_free(cache.buffer);
|
| 980 |
-
cache.ctx = nullptr;
|
| 981 |
}
|
| 982 |
|
| 983 |
static bool whisper_kv_cache_find_slot(
|
|
@@ -2002,7 +2004,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|
| 2002 |
|
| 2003 |
auto & kv_pad = wstate.kv_pad;
|
| 2004 |
|
| 2005 |
-
WHISPER_ASSERT(!!kv_pad.
|
| 2006 |
|
| 2007 |
const int n_ctx_pad = GGML_PAD(n_ctx, 256);
|
| 2008 |
|
|
@@ -2416,7 +2418,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|
| 2416 |
|
| 2417 |
auto & kv_self = wstate.kv_self;
|
| 2418 |
|
| 2419 |
-
WHISPER_ASSERT(!!kv_self.
|
| 2420 |
|
| 2421 |
const int n_ctx = kv_self.size;
|
| 2422 |
const int n_state = hparams.n_text_state;
|
|
|
|
| 699 |
struct ggml_tensor * k;
|
| 700 |
struct ggml_tensor * v;
|
| 701 |
|
|
|
|
|
|
|
| 702 |
ggml_backend_buffer_t buffer = nullptr;
|
| 703 |
+
|
| 704 |
+
std::vector<uint8_t> ctx_buf;
|
| 705 |
};
|
| 706 |
|
| 707 |
struct whisper_model {
|
|
|
|
| 941 |
const int64_t n_mem = n_text_layer*n_ctx;
|
| 942 |
const int64_t n_elements = n_text_state*n_mem;
|
| 943 |
|
| 944 |
+
cache.ctx_buf.resize(2*ggml_tensor_overhead());
|
| 945 |
+
|
| 946 |
struct ggml_init_params params = {
|
| 947 |
+
/*.mem_size =*/ cache.ctx_buf.size(),
|
| 948 |
+
/*.mem_buffer =*/ cache.ctx_buf.data(),
|
| 949 |
/*.no_alloc =*/ true,
|
| 950 |
};
|
| 951 |
|
|
|
|
| 955 |
cache.cells.clear();
|
| 956 |
cache.cells.resize(n_ctx);
|
| 957 |
|
| 958 |
+
struct ggml_context * ctx = ggml_init(params);
|
| 959 |
|
| 960 |
+
if (!ctx) {
|
| 961 |
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
|
| 962 |
return false;
|
| 963 |
}
|
| 964 |
|
| 965 |
+
cache.k = ggml_new_tensor_1d(ctx, wtype, n_elements);
|
| 966 |
+
cache.v = ggml_new_tensor_1d(ctx, wtype, n_elements);
|
| 967 |
|
| 968 |
+
cache.buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
| 969 |
if (!cache.buffer) {
|
| 970 |
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
|
| 971 |
return false;
|
|
|
|
| 973 |
|
| 974 |
ggml_backend_buffer_clear(cache.buffer, 0);
|
| 975 |
|
| 976 |
+
ggml_free(ctx);
|
| 977 |
+
|
| 978 |
return true;
|
| 979 |
}
|
| 980 |
|
| 981 |
static void whisper_kv_cache_free(struct whisper_kv_cache & cache) {
|
|
|
|
| 982 |
ggml_backend_buffer_free(cache.buffer);
|
|
|
|
| 983 |
}
|
| 984 |
|
| 985 |
static bool whisper_kv_cache_find_slot(
|
|
|
|
| 2004 |
|
| 2005 |
auto & kv_pad = wstate.kv_pad;
|
| 2006 |
|
| 2007 |
+
WHISPER_ASSERT(!!kv_pad.buffer);
|
| 2008 |
|
| 2009 |
const int n_ctx_pad = GGML_PAD(n_ctx, 256);
|
| 2010 |
|
|
|
|
| 2418 |
|
| 2419 |
auto & kv_self = wstate.kv_self;
|
| 2420 |
|
| 2421 |
+
WHISPER_ASSERT(!!kv_self.buffer);
|
| 2422 |
|
| 2423 |
const int n_ctx = kv_self.size;
|
| 2424 |
const int n_state = hparams.n_text_state;
|