ggerganov commited on
Commit
3ccf40a
·
unverified ·
1 Parent(s): b116fe7

ggml : alloc ggml_contexts on the heap (#2525)

Browse files

* whisper : reduce ggml_context usage

* ggml : allocate contexts on the heap (v2)

* ggml : aligned malloc -> malloc

Files changed (4) hide show
  1. ggml/include/ggml.h +4 -3
  2. ggml/src/ggml-metal.m +1 -1
  3. ggml/src/ggml.c +17 -50
  4. src/whisper.cpp +15 -13
ggml/include/ggml.h CHANGED
@@ -217,7 +217,6 @@
217
 
218
  #define GGML_MAX_DIMS 4
219
  #define GGML_MAX_PARAMS 2048
220
- #define GGML_MAX_CONTEXTS 64
221
  #define GGML_MAX_SRC 10
222
  #define GGML_MAX_N_THREADS 512
223
  #define GGML_MAX_OP_PARAMS 64
@@ -657,6 +656,7 @@ extern "C" {
657
  };
658
 
659
  // scratch buffer
 
660
  struct ggml_scratch {
661
  size_t offs;
662
  size_t size;
@@ -760,8 +760,9 @@ extern "C" {
760
 
761
  // main
762
 
763
- GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
764
- GGML_API void ggml_free(struct ggml_context * ctx);
 
765
 
766
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
767
 
 
217
 
218
  #define GGML_MAX_DIMS 4
219
  #define GGML_MAX_PARAMS 2048
 
220
  #define GGML_MAX_SRC 10
221
  #define GGML_MAX_N_THREADS 512
222
  #define GGML_MAX_OP_PARAMS 64
 
656
  };
657
 
658
  // scratch buffer
659
+ // TODO: deprecate and remove
660
  struct ggml_scratch {
661
  size_t offs;
662
  size_t size;
 
760
 
761
  // main
762
 
763
+ GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
764
+ GGML_API void ggml_reset(struct ggml_context * ctx);
765
+ GGML_API void ggml_free (struct ggml_context * ctx);
766
 
767
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
768
 
ggml/src/ggml-metal.m CHANGED
@@ -3129,7 +3129,7 @@ static enum ggml_status ggml_metal_graph_compute(
3129
 
3130
  // default buffer
3131
  static id<MTLDevice> g_backend_device = nil;
3132
- static int g_backend_device_ref_count = 0;
3133
 
3134
  static id<MTLDevice> ggml_backend_metal_get_device(void) {
3135
  if (g_backend_device == nil) {
 
3129
 
3130
  // default buffer
3131
  static id<MTLDevice> g_backend_device = nil;
3132
+ static int g_backend_device_ref_count = 0; // TODO: make thread-safe
3133
 
3134
  static id<MTLDevice> ggml_backend_metal_get_device(void) {
3135
  if (g_backend_device == nil) {
ggml/src/ggml.c CHANGED
@@ -308,6 +308,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
308
  }
309
 
310
  #define GGML_DEBUG 0
 
311
  #define GGML_GELU_FP16
312
  #define GGML_GELU_QUICK_FP16
313
 
@@ -1985,7 +1986,7 @@ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
1985
 
1986
  struct ggml_context {
1987
  size_t mem_size;
1988
- void* mem_buffer;
1989
  bool mem_buffer_owned;
1990
  bool no_alloc;
1991
  bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
@@ -3234,7 +3235,6 @@ struct ggml_numa_nodes {
3234
  //
3235
 
3236
  struct ggml_state {
3237
- struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
3238
  struct ggml_numa_nodes numa;
3239
  };
3240
 
@@ -3816,17 +3816,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3816
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
3817
 
3818
  g_state = (struct ggml_state) {
3819
- /*.contexts =*/ { { 0 } },
3820
  /*.numa =*/ {
3821
  .n_nodes = 0,
3822
  .total_cpus = 0,
3823
  },
3824
  };
3825
 
3826
- for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
3827
- g_state.contexts[i].used = false;
3828
- }
3829
-
3830
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
3831
 
3832
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
@@ -3839,26 +3834,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3839
  is_first_call = false;
3840
  }
3841
 
3842
- // find non-used context in g_state
3843
- struct ggml_context * ctx = NULL;
3844
-
3845
- for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
3846
- if (!g_state.contexts[i].used) {
3847
- g_state.contexts[i].used = true;
3848
- ctx = &g_state.contexts[i].context;
3849
-
3850
- GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
3851
- break;
3852
- }
3853
- }
3854
-
3855
- if (ctx == NULL) {
3856
- GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
3857
-
3858
- ggml_critical_section_end();
3859
 
3860
- return NULL;
3861
- }
3862
 
3863
  // allow to call ggml_init with 0 size
3864
  if (params.mem_size == 0) {
@@ -3886,42 +3864,31 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3886
 
3887
  GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3888
 
3889
- ggml_critical_section_end();
3890
-
3891
  return ctx;
3892
  }
3893
 
3894
- void ggml_free(struct ggml_context * ctx) {
3895
  if (ctx == NULL) {
3896
  return;
3897
  }
3898
 
3899
- // make this function thread safe
3900
- ggml_critical_section_start();
3901
-
3902
- bool found = false;
3903
-
3904
- for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
3905
- if (&g_state.contexts[i].context == ctx) {
3906
- g_state.contexts[i].used = false;
3907
-
3908
- GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
3909
- __func__, i, ggml_used_mem(ctx));
3910
-
3911
- if (ctx->mem_buffer_owned) {
3912
- GGML_ALIGNED_FREE(ctx->mem_buffer);
3913
- }
3914
 
3915
- found = true;
3916
- break;
3917
- }
3918
  }
3919
 
3920
- if (!found) {
3921
- GGML_PRINT_DEBUG("%s: context not found\n", __func__);
3922
  }
3923
 
3924
- ggml_critical_section_end();
3925
  }
3926
 
3927
  size_t ggml_used_mem(const struct ggml_context * ctx) {
 
308
  }
309
 
310
  #define GGML_DEBUG 0
311
+
312
  #define GGML_GELU_FP16
313
  #define GGML_GELU_QUICK_FP16
314
 
 
1986
 
1987
  struct ggml_context {
1988
  size_t mem_size;
1989
+ void * mem_buffer;
1990
  bool mem_buffer_owned;
1991
  bool no_alloc;
1992
  bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
 
3235
  //
3236
 
3237
  struct ggml_state {
 
3238
  struct ggml_numa_nodes numa;
3239
  };
3240
 
 
3816
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
3817
 
3818
  g_state = (struct ggml_state) {
 
3819
  /*.numa =*/ {
3820
  .n_nodes = 0,
3821
  .total_cpus = 0,
3822
  },
3823
  };
3824
 
 
 
 
 
3825
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
3826
 
3827
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
 
3834
  is_first_call = false;
3835
  }
3836
 
3837
+ ggml_critical_section_end();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3838
 
3839
+ struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));
 
3840
 
3841
  // allow to call ggml_init with 0 size
3842
  if (params.mem_size == 0) {
 
3864
 
3865
  GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3866
 
 
 
3867
  return ctx;
3868
  }
3869
 
3870
+ void ggml_reset(struct ggml_context * ctx) {
3871
  if (ctx == NULL) {
3872
  return;
3873
  }
3874
 
3875
+ ctx->n_objects = 0;
3876
+ ctx->objects_begin = NULL;
3877
+ ctx->objects_end = NULL;
3878
+ ctx->scratch = (struct ggml_scratch) { 0, 0, NULL, };
3879
+ ctx->scratch_save = (struct ggml_scratch) { 0, 0, NULL, };
3880
+ }
 
 
 
 
 
 
 
 
 
3881
 
3882
+ void ggml_free(struct ggml_context * ctx) {
3883
+ if (ctx == NULL) {
3884
+ return;
3885
  }
3886
 
3887
+ if (ctx->mem_buffer_owned) {
3888
+ GGML_ALIGNED_FREE(ctx->mem_buffer);
3889
  }
3890
 
3891
+ GGML_FREE(ctx);
3892
  }
3893
 
3894
  size_t ggml_used_mem(const struct ggml_context * ctx) {
src/whisper.cpp CHANGED
@@ -699,9 +699,9 @@ struct whisper_kv_cache {
699
  struct ggml_tensor * k;
700
  struct ggml_tensor * v;
701
 
702
- struct ggml_context * ctx = nullptr;
703
-
704
  ggml_backend_buffer_t buffer = nullptr;
 
 
705
  };
706
 
707
  struct whisper_model {
@@ -941,9 +941,11 @@ static bool whisper_kv_cache_init(
941
  const int64_t n_mem = n_text_layer*n_ctx;
942
  const int64_t n_elements = n_text_state*n_mem;
943
 
 
 
944
  struct ggml_init_params params = {
945
- /*.mem_size =*/ 2*ggml_tensor_overhead(),
946
- /*.mem_buffer =*/ nullptr,
947
  /*.no_alloc =*/ true,
948
  };
949
 
@@ -953,17 +955,17 @@ static bool whisper_kv_cache_init(
953
  cache.cells.clear();
954
  cache.cells.resize(n_ctx);
955
 
956
- cache.ctx = ggml_init(params);
957
 
958
- if (!cache.ctx) {
959
  WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
960
  return false;
961
  }
962
 
963
- cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
964
- cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
965
 
966
- cache.buffer = ggml_backend_alloc_ctx_tensors(cache.ctx, backend);
967
  if (!cache.buffer) {
968
  WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
969
  return false;
@@ -971,13 +973,13 @@ static bool whisper_kv_cache_init(
971
 
972
  ggml_backend_buffer_clear(cache.buffer, 0);
973
 
 
 
974
  return true;
975
  }
976
 
977
  static void whisper_kv_cache_free(struct whisper_kv_cache & cache) {
978
- ggml_free(cache.ctx);
979
  ggml_backend_buffer_free(cache.buffer);
980
- cache.ctx = nullptr;
981
  }
982
 
983
  static bool whisper_kv_cache_find_slot(
@@ -2002,7 +2004,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
2002
 
2003
  auto & kv_pad = wstate.kv_pad;
2004
 
2005
- WHISPER_ASSERT(!!kv_pad.ctx);
2006
 
2007
  const int n_ctx_pad = GGML_PAD(n_ctx, 256);
2008
 
@@ -2416,7 +2418,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
2416
 
2417
  auto & kv_self = wstate.kv_self;
2418
 
2419
- WHISPER_ASSERT(!!kv_self.ctx);
2420
 
2421
  const int n_ctx = kv_self.size;
2422
  const int n_state = hparams.n_text_state;
 
699
  struct ggml_tensor * k;
700
  struct ggml_tensor * v;
701
 
 
 
702
  ggml_backend_buffer_t buffer = nullptr;
703
+
704
+ std::vector<uint8_t> ctx_buf;
705
  };
706
 
707
  struct whisper_model {
 
941
  const int64_t n_mem = n_text_layer*n_ctx;
942
  const int64_t n_elements = n_text_state*n_mem;
943
 
944
+ cache.ctx_buf.resize(2*ggml_tensor_overhead());
945
+
946
  struct ggml_init_params params = {
947
+ /*.mem_size =*/ cache.ctx_buf.size(),
948
+ /*.mem_buffer =*/ cache.ctx_buf.data(),
949
  /*.no_alloc =*/ true,
950
  };
951
 
 
955
  cache.cells.clear();
956
  cache.cells.resize(n_ctx);
957
 
958
+ struct ggml_context * ctx = ggml_init(params);
959
 
960
+ if (!ctx) {
961
  WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
962
  return false;
963
  }
964
 
965
+ cache.k = ggml_new_tensor_1d(ctx, wtype, n_elements);
966
+ cache.v = ggml_new_tensor_1d(ctx, wtype, n_elements);
967
 
968
+ cache.buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
969
  if (!cache.buffer) {
970
  WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
971
  return false;
 
973
 
974
  ggml_backend_buffer_clear(cache.buffer, 0);
975
 
976
+ ggml_free(ctx);
977
+
978
  return true;
979
  }
980
 
981
  static void whisper_kv_cache_free(struct whisper_kv_cache & cache) {
 
982
  ggml_backend_buffer_free(cache.buffer);
 
983
  }
984
 
985
  static bool whisper_kv_cache_find_slot(
 
2004
 
2005
  auto & kv_pad = wstate.kv_pad;
2006
 
2007
+ WHISPER_ASSERT(!!kv_pad.buffer);
2008
 
2009
  const int n_ctx_pad = GGML_PAD(n_ctx, 256);
2010
 
 
2418
 
2419
  auto & kv_self = wstate.kv_self;
2420
 
2421
+ WHISPER_ASSERT(!!kv_self.buffer);
2422
 
2423
  const int n_ctx = kv_self.size;
2424
  const int n_state = hparams.n_text_state;