ggerganov commited on
Commit
3cc64d6
·
unverified ·
1 Parent(s): 9027276

backend : add eval callback (llama/4935)

Browse files

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* simple : no need for ggml_is_contiguous + fix bool parse

* llama : fix callback placement in llama_context_params

* backend : avoid double-ask callback calls

* simple : restore examples, imatrix will serve as a demo

Files changed (2) hide show
  1. ggml-backend.c +40 -2
  2. ggml-backend.h +11 -0
ggml-backend.c CHANGED
@@ -802,6 +802,9 @@ struct ggml_backend_sched {
802
  __attribute__((aligned(GGML_MEM_ALIGN)))
803
  #endif
804
  char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
 
 
 
805
  };
806
 
807
  #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
@@ -1324,9 +1327,38 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1324
  ggml_graph_dump_dot(split->graph, NULL, split_filename);
1325
  #endif
1326
 
 
1327
  uint64_t compute_start_us = ggml_time_us();
1328
- ggml_backend_graph_compute(split_backend, &split->graph);
1329
- //ggml_backend_synchronize(split_backend); // necessary to measure compute time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1330
  uint64_t compute_end_us = ggml_time_us();
1331
  compute_us[split_backend_id] += compute_end_us - compute_start_us;
1332
  }
@@ -1431,6 +1463,12 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1431
  sched_reset(sched);
1432
  }
1433
 
 
 
 
 
 
 
1434
  int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1435
  return sched->n_splits;
1436
  }
 
802
  __attribute__((aligned(GGML_MEM_ALIGN)))
803
  #endif
804
  char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
805
+
806
+ ggml_backend_sched_eval_callback callback_eval;
807
+ void * callback_eval_user_data;
808
  };
809
 
810
  #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
 
1327
  ggml_graph_dump_dot(split->graph, NULL, split_filename);
1328
  #endif
1329
 
1330
+
1331
  uint64_t compute_start_us = ggml_time_us();
1332
+ if (!sched->callback_eval) {
1333
+ ggml_backend_graph_compute(split_backend, &split->graph);
1334
+ //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1335
+ } else {
1336
+ // similar to ggml_backend_compare_graph_backend
1337
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1338
+ struct ggml_tensor * t = split->graph.nodes[j0];
1339
+
1340
+ // check if the user needs data from this node
1341
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1342
+
1343
+ int j1 = j0;
1344
+
1345
+ // determine the range [j0, j1] of nodes that can be computed together
1346
+ while (!need && j1 < split->graph.n_nodes - 1) {
1347
+ t = split->graph.nodes[++j1];
1348
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1349
+ }
1350
+
1351
+ struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1352
+
1353
+ ggml_backend_graph_compute(split_backend, &gv);
1354
+
1355
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1356
+ break;
1357
+ }
1358
+
1359
+ j0 = j1;
1360
+ }
1361
+ }
1362
  uint64_t compute_end_us = ggml_time_us();
1363
  compute_us[split_backend_id] += compute_end_us - compute_start_us;
1364
  }
 
1463
  sched_reset(sched);
1464
  }
1465
 
1466
+
1467
+ void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1468
+ sched->callback_eval = callback;
1469
+ sched->callback_eval_user_data = user_data;
1470
+ }
1471
+
1472
  int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1473
  return sched->n_splits;
1474
  }
ggml-backend.h CHANGED
@@ -148,6 +148,14 @@ extern "C" {
148
  struct ggml_backend_sched;
149
  typedef struct ggml_backend_sched * ggml_backend_sched_t;
150
 
 
 
 
 
 
 
 
 
151
  // Initialize a backend scheduler
152
  GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
153
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
@@ -168,6 +176,9 @@ extern "C" {
168
  // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
169
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
170
 
 
 
 
171
  //
172
  // Utils
173
  //
 
148
  struct ggml_backend_sched;
149
  typedef struct ggml_backend_sched * ggml_backend_sched_t;
150
 
151
+ // when ask == true, the scheduler wants to know if the user wants to observe this node
152
+ // this allows the scheduler to batch nodes together in order to evaluate them in a single call
153
+ //
154
+ // when ask == false, the scheduler is passing the node tensor to the user for observation
155
+ // if the user returns false, the scheduler will cancel the graph compute
156
+ //
157
+ typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
158
+
159
  // Initialize a backend scheduler
160
  GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
161
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
 
176
  // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
177
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
178
 
179
+ // Set a callback to be called for each resulting node during graph compute
180
+ GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
181
+
182
  //
183
  // Utils
184
  //