Spaces:
Running
Running
backend : add eval callback (llama/4935)
Browse files* backend : add eval callback
ggml-ci
* backend : group nodes in a single compute when user don't need them
* backend : clean-up the implementation
ggml-ci
* simple : do not perform tensor data copy if not needed
* simple : fix
* simple : no need for ggml_is_contiguous + fix bool parse
* llama : fix callback placement in llama_context_params
* backend : avoid double-ask callback calls
* simple : restore examples, imatrix will serve as a demo
- ggml-backend.c +40 -2
- ggml-backend.h +11 -0
ggml-backend.c
CHANGED
|
@@ -802,6 +802,9 @@ struct ggml_backend_sched {
|
|
| 802 |
__attribute__((aligned(GGML_MEM_ALIGN)))
|
| 803 |
#endif
|
| 804 |
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
|
|
|
|
|
|
|
|
|
| 805 |
};
|
| 806 |
|
| 807 |
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
|
@@ -1324,9 +1327,38 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1324 |
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
| 1325 |
#endif
|
| 1326 |
|
|
|
|
| 1327 |
uint64_t compute_start_us = ggml_time_us();
|
| 1328 |
-
|
| 1329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1330 |
uint64_t compute_end_us = ggml_time_us();
|
| 1331 |
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
| 1332 |
}
|
|
@@ -1431,6 +1463,12 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
| 1431 |
sched_reset(sched);
|
| 1432 |
}
|
| 1433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1434 |
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
| 1435 |
return sched->n_splits;
|
| 1436 |
}
|
|
|
|
| 802 |
__attribute__((aligned(GGML_MEM_ALIGN)))
|
| 803 |
#endif
|
| 804 |
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
| 805 |
+
|
| 806 |
+
ggml_backend_sched_eval_callback callback_eval;
|
| 807 |
+
void * callback_eval_user_data;
|
| 808 |
};
|
| 809 |
|
| 810 |
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
|
|
|
| 1327 |
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
| 1328 |
#endif
|
| 1329 |
|
| 1330 |
+
|
| 1331 |
uint64_t compute_start_us = ggml_time_us();
|
| 1332 |
+
if (!sched->callback_eval) {
|
| 1333 |
+
ggml_backend_graph_compute(split_backend, &split->graph);
|
| 1334 |
+
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
| 1335 |
+
} else {
|
| 1336 |
+
// similar to ggml_backend_compare_graph_backend
|
| 1337 |
+
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
| 1338 |
+
struct ggml_tensor * t = split->graph.nodes[j0];
|
| 1339 |
+
|
| 1340 |
+
// check if the user needs data from this node
|
| 1341 |
+
bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
| 1342 |
+
|
| 1343 |
+
int j1 = j0;
|
| 1344 |
+
|
| 1345 |
+
// determine the range [j0, j1] of nodes that can be computed together
|
| 1346 |
+
while (!need && j1 < split->graph.n_nodes - 1) {
|
| 1347 |
+
t = split->graph.nodes[++j1];
|
| 1348 |
+
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
| 1349 |
+
}
|
| 1350 |
+
|
| 1351 |
+
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
| 1352 |
+
|
| 1353 |
+
ggml_backend_graph_compute(split_backend, &gv);
|
| 1354 |
+
|
| 1355 |
+
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
| 1356 |
+
break;
|
| 1357 |
+
}
|
| 1358 |
+
|
| 1359 |
+
j0 = j1;
|
| 1360 |
+
}
|
| 1361 |
+
}
|
| 1362 |
uint64_t compute_end_us = ggml_time_us();
|
| 1363 |
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
| 1364 |
}
|
|
|
|
| 1463 |
sched_reset(sched);
|
| 1464 |
}
|
| 1465 |
|
| 1466 |
+
|
| 1467 |
+
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
| 1468 |
+
sched->callback_eval = callback;
|
| 1469 |
+
sched->callback_eval_user_data = user_data;
|
| 1470 |
+
}
|
| 1471 |
+
|
| 1472 |
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
| 1473 |
return sched->n_splits;
|
| 1474 |
}
|
ggml-backend.h
CHANGED
|
@@ -148,6 +148,14 @@ extern "C" {
|
|
| 148 |
struct ggml_backend_sched;
|
| 149 |
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
// Initialize a backend scheduler
|
| 152 |
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
| 153 |
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
@@ -168,6 +176,9 @@ extern "C" {
|
|
| 168 |
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
|
| 169 |
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
| 170 |
|
|
|
|
|
|
|
|
|
|
| 171 |
//
|
| 172 |
// Utils
|
| 173 |
//
|
|
|
|
| 148 |
struct ggml_backend_sched;
|
| 149 |
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
| 150 |
|
| 151 |
+
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
| 152 |
+
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
| 153 |
+
//
|
| 154 |
+
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
| 155 |
+
// if the user returns false, the scheduler will cancel the graph compute
|
| 156 |
+
//
|
| 157 |
+
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
| 158 |
+
|
| 159 |
// Initialize a backend scheduler
|
| 160 |
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
| 161 |
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
|
|
| 176 |
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
|
| 177 |
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
| 178 |
|
| 179 |
+
// Set a callback to be called for each resulting node during graph compute
|
| 180 |
+
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
| 181 |
+
|
| 182 |
//
|
| 183 |
// Utils
|
| 184 |
//
|