Spaces:
Sleeping
Sleeping
ggml : introduce ggml_status (ggml/750)
Browse files* using enum as an exit code instead of macros
* update return type from enum to unsigned int
* indentation fix
* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast
* ggml_status to string cast
* GGML_CALL was removed
Co-authored-by: slaren <[email protected]>
---------
Co-authored-by: slaren <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>
- ggml-backend-impl.h +4 -3
- ggml-backend.c +18 -21
- ggml-backend.h +16 -15
- ggml-cuda.cu +2 -2
- ggml-kompute.cpp +2 -2
- ggml-metal.m +4 -4
- ggml-opencl.cpp +2 -2
- ggml-sycl.cpp +2 -2
- ggml-vulkan.cpp +2 -2
- ggml.c +23 -6
- ggml.h +13 -4
ggml-backend-impl.h
CHANGED
|
@@ -91,13 +91,14 @@ extern "C" {
|
|
| 91 |
// (optional) complete all pending operations
|
| 92 |
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
| 93 |
|
| 94 |
-
//
|
| 95 |
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 96 |
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 97 |
-
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 98 |
|
|
|
|
|
|
|
| 99 |
// compute graph without a plan (async)
|
| 100 |
-
|
| 101 |
|
| 102 |
// check if the backend supports an operation
|
| 103 |
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
|
|
| 91 |
// (optional) complete all pending operations
|
| 92 |
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
| 93 |
|
| 94 |
+
// create a plan for ggml_cgraph and free it
|
| 95 |
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 96 |
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
|
|
| 97 |
|
| 98 |
+
// compute graph with a plan
|
| 99 |
+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 100 |
// compute graph without a plan (async)
|
| 101 |
+
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 102 |
|
| 103 |
// check if the backend supports an operation
|
| 104 |
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
ggml-backend.c
CHANGED
|
@@ -262,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
|
|
| 262 |
backend->iface.graph_plan_free(backend, plan);
|
| 263 |
}
|
| 264 |
|
| 265 |
-
|
| 266 |
-
backend->iface.graph_plan_compute(backend, plan);
|
| 267 |
}
|
| 268 |
|
| 269 |
-
|
| 270 |
return backend->iface.graph_compute(backend, cgraph);
|
| 271 |
}
|
| 272 |
|
|
@@ -732,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
|
|
| 732 |
GGML_UNUSED(backend);
|
| 733 |
}
|
| 734 |
|
| 735 |
-
GGML_CALL static
|
| 736 |
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
| 737 |
|
| 738 |
-
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
| 739 |
|
| 740 |
GGML_UNUSED(backend);
|
| 741 |
}
|
| 742 |
|
| 743 |
-
GGML_CALL static
|
| 744 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 745 |
|
| 746 |
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
@@ -755,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
| 755 |
cplan.abort_callback = cpu_ctx->abort_callback;
|
| 756 |
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
| 757 |
|
| 758 |
-
ggml_graph_compute(cgraph, &cplan);
|
| 759 |
-
return true;
|
| 760 |
}
|
| 761 |
|
| 762 |
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
@@ -1437,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
| 1437 |
return true;
|
| 1438 |
}
|
| 1439 |
|
| 1440 |
-
static
|
| 1441 |
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
| 1442 |
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
| 1443 |
|
|
@@ -1472,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1472 |
|
| 1473 |
uint64_t compute_start_us = ggml_time_us();
|
| 1474 |
if (!sched->callback_eval) {
|
| 1475 |
-
|
| 1476 |
-
|
|
|
|
| 1477 |
}
|
| 1478 |
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
| 1479 |
} else {
|
|
@@ -1494,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1494 |
|
| 1495 |
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
| 1496 |
|
| 1497 |
-
|
| 1498 |
-
|
|
|
|
| 1499 |
}
|
| 1500 |
|
| 1501 |
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
|
@@ -1519,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1519 |
}
|
| 1520 |
#endif
|
| 1521 |
|
| 1522 |
-
return
|
| 1523 |
}
|
| 1524 |
|
| 1525 |
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
|
@@ -1581,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
| 1581 |
return true;
|
| 1582 |
}
|
| 1583 |
|
| 1584 |
-
|
| 1585 |
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
| 1586 |
|
| 1587 |
if (!sched->is_reset) {
|
|
@@ -1590,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
|
|
| 1590 |
|
| 1591 |
ggml_backend_sched_split_graph(sched, graph);
|
| 1592 |
if (!ggml_backend_sched_alloc_splits(sched)) {
|
| 1593 |
-
return
|
| 1594 |
}
|
| 1595 |
|
| 1596 |
-
|
| 1597 |
-
return false;
|
| 1598 |
-
}
|
| 1599 |
-
|
| 1600 |
-
return true;
|
| 1601 |
}
|
| 1602 |
|
| 1603 |
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
|
|
|
| 262 |
backend->iface.graph_plan_free(backend, plan);
|
| 263 |
}
|
| 264 |
|
| 265 |
+
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 266 |
+
return backend->iface.graph_plan_compute(backend, plan);
|
| 267 |
}
|
| 268 |
|
| 269 |
+
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 270 |
return backend->iface.graph_compute(backend, cgraph);
|
| 271 |
}
|
| 272 |
|
|
|
|
| 732 |
GGML_UNUSED(backend);
|
| 733 |
}
|
| 734 |
|
| 735 |
+
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 736 |
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
| 737 |
|
| 738 |
+
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
| 739 |
|
| 740 |
GGML_UNUSED(backend);
|
| 741 |
}
|
| 742 |
|
| 743 |
+
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 744 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 745 |
|
| 746 |
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
|
|
| 755 |
cplan.abort_callback = cpu_ctx->abort_callback;
|
| 756 |
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
| 757 |
|
| 758 |
+
return ggml_graph_compute(cgraph, &cplan);
|
|
|
|
| 759 |
}
|
| 760 |
|
| 761 |
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
|
|
| 1436 |
return true;
|
| 1437 |
}
|
| 1438 |
|
| 1439 |
+
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
| 1440 |
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
| 1441 |
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
| 1442 |
|
|
|
|
| 1471 |
|
| 1472 |
uint64_t compute_start_us = ggml_time_us();
|
| 1473 |
if (!sched->callback_eval) {
|
| 1474 |
+
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
|
| 1475 |
+
if (ec != GGML_STATUS_SUCCESS) {
|
| 1476 |
+
return ec;
|
| 1477 |
}
|
| 1478 |
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
| 1479 |
} else {
|
|
|
|
| 1494 |
|
| 1495 |
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
| 1496 |
|
| 1497 |
+
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
|
| 1498 |
+
if (ec != GGML_STATUS_SUCCESS) {
|
| 1499 |
+
return ec;
|
| 1500 |
}
|
| 1501 |
|
| 1502 |
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
|
|
|
| 1520 |
}
|
| 1521 |
#endif
|
| 1522 |
|
| 1523 |
+
return GGML_STATUS_SUCCESS;
|
| 1524 |
}
|
| 1525 |
|
| 1526 |
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
|
|
|
| 1582 |
return true;
|
| 1583 |
}
|
| 1584 |
|
| 1585 |
+
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1586 |
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
| 1587 |
|
| 1588 |
if (!sched->is_reset) {
|
|
|
|
| 1591 |
|
| 1592 |
ggml_backend_sched_split_graph(sched, graph);
|
| 1593 |
if (!ggml_backend_sched_alloc_splits(sched)) {
|
| 1594 |
+
return GGML_STATUS_ALLOC_FAILED;
|
| 1595 |
}
|
| 1596 |
|
| 1597 |
+
return ggml_backend_sched_compute_splits(sched);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1598 |
}
|
| 1599 |
|
| 1600 |
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
ggml-backend.h
CHANGED
|
@@ -66,12 +66,13 @@ extern "C" {
|
|
| 66 |
|
| 67 |
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
| 68 |
|
| 69 |
-
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create
|
|
|
|
| 70 |
|
| 71 |
-
GGML_API
|
| 72 |
-
GGML_API
|
| 73 |
-
|
| 74 |
-
GGML_API bool ggml_backend_supports_op
|
| 75 |
|
| 76 |
// tensor copy between different backends
|
| 77 |
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
@@ -157,26 +158,26 @@ extern "C" {
|
|
| 157 |
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
| 158 |
|
| 159 |
// Initialize a backend scheduler
|
| 160 |
-
GGML_API ggml_backend_sched_t
|
| 161 |
-
GGML_API void
|
| 162 |
// Initialize backend buffers from a measure graph
|
| 163 |
-
GGML_API bool
|
| 164 |
// Get the number of splits of the last graph
|
| 165 |
-
GGML_API int
|
| 166 |
|
| 167 |
-
GGML_API size_t
|
| 168 |
|
| 169 |
-
GGML_API void
|
| 170 |
-
GGML_API ggml_backend_t
|
| 171 |
|
| 172 |
// Allocate and compute graph on the backend scheduler
|
| 173 |
-
GGML_API
|
| 174 |
|
| 175 |
// Reset all assignments and allocators - must be called before changing the node backends
|
| 176 |
-
GGML_API void
|
| 177 |
|
| 178 |
// Set a callback to be called for each resulting node during graph compute
|
| 179 |
-
GGML_API void
|
| 180 |
|
| 181 |
//
|
| 182 |
// Utils
|
|
|
|
| 66 |
|
| 67 |
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
| 68 |
|
| 69 |
+
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 70 |
+
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 71 |
|
| 72 |
+
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 73 |
+
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 74 |
+
|
| 75 |
+
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 76 |
|
| 77 |
// tensor copy between different backends
|
| 78 |
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
|
|
| 158 |
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
| 159 |
|
| 160 |
// Initialize a backend scheduler
|
| 161 |
+
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
| 162 |
+
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
| 163 |
// Initialize backend buffers from a measure graph
|
| 164 |
+
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
| 165 |
// Get the number of splits of the last graph
|
| 166 |
+
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
| 167 |
|
| 168 |
+
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
| 169 |
|
| 170 |
+
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
| 171 |
+
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
| 172 |
|
| 173 |
// Allocate and compute graph on the backend scheduler
|
| 174 |
+
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
| 175 |
|
| 176 |
// Reset all assignments and allocators - must be called before changing the node backends
|
| 177 |
+
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
| 178 |
|
| 179 |
// Set a callback to be called for each resulting node during graph compute
|
| 180 |
+
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
| 181 |
|
| 182 |
//
|
| 183 |
// Utils
|
ggml-cuda.cu
CHANGED
|
@@ -12241,7 +12241,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
| 12241 |
UNUSED(backend);
|
| 12242 |
}
|
| 12243 |
|
| 12244 |
-
GGML_CALL static
|
| 12245 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 12246 |
|
| 12247 |
ggml_cuda_set_main_device(cuda_ctx->device);
|
|
@@ -12277,7 +12277,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
|
| 12277 |
GGML_ASSERT(ok);
|
| 12278 |
}
|
| 12279 |
|
| 12280 |
-
return
|
| 12281 |
}
|
| 12282 |
|
| 12283 |
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
|
|
| 12241 |
UNUSED(backend);
|
| 12242 |
}
|
| 12243 |
|
| 12244 |
+
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 12245 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 12246 |
|
| 12247 |
ggml_cuda_set_main_device(cuda_ctx->device);
|
|
|
|
| 12277 |
GGML_ASSERT(ok);
|
| 12278 |
}
|
| 12279 |
|
| 12280 |
+
return GGML_STATUS_SUCCESS;
|
| 12281 |
}
|
| 12282 |
|
| 12283 |
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
ggml-kompute.cpp
CHANGED
|
@@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
|
|
| 1927 |
return ggml_backend_kompute_buffer_type(ctx->device);
|
| 1928 |
}
|
| 1929 |
|
| 1930 |
-
static
|
| 1931 |
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
| 1932 |
ggml_vk_graph_compute(ctx, cgraph);
|
| 1933 |
-
return
|
| 1934 |
}
|
| 1935 |
|
| 1936 |
static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
|
|
| 1927 |
return ggml_backend_kompute_buffer_type(ctx->device);
|
| 1928 |
}
|
| 1929 |
|
| 1930 |
+
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 1931 |
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
| 1932 |
ggml_vk_graph_compute(ctx, cgraph);
|
| 1933 |
+
return GGML_STATUS_SUCCESS;
|
| 1934 |
}
|
| 1935 |
|
| 1936 |
static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
ggml-metal.m
CHANGED
|
@@ -748,7 +748,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|
| 748 |
}
|
| 749 |
}
|
| 750 |
|
| 751 |
-
static
|
| 752 |
struct ggml_metal_context * ctx,
|
| 753 |
struct ggml_cgraph * gf) {
|
| 754 |
|
|
@@ -2484,7 +2484,7 @@ static bool ggml_metal_graph_compute(
|
|
| 2484 |
MTLCommandBufferStatus status = [command_buffer status];
|
| 2485 |
if (status != MTLCommandBufferStatusCompleted) {
|
| 2486 |
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
| 2487 |
-
return
|
| 2488 |
}
|
| 2489 |
}
|
| 2490 |
|
|
@@ -2493,7 +2493,7 @@ static bool ggml_metal_graph_compute(
|
|
| 2493 |
}
|
| 2494 |
|
| 2495 |
}
|
| 2496 |
-
return
|
| 2497 |
}
|
| 2498 |
|
| 2499 |
////////////////////////////////////////////////////////////////////////////////
|
|
@@ -2795,7 +2795,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
|
|
| 2795 |
UNUSED(backend);
|
| 2796 |
}
|
| 2797 |
|
| 2798 |
-
GGML_CALL static
|
| 2799 |
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
| 2800 |
|
| 2801 |
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
|
|
|
| 748 |
}
|
| 749 |
}
|
| 750 |
|
| 751 |
+
static enum ggml_status ggml_metal_graph_compute(
|
| 752 |
struct ggml_metal_context * ctx,
|
| 753 |
struct ggml_cgraph * gf) {
|
| 754 |
|
|
|
|
| 2484 |
MTLCommandBufferStatus status = [command_buffer status];
|
| 2485 |
if (status != MTLCommandBufferStatusCompleted) {
|
| 2486 |
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
| 2487 |
+
return GGML_STATUS_FAILED;
|
| 2488 |
}
|
| 2489 |
}
|
| 2490 |
|
|
|
|
| 2493 |
}
|
| 2494 |
|
| 2495 |
}
|
| 2496 |
+
return GGML_STATUS_SUCCESS;
|
| 2497 |
}
|
| 2498 |
|
| 2499 |
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
| 2795 |
UNUSED(backend);
|
| 2796 |
}
|
| 2797 |
|
| 2798 |
+
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 2799 |
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
| 2800 |
|
| 2801 |
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
ggml-opencl.cpp
CHANGED
|
@@ -2231,7 +2231,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
|
|
| 2231 |
GGML_UNUSED(backend);
|
| 2232 |
}
|
| 2233 |
|
| 2234 |
-
static
|
| 2235 |
for (int i = 0; i < graph->n_nodes; ++i) {
|
| 2236 |
ggml_tensor * node = graph->nodes[i];
|
| 2237 |
switch (node->op) {
|
|
@@ -2246,7 +2246,7 @@ static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgrap
|
|
| 2246 |
}
|
| 2247 |
}
|
| 2248 |
|
| 2249 |
-
return
|
| 2250 |
|
| 2251 |
GGML_UNUSED(backend);
|
| 2252 |
}
|
|
|
|
| 2231 |
GGML_UNUSED(backend);
|
| 2232 |
}
|
| 2233 |
|
| 2234 |
+
static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
|
| 2235 |
for (int i = 0; i < graph->n_nodes; ++i) {
|
| 2236 |
ggml_tensor * node = graph->nodes[i];
|
| 2237 |
switch (node->op) {
|
|
|
|
| 2246 |
}
|
| 2247 |
}
|
| 2248 |
|
| 2249 |
+
return GGML_STATUS_SUCCESS;
|
| 2250 |
|
| 2251 |
GGML_UNUSED(backend);
|
| 2252 |
}
|
ggml-sycl.cpp
CHANGED
|
@@ -15581,7 +15581,7 @@ catch (sycl::exception const &exc) {
|
|
| 15581 |
std::exit(1);
|
| 15582 |
}
|
| 15583 |
|
| 15584 |
-
GGML_CALL static
|
| 15585 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 15586 |
ggml_sycl_set_main_device(sycl_ctx->device);
|
| 15587 |
|
|
@@ -15613,7 +15613,7 @@ GGML_CALL static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, gg
|
|
| 15613 |
GGML_ASSERT(ok);
|
| 15614 |
}
|
| 15615 |
|
| 15616 |
-
return
|
| 15617 |
}
|
| 15618 |
|
| 15619 |
GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
|
|
| 15581 |
std::exit(1);
|
| 15582 |
}
|
| 15583 |
|
| 15584 |
+
GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 15585 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 15586 |
ggml_sycl_set_main_device(sycl_ctx->device);
|
| 15587 |
|
|
|
|
| 15613 |
GGML_ASSERT(ok);
|
| 15614 |
}
|
| 15615 |
|
| 15616 |
+
return GGML_STATUS_SUCCESS;
|
| 15617 |
}
|
| 15618 |
|
| 15619 |
GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
ggml-vulkan.cpp
CHANGED
|
@@ -5092,7 +5092,7 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
| 5092 |
ctx->transfer_ctx = nullptr;
|
| 5093 |
}
|
| 5094 |
|
| 5095 |
-
GGML_CALL static
|
| 5096 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 5097 |
|
| 5098 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
@@ -5135,7 +5135,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|
| 5135 |
|
| 5136 |
ggml_vk_graph_cleanup(ctx);
|
| 5137 |
|
| 5138 |
-
return
|
| 5139 |
|
| 5140 |
UNUSED(backend);
|
| 5141 |
}
|
|
|
|
| 5092 |
ctx->transfer_ctx = nullptr;
|
| 5093 |
}
|
| 5094 |
|
| 5095 |
+
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 5096 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 5097 |
|
| 5098 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
|
|
| 5135 |
|
| 5136 |
ggml_vk_graph_cleanup(ctx);
|
| 5137 |
|
| 5138 |
+
return GGML_STATUS_SUCCESS;
|
| 5139 |
|
| 5140 |
UNUSED(backend);
|
| 5141 |
}
|
ggml.c
CHANGED
|
@@ -320,6 +320,16 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
|
| 320 |
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
| 321 |
float ggml_table_f32_f16[1 << 16];
|
| 322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
// note: do not use these inside ggml.c
|
| 324 |
// these are meant to be used via the ggml.h API
|
| 325 |
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
|
@@ -17400,6 +17410,7 @@ struct ggml_compute_state {
|
|
| 17400 |
ggml_thread_t thrd;
|
| 17401 |
int ith;
|
| 17402 |
struct ggml_compute_state_shared * shared;
|
|
|
|
| 17403 |
};
|
| 17404 |
|
| 17405 |
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
|
@@ -17693,7 +17704,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 17693 |
while (true) {
|
| 17694 |
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
| 17695 |
state->shared->node_n += 1;
|
| 17696 |
-
|
|
|
|
| 17697 |
}
|
| 17698 |
|
| 17699 |
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
|
@@ -17815,7 +17827,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 17815 |
}
|
| 17816 |
}
|
| 17817 |
|
| 17818 |
-
return
|
| 17819 |
}
|
| 17820 |
|
| 17821 |
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
|
@@ -18011,7 +18023,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
| 18011 |
return cplan;
|
| 18012 |
}
|
| 18013 |
|
| 18014 |
-
|
| 18015 |
{
|
| 18016 |
GGML_ASSERT(cplan);
|
| 18017 |
GGML_ASSERT(cplan->n_threads > 0);
|
|
@@ -18055,6 +18067,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
| 18055 |
.thrd = 0,
|
| 18056 |
.ith = j,
|
| 18057 |
.shared = &state_shared,
|
|
|
|
| 18058 |
};
|
| 18059 |
|
| 18060 |
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
|
@@ -18065,12 +18078,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
| 18065 |
|
| 18066 |
workers[0].ith = 0;
|
| 18067 |
workers[0].shared = &state_shared;
|
|
|
|
| 18068 |
|
| 18069 |
const int64_t perf_start_cycles = ggml_perf_cycles();
|
| 18070 |
const int64_t perf_start_time_us = ggml_perf_time_us();
|
| 18071 |
|
| 18072 |
// this is a work thread too
|
| 18073 |
-
|
|
|
|
| 18074 |
|
| 18075 |
// don't leave affinity set on the main thread
|
| 18076 |
clear_numa_thread_affinity();
|
|
@@ -18080,6 +18095,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
| 18080 |
for (int j = 1; j < n_threads; j++) {
|
| 18081 |
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
| 18082 |
GGML_ASSERT(rc == 0);
|
|
|
|
|
|
|
| 18083 |
}
|
| 18084 |
}
|
| 18085 |
|
|
@@ -18107,14 +18124,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
| 18107 |
return compute_status;
|
| 18108 |
}
|
| 18109 |
|
| 18110 |
-
|
| 18111 |
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
| 18112 |
|
| 18113 |
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
|
| 18114 |
|
| 18115 |
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
| 18116 |
|
| 18117 |
-
ggml_graph_compute(cgraph, &cplan);
|
| 18118 |
}
|
| 18119 |
|
| 18120 |
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
|
|
|
| 320 |
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
| 321 |
float ggml_table_f32_f16[1 << 16];
|
| 322 |
|
| 323 |
+
const char * ggml_status_to_string(enum ggml_status status) {
|
| 324 |
+
switch (status) {
|
| 325 |
+
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
| 326 |
+
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
|
| 327 |
+
case GGML_STATUS_SUCCESS: return "GGML status: success";
|
| 328 |
+
case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
|
| 329 |
+
default: GGML_ASSERT(false);
|
| 330 |
+
}
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
// note: do not use these inside ggml.c
|
| 334 |
// these are meant to be used via the ggml.h API
|
| 335 |
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
|
|
|
| 17410 |
ggml_thread_t thrd;
|
| 17411 |
int ith;
|
| 17412 |
struct ggml_compute_state_shared * shared;
|
| 17413 |
+
enum ggml_status ec;
|
| 17414 |
};
|
| 17415 |
|
| 17416 |
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
|
|
|
| 17704 |
while (true) {
|
| 17705 |
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
| 17706 |
state->shared->node_n += 1;
|
| 17707 |
+
state->ec = GGML_STATUS_ABORTED;
|
| 17708 |
+
return 0;
|
| 17709 |
}
|
| 17710 |
|
| 17711 |
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
|
|
|
| 17827 |
}
|
| 17828 |
}
|
| 17829 |
|
| 17830 |
+
return 0;
|
| 17831 |
}
|
| 17832 |
|
| 17833 |
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
|
|
|
| 18023 |
return cplan;
|
| 18024 |
}
|
| 18025 |
|
| 18026 |
+
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
| 18027 |
{
|
| 18028 |
GGML_ASSERT(cplan);
|
| 18029 |
GGML_ASSERT(cplan->n_threads > 0);
|
|
|
|
| 18067 |
.thrd = 0,
|
| 18068 |
.ith = j,
|
| 18069 |
.shared = &state_shared,
|
| 18070 |
+
.ec = GGML_STATUS_SUCCESS,
|
| 18071 |
};
|
| 18072 |
|
| 18073 |
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
|
|
|
| 18078 |
|
| 18079 |
workers[0].ith = 0;
|
| 18080 |
workers[0].shared = &state_shared;
|
| 18081 |
+
workers[0].ec = GGML_STATUS_SUCCESS;
|
| 18082 |
|
| 18083 |
const int64_t perf_start_cycles = ggml_perf_cycles();
|
| 18084 |
const int64_t perf_start_time_us = ggml_perf_time_us();
|
| 18085 |
|
| 18086 |
// this is a work thread too
|
| 18087 |
+
ggml_graph_compute_thread(&workers[0]);
|
| 18088 |
+
enum ggml_status compute_status = workers[0].ec;
|
| 18089 |
|
| 18090 |
// don't leave affinity set on the main thread
|
| 18091 |
clear_numa_thread_affinity();
|
|
|
|
| 18095 |
for (int j = 1; j < n_threads; j++) {
|
| 18096 |
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
| 18097 |
GGML_ASSERT(rc == 0);
|
| 18098 |
+
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
| 18099 |
+
compute_status = workers[j].ec;
|
| 18100 |
}
|
| 18101 |
}
|
| 18102 |
|
|
|
|
| 18124 |
return compute_status;
|
| 18125 |
}
|
| 18126 |
|
| 18127 |
+
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
| 18128 |
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
| 18129 |
|
| 18130 |
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
|
| 18131 |
|
| 18132 |
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
| 18133 |
|
| 18134 |
+
return ggml_graph_compute(cgraph, &cplan);
|
| 18135 |
}
|
| 18136 |
|
| 18137 |
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
ggml.h
CHANGED
|
@@ -315,6 +315,16 @@
|
|
| 315 |
extern "C" {
|
| 316 |
#endif
|
| 317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
typedef uint16_t ggml_fp16_t;
|
| 319 |
|
| 320 |
// convert FP16 <-> FP32
|
|
@@ -1940,12 +1950,11 @@ extern "C" {
|
|
| 1940 |
|
| 1941 |
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
| 1942 |
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
| 1943 |
-
GGML_API struct ggml_cplan ggml_graph_plan
|
| 1944 |
-
GGML_API
|
| 1945 |
-
|
| 1946 |
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
| 1947 |
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
| 1948 |
-
GGML_API
|
| 1949 |
|
| 1950 |
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
| 1951 |
|
|
|
|
| 315 |
extern "C" {
|
| 316 |
#endif
|
| 317 |
|
| 318 |
+
enum ggml_status {
|
| 319 |
+
GGML_STATUS_ALLOC_FAILED = -2,
|
| 320 |
+
GGML_STATUS_FAILED = -1,
|
| 321 |
+
GGML_STATUS_SUCCESS = 0,
|
| 322 |
+
GGML_STATUS_ABORTED = 1,
|
| 323 |
+
};
|
| 324 |
+
|
| 325 |
+
// get ggml_status name string
|
| 326 |
+
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
| 327 |
+
|
| 328 |
typedef uint16_t ggml_fp16_t;
|
| 329 |
|
| 330 |
// convert FP16 <-> FP32
|
|
|
|
| 1950 |
|
| 1951 |
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
| 1952 |
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
| 1953 |
+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
| 1954 |
+
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
|
|
|
| 1955 |
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
| 1956 |
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
| 1957 |
+
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
| 1958 |
|
| 1959 |
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
| 1960 |
|