Spaces:
Running
Running
Commit
·
1bdb50a
1
Parent(s):
7d7ac98
ggml-backend : add device and backend reg interfaces (llama/9707)
Browse files- ggml/include/ggml-backend.h +144 -59
- ggml/include/ggml-blas.h +3 -3
- ggml/include/ggml-cann.h +9 -9
- ggml/include/ggml-cuda.h +17 -15
- ggml/include/ggml-metal.h +4 -2
- ggml/include/ggml-rpc.h +5 -5
- ggml/include/ggml-sycl.h +8 -8
- ggml/include/ggml-vulkan.h +7 -7
- ggml/include/ggml.h +29 -39
- ggml/src/CMakeLists.txt +1 -1
- ggml/src/ggml-backend-impl.h +155 -71
- ggml/src/ggml-backend.cpp +2496 -0
- ggml/src/ggml-blas.cpp +8 -10
- ggml/src/ggml-cann.cpp +54 -101
- ggml/src/ggml-cuda.cu +353 -189
- ggml/src/ggml-kompute.cpp +2 -23
- ggml/src/ggml-metal.m +24 -25
- ggml/src/ggml-rpc.cpp +26 -27
- ggml/src/ggml-sycl.cpp +45 -62
- ggml/src/ggml-vulkan.cpp +48 -69
- ggml/src/ggml.c +25 -21
ggml/include/ggml-backend.h
CHANGED
|
@@ -12,43 +12,52 @@ extern "C" {
|
|
| 12 |
typedef struct ggml_backend_event * ggml_backend_event_t;
|
| 13 |
typedef struct ggml_backend * ggml_backend_t;
|
| 14 |
typedef void * ggml_backend_graph_plan_t;
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
//
|
| 17 |
-
// Backend buffer
|
| 18 |
//
|
| 19 |
|
| 20 |
-
|
| 21 |
-
GGML_API
|
| 22 |
-
GGML_API
|
| 23 |
-
GGML_API
|
| 24 |
-
GGML_API
|
| 25 |
-
GGML_API
|
| 26 |
-
GGML_API
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
// buffer
|
| 29 |
enum ggml_backend_buffer_usage {
|
| 30 |
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
| 31 |
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
| 32 |
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
|
| 33 |
};
|
| 34 |
|
| 35 |
-
GGML_API
|
| 36 |
-
GGML_API
|
| 37 |
-
GGML_API
|
| 38 |
-
GGML_API
|
| 39 |
-
GGML_API
|
| 40 |
-
GGML_API
|
| 41 |
-
GGML_API
|
| 42 |
-
GGML_API
|
| 43 |
-
GGML_API
|
| 44 |
-
GGML_API
|
| 45 |
-
GGML_API
|
| 46 |
-
GGML_API
|
| 47 |
-
GGML_API
|
| 48 |
-
GGML_API
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
//
|
| 51 |
-
// Backend
|
| 52 |
//
|
| 53 |
|
| 54 |
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
|
@@ -64,9 +73,9 @@ extern "C" {
|
|
| 64 |
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 65 |
|
| 66 |
// "offset" refers to the offset of the tensor data for setting/getting data
|
| 67 |
-
GGML_API
|
| 68 |
-
GGML_API
|
| 69 |
-
GGML_API
|
| 70 |
|
| 71 |
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
| 72 |
|
|
@@ -76,65 +85,121 @@ extern "C" {
|
|
| 76 |
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 77 |
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 78 |
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
|
|
|
|
|
| 79 |
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 80 |
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
| 81 |
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 82 |
|
| 83 |
-
// tensor copy between different backends
|
| 84 |
-
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 85 |
-
|
| 86 |
// asynchronous copy
|
| 87 |
// the copy is performed after all the currently queued operations in backend_src
|
| 88 |
// backend_dst will wait for the copy to complete before performing other operations
|
| 89 |
// automatic fallback to sync copy if async is not supported
|
| 90 |
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 91 |
|
| 92 |
-
|
| 93 |
-
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
|
| 94 |
-
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
| 95 |
-
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
| 96 |
-
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
| 97 |
-
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
| 98 |
|
| 99 |
//
|
| 100 |
-
//
|
| 101 |
//
|
| 102 |
|
| 103 |
-
GGML_API
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
//
|
| 120 |
// Backend registry
|
| 121 |
//
|
| 122 |
|
| 123 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
GGML_API ggml_backend_t
|
| 128 |
-
|
| 129 |
-
GGML_API ggml_backend_t
|
| 130 |
-
|
| 131 |
-
GGML_API
|
| 132 |
|
| 133 |
//
|
| 134 |
// Backend scheduler
|
| 135 |
//
|
| 136 |
|
| 137 |
-
// The backend scheduler allows for multiple
|
| 138 |
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
| 139 |
// The backends are selected based on:
|
| 140 |
// - the backend that supports the operation
|
|
@@ -169,9 +234,9 @@ extern "C" {
|
|
| 169 |
}
|
| 170 |
*/
|
| 171 |
|
| 172 |
-
struct ggml_backend_sched;
|
| 173 |
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
| 174 |
|
|
|
|
| 175 |
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
| 176 |
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
| 177 |
//
|
|
@@ -226,7 +291,7 @@ extern "C" {
|
|
| 226 |
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
| 227 |
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
| 228 |
|
| 229 |
-
typedef bool (*
|
| 230 |
|
| 231 |
// Compare the output of two backends
|
| 232 |
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
|
@@ -235,6 +300,26 @@ extern "C" {
|
|
| 235 |
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
| 236 |
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
| 237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
#ifdef __cplusplus
|
| 240 |
}
|
|
|
|
| 12 |
typedef struct ggml_backend_event * ggml_backend_event_t;
|
| 13 |
typedef struct ggml_backend * ggml_backend_t;
|
| 14 |
typedef void * ggml_backend_graph_plan_t;
|
| 15 |
+
typedef struct ggml_backend_reg * ggml_backend_reg_t;
|
| 16 |
+
typedef struct ggml_backend_device * ggml_backend_dev_t;
|
| 17 |
+
|
| 18 |
|
| 19 |
//
|
| 20 |
+
// Backend buffer type
|
| 21 |
//
|
| 22 |
|
| 23 |
+
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
| 24 |
+
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
| 25 |
+
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
| 26 |
+
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
| 27 |
+
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
| 28 |
+
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
| 29 |
+
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
| 30 |
+
|
| 31 |
+
//
|
| 32 |
+
// Backend buffer
|
| 33 |
+
//
|
| 34 |
|
|
|
|
| 35 |
enum ggml_backend_buffer_usage {
|
| 36 |
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
| 37 |
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
| 38 |
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
|
| 39 |
};
|
| 40 |
|
| 41 |
+
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
| 42 |
+
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
| 43 |
+
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
| 44 |
+
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
| 45 |
+
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 46 |
+
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
| 47 |
+
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
| 48 |
+
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 49 |
+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
| 50 |
+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
| 51 |
+
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
| 52 |
+
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
|
| 53 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
| 54 |
+
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
| 55 |
+
|
| 56 |
+
// tensor copy between different backends
|
| 57 |
+
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 58 |
|
| 59 |
//
|
| 60 |
+
// Backend (stream)
|
| 61 |
//
|
| 62 |
|
| 63 |
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
|
|
|
| 73 |
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 74 |
|
| 75 |
// "offset" refers to the offset of the tensor data for setting/getting data
|
| 76 |
+
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 77 |
+
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 78 |
+
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
| 79 |
|
| 80 |
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
| 81 |
|
|
|
|
| 85 |
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 86 |
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 87 |
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 88 |
+
|
| 89 |
+
// NOTE: will be removed, use device version instead
|
| 90 |
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 91 |
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
| 92 |
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 93 |
|
|
|
|
|
|
|
|
|
|
| 94 |
// asynchronous copy
|
| 95 |
// the copy is performed after all the currently queued operations in backend_src
|
| 96 |
// backend_dst will wait for the copy to complete before performing other operations
|
| 97 |
// automatic fallback to sync copy if async is not supported
|
| 98 |
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 99 |
|
| 100 |
+
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
//
|
| 103 |
+
// Events
|
| 104 |
//
|
| 105 |
|
| 106 |
+
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
|
| 107 |
+
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
|
| 108 |
+
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
|
| 109 |
+
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
| 110 |
+
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
|
| 111 |
|
| 112 |
+
//
|
| 113 |
+
// Backend device
|
| 114 |
+
//
|
|
|
|
| 115 |
|
| 116 |
+
enum ggml_backend_dev_type {
|
| 117 |
+
GGML_BACKEND_DEVICE_TYPE_CPU,
|
| 118 |
+
GGML_BACKEND_DEVICE_TYPE_GPU,
|
| 119 |
+
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
|
| 120 |
+
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
|
| 121 |
+
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
|
| 122 |
+
};
|
| 123 |
|
| 124 |
+
// functionality supported by the device
|
| 125 |
+
struct ggml_backend_dev_caps {
|
| 126 |
+
// asynchronous operations
|
| 127 |
+
bool async;
|
| 128 |
+
// pinned host buffer
|
| 129 |
+
bool host_buffer;
|
| 130 |
+
// event synchronization
|
| 131 |
+
bool events;
|
| 132 |
+
};
|
| 133 |
|
| 134 |
+
// all the device properties
|
| 135 |
+
struct ggml_backend_dev_props {
|
| 136 |
+
const char * name;
|
| 137 |
+
const char * description;
|
| 138 |
+
size_t memory_free;
|
| 139 |
+
size_t memory_total;
|
| 140 |
+
enum ggml_backend_dev_type type;
|
| 141 |
+
struct ggml_backend_dev_caps caps;
|
| 142 |
+
};
|
| 143 |
+
|
| 144 |
+
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
| 145 |
+
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
|
| 146 |
+
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
|
| 147 |
+
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
|
| 148 |
+
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
| 149 |
+
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
| 150 |
+
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
| 151 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
|
| 152 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
| 153 |
+
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
| 154 |
+
|
| 155 |
+
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
| 156 |
+
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
|
| 157 |
+
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
| 158 |
+
|
| 159 |
+
//
|
| 160 |
+
// Backend (reg)
|
| 161 |
+
//
|
| 162 |
+
|
| 163 |
+
GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
|
| 164 |
+
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
|
| 165 |
+
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
| 166 |
+
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
| 167 |
+
GGML_API void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data);
|
| 168 |
+
|
| 169 |
+
// Functions that may be obtained using ggml_backend_reg_get_proc_address
|
| 170 |
+
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
|
| 171 |
|
| 172 |
//
|
| 173 |
// Backend registry
|
| 174 |
//
|
| 175 |
|
| 176 |
+
// Backend (reg) enumeration
|
| 177 |
+
GGML_API size_t ggml_backend_reg_count(void);
|
| 178 |
+
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
|
| 179 |
+
GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
|
| 180 |
+
|
| 181 |
+
// Device enumeration
|
| 182 |
+
GGML_API size_t ggml_backend_dev_count(void);
|
| 183 |
+
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
|
| 184 |
+
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
|
| 185 |
+
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
|
| 186 |
+
|
| 187 |
+
// Set the log callback for all registered backends
|
| 188 |
+
GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data);
|
| 189 |
|
| 190 |
+
// Direct backend (stream) initialization
|
| 191 |
+
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
|
| 192 |
+
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
| 193 |
+
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
| 194 |
+
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
| 195 |
+
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
|
| 196 |
+
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
| 197 |
|
| 198 |
//
|
| 199 |
// Backend scheduler
|
| 200 |
//
|
| 201 |
|
| 202 |
+
// The backend scheduler allows for multiple backend devices to be used together
|
| 203 |
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
| 204 |
// The backends are selected based on:
|
| 205 |
// - the backend that supports the operation
|
|
|
|
| 234 |
}
|
| 235 |
*/
|
| 236 |
|
|
|
|
| 237 |
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
| 238 |
|
| 239 |
+
// Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
|
| 240 |
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
| 241 |
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
| 242 |
//
|
|
|
|
| 291 |
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
| 292 |
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
| 293 |
|
| 294 |
+
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
| 295 |
|
| 296 |
// Compare the output of two backends
|
| 297 |
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
|
|
|
| 300 |
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
| 301 |
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
| 302 |
|
| 303 |
+
//
|
| 304 |
+
// CPU backend
|
| 305 |
+
//
|
| 306 |
+
|
| 307 |
+
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
| 308 |
+
|
| 309 |
+
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
| 310 |
+
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
| 311 |
+
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
| 312 |
+
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
| 313 |
+
|
| 314 |
+
// Create a backend buffer from an existing pointer
|
| 315 |
+
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
| 316 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
| 317 |
+
|
| 318 |
+
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
| 319 |
+
|
| 320 |
+
#ifdef GGML_USE_CPU_HBM
|
| 321 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
| 322 |
+
#endif
|
| 323 |
|
| 324 |
#ifdef __cplusplus
|
| 325 |
}
|
ggml/include/ggml-blas.h
CHANGED
|
@@ -9,13 +9,13 @@ extern "C" {
|
|
| 9 |
#endif
|
| 10 |
|
| 11 |
// backend API
|
| 12 |
-
GGML_API
|
| 13 |
|
| 14 |
-
GGML_API
|
| 15 |
|
| 16 |
// number of threads used for conversion to float
|
| 17 |
// for openblas and blis, this will also set the number of threads used for blas operations
|
| 18 |
-
GGML_API
|
| 19 |
|
| 20 |
|
| 21 |
#ifdef __cplusplus
|
|
|
|
| 9 |
#endif
|
| 10 |
|
| 11 |
// backend API
|
| 12 |
+
GGML_API ggml_backend_t ggml_backend_blas_init(void);
|
| 13 |
|
| 14 |
+
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
| 15 |
|
| 16 |
// number of threads used for conversion to float
|
| 17 |
// for openblas and blis, this will also set the number of threads used for blas operations
|
| 18 |
+
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
| 19 |
|
| 20 |
|
| 21 |
#ifdef __cplusplus
|
ggml/include/ggml-cann.h
CHANGED
|
@@ -44,7 +44,7 @@ extern "C" {
|
|
| 44 |
* @param device The index of the device to initialize.
|
| 45 |
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
| 46 |
*/
|
| 47 |
-
GGML_API
|
| 48 |
|
| 49 |
/**
|
| 50 |
* @brief Checks if a given backend is a CANN backend.
|
|
@@ -55,7 +55,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
|
|
| 55 |
* @param backend The backend instance to check.
|
| 56 |
* @return True if the backend is a CANN backend, false otherwise.
|
| 57 |
*/
|
| 58 |
-
GGML_API
|
| 59 |
|
| 60 |
/**
|
| 61 |
* @brief Retrieves the CANN buffer type for a specified device.
|
|
@@ -67,7 +67,7 @@ GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
|
|
| 67 |
* @return A pointer to the buffer type interface for the specified device, or
|
| 68 |
* nullptr if the device index is out of range.
|
| 69 |
*/
|
| 70 |
-
GGML_API
|
| 71 |
ggml_backend_cann_buffer_type(int32_t device);
|
| 72 |
|
| 73 |
/**
|
|
@@ -78,14 +78,14 @@ ggml_backend_cann_buffer_type(int32_t device);
|
|
| 78 |
*
|
| 79 |
* @return The number of CANN devices available.
|
| 80 |
*/
|
| 81 |
-
GGML_API
|
| 82 |
|
| 83 |
/**
|
| 84 |
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
| 85 |
*
|
| 86 |
* @return A pointer to the host buffer type interface.
|
| 87 |
*/
|
| 88 |
-
GGML_API
|
| 89 |
|
| 90 |
/**
|
| 91 |
* @brief Retrieves the description of a specific CANN device.
|
|
@@ -97,7 +97,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type
|
|
| 97 |
* @param description Pointer to a buffer where the description will be written.
|
| 98 |
* @param description_size Size of the description buffer.
|
| 99 |
*/
|
| 100 |
-
GGML_API
|
| 101 |
int32_t device, char* description, size_t description_size);
|
| 102 |
|
| 103 |
/**
|
|
@@ -112,9 +112,9 @@ GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
|
|
| 112 |
* @param total Pointer to a variable where the total memory size will be
|
| 113 |
* stored.
|
| 114 |
*/
|
| 115 |
-
GGML_API
|
| 116 |
-
|
| 117 |
-
|
| 118 |
|
| 119 |
/**
|
| 120 |
* @brief Set the logging callback for GGML.
|
|
|
|
| 44 |
* @param device The index of the device to initialize.
|
| 45 |
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
| 46 |
*/
|
| 47 |
+
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
| 48 |
|
| 49 |
/**
|
| 50 |
* @brief Checks if a given backend is a CANN backend.
|
|
|
|
| 55 |
* @param backend The backend instance to check.
|
| 56 |
* @return True if the backend is a CANN backend, false otherwise.
|
| 57 |
*/
|
| 58 |
+
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
| 59 |
|
| 60 |
/**
|
| 61 |
* @brief Retrieves the CANN buffer type for a specified device.
|
|
|
|
| 67 |
* @return A pointer to the buffer type interface for the specified device, or
|
| 68 |
* nullptr if the device index is out of range.
|
| 69 |
*/
|
| 70 |
+
GGML_API ggml_backend_buffer_type_t
|
| 71 |
ggml_backend_cann_buffer_type(int32_t device);
|
| 72 |
|
| 73 |
/**
|
|
|
|
| 78 |
*
|
| 79 |
* @return The number of CANN devices available.
|
| 80 |
*/
|
| 81 |
+
GGML_API int32_t ggml_backend_cann_get_device_count(void);
|
| 82 |
|
| 83 |
/**
|
| 84 |
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
| 85 |
*
|
| 86 |
* @return A pointer to the host buffer type interface.
|
| 87 |
*/
|
| 88 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
| 89 |
|
| 90 |
/**
|
| 91 |
* @brief Retrieves the description of a specific CANN device.
|
|
|
|
| 97 |
* @param description Pointer to a buffer where the description will be written.
|
| 98 |
* @param description_size Size of the description buffer.
|
| 99 |
*/
|
| 100 |
+
GGML_API void ggml_backend_cann_get_device_description(
|
| 101 |
int32_t device, char* description, size_t description_size);
|
| 102 |
|
| 103 |
/**
|
|
|
|
| 112 |
* @param total Pointer to a variable where the total memory size will be
|
| 113 |
* stored.
|
| 114 |
*/
|
| 115 |
+
GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
|
| 116 |
+
size_t* free,
|
| 117 |
+
size_t* total);
|
| 118 |
|
| 119 |
/**
|
| 120 |
* @brief Set the logging callback for GGML.
|
ggml/include/ggml-cuda.h
CHANGED
|
@@ -3,6 +3,10 @@
|
|
| 3 |
#include "ggml.h"
|
| 4 |
#include "ggml-backend.h"
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
#ifdef GGML_USE_HIPBLAS
|
| 7 |
#define GGML_CUDA_NAME "ROCm"
|
| 8 |
#define GGML_CUBLAS_NAME "hipBLAS"
|
|
@@ -13,35 +17,33 @@
|
|
| 13 |
#define GGML_CUDA_NAME "CUDA"
|
| 14 |
#define GGML_CUBLAS_NAME "cuBLAS"
|
| 15 |
#endif
|
| 16 |
-
|
| 17 |
-
#ifdef __cplusplus
|
| 18 |
-
extern "C" {
|
| 19 |
-
#endif
|
| 20 |
-
|
| 21 |
#define GGML_CUDA_MAX_DEVICES 16
|
| 22 |
|
| 23 |
// backend API
|
| 24 |
-
GGML_API
|
| 25 |
|
| 26 |
-
GGML_API
|
| 27 |
|
| 28 |
// device buffer
|
| 29 |
-
GGML_API
|
| 30 |
|
| 31 |
// split tensor buffer that splits matrices by rows across multiple devices
|
| 32 |
-
GGML_API
|
| 33 |
|
| 34 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 35 |
-
GGML_API
|
| 36 |
|
| 37 |
-
GGML_API
|
| 38 |
-
GGML_API
|
| 39 |
-
GGML_API
|
| 40 |
|
| 41 |
-
GGML_API
|
| 42 |
-
GGML_API
|
| 43 |
|
| 44 |
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
|
|
|
|
|
|
|
|
|
| 45 |
#ifdef __cplusplus
|
| 46 |
}
|
| 47 |
#endif
|
|
|
|
| 3 |
#include "ggml.h"
|
| 4 |
#include "ggml-backend.h"
|
| 5 |
|
| 6 |
+
#ifdef __cplusplus
|
| 7 |
+
extern "C" {
|
| 8 |
+
#endif
|
| 9 |
+
|
| 10 |
#ifdef GGML_USE_HIPBLAS
|
| 11 |
#define GGML_CUDA_NAME "ROCm"
|
| 12 |
#define GGML_CUBLAS_NAME "hipBLAS"
|
|
|
|
| 17 |
#define GGML_CUDA_NAME "CUDA"
|
| 18 |
#define GGML_CUBLAS_NAME "cuBLAS"
|
| 19 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
#define GGML_CUDA_MAX_DEVICES 16
|
| 21 |
|
| 22 |
// backend API
|
| 23 |
+
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
| 24 |
|
| 25 |
+
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
| 26 |
|
| 27 |
// device buffer
|
| 28 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
| 29 |
|
| 30 |
// split tensor buffer that splits matrices by rows across multiple devices
|
| 31 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
| 32 |
|
| 33 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 34 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
| 35 |
|
| 36 |
+
GGML_API int ggml_backend_cuda_get_device_count(void);
|
| 37 |
+
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
| 38 |
+
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
| 39 |
|
| 40 |
+
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
| 41 |
+
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
| 42 |
|
| 43 |
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
| 44 |
+
|
| 45 |
+
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
| 46 |
+
|
| 47 |
#ifdef __cplusplus
|
| 48 |
}
|
| 49 |
#endif
|
ggml/include/ggml-metal.h
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
// An interface allowing to compute ggml_cgraph with Metal
|
| 2 |
//
|
| 3 |
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
|
@@ -43,11 +45,11 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|
| 43 |
|
| 44 |
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
| 45 |
|
| 46 |
-
GGML_API
|
| 47 |
|
| 48 |
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
| 49 |
|
| 50 |
-
GGML_API
|
| 51 |
|
| 52 |
// helper to check if the device supports a specific family
|
| 53 |
// ideally, the user code should be doing these checks
|
|
|
|
| 1 |
+
// Note: this description is outdated
|
| 2 |
+
//
|
| 3 |
// An interface allowing to compute ggml_cgraph with Metal
|
| 4 |
//
|
| 5 |
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
|
|
|
| 45 |
|
| 46 |
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
| 47 |
|
| 48 |
+
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
| 49 |
|
| 50 |
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
| 51 |
|
| 52 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
| 53 |
|
| 54 |
// helper to check if the device supports a specific family
|
| 55 |
// ideally, the user code should be doing these checks
|
ggml/include/ggml-rpc.h
CHANGED
|
@@ -10,14 +10,14 @@ extern "C" {
|
|
| 10 |
#define GGML_RPC_MAX_SERVERS 16
|
| 11 |
|
| 12 |
// backend API
|
| 13 |
-
GGML_API
|
| 14 |
-
GGML_API
|
| 15 |
|
| 16 |
-
GGML_API
|
| 17 |
|
| 18 |
-
GGML_API
|
| 19 |
|
| 20 |
-
GGML_API
|
| 21 |
|
| 22 |
#ifdef __cplusplus
|
| 23 |
}
|
|
|
|
| 10 |
#define GGML_RPC_MAX_SERVERS 16
|
| 11 |
|
| 12 |
// backend API
|
| 13 |
+
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
| 14 |
+
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
| 15 |
|
| 16 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
| 17 |
|
| 18 |
+
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
| 19 |
|
| 20 |
+
GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
| 21 |
|
| 22 |
#ifdef __cplusplus
|
| 23 |
}
|
ggml/include/ggml-sycl.h
CHANGED
|
@@ -23,20 +23,20 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
|
| 23 |
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
| 24 |
|
| 25 |
// split tensor buffer that splits matrices by rows across multiple devices
|
| 26 |
-
GGML_API
|
| 27 |
|
| 28 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 29 |
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
| 30 |
|
| 31 |
-
GGML_API void
|
| 32 |
-
GGML_API
|
| 33 |
-
GGML_API
|
| 34 |
-
GGML_API
|
| 35 |
-
GGML_API
|
| 36 |
|
| 37 |
// SYCL doesn't support registering host memory, keep here for reference
|
| 38 |
-
// GGML_API
|
| 39 |
-
// GGML_API
|
| 40 |
#ifdef __cplusplus
|
| 41 |
}
|
| 42 |
#endif
|
|
|
|
| 23 |
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
| 24 |
|
| 25 |
// split tensor buffer that splits matrices by rows across multiple devices
|
| 26 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
| 27 |
|
| 28 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 29 |
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
| 30 |
|
| 31 |
+
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
| 32 |
+
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
| 33 |
+
GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
| 34 |
+
GGML_API int ggml_backend_sycl_get_device_count();
|
| 35 |
+
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
| 36 |
|
| 37 |
// SYCL doesn't support registering host memory, keep here for reference
|
| 38 |
+
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
| 39 |
+
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
| 40 |
#ifdef __cplusplus
|
| 41 |
}
|
| 42 |
#endif
|
ggml/include/ggml-vulkan.h
CHANGED
|
@@ -13,16 +13,16 @@ extern "C" {
|
|
| 13 |
GGML_API void ggml_vk_instance_init(void);
|
| 14 |
|
| 15 |
// backend API
|
| 16 |
-
GGML_API
|
| 17 |
|
| 18 |
-
GGML_API
|
| 19 |
-
GGML_API
|
| 20 |
-
GGML_API
|
| 21 |
-
GGML_API
|
| 22 |
|
| 23 |
-
GGML_API
|
| 24 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 25 |
-
GGML_API
|
| 26 |
|
| 27 |
#ifdef __cplusplus
|
| 28 |
}
|
|
|
|
| 13 |
GGML_API void ggml_vk_instance_init(void);
|
| 14 |
|
| 15 |
// backend API
|
| 16 |
+
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
| 17 |
|
| 18 |
+
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
|
| 19 |
+
GGML_API int ggml_backend_vk_get_device_count(void);
|
| 20 |
+
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
| 21 |
+
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
| 22 |
|
| 23 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
| 24 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 25 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
| 26 |
|
| 27 |
#ifdef __cplusplus
|
| 28 |
}
|
ggml/include/ggml.h
CHANGED
|
@@ -187,16 +187,6 @@
|
|
| 187 |
# define GGML_API
|
| 188 |
#endif
|
| 189 |
|
| 190 |
-
#ifdef GGML_MULTIPLATFORM
|
| 191 |
-
# if defined(_WIN32)
|
| 192 |
-
# define GGML_CALL
|
| 193 |
-
# else
|
| 194 |
-
# define GGML_CALL __attribute__((__ms_abi__))
|
| 195 |
-
# endif
|
| 196 |
-
#else
|
| 197 |
-
# define GGML_CALL
|
| 198 |
-
#endif
|
| 199 |
-
|
| 200 |
// TODO: support for clang
|
| 201 |
#ifdef __GNUC__
|
| 202 |
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
|
@@ -340,7 +330,7 @@ extern "C" {
|
|
| 340 |
};
|
| 341 |
|
| 342 |
// get ggml_status name string
|
| 343 |
-
GGML_API
|
| 344 |
|
| 345 |
// ieee 754-2008 half-precision float16
|
| 346 |
// todo: make this not an integral type
|
|
@@ -717,46 +707,46 @@ extern "C" {
|
|
| 717 |
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
| 718 |
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
| 719 |
|
| 720 |
-
GGML_API
|
| 721 |
-
GGML_API
|
| 722 |
-
GGML_API
|
| 723 |
-
GGML_API
|
| 724 |
|
| 725 |
-
GGML_API
|
| 726 |
-
GGML_API
|
| 727 |
-
GGML_API
|
| 728 |
|
| 729 |
GGML_DEPRECATED(
|
| 730 |
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
| 731 |
"use ggml_row_size() instead");
|
| 732 |
|
| 733 |
-
GGML_API
|
| 734 |
-
GGML_API
|
| 735 |
-
GGML_API
|
| 736 |
|
| 737 |
-
GGML_API
|
| 738 |
-
GGML_API
|
| 739 |
|
| 740 |
-
GGML_API
|
| 741 |
|
| 742 |
-
GGML_API
|
| 743 |
|
| 744 |
// TODO: temporary until model loading of ggml examples is refactored
|
| 745 |
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
| 746 |
|
| 747 |
-
GGML_API
|
| 748 |
-
GGML_API
|
| 749 |
-
GGML_API
|
| 750 |
-
GGML_API
|
| 751 |
-
GGML_API
|
| 752 |
-
GGML_API
|
| 753 |
-
GGML_API
|
| 754 |
-
GGML_API
|
| 755 |
|
| 756 |
-
GGML_API
|
| 757 |
-
GGML_API
|
| 758 |
-
GGML_API
|
| 759 |
-
GGML_API
|
| 760 |
|
| 761 |
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
| 762 |
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
|
@@ -848,7 +838,7 @@ extern "C" {
|
|
| 848 |
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
| 849 |
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
| 850 |
|
| 851 |
-
GGML_API
|
| 852 |
|
| 853 |
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
| 854 |
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
|
@@ -1568,7 +1558,7 @@ extern "C" {
|
|
| 1568 |
"use ggml_rope_ext_inplace instead");
|
| 1569 |
|
| 1570 |
// compute correction dims for YaRN RoPE scaling
|
| 1571 |
-
|
| 1572 |
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
| 1573 |
|
| 1574 |
// rotary position embedding backward, i.e compute dx from dy
|
|
|
|
| 187 |
# define GGML_API
|
| 188 |
#endif
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
// TODO: support for clang
|
| 191 |
#ifdef __GNUC__
|
| 192 |
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
|
|
|
| 330 |
};
|
| 331 |
|
| 332 |
// get ggml_status name string
|
| 333 |
+
GGML_API const char * ggml_status_to_string(enum ggml_status status);
|
| 334 |
|
| 335 |
// ieee 754-2008 half-precision float16
|
| 336 |
// todo: make this not an integral type
|
|
|
|
| 707 |
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
| 708 |
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
| 709 |
|
| 710 |
+
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
| 711 |
+
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
| 712 |
+
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
| 713 |
+
GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
| 714 |
|
| 715 |
+
GGML_API int64_t ggml_blck_size(enum ggml_type type);
|
| 716 |
+
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
| 717 |
+
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
| 718 |
|
| 719 |
GGML_DEPRECATED(
|
| 720 |
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
| 721 |
"use ggml_row_size() instead");
|
| 722 |
|
| 723 |
+
GGML_API const char * ggml_type_name(enum ggml_type type);
|
| 724 |
+
GGML_API const char * ggml_op_name (enum ggml_op op);
|
| 725 |
+
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
| 726 |
|
| 727 |
+
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
| 728 |
+
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
| 729 |
|
| 730 |
+
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
| 731 |
|
| 732 |
+
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
| 733 |
|
| 734 |
// TODO: temporary until model loading of ggml examples is refactored
|
| 735 |
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
| 736 |
|
| 737 |
+
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
| 738 |
+
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
| 739 |
+
GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
|
| 740 |
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
| 741 |
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
| 742 |
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
| 743 |
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
| 744 |
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
| 745 |
|
| 746 |
+
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
| 747 |
+
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
| 748 |
+
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
| 749 |
+
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
| 750 |
|
| 751 |
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
| 752 |
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
|
|
|
| 838 |
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
| 839 |
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
| 840 |
|
| 841 |
+
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
| 842 |
|
| 843 |
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
| 844 |
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
|
|
|
| 1558 |
"use ggml_rope_ext_inplace instead");
|
| 1559 |
|
| 1560 |
// compute correction dims for YaRN RoPE scaling
|
| 1561 |
+
void ggml_rope_yarn_corr_dims(
|
| 1562 |
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
| 1563 |
|
| 1564 |
// rotary position embedding backward, i.e compute dx from dy
|
ggml/src/CMakeLists.txt
CHANGED
|
@@ -1325,7 +1325,7 @@ add_library(ggml
|
|
| 1325 |
../include/ggml-backend.h
|
| 1326 |
ggml.c
|
| 1327 |
ggml-alloc.c
|
| 1328 |
-
ggml-backend.
|
| 1329 |
ggml-quants.c
|
| 1330 |
ggml-quants.h
|
| 1331 |
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
|
|
|
| 1325 |
../include/ggml-backend.h
|
| 1326 |
ggml.c
|
| 1327 |
ggml-alloc.c
|
| 1328 |
+
ggml-backend.cpp
|
| 1329 |
ggml-quants.c
|
| 1330 |
ggml-quants.h
|
| 1331 |
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
ggml/src/ggml-backend-impl.h
CHANGED
|
@@ -9,145 +9,229 @@ extern "C" {
|
|
| 9 |
#endif
|
| 10 |
|
| 11 |
//
|
| 12 |
-
// Backend buffer
|
| 13 |
//
|
| 14 |
|
| 15 |
-
// buffer type
|
| 16 |
-
typedef void * ggml_backend_buffer_type_context_t;
|
| 17 |
-
|
| 18 |
struct ggml_backend_buffer_type_i {
|
| 19 |
-
const char * (*
|
| 20 |
// allocate a buffer of this type
|
| 21 |
-
ggml_backend_buffer_t (*
|
| 22 |
// tensor alignment
|
| 23 |
-
size_t (*
|
| 24 |
-
// max buffer size that can be allocated
|
| 25 |
-
size_t (*
|
| 26 |
-
// data size needed to allocate the tensor, including padding
|
| 27 |
-
size_t (*
|
| 28 |
-
// check if tensor data is in host memory
|
| 29 |
-
bool (*
|
| 30 |
};
|
| 31 |
|
| 32 |
struct ggml_backend_buffer_type {
|
| 33 |
struct ggml_backend_buffer_type_i iface;
|
| 34 |
-
|
|
|
|
| 35 |
};
|
| 36 |
|
| 37 |
-
//
|
| 38 |
-
|
|
|
|
| 39 |
|
| 40 |
struct ggml_backend_buffer_i {
|
| 41 |
-
const char * (*
|
| 42 |
-
|
| 43 |
-
void
|
| 44 |
-
|
| 45 |
-
void
|
| 46 |
-
|
| 47 |
-
void (*
|
| 48 |
-
|
| 49 |
-
void (*
|
| 50 |
-
void (*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
};
|
| 52 |
|
| 53 |
struct ggml_backend_buffer {
|
| 54 |
struct ggml_backend_buffer_i iface;
|
| 55 |
ggml_backend_buffer_type_t buft;
|
| 56 |
-
|
| 57 |
size_t size;
|
| 58 |
enum ggml_backend_buffer_usage usage;
|
| 59 |
};
|
| 60 |
|
| 61 |
-
|
| 62 |
-
ggml_backend_buffer_type_t
|
| 63 |
-
struct ggml_backend_buffer_i
|
| 64 |
-
|
| 65 |
-
size_t
|
| 66 |
|
| 67 |
// do not use directly, use ggml_backend_tensor_copy instead
|
| 68 |
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 69 |
|
|
|
|
| 70 |
// buffer that contains a collection of buffers
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
|
| 75 |
//
|
| 76 |
-
// Backend
|
| 77 |
//
|
| 78 |
|
| 79 |
-
typedef void * ggml_backend_context_t;
|
| 80 |
-
|
| 81 |
struct ggml_backend_i {
|
| 82 |
-
const char * (*
|
| 83 |
|
| 84 |
-
void (*
|
| 85 |
|
| 86 |
// buffer allocation
|
| 87 |
-
ggml_backend_buffer_type_t (*
|
| 88 |
|
| 89 |
// (optional) asynchronous tensor data access
|
| 90 |
-
void (*
|
| 91 |
-
void (*
|
| 92 |
-
bool (*
|
| 93 |
|
| 94 |
// (optional) complete all pending operations
|
| 95 |
-
void (*
|
| 96 |
|
| 97 |
-
// compute graph with a plan (not used currently)
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 101 |
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
| 102 |
-
void (*
|
| 103 |
// compute the graph with the plan
|
| 104 |
-
enum ggml_status (*
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
//
|
| 107 |
-
|
| 108 |
|
|
|
|
| 109 |
// check if the backend can compute an operation
|
| 110 |
-
bool (*
|
| 111 |
|
| 112 |
// check if the backend can use tensors allocated in a buffer type
|
| 113 |
-
bool (*
|
| 114 |
|
| 115 |
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
| 116 |
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
| 117 |
// even if the weight has to be copied from the CPU temporarily
|
| 118 |
-
bool (*
|
| 119 |
|
| 120 |
// (optional) event synchronization
|
| 121 |
-
//
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
| 126 |
-
// wait for an event on on a different backend instance
|
| 127 |
-
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
| 128 |
-
// block until an event is recorded
|
| 129 |
-
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
| 130 |
};
|
| 131 |
|
| 132 |
struct ggml_backend {
|
| 133 |
ggml_guid_t guid;
|
| 134 |
-
|
| 135 |
struct ggml_backend_i iface;
|
| 136 |
-
|
|
|
|
| 137 |
};
|
| 138 |
|
| 139 |
struct ggml_backend_event {
|
| 140 |
-
|
| 141 |
void * context;
|
| 142 |
};
|
| 143 |
|
| 144 |
//
|
| 145 |
-
// Backend
|
| 146 |
//
|
| 147 |
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
#ifdef __cplusplus
|
| 153 |
}
|
|
|
|
| 9 |
#endif
|
| 10 |
|
| 11 |
//
|
| 12 |
+
// Backend buffer type
|
| 13 |
//
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
struct ggml_backend_buffer_type_i {
|
| 16 |
+
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
| 17 |
// allocate a buffer of this type
|
| 18 |
+
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
| 19 |
// tensor alignment
|
| 20 |
+
size_t (*get_alignment) (ggml_backend_buffer_type_t buft);
|
| 21 |
+
// (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
|
| 22 |
+
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
|
| 23 |
+
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
|
| 24 |
+
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
| 25 |
+
// (optional) check if tensor data is in host memory (defaults to false)
|
| 26 |
+
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
| 27 |
};
|
| 28 |
|
| 29 |
struct ggml_backend_buffer_type {
|
| 30 |
struct ggml_backend_buffer_type_i iface;
|
| 31 |
+
ggml_backend_dev_t device;
|
| 32 |
+
void * context;
|
| 33 |
};
|
| 34 |
|
| 35 |
+
//
|
| 36 |
+
// Backend buffer
|
| 37 |
+
//
|
| 38 |
|
| 39 |
struct ggml_backend_buffer_i {
|
| 40 |
+
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
| 41 |
+
// (optional) free the buffer
|
| 42 |
+
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
| 43 |
+
// base address of the buffer
|
| 44 |
+
void * (*get_base) (ggml_backend_buffer_t buffer);
|
| 45 |
+
// (optional) initialize a tensor in the buffer (eg. add tensor extras)
|
| 46 |
+
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 47 |
+
// tensor data access
|
| 48 |
+
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
| 49 |
+
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 50 |
+
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 51 |
+
// (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
|
| 52 |
+
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 53 |
+
// clear the entire buffer
|
| 54 |
+
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
| 55 |
+
// (optional) reset any internal state due to tensor initialization, such as tensor extras
|
| 56 |
+
void (*reset) (ggml_backend_buffer_t buffer);
|
| 57 |
};
|
| 58 |
|
| 59 |
struct ggml_backend_buffer {
|
| 60 |
struct ggml_backend_buffer_i iface;
|
| 61 |
ggml_backend_buffer_type_t buft;
|
| 62 |
+
void * context;
|
| 63 |
size_t size;
|
| 64 |
enum ggml_backend_buffer_usage usage;
|
| 65 |
};
|
| 66 |
|
| 67 |
+
ggml_backend_buffer_t ggml_backend_buffer_init(
|
| 68 |
+
ggml_backend_buffer_type_t buft,
|
| 69 |
+
struct ggml_backend_buffer_i iface,
|
| 70 |
+
void * context,
|
| 71 |
+
size_t size);
|
| 72 |
|
| 73 |
// do not use directly, use ggml_backend_tensor_copy instead
|
| 74 |
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 75 |
|
| 76 |
+
// multi-buffer
|
| 77 |
// buffer that contains a collection of buffers
|
| 78 |
+
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
| 79 |
+
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
| 80 |
+
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
| 81 |
|
| 82 |
//
|
| 83 |
+
// Backend (stream)
|
| 84 |
//
|
| 85 |
|
|
|
|
|
|
|
| 86 |
struct ggml_backend_i {
|
| 87 |
+
const char * (*get_name)(ggml_backend_t backend);
|
| 88 |
|
| 89 |
+
void (*free)(ggml_backend_t backend);
|
| 90 |
|
| 91 |
// buffer allocation
|
| 92 |
+
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
| 93 |
|
| 94 |
// (optional) asynchronous tensor data access
|
| 95 |
+
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 96 |
+
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 97 |
+
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 98 |
|
| 99 |
// (optional) complete all pending operations
|
| 100 |
+
void (*synchronize)(ggml_backend_t backend);
|
| 101 |
|
| 102 |
+
// (optional) compute graph with a plan (not used currently)
|
| 103 |
+
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 104 |
+
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
|
|
| 105 |
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
| 106 |
+
void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
| 107 |
// compute the graph with the plan
|
| 108 |
+
enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 109 |
+
|
| 110 |
+
// compute graph (always async if supported by the backend)
|
| 111 |
+
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 112 |
|
| 113 |
+
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
|
| 114 |
+
// new backends should implement the device interface instead
|
| 115 |
|
| 116 |
+
// These functions are being moved to the device interface
|
| 117 |
// check if the backend can compute an operation
|
| 118 |
+
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
| 119 |
|
| 120 |
// check if the backend can use tensors allocated in a buffer type
|
| 121 |
+
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
| 122 |
|
| 123 |
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
| 124 |
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
| 125 |
// even if the weight has to be copied from the CPU temporarily
|
| 126 |
+
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
| 127 |
|
| 128 |
// (optional) event synchronization
|
| 129 |
+
// record an event on this stream
|
| 130 |
+
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
| 131 |
+
// wait for an event on on a different stream
|
| 132 |
+
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
};
|
| 134 |
|
| 135 |
struct ggml_backend {
|
| 136 |
ggml_guid_t guid;
|
|
|
|
| 137 |
struct ggml_backend_i iface;
|
| 138 |
+
ggml_backend_dev_t device;
|
| 139 |
+
void * context;
|
| 140 |
};
|
| 141 |
|
| 142 |
struct ggml_backend_event {
|
| 143 |
+
struct ggml_backend_device * device;
|
| 144 |
void * context;
|
| 145 |
};
|
| 146 |
|
| 147 |
//
|
| 148 |
+
// Backend device
|
| 149 |
//
|
| 150 |
|
| 151 |
+
// Note: if additional properties are needed, we should add a struct with all of them
|
| 152 |
+
// the current functions to obtain the properties can remain, since they are more convenient for often used properties
|
| 153 |
+
struct ggml_backend_device_i {
|
| 154 |
+
// device name: short identifier for this device, such as "CPU" or "CUDA0"
|
| 155 |
+
const char * (*get_name)(ggml_backend_dev_t dev);
|
| 156 |
+
|
| 157 |
+
// device description: short informative description of the device, could be the model name
|
| 158 |
+
const char * (*get_description)(ggml_backend_dev_t dev);
|
| 159 |
+
|
| 160 |
+
// device memory in bytes
|
| 161 |
+
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
| 162 |
+
|
| 163 |
+
// device type
|
| 164 |
+
enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
|
| 165 |
+
|
| 166 |
+
// device properties
|
| 167 |
+
void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
|
| 168 |
+
|
| 169 |
+
// backend (stream) initialization
|
| 170 |
+
ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
|
| 171 |
+
|
| 172 |
+
// preferred buffer type
|
| 173 |
+
ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
|
| 174 |
+
|
| 175 |
+
// (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
|
| 176 |
+
ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
|
| 177 |
+
|
| 178 |
+
// (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
|
| 179 |
+
ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
|
| 180 |
+
|
| 181 |
+
// check if the backend can compute an operation
|
| 182 |
+
bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
|
| 183 |
+
|
| 184 |
+
// check if the backend can use tensors allocated in a buffer type
|
| 185 |
+
bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
|
| 186 |
+
|
| 187 |
+
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
| 188 |
+
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
| 189 |
+
// even if the weight has to be copied from the CPU temporarily
|
| 190 |
+
bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
|
| 191 |
+
|
| 192 |
+
// (optional) event synchronization
|
| 193 |
+
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
|
| 194 |
+
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
| 195 |
+
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
| 196 |
+
};
|
| 197 |
+
|
| 198 |
+
struct ggml_backend_device {
|
| 199 |
+
struct ggml_backend_device_i iface;
|
| 200 |
+
ggml_backend_reg_t reg;
|
| 201 |
+
void * context;
|
| 202 |
+
};
|
| 203 |
+
|
| 204 |
+
//
|
| 205 |
+
// Backend (reg)
|
| 206 |
+
//
|
| 207 |
+
|
| 208 |
+
struct ggml_backend_reg_i {
|
| 209 |
+
const char * (*get_name)(ggml_backend_reg_t reg);
|
| 210 |
+
|
| 211 |
+
// enumerate available devices
|
| 212 |
+
size_t (*get_device_count)(ggml_backend_reg_t reg);
|
| 213 |
+
ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
|
| 214 |
+
|
| 215 |
+
// (optional) get a pointer to a function in the backend
|
| 216 |
+
// backends can add custom functions that are not part of the standard ggml-backend interface
|
| 217 |
+
void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
|
| 218 |
+
|
| 219 |
+
// (optional) set the log callback for the backend
|
| 220 |
+
void (*set_log_callback)(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data);
|
| 221 |
+
};
|
| 222 |
+
|
| 223 |
+
struct ggml_backend_reg {
|
| 224 |
+
// int api_version; // TODO: for dynamic loading
|
| 225 |
+
struct ggml_backend_reg_i iface;
|
| 226 |
+
void * context;
|
| 227 |
+
};
|
| 228 |
+
|
| 229 |
|
| 230 |
+
// Internal backend registry API
|
| 231 |
+
void ggml_backend_register(ggml_backend_reg_t reg);
|
| 232 |
+
void ggml_backend_device_register(ggml_backend_dev_t device);
|
| 233 |
+
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
|
| 234 |
+
// typedef ggml_backend_register_t * (*ggml_backend_init)(void);
|
| 235 |
|
| 236 |
#ifdef __cplusplus
|
| 237 |
}
|
ggml/src/ggml-backend.cpp
ADDED
|
@@ -0,0 +1,2496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Note: porting this file to C++ is a work in progress
|
| 2 |
+
|
| 3 |
+
#include "ggml-backend-impl.h"
|
| 4 |
+
#include "ggml-alloc.h"
|
| 5 |
+
#include "ggml-impl.h"
|
| 6 |
+
|
| 7 |
+
#include <assert.h>
|
| 8 |
+
#include <limits.h>
|
| 9 |
+
#include <stdarg.h>
|
| 10 |
+
#include <stdio.h>
|
| 11 |
+
#include <stdlib.h>
|
| 12 |
+
#include <string.h>
|
| 13 |
+
|
| 14 |
+
#include <vector>
|
| 15 |
+
|
| 16 |
+
// backend buffer type
|
| 17 |
+
|
| 18 |
+
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
| 19 |
+
return buft->iface.get_name(buft);
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 23 |
+
return buft->iface.alloc_buffer(buft, size);
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 27 |
+
return buft->iface.get_alignment(buft);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
| 31 |
+
// get_max_size is optional, defaults to SIZE_MAX
|
| 32 |
+
if (buft->iface.get_max_size) {
|
| 33 |
+
return buft->iface.get_max_size(buft);
|
| 34 |
+
}
|
| 35 |
+
return SIZE_MAX;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
| 39 |
+
// get_alloc_size is optional, defaults to ggml_nbytes
|
| 40 |
+
if (buft->iface.get_alloc_size) {
|
| 41 |
+
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
| 42 |
+
assert(size >= ggml_nbytes(tensor));
|
| 43 |
+
return size;
|
| 44 |
+
}
|
| 45 |
+
return ggml_nbytes(tensor);
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
| 49 |
+
if (buft->iface.is_host) {
|
| 50 |
+
return buft->iface.is_host(buft);
|
| 51 |
+
}
|
| 52 |
+
return false;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
|
| 56 |
+
return buft->device;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
// backend buffer
|
| 60 |
+
|
| 61 |
+
ggml_backend_buffer_t ggml_backend_buffer_init(
|
| 62 |
+
ggml_backend_buffer_type_t buft,
|
| 63 |
+
struct ggml_backend_buffer_i iface,
|
| 64 |
+
void * context,
|
| 65 |
+
size_t size) {
|
| 66 |
+
ggml_backend_buffer_t buffer = new ggml_backend_buffer {
|
| 67 |
+
/* .interface = */ iface,
|
| 68 |
+
/* .buft = */ buft,
|
| 69 |
+
/* .context = */ context,
|
| 70 |
+
/* .size = */ size,
|
| 71 |
+
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
|
| 72 |
+
};
|
| 73 |
+
|
| 74 |
+
return buffer;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
| 78 |
+
return buffer->iface.get_name(buffer);
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
| 82 |
+
if (buffer == NULL) {
|
| 83 |
+
return;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
if (buffer->iface.free_buffer != NULL) {
|
| 87 |
+
buffer->iface.free_buffer(buffer);
|
| 88 |
+
}
|
| 89 |
+
delete buffer;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
| 93 |
+
return buffer->size;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 97 |
+
void * base = buffer->iface.get_base(buffer);
|
| 98 |
+
|
| 99 |
+
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
| 100 |
+
|
| 101 |
+
return base;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 105 |
+
// init_tensor is optional
|
| 106 |
+
if (buffer->iface.init_tensor) {
|
| 107 |
+
buffer->iface.init_tensor(buffer, tensor);
|
| 108 |
+
}
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
| 112 |
+
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
| 116 |
+
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 120 |
+
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 124 |
+
buffer->iface.clear(buffer, value);
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
| 128 |
+
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
| 132 |
+
buffer->usage = usage;
|
| 133 |
+
|
| 134 |
+
// FIXME: add a generic callback to the buffer interface
|
| 135 |
+
if (ggml_backend_buffer_is_multi_buffer(buffer)) {
|
| 136 |
+
ggml_backend_multi_buffer_set_usage(buffer, usage);
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
|
| 141 |
+
return buffer->usage;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
| 145 |
+
return buffer->buft;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
| 149 |
+
if (buffer->iface.reset) {
|
| 150 |
+
buffer->iface.reset(buffer);
|
| 151 |
+
}
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 155 |
+
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
| 156 |
+
if (dst_buf->iface.cpy_tensor) {
|
| 157 |
+
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
| 158 |
+
}
|
| 159 |
+
return false;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
// backend
|
| 163 |
+
|
| 164 |
+
ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
|
| 165 |
+
if (backend == NULL) {
|
| 166 |
+
return NULL;
|
| 167 |
+
}
|
| 168 |
+
return backend->guid;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
const char * ggml_backend_name(ggml_backend_t backend) {
|
| 172 |
+
if (backend == NULL) {
|
| 173 |
+
return "NULL";
|
| 174 |
+
}
|
| 175 |
+
return backend->iface.get_name(backend);
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
void ggml_backend_free(ggml_backend_t backend) {
|
| 179 |
+
if (backend == NULL) {
|
| 180 |
+
return;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
backend->iface.free(backend);
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
| 187 |
+
return backend->iface.get_default_buffer_type(backend);
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
| 191 |
+
return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
| 195 |
+
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
size_t ggml_backend_get_max_size(ggml_backend_t backend) {
|
| 199 |
+
return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 203 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 204 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 205 |
+
|
| 206 |
+
if (backend->iface.set_tensor_async == NULL) {
|
| 207 |
+
ggml_backend_tensor_set(tensor, data, offset, size);
|
| 208 |
+
} else {
|
| 209 |
+
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 214 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 215 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 216 |
+
|
| 217 |
+
if (backend->iface.get_tensor_async == NULL) {
|
| 218 |
+
ggml_backend_tensor_get(tensor, data, offset, size);
|
| 219 |
+
} else {
|
| 220 |
+
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
| 221 |
+
}
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 225 |
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 226 |
+
|
| 227 |
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 228 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 229 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 230 |
+
|
| 231 |
+
if (!size) {
|
| 232 |
+
return;
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 239 |
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 240 |
+
|
| 241 |
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 242 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 243 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 244 |
+
|
| 245 |
+
if (!size) {
|
| 246 |
+
return;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
| 253 |
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 254 |
+
|
| 255 |
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 256 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 257 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 258 |
+
|
| 259 |
+
if (!size) {
|
| 260 |
+
return;
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
|
| 264 |
+
|
| 265 |
+
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
void ggml_backend_synchronize(ggml_backend_t backend) {
|
| 269 |
+
if (backend->iface.synchronize == NULL) {
|
| 270 |
+
return;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
backend->iface.synchronize(backend);
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 277 |
+
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
| 278 |
+
|
| 279 |
+
return backend->iface.graph_plan_create(backend, cgraph);
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 283 |
+
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
| 284 |
+
|
| 285 |
+
backend->iface.graph_plan_free(backend, plan);
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 289 |
+
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
| 290 |
+
|
| 291 |
+
return backend->iface.graph_plan_compute(backend, plan);
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 295 |
+
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
|
| 296 |
+
ggml_backend_synchronize(backend);
|
| 297 |
+
return err;
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 301 |
+
return backend->iface.graph_compute(backend, cgraph);
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 305 |
+
// helper to ease transition to device interface
|
| 306 |
+
if (backend->device) {
|
| 307 |
+
return ggml_backend_dev_supports_op(backend->device, op);
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
return backend->iface.supports_op(backend, op);
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 314 |
+
// helper to ease transition to device interface
|
| 315 |
+
if (backend->device) {
|
| 316 |
+
return ggml_backend_dev_supports_buft(backend->device, buft);
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
return backend->iface.supports_buft(backend, buft);
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 323 |
+
// helper to ease transition to device interface
|
| 324 |
+
if (backend->device) {
|
| 325 |
+
return ggml_backend_dev_offload_op(backend->device, op);
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
if (backend->iface.offload_op != NULL) {
|
| 329 |
+
return backend->iface.offload_op(backend, op);
|
| 330 |
+
}
|
| 331 |
+
return false;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
| 335 |
+
return backend->device;
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
// backend copy
|
| 339 |
+
|
| 340 |
+
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
| 341 |
+
if (a->type != b->type) {
|
| 342 |
+
return false;
|
| 343 |
+
}
|
| 344 |
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
| 345 |
+
if (a->ne[i] != b->ne[i]) {
|
| 346 |
+
return false;
|
| 347 |
+
}
|
| 348 |
+
if (a->nb[i] != b->nb[i]) {
|
| 349 |
+
return false;
|
| 350 |
+
}
|
| 351 |
+
}
|
| 352 |
+
return true;
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 356 |
+
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
| 357 |
+
|
| 358 |
+
if (src == dst) {
|
| 359 |
+
return;
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 363 |
+
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
| 364 |
+
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
|
| 365 |
+
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
| 366 |
+
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
| 367 |
+
#ifndef NDEBUG
|
| 368 |
+
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
| 369 |
+
#endif
|
| 370 |
+
size_t nbytes = ggml_nbytes(src);
|
| 371 |
+
void * data = malloc(nbytes);
|
| 372 |
+
ggml_backend_tensor_get(src, data, 0, nbytes);
|
| 373 |
+
ggml_backend_tensor_set(dst, data, 0, nbytes);
|
| 374 |
+
free(data);
|
| 375 |
+
}
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 379 |
+
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
| 380 |
+
|
| 381 |
+
if (src == dst) {
|
| 382 |
+
return;
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
| 386 |
+
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
| 387 |
+
return;
|
| 388 |
+
}
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
// an async copy would normally happen after all the queued operations on both backends are completed
|
| 392 |
+
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
|
| 393 |
+
ggml_backend_synchronize(backend_src);
|
| 394 |
+
ggml_backend_synchronize(backend_dst);
|
| 395 |
+
ggml_backend_tensor_copy(src, dst);
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
// events
|
| 399 |
+
|
| 400 |
+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
|
| 401 |
+
// null device is allowed for the transition period to the device interface
|
| 402 |
+
if (device == NULL || device->iface.event_new == NULL) {
|
| 403 |
+
return NULL;
|
| 404 |
+
}
|
| 405 |
+
return device->iface.event_new(device);
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
void ggml_backend_event_free(ggml_backend_event_t event) {
|
| 409 |
+
if (event == NULL) {
|
| 410 |
+
return;
|
| 411 |
+
}
|
| 412 |
+
event->device->iface.event_free(event->device, event);
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
| 416 |
+
GGML_ASSERT(backend->iface.event_record != NULL);
|
| 417 |
+
|
| 418 |
+
backend->iface.event_record(backend, event);
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
| 422 |
+
GGML_ASSERT(event->device->iface.event_synchronize);
|
| 423 |
+
|
| 424 |
+
event->device->iface.event_synchronize(event->device, event);
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
| 428 |
+
GGML_ASSERT(backend->iface.event_wait != NULL);
|
| 429 |
+
|
| 430 |
+
backend->iface.event_wait(backend, event);
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
// Backend device
|
| 434 |
+
|
| 435 |
+
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
| 436 |
+
return device->iface.get_name(device);
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
| 440 |
+
return device->iface.get_description(device);
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
| 444 |
+
device->iface.get_memory(device, free, total);
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
| 448 |
+
return device->iface.get_type(device);
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
|
| 452 |
+
device->iface.get_props(device, props);
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
| 456 |
+
return device->reg;
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
| 460 |
+
return device->iface.init_backend(device, params);
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
| 464 |
+
return device->iface.get_buffer_type(device);
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
| 468 |
+
return device->iface.get_host_buffer_type(device);
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
| 472 |
+
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
| 476 |
+
return device->iface.supports_op(device, op);
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
| 480 |
+
return device->iface.supports_buft(device, buft);
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
| 484 |
+
return device->iface.offload_op(device, op);
|
| 485 |
+
}
|
| 486 |
+
|
| 487 |
+
// Backend (reg)
|
| 488 |
+
|
| 489 |
+
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
| 490 |
+
return reg->iface.get_name(reg);
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
| 494 |
+
return reg->iface.get_device_count(reg);
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
| 498 |
+
return reg->iface.get_device(reg, index);
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
| 502 |
+
if (!reg->iface.get_proc_address) {
|
| 503 |
+
return NULL;
|
| 504 |
+
}
|
| 505 |
+
return reg->iface.get_proc_address(reg, name);
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
|
| 509 |
+
if (reg->iface.set_log_callback) {
|
| 510 |
+
reg->iface.set_log_callback(reg, log_callback, user_data);
|
| 511 |
+
}
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
// Backend registry
|
| 515 |
+
|
| 516 |
+
#ifdef GGML_USE_CUDA
|
| 517 |
+
#include "ggml-cuda.h"
|
| 518 |
+
#endif
|
| 519 |
+
|
| 520 |
+
struct ggml_backend_registry {
|
| 521 |
+
std::vector<ggml_backend_reg_t> backends;
|
| 522 |
+
std::vector<ggml_backend_dev_t> devices;
|
| 523 |
+
|
| 524 |
+
ggml_backend_registry() {
|
| 525 |
+
#ifdef GGML_USE_CUDA
|
| 526 |
+
register_backend(ggml_backend_cuda_reg());
|
| 527 |
+
#endif
|
| 528 |
+
|
| 529 |
+
register_backend(ggml_backend_cpu_reg());
|
| 530 |
+
|
| 531 |
+
// TODO: sycl, metal, vulkan, kompute, cann
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
void register_backend(ggml_backend_reg_t reg) {
|
| 535 |
+
#ifndef NDEBUG
|
| 536 |
+
fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
|
| 537 |
+
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
| 538 |
+
#endif
|
| 539 |
+
backends.push_back(reg);
|
| 540 |
+
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
| 541 |
+
register_device(ggml_backend_reg_dev_get(reg, i));
|
| 542 |
+
}
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
void register_device(ggml_backend_dev_t device) {
|
| 546 |
+
#ifndef NDEBUG
|
| 547 |
+
fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
| 548 |
+
#endif
|
| 549 |
+
devices.push_back(device);
|
| 550 |
+
}
|
| 551 |
+
};
|
| 552 |
+
|
| 553 |
+
static ggml_backend_registry & get_reg() {
|
| 554 |
+
static ggml_backend_registry reg;
|
| 555 |
+
return reg;
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
// Internal API
|
| 559 |
+
void ggml_backend_register(ggml_backend_reg_t reg) {
|
| 560 |
+
get_reg().register_backend(reg);
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
void ggml_backend_device_register(ggml_backend_dev_t device) {
|
| 564 |
+
get_reg().register_device(device);
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
// Backend (reg) enumeration
|
| 568 |
+
size_t ggml_backend_reg_count() {
|
| 569 |
+
return get_reg().backends.size();
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
|
| 573 |
+
GGML_ASSERT(index < ggml_backend_reg_count());
|
| 574 |
+
return get_reg().backends[index];
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
|
| 578 |
+
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
| 579 |
+
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
| 580 |
+
if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
|
| 581 |
+
return reg;
|
| 582 |
+
}
|
| 583 |
+
}
|
| 584 |
+
return NULL;
|
| 585 |
+
}
|
| 586 |
+
|
| 587 |
+
// Device enumeration
|
| 588 |
+
size_t ggml_backend_dev_count() {
|
| 589 |
+
return get_reg().devices.size();
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
| 593 |
+
GGML_ASSERT(index < ggml_backend_dev_count());
|
| 594 |
+
return get_reg().devices[index];
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
| 598 |
+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
| 599 |
+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
| 600 |
+
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
|
| 601 |
+
return dev;
|
| 602 |
+
}
|
| 603 |
+
}
|
| 604 |
+
return NULL;
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
| 608 |
+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
| 609 |
+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
| 610 |
+
if (ggml_backend_dev_type(dev) == type) {
|
| 611 |
+
return dev;
|
| 612 |
+
}
|
| 613 |
+
}
|
| 614 |
+
return NULL;
|
| 615 |
+
}
|
| 616 |
+
|
| 617 |
+
void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data) {
|
| 618 |
+
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
| 619 |
+
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
| 620 |
+
ggml_backend_reg_set_log_callback(reg, log_callback, user_data);
|
| 621 |
+
}
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
// Convenience functions
|
| 625 |
+
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
| 626 |
+
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
| 627 |
+
if (!dev) {
|
| 628 |
+
return NULL;
|
| 629 |
+
}
|
| 630 |
+
return ggml_backend_dev_init(dev, params);
|
| 631 |
+
}
|
| 632 |
+
|
| 633 |
+
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
| 634 |
+
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
| 635 |
+
if (!dev) {
|
| 636 |
+
return NULL;
|
| 637 |
+
}
|
| 638 |
+
return ggml_backend_dev_init(dev, params);
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
ggml_backend_t ggml_backend_init_best(void) {
|
| 642 |
+
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
|
| 643 |
+
if (!dev) {
|
| 644 |
+
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
|
| 645 |
+
}
|
| 646 |
+
if (!dev) {
|
| 647 |
+
return NULL;
|
| 648 |
+
}
|
| 649 |
+
return ggml_backend_dev_init(dev, NULL);
|
| 650 |
+
}
|
| 651 |
+
|
| 652 |
+
// backend CPU
|
| 653 |
+
|
| 654 |
+
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
| 655 |
+
|
| 656 |
+
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 657 |
+
return "CPU";
|
| 658 |
+
|
| 659 |
+
GGML_UNUSED(buffer);
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 663 |
+
uintptr_t data = (uintptr_t)buffer->context;
|
| 664 |
+
|
| 665 |
+
// align the buffer
|
| 666 |
+
if (data % TENSOR_ALIGNMENT != 0) {
|
| 667 |
+
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
return (void *)data;
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 674 |
+
free(buffer->context);
|
| 675 |
+
}
|
| 676 |
+
|
| 677 |
+
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
| 678 |
+
memset((char *)tensor->data + offset, value, size);
|
| 679 |
+
|
| 680 |
+
GGML_UNUSED(buffer);
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 684 |
+
memcpy((char *)tensor->data + offset, data, size);
|
| 685 |
+
|
| 686 |
+
GGML_UNUSED(buffer);
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 690 |
+
memcpy(data, (const char *)tensor->data + offset, size);
|
| 691 |
+
|
| 692 |
+
GGML_UNUSED(buffer);
|
| 693 |
+
}
|
| 694 |
+
|
| 695 |
+
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 696 |
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 697 |
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 698 |
+
return true;
|
| 699 |
+
}
|
| 700 |
+
return false;
|
| 701 |
+
|
| 702 |
+
GGML_UNUSED(buffer);
|
| 703 |
+
}
|
| 704 |
+
|
| 705 |
+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 706 |
+
memset(buffer->context, value, buffer->size);
|
| 707 |
+
}
|
| 708 |
+
|
| 709 |
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
| 710 |
+
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
| 711 |
+
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
| 712 |
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 713 |
+
/* .init_tensor = */ NULL, // no initialization required
|
| 714 |
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
| 715 |
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
| 716 |
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 717 |
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
| 718 |
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
| 719 |
+
/* .reset = */ NULL,
|
| 720 |
+
};
|
| 721 |
+
|
| 722 |
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
| 723 |
+
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
| 724 |
+
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
| 725 |
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 726 |
+
/* .init_tensor = */ NULL, // no initialization required
|
| 727 |
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
| 728 |
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
| 729 |
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 730 |
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
| 731 |
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
| 732 |
+
/* .reset = */ NULL,
|
| 733 |
+
};
|
| 734 |
+
|
| 735 |
+
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 736 |
+
return "CPU";
|
| 737 |
+
|
| 738 |
+
GGML_UNUSED(buft);
|
| 739 |
+
}
|
| 740 |
+
|
| 741 |
+
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 742 |
+
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
| 743 |
+
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
| 744 |
+
if (data == NULL) {
|
| 745 |
+
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
| 746 |
+
return NULL;
|
| 747 |
+
}
|
| 748 |
+
|
| 749 |
+
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
| 750 |
+
}
|
| 751 |
+
|
| 752 |
+
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 753 |
+
return TENSOR_ALIGNMENT;
|
| 754 |
+
|
| 755 |
+
GGML_UNUSED(buft);
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 759 |
+
return true;
|
| 760 |
+
|
| 761 |
+
GGML_UNUSED(buft);
|
| 762 |
+
}
|
| 763 |
+
|
| 764 |
+
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
| 765 |
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
| 766 |
+
/* .iface = */ {
|
| 767 |
+
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
| 768 |
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
| 769 |
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 770 |
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 771 |
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 772 |
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 773 |
+
},
|
| 774 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
| 775 |
+
/* .context = */ NULL,
|
| 776 |
+
};
|
| 777 |
+
|
| 778 |
+
return &ggml_backend_cpu_buffer_type;
|
| 779 |
+
}
|
| 780 |
+
|
| 781 |
+
#ifdef GGML_USE_CPU_HBM
|
| 782 |
+
|
| 783 |
+
// buffer type HBM
|
| 784 |
+
|
| 785 |
+
#include <hbwmalloc.h>
|
| 786 |
+
|
| 787 |
+
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 788 |
+
return "CPU_HBM";
|
| 789 |
+
|
| 790 |
+
GGML_UNUSED(buft);
|
| 791 |
+
}
|
| 792 |
+
|
| 793 |
+
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
| 794 |
+
return "CPU_HBM";
|
| 795 |
+
|
| 796 |
+
GGML_UNUSED(buf);
|
| 797 |
+
}
|
| 798 |
+
|
| 799 |
+
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 800 |
+
hbw_free(buffer->context);
|
| 801 |
+
}
|
| 802 |
+
|
| 803 |
+
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 804 |
+
//void * ptr = hbw_malloc(size);
|
| 805 |
+
void * ptr;
|
| 806 |
+
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
| 807 |
+
if (result != 0) {
|
| 808 |
+
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
| 809 |
+
return NULL;
|
| 810 |
+
}
|
| 811 |
+
|
| 812 |
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 813 |
+
buffer->buft = buft;
|
| 814 |
+
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
| 815 |
+
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
| 816 |
+
|
| 817 |
+
return buffer;
|
| 818 |
+
}
|
| 819 |
+
|
| 820 |
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
| 821 |
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
| 822 |
+
/* .iface = */ {
|
| 823 |
+
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
| 824 |
+
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
| 825 |
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 826 |
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 827 |
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 828 |
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 829 |
+
},
|
| 830 |
+
/* .context = */ NULL,
|
| 831 |
+
};
|
| 832 |
+
|
| 833 |
+
return &ggml_backend_cpu_buffer_type_hbm;
|
| 834 |
+
}
|
| 835 |
+
#endif
|
| 836 |
+
|
| 837 |
+
struct ggml_backend_cpu_context {
|
| 838 |
+
int n_threads;
|
| 839 |
+
ggml_threadpool_t threadpool;
|
| 840 |
+
|
| 841 |
+
uint8_t * work_data;
|
| 842 |
+
size_t work_size;
|
| 843 |
+
|
| 844 |
+
ggml_abort_callback abort_callback;
|
| 845 |
+
void * abort_callback_data;
|
| 846 |
+
};
|
| 847 |
+
|
| 848 |
+
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
| 849 |
+
return "CPU";
|
| 850 |
+
|
| 851 |
+
GGML_UNUSED(backend);
|
| 852 |
+
}
|
| 853 |
+
|
| 854 |
+
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
| 855 |
+
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 856 |
+
delete[] cpu_ctx->work_data;
|
| 857 |
+
delete cpu_ctx;
|
| 858 |
+
delete backend;
|
| 859 |
+
}
|
| 860 |
+
|
| 861 |
+
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
| 862 |
+
return ggml_backend_cpu_buffer_type();
|
| 863 |
+
|
| 864 |
+
GGML_UNUSED(backend);
|
| 865 |
+
}
|
| 866 |
+
|
| 867 |
+
struct ggml_backend_plan_cpu {
|
| 868 |
+
struct ggml_cplan cplan;
|
| 869 |
+
struct ggml_cgraph cgraph;
|
| 870 |
+
};
|
| 871 |
+
|
| 872 |
+
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
| 873 |
+
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 874 |
+
|
| 875 |
+
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
|
| 876 |
+
|
| 877 |
+
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
| 878 |
+
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
| 879 |
+
|
| 880 |
+
if (cpu_plan->cplan.work_size > 0) {
|
| 881 |
+
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
| 882 |
+
if (cpu_plan->cplan.work_data == NULL) {
|
| 883 |
+
delete cpu_plan;
|
| 884 |
+
return NULL;
|
| 885 |
+
}
|
| 886 |
+
}
|
| 887 |
+
|
| 888 |
+
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
| 889 |
+
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
| 890 |
+
|
| 891 |
+
return cpu_plan;
|
| 892 |
+
}
|
| 893 |
+
|
| 894 |
+
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 895 |
+
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
| 896 |
+
|
| 897 |
+
delete[] cpu_plan->cplan.work_data;
|
| 898 |
+
delete cpu_plan;
|
| 899 |
+
|
| 900 |
+
GGML_UNUSED(backend);
|
| 901 |
+
}
|
| 902 |
+
|
| 903 |
+
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 904 |
+
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
| 905 |
+
|
| 906 |
+
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
| 907 |
+
|
| 908 |
+
GGML_UNUSED(backend);
|
| 909 |
+
}
|
| 910 |
+
|
| 911 |
+
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 912 |
+
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 913 |
+
|
| 914 |
+
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
| 915 |
+
|
| 916 |
+
if (cpu_ctx->work_size < cplan.work_size) {
|
| 917 |
+
delete[] cpu_ctx->work_data;
|
| 918 |
+
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
| 919 |
+
if (cpu_ctx->work_data == NULL) {
|
| 920 |
+
cpu_ctx->work_size = 0;
|
| 921 |
+
return GGML_STATUS_ALLOC_FAILED;
|
| 922 |
+
}
|
| 923 |
+
cpu_ctx->work_size = cplan.work_size;
|
| 924 |
+
}
|
| 925 |
+
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
| 926 |
+
|
| 927 |
+
cplan.abort_callback = cpu_ctx->abort_callback;
|
| 928 |
+
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
| 929 |
+
|
| 930 |
+
return ggml_graph_compute(cgraph, &cplan);
|
| 931 |
+
}
|
| 932 |
+
|
| 933 |
+
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
| 934 |
+
/* .get_name = */ ggml_backend_cpu_get_name,
|
| 935 |
+
/* .free = */ ggml_backend_cpu_free,
|
| 936 |
+
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
| 937 |
+
/* .set_tensor_async = */ NULL,
|
| 938 |
+
/* .get_tensor_async = */ NULL,
|
| 939 |
+
/* .cpy_tensor_async = */ NULL,
|
| 940 |
+
/* .synchronize = */ NULL,
|
| 941 |
+
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
| 942 |
+
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
| 943 |
+
/* .graph_plan_update = */ NULL,
|
| 944 |
+
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
| 945 |
+
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
| 946 |
+
/* .supports_op = */ NULL,
|
| 947 |
+
/* .supports_buft = */ NULL,
|
| 948 |
+
/* .offload_op = */ NULL,
|
| 949 |
+
/* .event_record = */ NULL,
|
| 950 |
+
/* .event_wait = */ NULL,
|
| 951 |
+
};
|
| 952 |
+
|
| 953 |
+
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
| 954 |
+
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
| 955 |
+
return &guid;
|
| 956 |
+
}
|
| 957 |
+
|
| 958 |
+
ggml_backend_t ggml_backend_cpu_init(void) {
|
| 959 |
+
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
|
| 960 |
+
if (ctx == NULL) {
|
| 961 |
+
return NULL;
|
| 962 |
+
}
|
| 963 |
+
|
| 964 |
+
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
| 965 |
+
ctx->threadpool = NULL;
|
| 966 |
+
ctx->work_data = NULL;
|
| 967 |
+
ctx->work_size = 0;
|
| 968 |
+
ctx->abort_callback = NULL;
|
| 969 |
+
ctx->abort_callback_data = NULL;
|
| 970 |
+
|
| 971 |
+
ggml_backend_t cpu_backend = new ggml_backend {
|
| 972 |
+
/* .guid = */ ggml_backend_cpu_guid(),
|
| 973 |
+
/* .interface = */ ggml_backend_cpu_i,
|
| 974 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
| 975 |
+
/* .context = */ ctx,
|
| 976 |
+
};
|
| 977 |
+
|
| 978 |
+
if (cpu_backend == NULL) {
|
| 979 |
+
delete ctx;
|
| 980 |
+
return NULL;
|
| 981 |
+
}
|
| 982 |
+
|
| 983 |
+
return cpu_backend;
|
| 984 |
+
}
|
| 985 |
+
|
| 986 |
+
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
| 987 |
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
| 988 |
+
}
|
| 989 |
+
|
| 990 |
+
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
| 991 |
+
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
| 992 |
+
|
| 993 |
+
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
| 994 |
+
ctx->n_threads = n_threads;
|
| 995 |
+
}
|
| 996 |
+
|
| 997 |
+
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
| 998 |
+
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
| 999 |
+
|
| 1000 |
+
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
| 1001 |
+
|
| 1002 |
+
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
| 1003 |
+
// already had a different threadpool, pause/suspend it before switching
|
| 1004 |
+
ggml_threadpool_pause(ctx->threadpool);
|
| 1005 |
+
}
|
| 1006 |
+
ctx->threadpool = threadpool;
|
| 1007 |
+
}
|
| 1008 |
+
|
| 1009 |
+
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
| 1010 |
+
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
| 1011 |
+
|
| 1012 |
+
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
| 1013 |
+
ctx->abort_callback = abort_callback;
|
| 1014 |
+
ctx->abort_callback_data = abort_callback_data;
|
| 1015 |
+
}
|
| 1016 |
+
|
| 1017 |
+
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
| 1018 |
+
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
| 1019 |
+
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
| 1020 |
+
}
|
| 1021 |
+
|
| 1022 |
+
////////////////////////
|
| 1023 |
+
|
| 1024 |
+
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
|
| 1025 |
+
return "CPU";
|
| 1026 |
+
|
| 1027 |
+
GGML_UNUSED(dev);
|
| 1028 |
+
}
|
| 1029 |
+
|
| 1030 |
+
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
|
| 1031 |
+
// TODO
|
| 1032 |
+
return "CPU";
|
| 1033 |
+
|
| 1034 |
+
GGML_UNUSED(dev);
|
| 1035 |
+
}
|
| 1036 |
+
|
| 1037 |
+
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
| 1038 |
+
// TODO
|
| 1039 |
+
*free = 0;
|
| 1040 |
+
*total = 0;
|
| 1041 |
+
|
| 1042 |
+
GGML_UNUSED(dev);
|
| 1043 |
+
}
|
| 1044 |
+
|
| 1045 |
+
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
| 1046 |
+
return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
|
| 1047 |
+
|
| 1048 |
+
GGML_UNUSED(dev);
|
| 1049 |
+
}
|
| 1050 |
+
|
| 1051 |
+
static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
| 1052 |
+
props->name = ggml_backend_cpu_device_get_name(dev);
|
| 1053 |
+
props->description = ggml_backend_cpu_device_get_description(dev);
|
| 1054 |
+
props->type = ggml_backend_cpu_device_get_type(dev);
|
| 1055 |
+
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
| 1056 |
+
props->caps = {
|
| 1057 |
+
/* async */ false,
|
| 1058 |
+
/* host_buffer */ false,
|
| 1059 |
+
/* events */ false,
|
| 1060 |
+
};
|
| 1061 |
+
}
|
| 1062 |
+
|
| 1063 |
+
static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
|
| 1064 |
+
return ggml_backend_cpu_init();
|
| 1065 |
+
|
| 1066 |
+
GGML_UNUSED(dev);
|
| 1067 |
+
GGML_UNUSED(params);
|
| 1068 |
+
}
|
| 1069 |
+
|
| 1070 |
+
static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
|
| 1071 |
+
return ggml_backend_cpu_buffer_type();
|
| 1072 |
+
|
| 1073 |
+
GGML_UNUSED(dev);
|
| 1074 |
+
}
|
| 1075 |
+
|
| 1076 |
+
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
| 1077 |
+
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 1078 |
+
|
| 1079 |
+
GGML_UNUSED(dev);
|
| 1080 |
+
GGML_UNUSED(max_tensor_size);
|
| 1081 |
+
}
|
| 1082 |
+
|
| 1083 |
+
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
| 1084 |
+
switch (op->op) {
|
| 1085 |
+
case GGML_OP_CPY:
|
| 1086 |
+
return
|
| 1087 |
+
op->type != GGML_TYPE_IQ2_XXS &&
|
| 1088 |
+
op->type != GGML_TYPE_IQ2_XS &&
|
| 1089 |
+
op->type != GGML_TYPE_IQ1_S &&
|
| 1090 |
+
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
| 1091 |
+
case GGML_OP_MUL_MAT:
|
| 1092 |
+
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
| 1093 |
+
case GGML_OP_ROPE_BACK:
|
| 1094 |
+
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
| 1095 |
+
case GGML_OP_IM2COL_BACK:
|
| 1096 |
+
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
| 1097 |
+
case GGML_OP_OUT_PROD:
|
| 1098 |
+
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
|
| 1099 |
+
default:
|
| 1100 |
+
return true;
|
| 1101 |
+
}
|
| 1102 |
+
|
| 1103 |
+
GGML_UNUSED(dev);
|
| 1104 |
+
}
|
| 1105 |
+
|
| 1106 |
+
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
| 1107 |
+
return ggml_backend_buft_is_host(buft);
|
| 1108 |
+
|
| 1109 |
+
GGML_UNUSED(dev);
|
| 1110 |
+
}
|
| 1111 |
+
|
| 1112 |
+
static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
| 1113 |
+
/* .get_name = */ ggml_backend_cpu_device_get_name,
|
| 1114 |
+
/* .get_description = */ ggml_backend_cpu_device_get_description,
|
| 1115 |
+
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
| 1116 |
+
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
| 1117 |
+
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
| 1118 |
+
/* .init_backend = */ ggml_backend_cpu_device_init,
|
| 1119 |
+
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
| 1120 |
+
/* .get_host_buffer_type = */ NULL,
|
| 1121 |
+
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
|
| 1122 |
+
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
| 1123 |
+
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
| 1124 |
+
/* .offload_op = */ NULL,
|
| 1125 |
+
/* .event_new = */ NULL,
|
| 1126 |
+
/* .event_free = */ NULL,
|
| 1127 |
+
/* .event_synchronize = */ NULL,
|
| 1128 |
+
};
|
| 1129 |
+
|
| 1130 |
+
////////////////////////
|
| 1131 |
+
|
| 1132 |
+
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
| 1133 |
+
return "CPU";
|
| 1134 |
+
|
| 1135 |
+
GGML_UNUSED(reg);
|
| 1136 |
+
}
|
| 1137 |
+
|
| 1138 |
+
static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
|
| 1139 |
+
return 1;
|
| 1140 |
+
|
| 1141 |
+
GGML_UNUSED(reg);
|
| 1142 |
+
}
|
| 1143 |
+
|
| 1144 |
+
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
| 1145 |
+
GGML_ASSERT(index == 0);
|
| 1146 |
+
|
| 1147 |
+
static ggml_backend_device ggml_backend_cpu_device = {
|
| 1148 |
+
/* .iface = */ ggml_backend_cpu_device_i,
|
| 1149 |
+
/* .reg = */ reg,
|
| 1150 |
+
/* .context = */ NULL,
|
| 1151 |
+
};
|
| 1152 |
+
|
| 1153 |
+
return &ggml_backend_cpu_device;
|
| 1154 |
+
|
| 1155 |
+
GGML_UNUSED(reg);
|
| 1156 |
+
GGML_UNUSED(index);
|
| 1157 |
+
}
|
| 1158 |
+
|
| 1159 |
+
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
|
| 1160 |
+
/* .get_name = */ ggml_backend_cpu_reg_get_name,
|
| 1161 |
+
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
|
| 1162 |
+
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
| 1163 |
+
/* .get_proc_address = */ NULL,
|
| 1164 |
+
/* .set_log_callback = */ NULL,
|
| 1165 |
+
};
|
| 1166 |
+
|
| 1167 |
+
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
| 1168 |
+
static struct ggml_backend_reg ggml_backend_cpu_reg = {
|
| 1169 |
+
/* .iface = */ ggml_backend_cpu_reg_i,
|
| 1170 |
+
/* .context = */ NULL,
|
| 1171 |
+
};
|
| 1172 |
+
|
| 1173 |
+
return &ggml_backend_cpu_reg;
|
| 1174 |
+
}
|
| 1175 |
+
|
| 1176 |
+
// multi-buffer buffer
|
| 1177 |
+
|
| 1178 |
+
struct ggml_backend_multi_buffer_context {
|
| 1179 |
+
ggml_backend_buffer_t * buffers;
|
| 1180 |
+
size_t n_buffers;
|
| 1181 |
+
};
|
| 1182 |
+
|
| 1183 |
+
static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 1184 |
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
| 1185 |
+
|
| 1186 |
+
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
| 1187 |
+
}
|
| 1188 |
+
|
| 1189 |
+
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1190 |
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
| 1191 |
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
| 1192 |
+
ggml_backend_buffer_free(ctx->buffers[i]);
|
| 1193 |
+
}
|
| 1194 |
+
|
| 1195 |
+
free(ctx->buffers);
|
| 1196 |
+
free(ctx);
|
| 1197 |
+
}
|
| 1198 |
+
|
| 1199 |
+
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 1200 |
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
| 1201 |
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
| 1202 |
+
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
| 1203 |
+
}
|
| 1204 |
+
}
|
| 1205 |
+
|
| 1206 |
+
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
| 1207 |
+
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
| 1208 |
+
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
| 1209 |
+
/* .get_base = */ NULL,
|
| 1210 |
+
/* .init_tensor = */ NULL,
|
| 1211 |
+
/* .memset_tensor = */ NULL,
|
| 1212 |
+
/* .set_tensor = */ NULL,
|
| 1213 |
+
/* .get_tensor = */ NULL,
|
| 1214 |
+
/* .cpy_tensor = */ NULL,
|
| 1215 |
+
/* .clear = */ ggml_backend_multi_buffer_clear,
|
| 1216 |
+
/* .reset = */ NULL,
|
| 1217 |
+
};
|
| 1218 |
+
|
| 1219 |
+
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
| 1220 |
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
|
| 1221 |
+
ctx->n_buffers = n_buffers;
|
| 1222 |
+
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
| 1223 |
+
|
| 1224 |
+
GGML_ASSERT(ctx->buffers != NULL);
|
| 1225 |
+
|
| 1226 |
+
size_t total_size = 0;
|
| 1227 |
+
for (size_t i = 0; i < n_buffers; i++) {
|
| 1228 |
+
ctx->buffers[i] = buffers[i];
|
| 1229 |
+
total_size += ggml_backend_buffer_get_size(buffers[i]);
|
| 1230 |
+
}
|
| 1231 |
+
|
| 1232 |
+
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
|
| 1233 |
+
}
|
| 1234 |
+
|
| 1235 |
+
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
| 1236 |
+
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
|
| 1237 |
+
}
|
| 1238 |
+
|
| 1239 |
+
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
| 1240 |
+
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
| 1241 |
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
| 1242 |
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
| 1243 |
+
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
| 1244 |
+
}
|
| 1245 |
+
}
|
| 1246 |
+
|
| 1247 |
+
// creates a copy of the tensor with the same memory layout
|
| 1248 |
+
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
| 1249 |
+
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
| 1250 |
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
| 1251 |
+
dup->nb[i] = tensor->nb[i];
|
| 1252 |
+
}
|
| 1253 |
+
return dup;
|
| 1254 |
+
}
|
| 1255 |
+
|
| 1256 |
+
static bool ggml_is_view_op(enum ggml_op op) {
|
| 1257 |
+
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
| 1258 |
+
}
|
| 1259 |
+
|
| 1260 |
+
// scheduler
|
| 1261 |
+
|
| 1262 |
+
#ifndef GGML_SCHED_MAX_BACKENDS
|
| 1263 |
+
#define GGML_SCHED_MAX_BACKENDS 16
|
| 1264 |
+
#endif
|
| 1265 |
+
|
| 1266 |
+
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
| 1267 |
+
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
| 1268 |
+
#endif
|
| 1269 |
+
|
| 1270 |
+
#ifndef GGML_SCHED_MAX_COPIES
|
| 1271 |
+
#define GGML_SCHED_MAX_COPIES 4
|
| 1272 |
+
#endif
|
| 1273 |
+
|
| 1274 |
+
struct ggml_backend_sched_split {
|
| 1275 |
+
int backend_id;
|
| 1276 |
+
int i_start;
|
| 1277 |
+
int i_end;
|
| 1278 |
+
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
| 1279 |
+
int n_inputs;
|
| 1280 |
+
// graph view of this split
|
| 1281 |
+
struct ggml_cgraph graph;
|
| 1282 |
+
};
|
| 1283 |
+
|
| 1284 |
+
struct ggml_backend_sched {
|
| 1285 |
+
bool is_reset; // true if the scheduler has been reset since the last graph split
|
| 1286 |
+
bool is_alloc;
|
| 1287 |
+
|
| 1288 |
+
int n_backends;
|
| 1289 |
+
|
| 1290 |
+
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
|
| 1291 |
+
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
| 1292 |
+
ggml_gallocr_t galloc;
|
| 1293 |
+
|
| 1294 |
+
// hash map of the nodes in the graph
|
| 1295 |
+
struct ggml_hash_set hash_set;
|
| 1296 |
+
int * hv_tensor_backend_ids; // [hash_set.size]
|
| 1297 |
+
struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
|
| 1298 |
+
|
| 1299 |
+
int * node_backend_ids; // [graph_size]
|
| 1300 |
+
int * leaf_backend_ids; // [graph_size]
|
| 1301 |
+
|
| 1302 |
+
int * prev_node_backend_ids; // [graph_size]
|
| 1303 |
+
int * prev_leaf_backend_ids; // [graph_size]
|
| 1304 |
+
|
| 1305 |
+
// copy of the graph with modified inputs
|
| 1306 |
+
struct ggml_cgraph graph;
|
| 1307 |
+
|
| 1308 |
+
// graph splits
|
| 1309 |
+
struct ggml_backend_sched_split * splits;
|
| 1310 |
+
int n_splits;
|
| 1311 |
+
int splits_capacity;
|
| 1312 |
+
|
| 1313 |
+
// pipeline parallelism support
|
| 1314 |
+
int n_copies;
|
| 1315 |
+
int cur_copy;
|
| 1316 |
+
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
| 1317 |
+
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
| 1318 |
+
int n_graph_inputs;
|
| 1319 |
+
|
| 1320 |
+
struct ggml_context * ctx;
|
| 1321 |
+
|
| 1322 |
+
ggml_backend_sched_eval_callback callback_eval;
|
| 1323 |
+
void * callback_eval_user_data;
|
| 1324 |
+
|
| 1325 |
+
char * context_buffer;
|
| 1326 |
+
size_t context_buffer_size;
|
| 1327 |
+
|
| 1328 |
+
bool debug;
|
| 1329 |
+
};
|
| 1330 |
+
|
| 1331 |
+
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
| 1332 |
+
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
|
| 1333 |
+
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
|
| 1334 |
+
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
| 1335 |
+
|
| 1336 |
+
// returns the priority of the backend, lower id is higher priority
|
| 1337 |
+
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 1338 |
+
for (int i = 0; i < sched->n_backends; i++) {
|
| 1339 |
+
if (sched->backends[i] == backend) {
|
| 1340 |
+
return i;
|
| 1341 |
+
}
|
| 1342 |
+
}
|
| 1343 |
+
return -1;
|
| 1344 |
+
}
|
| 1345 |
+
|
| 1346 |
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
| 1347 |
+
ggml_backend_buffer_t buffer = tensor->buffer;
|
| 1348 |
+
if (buffer == NULL) {
|
| 1349 |
+
return -1;
|
| 1350 |
+
}
|
| 1351 |
+
|
| 1352 |
+
// find highest prio backend that supports the buffer type and the op
|
| 1353 |
+
for (int i = 0; i < sched->n_backends; i++) {
|
| 1354 |
+
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
| 1355 |
+
ggml_backend_supports_op(sched->backends[i], op)) {
|
| 1356 |
+
return i;
|
| 1357 |
+
}
|
| 1358 |
+
}
|
| 1359 |
+
|
| 1360 |
+
#ifndef NDEBUG
|
| 1361 |
+
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
| 1362 |
+
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
| 1363 |
+
#endif
|
| 1364 |
+
|
| 1365 |
+
return -1;
|
| 1366 |
+
}
|
| 1367 |
+
|
| 1368 |
+
#if 0
|
| 1369 |
+
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
| 1370 |
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
| 1371 |
+
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
| 1372 |
+
#define GET_CAUSE(node) causes[hash_id(node)]
|
| 1373 |
+
#else
|
| 1374 |
+
#define SET_CAUSE(node, ...)
|
| 1375 |
+
#define GET_CAUSE(node) ""
|
| 1376 |
+
#endif
|
| 1377 |
+
|
| 1378 |
+
// returns the backend that should be used for the node based on the current locations
|
| 1379 |
+
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
| 1380 |
+
// TODO: use supports_op to check if the backend supports the op
|
| 1381 |
+
|
| 1382 |
+
// assign pre-allocated nodes to their backend
|
| 1383 |
+
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
| 1384 |
+
if (cur_backend_id != -1) {
|
| 1385 |
+
SET_CAUSE(tensor, "1.dst");
|
| 1386 |
+
return cur_backend_id;
|
| 1387 |
+
}
|
| 1388 |
+
|
| 1389 |
+
// view_src
|
| 1390 |
+
if (tensor->view_src != NULL) {
|
| 1391 |
+
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
| 1392 |
+
if (cur_backend_id != -1) {
|
| 1393 |
+
SET_CAUSE(tensor, "1.vsrc");
|
| 1394 |
+
return cur_backend_id;
|
| 1395 |
+
}
|
| 1396 |
+
}
|
| 1397 |
+
|
| 1398 |
+
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
| 1399 |
+
// since the tensor is pre-allocated, it cannot be moved to another backend
|
| 1400 |
+
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
| 1401 |
+
}
|
| 1402 |
+
|
| 1403 |
+
// graph input
|
| 1404 |
+
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1405 |
+
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
| 1406 |
+
SET_CAUSE(tensor, "1.inp");
|
| 1407 |
+
return cur_backend_id;
|
| 1408 |
+
}
|
| 1409 |
+
|
| 1410 |
+
// operations with weights are preferably run on the same backend as the weights
|
| 1411 |
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 1412 |
+
const struct ggml_tensor * src = tensor->src[i];
|
| 1413 |
+
if (src == NULL) {
|
| 1414 |
+
continue;
|
| 1415 |
+
}
|
| 1416 |
+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1417 |
+
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
| 1418 |
+
// check if a backend with higher prio wants to offload the op
|
| 1419 |
+
if (src_backend_id == sched->n_backends - 1) {
|
| 1420 |
+
for (int b = 0; b < src_backend_id; b++) {
|
| 1421 |
+
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
| 1422 |
+
SET_CAUSE(tensor, "1.off");
|
| 1423 |
+
return b;
|
| 1424 |
+
}
|
| 1425 |
+
}
|
| 1426 |
+
}
|
| 1427 |
+
SET_CAUSE(tensor, "1.wgt%d", i);
|
| 1428 |
+
return src_backend_id;
|
| 1429 |
+
}
|
| 1430 |
+
}
|
| 1431 |
+
|
| 1432 |
+
return -1;
|
| 1433 |
+
}
|
| 1434 |
+
|
| 1435 |
+
static char * fmt_size(size_t size) {
|
| 1436 |
+
static char buffer[128];
|
| 1437 |
+
if (size >= 1024*1024) {
|
| 1438 |
+
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
| 1439 |
+
} else {
|
| 1440 |
+
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
| 1441 |
+
}
|
| 1442 |
+
return buffer;
|
| 1443 |
+
}
|
| 1444 |
+
|
| 1445 |
+
static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1446 |
+
int cur_split = 0;
|
| 1447 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1448 |
+
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
| 1449 |
+
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
| 1450 |
+
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
| 1451 |
+
sched->splits[cur_split].n_inputs);
|
| 1452 |
+
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
| 1453 |
+
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
| 1454 |
+
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
| 1455 |
+
}
|
| 1456 |
+
fprintf(stderr, "\n");
|
| 1457 |
+
cur_split++;
|
| 1458 |
+
}
|
| 1459 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1460 |
+
if (ggml_is_view_op(node->op)) {
|
| 1461 |
+
continue;
|
| 1462 |
+
}
|
| 1463 |
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
| 1464 |
+
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
| 1465 |
+
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
| 1466 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1467 |
+
struct ggml_tensor * src = node->src[j];
|
| 1468 |
+
if (src == NULL) {
|
| 1469 |
+
continue;
|
| 1470 |
+
}
|
| 1471 |
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
| 1472 |
+
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
| 1473 |
+
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 1474 |
+
}
|
| 1475 |
+
fprintf(stderr, "\n");
|
| 1476 |
+
}
|
| 1477 |
+
}
|
| 1478 |
+
|
| 1479 |
+
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
|
| 1480 |
+
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
| 1481 |
+
ggml_backend_buffer_type_t buft = NULL;
|
| 1482 |
+
|
| 1483 |
+
if (buf) {
|
| 1484 |
+
// the tensor is already allocated
|
| 1485 |
+
buft = buf->buft;
|
| 1486 |
+
} else {
|
| 1487 |
+
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
| 1488 |
+
int tensor_backend_id = tensor_backend_id(t);
|
| 1489 |
+
if (tensor_backend_id == -1 && t->view_src) {
|
| 1490 |
+
tensor_backend_id = tensor_backend_id(t->view_src);
|
| 1491 |
+
}
|
| 1492 |
+
if (tensor_backend_id != -1) {
|
| 1493 |
+
buft = sched->bufts[tensor_backend_id];
|
| 1494 |
+
}
|
| 1495 |
+
}
|
| 1496 |
+
|
| 1497 |
+
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
| 1498 |
+
}
|
| 1499 |
+
|
| 1500 |
+
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
| 1501 |
+
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
| 1502 |
+
*node_backend_id = cur_backend_id;
|
| 1503 |
+
SET_CAUSE(node, "2.sup");
|
| 1504 |
+
}
|
| 1505 |
+
}
|
| 1506 |
+
|
| 1507 |
+
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
| 1508 |
+
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1509 |
+
// reset splits
|
| 1510 |
+
sched->n_splits = 0;
|
| 1511 |
+
sched->n_graph_inputs = 0;
|
| 1512 |
+
sched->is_reset = false;
|
| 1513 |
+
|
| 1514 |
+
struct ggml_init_params params = {
|
| 1515 |
+
/* .mem_size = */ sched->context_buffer_size,
|
| 1516 |
+
/* .mem_buffer = */ sched->context_buffer,
|
| 1517 |
+
/* .no_alloc = */ true
|
| 1518 |
+
};
|
| 1519 |
+
|
| 1520 |
+
ggml_free(sched->ctx);
|
| 1521 |
+
|
| 1522 |
+
sched->ctx = ggml_init(params);
|
| 1523 |
+
if (sched->ctx == NULL) {
|
| 1524 |
+
GGML_ABORT("%s: failed to initialize context\n", __func__);
|
| 1525 |
+
}
|
| 1526 |
+
|
| 1527 |
+
// pass 1: assign backends to ops with pre-allocated inputs
|
| 1528 |
+
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1529 |
+
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1530 |
+
int * leaf_backend_id = &tensor_backend_id(leaf);
|
| 1531 |
+
// do not overwrite user assignments
|
| 1532 |
+
if (*leaf_backend_id == -1) {
|
| 1533 |
+
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
| 1534 |
+
}
|
| 1535 |
+
}
|
| 1536 |
+
|
| 1537 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1538 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1539 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1540 |
+
// do not overwrite user assignments
|
| 1541 |
+
if (*node_backend_id == -1) {
|
| 1542 |
+
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
| 1543 |
+
|
| 1544 |
+
#if 0
|
| 1545 |
+
// src
|
| 1546 |
+
if (node->op == GGML_OP_NONE) {
|
| 1547 |
+
continue;
|
| 1548 |
+
}
|
| 1549 |
+
|
| 1550 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1551 |
+
struct ggml_tensor * src = node->src[j];
|
| 1552 |
+
if (src == NULL) {
|
| 1553 |
+
continue;
|
| 1554 |
+
}
|
| 1555 |
+
int * src_backend_id = &tensor_backend_id(src);
|
| 1556 |
+
if (*src_backend_id == -1) {
|
| 1557 |
+
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
| 1558 |
+
}
|
| 1559 |
+
}
|
| 1560 |
+
#endif
|
| 1561 |
+
}
|
| 1562 |
+
}
|
| 1563 |
+
|
| 1564 |
+
// pass 2: expand current backend assignments
|
| 1565 |
+
// assign the same backend to adjacent nodes
|
| 1566 |
+
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
| 1567 |
+
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
| 1568 |
+
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
| 1569 |
+
// expand gpu down
|
| 1570 |
+
{
|
| 1571 |
+
int cur_backend_id = -1;
|
| 1572 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1573 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1574 |
+
if (ggml_is_view_op(node->op)) {
|
| 1575 |
+
continue;
|
| 1576 |
+
}
|
| 1577 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1578 |
+
if (*node_backend_id != -1) {
|
| 1579 |
+
if (*node_backend_id == sched->n_backends - 1) {
|
| 1580 |
+
// skip cpu (lowest prio backend)
|
| 1581 |
+
cur_backend_id = -1;
|
| 1582 |
+
} else {
|
| 1583 |
+
cur_backend_id = *node_backend_id;
|
| 1584 |
+
}
|
| 1585 |
+
} else if (cur_backend_id != -1) {
|
| 1586 |
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
| 1587 |
+
}
|
| 1588 |
+
}
|
| 1589 |
+
}
|
| 1590 |
+
// expand gpu up
|
| 1591 |
+
{
|
| 1592 |
+
int cur_backend_id = -1;
|
| 1593 |
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1594 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1595 |
+
if (ggml_is_view_op(node->op)) {
|
| 1596 |
+
continue;
|
| 1597 |
+
}
|
| 1598 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1599 |
+
if (*node_backend_id != -1) {
|
| 1600 |
+
if (*node_backend_id == sched->n_backends - 1) {
|
| 1601 |
+
// skip cpu (lowest prio backend)
|
| 1602 |
+
cur_backend_id = -1;
|
| 1603 |
+
} else {
|
| 1604 |
+
cur_backend_id = *node_backend_id;
|
| 1605 |
+
}
|
| 1606 |
+
} else if (cur_backend_id != -1) {
|
| 1607 |
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
| 1608 |
+
}
|
| 1609 |
+
}
|
| 1610 |
+
}
|
| 1611 |
+
// expand rest down
|
| 1612 |
+
{
|
| 1613 |
+
int cur_backend_id = -1;
|
| 1614 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1615 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1616 |
+
if (ggml_is_view_op(node->op)) {
|
| 1617 |
+
continue;
|
| 1618 |
+
}
|
| 1619 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1620 |
+
if (*node_backend_id != -1) {
|
| 1621 |
+
cur_backend_id = *node_backend_id;
|
| 1622 |
+
} else if (cur_backend_id != -1) {
|
| 1623 |
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
| 1624 |
+
}
|
| 1625 |
+
}
|
| 1626 |
+
}
|
| 1627 |
+
// expand rest up
|
| 1628 |
+
{
|
| 1629 |
+
int cur_backend_id = -1;
|
| 1630 |
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1631 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1632 |
+
if (ggml_is_view_op(node->op)) {
|
| 1633 |
+
continue;
|
| 1634 |
+
}
|
| 1635 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1636 |
+
if (*node_backend_id != -1) {
|
| 1637 |
+
cur_backend_id = *node_backend_id;
|
| 1638 |
+
} else if (cur_backend_id != -1) {
|
| 1639 |
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
| 1640 |
+
}
|
| 1641 |
+
}
|
| 1642 |
+
}
|
| 1643 |
+
|
| 1644 |
+
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
| 1645 |
+
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
| 1646 |
+
// however, we also need to verify that the sources are in compatible buffer types
|
| 1647 |
+
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
| 1648 |
+
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
| 1649 |
+
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
| 1650 |
+
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
| 1651 |
+
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
| 1652 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1653 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1654 |
+
if (ggml_is_view_op(node->op)) {
|
| 1655 |
+
continue;
|
| 1656 |
+
}
|
| 1657 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1658 |
+
if (*node_backend_id == -1) {
|
| 1659 |
+
// unassigned node: find the backend with the most supported inputs
|
| 1660 |
+
int n_supported_best = -1;
|
| 1661 |
+
for (int b = 0; b < sched->n_backends; b++) {
|
| 1662 |
+
if (ggml_backend_supports_op(sched->backends[b], node)) {
|
| 1663 |
+
int n_supported = 0;
|
| 1664 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1665 |
+
struct ggml_tensor * src = node->src[j];
|
| 1666 |
+
if (src == NULL) {
|
| 1667 |
+
continue;
|
| 1668 |
+
}
|
| 1669 |
+
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
|
| 1670 |
+
n_supported++;
|
| 1671 |
+
}
|
| 1672 |
+
}
|
| 1673 |
+
if (n_supported > n_supported_best) {
|
| 1674 |
+
n_supported_best = n_supported;
|
| 1675 |
+
*node_backend_id = b;
|
| 1676 |
+
SET_CAUSE(node, "3.best");
|
| 1677 |
+
}
|
| 1678 |
+
}
|
| 1679 |
+
}
|
| 1680 |
+
} else {
|
| 1681 |
+
// assigned node: upgrade to higher prio backend if possible
|
| 1682 |
+
for (int b = 0; b < *node_backend_id; b++) {
|
| 1683 |
+
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
|
| 1684 |
+
bool supported = true;
|
| 1685 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1686 |
+
struct ggml_tensor * src = node->src[j];
|
| 1687 |
+
if (src == NULL) {
|
| 1688 |
+
continue;
|
| 1689 |
+
}
|
| 1690 |
+
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
|
| 1691 |
+
supported = false;
|
| 1692 |
+
break;
|
| 1693 |
+
}
|
| 1694 |
+
}
|
| 1695 |
+
if (supported) {
|
| 1696 |
+
*node_backend_id = b;
|
| 1697 |
+
SET_CAUSE(node, "3.upg");
|
| 1698 |
+
break;
|
| 1699 |
+
}
|
| 1700 |
+
}
|
| 1701 |
+
}
|
| 1702 |
+
}
|
| 1703 |
+
}
|
| 1704 |
+
|
| 1705 |
+
// pass 4: assign backends to remaining src from dst and view_src
|
| 1706 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1707 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1708 |
+
int * cur_backend_id = &tensor_backend_id(node);
|
| 1709 |
+
if (node->view_src != NULL && *cur_backend_id == -1) {
|
| 1710 |
+
*cur_backend_id = tensor_backend_id(node->view_src);
|
| 1711 |
+
SET_CAUSE(node, "4.vsrc");
|
| 1712 |
+
}
|
| 1713 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1714 |
+
struct ggml_tensor * src = node->src[j];
|
| 1715 |
+
if (src == NULL) {
|
| 1716 |
+
continue;
|
| 1717 |
+
}
|
| 1718 |
+
int * src_backend_id = &tensor_backend_id(src);
|
| 1719 |
+
if (*src_backend_id == -1) {
|
| 1720 |
+
if (src->view_src != NULL) {
|
| 1721 |
+
// views are always on the same backend as the source
|
| 1722 |
+
*src_backend_id = tensor_backend_id(src->view_src);
|
| 1723 |
+
SET_CAUSE(src, "4.vsrc");
|
| 1724 |
+
} else {
|
| 1725 |
+
*src_backend_id = *cur_backend_id;
|
| 1726 |
+
SET_CAUSE(src, "4.cur");
|
| 1727 |
+
}
|
| 1728 |
+
}
|
| 1729 |
+
}
|
| 1730 |
+
}
|
| 1731 |
+
|
| 1732 |
+
// pass 5: split graph, find tensors that need to be copied
|
| 1733 |
+
{
|
| 1734 |
+
int i_split = 0;
|
| 1735 |
+
struct ggml_backend_sched_split * split = &sched->splits[0];
|
| 1736 |
+
// find the backend of the first split, skipping view ops
|
| 1737 |
+
int i = 0;
|
| 1738 |
+
for (; i < graph->n_nodes; i++) {
|
| 1739 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1740 |
+
if (!ggml_is_view_op(node->op)) {
|
| 1741 |
+
split->backend_id = tensor_backend_id(node);
|
| 1742 |
+
break;
|
| 1743 |
+
}
|
| 1744 |
+
}
|
| 1745 |
+
split->i_start = 0;
|
| 1746 |
+
split->n_inputs = 0;
|
| 1747 |
+
int cur_backend_id = split->backend_id;
|
| 1748 |
+
for (; i < graph->n_nodes; i++) {
|
| 1749 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1750 |
+
|
| 1751 |
+
if (ggml_is_view_op(node->op)) {
|
| 1752 |
+
continue;
|
| 1753 |
+
}
|
| 1754 |
+
|
| 1755 |
+
const int node_backend_id = tensor_backend_id(node);
|
| 1756 |
+
|
| 1757 |
+
assert(node_backend_id != -1); // all nodes should be assigned by now
|
| 1758 |
+
|
| 1759 |
+
// check if we should start a new split based on the sources of the current node
|
| 1760 |
+
bool need_new_split = false;
|
| 1761 |
+
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
| 1762 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1763 |
+
struct ggml_tensor * src = node->src[j];
|
| 1764 |
+
if (src == NULL) {
|
| 1765 |
+
continue;
|
| 1766 |
+
}
|
| 1767 |
+
// check if a weight is on a different backend
|
| 1768 |
+
// by starting a new split, the memory of the previously offloaded weights can be reused
|
| 1769 |
+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1770 |
+
int src_backend_id = tensor_backend_id(src);
|
| 1771 |
+
if (src_backend_id != cur_backend_id) {
|
| 1772 |
+
need_new_split = true;
|
| 1773 |
+
break;
|
| 1774 |
+
}
|
| 1775 |
+
}
|
| 1776 |
+
// check if the split has too many inputs
|
| 1777 |
+
// FIXME: count the number of inputs instead of only checking when full
|
| 1778 |
+
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
| 1779 |
+
const size_t id = hash_id(src);
|
| 1780 |
+
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
| 1781 |
+
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
| 1782 |
+
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
| 1783 |
+
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
| 1784 |
+
need_new_split = true;
|
| 1785 |
+
break;
|
| 1786 |
+
}
|
| 1787 |
+
}
|
| 1788 |
+
}
|
| 1789 |
+
}
|
| 1790 |
+
|
| 1791 |
+
if (node_backend_id != cur_backend_id || need_new_split) {
|
| 1792 |
+
split->i_end = i;
|
| 1793 |
+
i_split++;
|
| 1794 |
+
if (i_split >= sched->splits_capacity) {
|
| 1795 |
+
sched->splits_capacity *= 2;
|
| 1796 |
+
sched->splits = (ggml_backend_sched_split *)
|
| 1797 |
+
realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
| 1798 |
+
GGML_ASSERT(sched->splits != NULL);
|
| 1799 |
+
}
|
| 1800 |
+
split = &sched->splits[i_split];
|
| 1801 |
+
split->backend_id = node_backend_id;
|
| 1802 |
+
split->i_start = i;
|
| 1803 |
+
split->n_inputs = 0;
|
| 1804 |
+
cur_backend_id = node_backend_id;
|
| 1805 |
+
}
|
| 1806 |
+
|
| 1807 |
+
// find inputs that are not on the same backend
|
| 1808 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1809 |
+
struct ggml_tensor * src = node->src[j];
|
| 1810 |
+
if (src == NULL) {
|
| 1811 |
+
continue;
|
| 1812 |
+
}
|
| 1813 |
+
|
| 1814 |
+
size_t src_id = hash_id(src);
|
| 1815 |
+
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
| 1816 |
+
assert(src_backend_id != -1); // all inputs should be assigned by now
|
| 1817 |
+
|
| 1818 |
+
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
| 1819 |
+
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
| 1820 |
+
ggml_backend_t backend = sched->backends[src_backend_id];
|
| 1821 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 1822 |
+
struct ggml_tensor * tensor_copy;
|
| 1823 |
+
if (c == sched->cur_copy) {
|
| 1824 |
+
tensor_copy = src; // use the original tensor as the current copy
|
| 1825 |
+
} else {
|
| 1826 |
+
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1827 |
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
| 1828 |
+
}
|
| 1829 |
+
if (sched->n_copies > 1) {
|
| 1830 |
+
ggml_set_input(tensor_copy);
|
| 1831 |
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
| 1832 |
+
}
|
| 1833 |
+
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
| 1834 |
+
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1835 |
+
}
|
| 1836 |
+
int n_graph_inputs = sched->n_graph_inputs++;
|
| 1837 |
+
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
| 1838 |
+
sched->graph_inputs[n_graph_inputs] = src;
|
| 1839 |
+
}
|
| 1840 |
+
}
|
| 1841 |
+
|
| 1842 |
+
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
| 1843 |
+
// create a copy of the input in the split's backend
|
| 1844 |
+
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
|
| 1845 |
+
ggml_backend_t backend = sched->backends[cur_backend_id];
|
| 1846 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 1847 |
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1848 |
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
| 1849 |
+
if (sched->n_copies > 1) {
|
| 1850 |
+
ggml_set_input(tensor_copy);
|
| 1851 |
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
| 1852 |
+
}
|
| 1853 |
+
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
| 1854 |
+
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1855 |
+
}
|
| 1856 |
+
int n_inputs = split->n_inputs++;
|
| 1857 |
+
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
| 1858 |
+
split->inputs[n_inputs] = src;
|
| 1859 |
+
}
|
| 1860 |
+
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
| 1861 |
+
}
|
| 1862 |
+
}
|
| 1863 |
+
}
|
| 1864 |
+
split->i_end = graph->n_nodes;
|
| 1865 |
+
sched->n_splits = i_split + 1;
|
| 1866 |
+
}
|
| 1867 |
+
|
| 1868 |
+
if (sched->debug) {
|
| 1869 |
+
ggml_backend_sched_print_assignments(sched, graph);
|
| 1870 |
+
}
|
| 1871 |
+
|
| 1872 |
+
// swap node_backend_ids and leaf _backend_ids with prevs
|
| 1873 |
+
{
|
| 1874 |
+
int * tmp = sched->node_backend_ids;
|
| 1875 |
+
sched->node_backend_ids = sched->prev_node_backend_ids;
|
| 1876 |
+
sched->prev_node_backend_ids = tmp;
|
| 1877 |
+
|
| 1878 |
+
tmp = sched->leaf_backend_ids;
|
| 1879 |
+
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
| 1880 |
+
sched->prev_leaf_backend_ids = tmp;
|
| 1881 |
+
}
|
| 1882 |
+
|
| 1883 |
+
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
| 1884 |
+
if (sched->graph.size < graph_size) {
|
| 1885 |
+
sched->graph.size = graph_size;
|
| 1886 |
+
sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
| 1887 |
+
sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
| 1888 |
+
GGML_ASSERT(sched->graph.nodes != NULL);
|
| 1889 |
+
GGML_ASSERT(sched->graph.leafs != NULL);
|
| 1890 |
+
}
|
| 1891 |
+
sched->graph.n_nodes = 0;
|
| 1892 |
+
sched->graph.n_leafs = 0;
|
| 1893 |
+
|
| 1894 |
+
struct ggml_cgraph * graph_copy = &sched->graph;
|
| 1895 |
+
|
| 1896 |
+
for (int i = 0; i < sched->n_splits; i++) {
|
| 1897 |
+
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1898 |
+
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
| 1899 |
+
|
| 1900 |
+
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
| 1901 |
+
for (int j = 0; j < split->n_inputs; j++) {
|
| 1902 |
+
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
| 1903 |
+
|
| 1904 |
+
struct ggml_tensor * input = split->inputs[j];
|
| 1905 |
+
const size_t input_id = hash_id(input);
|
| 1906 |
+
struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
|
| 1907 |
+
|
| 1908 |
+
// add a dependency to the input source so that it is not freed before the copy is done
|
| 1909 |
+
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
| 1910 |
+
input_dep->src[0] = input;
|
| 1911 |
+
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
| 1912 |
+
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
| 1913 |
+
|
| 1914 |
+
// add a dependency to the input copy so that it is allocated at the start of the split
|
| 1915 |
+
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
| 1916 |
+
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
| 1917 |
+
}
|
| 1918 |
+
|
| 1919 |
+
for (int j = split->i_start; j < split->i_end; j++) {
|
| 1920 |
+
assert(graph_copy->size > graph_copy->n_nodes);
|
| 1921 |
+
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
| 1922 |
+
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
| 1923 |
+
}
|
| 1924 |
+
}
|
| 1925 |
+
|
| 1926 |
+
if (sched->n_copies > 1) {
|
| 1927 |
+
// add input copies as leafs so that they are allocated first
|
| 1928 |
+
for (int i = 0; i < sched->n_graph_inputs; i++) {
|
| 1929 |
+
struct ggml_tensor * input = sched->graph_inputs[i];
|
| 1930 |
+
size_t id = hash_id(input);
|
| 1931 |
+
int backend_id = tensor_backend_id(input);
|
| 1932 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 1933 |
+
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
| 1934 |
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
| 1935 |
+
assert(graph_copy->size > graph_copy->n_leafs);
|
| 1936 |
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
| 1937 |
+
}
|
| 1938 |
+
}
|
| 1939 |
+
|
| 1940 |
+
for (int i = 0; i < sched->n_splits; i++) {
|
| 1941 |
+
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1942 |
+
int backend_id = split->backend_id;
|
| 1943 |
+
for (int j = 0; j < split->n_inputs; j++) {
|
| 1944 |
+
struct ggml_tensor * input = split->inputs[j];
|
| 1945 |
+
size_t id = hash_id(input);
|
| 1946 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 1947 |
+
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
| 1948 |
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
| 1949 |
+
assert(graph_copy->size > graph_copy->n_leafs);
|
| 1950 |
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
| 1951 |
+
}
|
| 1952 |
+
}
|
| 1953 |
+
}
|
| 1954 |
+
}
|
| 1955 |
+
|
| 1956 |
+
// add leafs from the original graph
|
| 1957 |
+
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1958 |
+
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1959 |
+
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
| 1960 |
+
assert(graph_copy->size > graph_copy->n_leafs);
|
| 1961 |
+
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
| 1962 |
+
}
|
| 1963 |
+
}
|
| 1964 |
+
|
| 1965 |
+
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
| 1966 |
+
bool backend_ids_changed = false;
|
| 1967 |
+
for (int i = 0; i < sched->graph.n_nodes; i++) {
|
| 1968 |
+
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
| 1969 |
+
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
| 1970 |
+
backend_ids_changed = true;
|
| 1971 |
+
break;
|
| 1972 |
+
}
|
| 1973 |
+
}
|
| 1974 |
+
if (!backend_ids_changed) {
|
| 1975 |
+
for (int i = 0; i < sched->graph.n_leafs; i++) {
|
| 1976 |
+
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
| 1977 |
+
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
| 1978 |
+
backend_ids_changed = true;
|
| 1979 |
+
break;
|
| 1980 |
+
}
|
| 1981 |
+
}
|
| 1982 |
+
}
|
| 1983 |
+
|
| 1984 |
+
// allocate graph
|
| 1985 |
+
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
| 1986 |
+
// the re-allocation may cause the split inputs to be moved to a different address
|
| 1987 |
+
ggml_backend_sched_synchronize(sched);
|
| 1988 |
+
#ifndef NDEBUG
|
| 1989 |
+
fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
| 1990 |
+
#endif
|
| 1991 |
+
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
| 1992 |
+
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
| 1993 |
+
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
| 1994 |
+
return false;
|
| 1995 |
+
}
|
| 1996 |
+
}
|
| 1997 |
+
|
| 1998 |
+
return true;
|
| 1999 |
+
}
|
| 2000 |
+
|
| 2001 |
+
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
| 2002 |
+
struct ggml_backend_sched_split * splits = sched->splits;
|
| 2003 |
+
|
| 2004 |
+
for (int i = 0; i < sched->n_splits; i++) {
|
| 2005 |
+
struct ggml_backend_sched_split * split = &splits[i];
|
| 2006 |
+
int split_backend_id = split->backend_id;
|
| 2007 |
+
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
| 2008 |
+
|
| 2009 |
+
// copy the input tensors to the split backend
|
| 2010 |
+
for (int j = 0; j < split->n_inputs; j++) {
|
| 2011 |
+
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
| 2012 |
+
struct ggml_tensor * input = split->inputs[j];
|
| 2013 |
+
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
| 2014 |
+
|
| 2015 |
+
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 2016 |
+
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
| 2017 |
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 2018 |
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
| 2019 |
+
} else {
|
| 2020 |
+
ggml_backend_synchronize(split_backend);
|
| 2021 |
+
}
|
| 2022 |
+
ggml_backend_tensor_copy(input, input_cpy);
|
| 2023 |
+
} else {
|
| 2024 |
+
// wait for the split backend to finish using the input before overwriting it
|
| 2025 |
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 2026 |
+
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
| 2027 |
+
} else {
|
| 2028 |
+
ggml_backend_synchronize(split_backend);
|
| 2029 |
+
}
|
| 2030 |
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
| 2031 |
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
| 2032 |
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
| 2033 |
+
ggml_backend_synchronize(input_backend);
|
| 2034 |
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 2035 |
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
| 2036 |
+
} else {
|
| 2037 |
+
ggml_backend_synchronize(split_backend);
|
| 2038 |
+
}
|
| 2039 |
+
ggml_backend_tensor_copy(input, input_cpy);
|
| 2040 |
+
}
|
| 2041 |
+
}
|
| 2042 |
+
}
|
| 2043 |
+
|
| 2044 |
+
if (!sched->callback_eval) {
|
| 2045 |
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
| 2046 |
+
if (ec != GGML_STATUS_SUCCESS) {
|
| 2047 |
+
return ec;
|
| 2048 |
+
}
|
| 2049 |
+
} else {
|
| 2050 |
+
// similar to ggml_backend_compare_graph_backend
|
| 2051 |
+
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
| 2052 |
+
struct ggml_tensor * t = split->graph.nodes[j0];
|
| 2053 |
+
|
| 2054 |
+
// check if the user needs data from this node
|
| 2055 |
+
bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
| 2056 |
+
|
| 2057 |
+
int j1 = j0;
|
| 2058 |
+
|
| 2059 |
+
// determine the range [j0, j1] of nodes that can be computed together
|
| 2060 |
+
while (!need && j1 < split->graph.n_nodes - 1) {
|
| 2061 |
+
t = split->graph.nodes[++j1];
|
| 2062 |
+
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
| 2063 |
+
}
|
| 2064 |
+
|
| 2065 |
+
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
| 2066 |
+
|
| 2067 |
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
|
| 2068 |
+
if (ec != GGML_STATUS_SUCCESS) {
|
| 2069 |
+
return ec;
|
| 2070 |
+
}
|
| 2071 |
+
|
| 2072 |
+
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
| 2073 |
+
ggml_backend_synchronize(split_backend);
|
| 2074 |
+
|
| 2075 |
+
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
| 2076 |
+
break;
|
| 2077 |
+
}
|
| 2078 |
+
|
| 2079 |
+
j0 = j1;
|
| 2080 |
+
}
|
| 2081 |
+
}
|
| 2082 |
+
|
| 2083 |
+
// record the event of this copy
|
| 2084 |
+
if (split->n_inputs > 0) {
|
| 2085 |
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 2086 |
+
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
|
| 2087 |
+
}
|
| 2088 |
+
}
|
| 2089 |
+
}
|
| 2090 |
+
|
| 2091 |
+
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
| 2092 |
+
|
| 2093 |
+
return GGML_STATUS_SUCCESS;
|
| 2094 |
+
}
|
| 2095 |
+
|
| 2096 |
+
ggml_backend_sched_t ggml_backend_sched_new(
|
| 2097 |
+
ggml_backend_t * backends,
|
| 2098 |
+
ggml_backend_buffer_type_t * bufts,
|
| 2099 |
+
int n_backends,
|
| 2100 |
+
size_t graph_size,
|
| 2101 |
+
bool parallel) {
|
| 2102 |
+
GGML_ASSERT(n_backends > 0);
|
| 2103 |
+
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
| 2104 |
+
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
| 2105 |
+
|
| 2106 |
+
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
| 2107 |
+
|
| 2108 |
+
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
| 2109 |
+
sched->n_backends = n_backends;
|
| 2110 |
+
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
| 2111 |
+
|
| 2112 |
+
// initialize hash table
|
| 2113 |
+
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
| 2114 |
+
sched->hash_set = ggml_hash_set_new(graph_size);
|
| 2115 |
+
sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
| 2116 |
+
sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
| 2117 |
+
|
| 2118 |
+
const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
| 2119 |
+
const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
| 2120 |
+
sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
| 2121 |
+
sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
| 2122 |
+
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
| 2123 |
+
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
| 2124 |
+
|
| 2125 |
+
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
| 2126 |
+
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
| 2127 |
+
|
| 2128 |
+
const int initial_splits_capacity = 16;
|
| 2129 |
+
sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
| 2130 |
+
sched->splits_capacity = initial_splits_capacity;
|
| 2131 |
+
|
| 2132 |
+
for (int b = 0; b < n_backends; b++) {
|
| 2133 |
+
sched->backends[b] = backends[b];
|
| 2134 |
+
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
| 2135 |
+
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
| 2136 |
+
if (sched->n_copies > 1) {
|
| 2137 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 2138 |
+
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
| 2139 |
+
}
|
| 2140 |
+
}
|
| 2141 |
+
}
|
| 2142 |
+
|
| 2143 |
+
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
| 2144 |
+
|
| 2145 |
+
ggml_backend_sched_reset(sched);
|
| 2146 |
+
|
| 2147 |
+
return sched;
|
| 2148 |
+
}
|
| 2149 |
+
|
| 2150 |
+
void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
| 2151 |
+
if (sched == NULL) {
|
| 2152 |
+
return;
|
| 2153 |
+
}
|
| 2154 |
+
for (int b = 0; b < sched->n_backends; b++) {
|
| 2155 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 2156 |
+
ggml_backend_event_free(sched->events[b][c]);
|
| 2157 |
+
}
|
| 2158 |
+
}
|
| 2159 |
+
ggml_gallocr_free(sched->galloc);
|
| 2160 |
+
ggml_free(sched->ctx);
|
| 2161 |
+
ggml_hash_set_free(&sched->hash_set);
|
| 2162 |
+
free(sched->splits);
|
| 2163 |
+
free(sched->hv_tensor_backend_ids);
|
| 2164 |
+
free(sched->hv_tensor_copies);
|
| 2165 |
+
free(sched->node_backend_ids);
|
| 2166 |
+
free(sched->leaf_backend_ids);
|
| 2167 |
+
free(sched->prev_node_backend_ids);
|
| 2168 |
+
free(sched->prev_leaf_backend_ids);
|
| 2169 |
+
free(sched->context_buffer);
|
| 2170 |
+
free(sched->graph.nodes);
|
| 2171 |
+
free(sched->graph.leafs);
|
| 2172 |
+
free(sched);
|
| 2173 |
+
}
|
| 2174 |
+
|
| 2175 |
+
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
| 2176 |
+
// reset state for the next run
|
| 2177 |
+
if (!sched->is_reset) {
|
| 2178 |
+
ggml_hash_set_reset(&sched->hash_set);
|
| 2179 |
+
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
| 2180 |
+
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
| 2181 |
+
sched->is_reset = true;
|
| 2182 |
+
}
|
| 2183 |
+
sched->is_alloc = false;
|
| 2184 |
+
}
|
| 2185 |
+
|
| 2186 |
+
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
| 2187 |
+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
| 2188 |
+
|
| 2189 |
+
ggml_backend_sched_split_graph(sched, measure_graph);
|
| 2190 |
+
|
| 2191 |
+
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
| 2192 |
+
return false;
|
| 2193 |
+
}
|
| 2194 |
+
|
| 2195 |
+
ggml_backend_sched_reset(sched);
|
| 2196 |
+
ggml_backend_sched_synchronize(sched);
|
| 2197 |
+
|
| 2198 |
+
return true;
|
| 2199 |
+
}
|
| 2200 |
+
|
| 2201 |
+
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 2202 |
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
| 2203 |
+
|
| 2204 |
+
ggml_backend_sched_split_graph(sched, graph);
|
| 2205 |
+
|
| 2206 |
+
|
| 2207 |
+
if (!ggml_backend_sched_alloc_splits(sched)) {
|
| 2208 |
+
return false;
|
| 2209 |
+
}
|
| 2210 |
+
|
| 2211 |
+
sched->is_alloc = true;
|
| 2212 |
+
|
| 2213 |
+
return true;
|
| 2214 |
+
}
|
| 2215 |
+
|
| 2216 |
+
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 2217 |
+
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
|
| 2218 |
+
ggml_backend_sched_synchronize(sched);
|
| 2219 |
+
return err;
|
| 2220 |
+
}
|
| 2221 |
+
|
| 2222 |
+
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 2223 |
+
if (!sched->is_reset && !sched->is_alloc) {
|
| 2224 |
+
ggml_backend_sched_reset(sched);
|
| 2225 |
+
}
|
| 2226 |
+
|
| 2227 |
+
if (!sched->is_alloc) {
|
| 2228 |
+
if (!ggml_backend_sched_alloc_graph(sched, graph)) {
|
| 2229 |
+
return GGML_STATUS_ALLOC_FAILED;
|
| 2230 |
+
}
|
| 2231 |
+
}
|
| 2232 |
+
|
| 2233 |
+
return ggml_backend_sched_compute_splits(sched);
|
| 2234 |
+
}
|
| 2235 |
+
|
| 2236 |
+
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
| 2237 |
+
for (int i = 0; i < sched->n_backends; i++) {
|
| 2238 |
+
ggml_backend_synchronize(sched->backends[i]);
|
| 2239 |
+
}
|
| 2240 |
+
}
|
| 2241 |
+
|
| 2242 |
+
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
| 2243 |
+
sched->callback_eval = callback;
|
| 2244 |
+
sched->callback_eval_user_data = user_data;
|
| 2245 |
+
}
|
| 2246 |
+
|
| 2247 |
+
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
| 2248 |
+
return sched->n_splits;
|
| 2249 |
+
}
|
| 2250 |
+
|
| 2251 |
+
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
| 2252 |
+
return sched->n_copies;
|
| 2253 |
+
}
|
| 2254 |
+
|
| 2255 |
+
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
| 2256 |
+
return sched->n_backends;
|
| 2257 |
+
}
|
| 2258 |
+
|
| 2259 |
+
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
| 2260 |
+
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
| 2261 |
+
return sched->backends[i];
|
| 2262 |
+
}
|
| 2263 |
+
|
| 2264 |
+
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 2265 |
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 2266 |
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 2267 |
+
|
| 2268 |
+
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
| 2269 |
+
}
|
| 2270 |
+
|
| 2271 |
+
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
| 2272 |
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 2273 |
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 2274 |
+
tensor_backend_id(node) = backend_index;
|
| 2275 |
+
SET_CAUSE(node, "usr");
|
| 2276 |
+
sched->is_reset = false;
|
| 2277 |
+
}
|
| 2278 |
+
|
| 2279 |
+
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
| 2280 |
+
int backend_index = tensor_backend_id(node);
|
| 2281 |
+
if (backend_index == -1) {
|
| 2282 |
+
return NULL;
|
| 2283 |
+
}
|
| 2284 |
+
return sched->backends[backend_index];
|
| 2285 |
+
}
|
| 2286 |
+
|
| 2287 |
+
// utils
|
| 2288 |
+
|
| 2289 |
+
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
| 2290 |
+
GGML_ASSERT(tensor->buffer == NULL);
|
| 2291 |
+
GGML_ASSERT(tensor->view_src != NULL);
|
| 2292 |
+
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
| 2293 |
+
GGML_ASSERT(tensor->view_src->data != NULL);
|
| 2294 |
+
|
| 2295 |
+
tensor->buffer = tensor->view_src->buffer;
|
| 2296 |
+
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
| 2297 |
+
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
| 2298 |
+
}
|
| 2299 |
+
|
| 2300 |
+
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
| 2301 |
+
GGML_ASSERT(tensor->buffer == NULL);
|
| 2302 |
+
GGML_ASSERT(tensor->data == NULL);
|
| 2303 |
+
GGML_ASSERT(tensor->view_src == NULL);
|
| 2304 |
+
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
|
| 2305 |
+
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
|
| 2306 |
+
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
|
| 2307 |
+
|
| 2308 |
+
tensor->buffer = buffer;
|
| 2309 |
+
tensor->data = addr;
|
| 2310 |
+
ggml_backend_buffer_init_tensor(buffer, tensor);
|
| 2311 |
+
}
|
| 2312 |
+
|
| 2313 |
+
static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
|
| 2314 |
+
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
|
| 2315 |
+
|
| 2316 |
+
GGML_ASSERT(src != NULL);
|
| 2317 |
+
GGML_ASSERT(src->data && "graph must be allocated");
|
| 2318 |
+
|
| 2319 |
+
size_t id = ggml_hash_insert(&hash_set, src);
|
| 2320 |
+
if (id == GGML_HASHSET_ALREADY_EXISTS) {
|
| 2321 |
+
return node_copies[ggml_hash_find(&hash_set, src)];
|
| 2322 |
+
}
|
| 2323 |
+
|
| 2324 |
+
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
| 2325 |
+
if (src->view_src != NULL) {
|
| 2326 |
+
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
| 2327 |
+
dst->view_offs = src->view_offs;
|
| 2328 |
+
}
|
| 2329 |
+
dst->op = src->op;
|
| 2330 |
+
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
|
| 2331 |
+
ggml_set_name(dst, src->name);
|
| 2332 |
+
|
| 2333 |
+
// copy src
|
| 2334 |
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 2335 |
+
struct ggml_tensor * s = src->src[i];
|
| 2336 |
+
if (s == NULL) {
|
| 2337 |
+
continue;
|
| 2338 |
+
}
|
| 2339 |
+
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
| 2340 |
+
}
|
| 2341 |
+
|
| 2342 |
+
node_copies[id] = dst;
|
| 2343 |
+
return dst;
|
| 2344 |
+
}
|
| 2345 |
+
|
| 2346 |
+
static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
| 2347 |
+
size_t id = ggml_hash_find(hash_set, src);
|
| 2348 |
+
if (node_init[id]) {
|
| 2349 |
+
return;
|
| 2350 |
+
}
|
| 2351 |
+
node_init[id] = true;
|
| 2352 |
+
|
| 2353 |
+
struct ggml_tensor * dst = node_copies[id];
|
| 2354 |
+
if (dst->view_src != NULL) {
|
| 2355 |
+
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
| 2356 |
+
ggml_backend_view_init(dst);
|
| 2357 |
+
}
|
| 2358 |
+
else {
|
| 2359 |
+
ggml_backend_tensor_copy(src, dst);
|
| 2360 |
+
}
|
| 2361 |
+
|
| 2362 |
+
// init src
|
| 2363 |
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 2364 |
+
struct ggml_tensor * s = src->src[i];
|
| 2365 |
+
if (s == NULL) {
|
| 2366 |
+
continue;
|
| 2367 |
+
}
|
| 2368 |
+
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
| 2369 |
+
}
|
| 2370 |
+
}
|
| 2371 |
+
|
| 2372 |
+
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
| 2373 |
+
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
| 2374 |
+
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
| 2375 |
+
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
| 2376 |
+
|
| 2377 |
+
struct ggml_init_params params = {
|
| 2378 |
+
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
| 2379 |
+
/* .mem_buffer = */ NULL,
|
| 2380 |
+
/* .no_alloc = */ true
|
| 2381 |
+
};
|
| 2382 |
+
|
| 2383 |
+
struct ggml_context * ctx_allocated = ggml_init(params);
|
| 2384 |
+
struct ggml_context * ctx_unallocated = ggml_init(params);
|
| 2385 |
+
|
| 2386 |
+
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
| 2387 |
+
fprintf(stderr, "failed to allocate context for graph copy\n");
|
| 2388 |
+
ggml_hash_set_free(&hash_set);
|
| 2389 |
+
free(node_copies);
|
| 2390 |
+
free(node_init);
|
| 2391 |
+
ggml_free(ctx_allocated);
|
| 2392 |
+
ggml_free(ctx_unallocated);
|
| 2393 |
+
return {
|
| 2394 |
+
/* .buffer = */ NULL,
|
| 2395 |
+
/* .ctx_allocated = */ NULL,
|
| 2396 |
+
/* .ctx_unallocated = */ NULL,
|
| 2397 |
+
/* .graph = */ NULL,
|
| 2398 |
+
};
|
| 2399 |
+
}
|
| 2400 |
+
|
| 2401 |
+
// dup nodes
|
| 2402 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 2403 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 2404 |
+
graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
| 2405 |
+
}
|
| 2406 |
+
|
| 2407 |
+
// allocate nodes
|
| 2408 |
+
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
| 2409 |
+
if (buffer == NULL) {
|
| 2410 |
+
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
| 2411 |
+
ggml_hash_set_free(&hash_set);
|
| 2412 |
+
free(node_copies);
|
| 2413 |
+
free(node_init);
|
| 2414 |
+
ggml_free(ctx_allocated);
|
| 2415 |
+
ggml_free(ctx_unallocated);
|
| 2416 |
+
return {
|
| 2417 |
+
/* .buffer = */ NULL,
|
| 2418 |
+
/* .ctx_allocated = */ NULL,
|
| 2419 |
+
/* .ctx_unallocated = */ NULL,
|
| 2420 |
+
/* .graph = */ NULL,
|
| 2421 |
+
};
|
| 2422 |
+
}
|
| 2423 |
+
|
| 2424 |
+
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
| 2425 |
+
|
| 2426 |
+
// copy data and init views
|
| 2427 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 2428 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 2429 |
+
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
| 2430 |
+
}
|
| 2431 |
+
|
| 2432 |
+
// build graph copy
|
| 2433 |
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
| 2434 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 2435 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 2436 |
+
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
|
| 2437 |
+
graph_copy->nodes[i] = node_copy;
|
| 2438 |
+
}
|
| 2439 |
+
graph_copy->n_nodes = graph->n_nodes;
|
| 2440 |
+
|
| 2441 |
+
ggml_hash_set_free(&hash_set);
|
| 2442 |
+
free(node_copies);
|
| 2443 |
+
free(node_init);
|
| 2444 |
+
|
| 2445 |
+
return {
|
| 2446 |
+
/* .buffer = */ buffer,
|
| 2447 |
+
/* .ctx_allocated = */ ctx_allocated,
|
| 2448 |
+
/* .ctx_unallocated = */ ctx_unallocated,
|
| 2449 |
+
/* .graph = */ graph_copy,
|
| 2450 |
+
};
|
| 2451 |
+
}
|
| 2452 |
+
|
| 2453 |
+
void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
| 2454 |
+
ggml_backend_buffer_free(copy.buffer);
|
| 2455 |
+
ggml_free(copy.ctx_allocated);
|
| 2456 |
+
ggml_free(copy.ctx_unallocated);
|
| 2457 |
+
}
|
| 2458 |
+
|
| 2459 |
+
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
| 2460 |
+
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
| 2461 |
+
if (copy.buffer == NULL) {
|
| 2462 |
+
return false;
|
| 2463 |
+
}
|
| 2464 |
+
|
| 2465 |
+
struct ggml_cgraph * g1 = graph;
|
| 2466 |
+
struct ggml_cgraph * g2 = copy.graph;
|
| 2467 |
+
|
| 2468 |
+
assert(g1->n_nodes == g2->n_nodes);
|
| 2469 |
+
|
| 2470 |
+
for (int i = 0; i < g1->n_nodes; i++) {
|
| 2471 |
+
//printf("eval %d/%d\n", i, g1->n_nodes);
|
| 2472 |
+
struct ggml_tensor * t1 = g1->nodes[i];
|
| 2473 |
+
struct ggml_tensor * t2 = g2->nodes[i];
|
| 2474 |
+
|
| 2475 |
+
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
|
| 2476 |
+
|
| 2477 |
+
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
|
| 2478 |
+
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
|
| 2479 |
+
|
| 2480 |
+
ggml_backend_graph_compute(backend1, &g1v);
|
| 2481 |
+
ggml_backend_graph_compute(backend2, &g2v);
|
| 2482 |
+
|
| 2483 |
+
if (ggml_is_view_op(t1->op)) {
|
| 2484 |
+
continue;
|
| 2485 |
+
}
|
| 2486 |
+
|
| 2487 |
+
// compare results, calculate rms etc
|
| 2488 |
+
if (!callback(i, t1, t2, user_data)) {
|
| 2489 |
+
break;
|
| 2490 |
+
}
|
| 2491 |
+
}
|
| 2492 |
+
|
| 2493 |
+
ggml_backend_graph_copy_free(copy);
|
| 2494 |
+
|
| 2495 |
+
return true;
|
| 2496 |
+
}
|
ggml/src/ggml-blas.cpp
CHANGED
|
@@ -235,25 +235,25 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g
|
|
| 235 |
|
| 236 |
// backend interface
|
| 237 |
|
| 238 |
-
|
| 239 |
return "BLAS";
|
| 240 |
|
| 241 |
GGML_UNUSED(backend);
|
| 242 |
}
|
| 243 |
|
| 244 |
-
|
| 245 |
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
| 246 |
delete ctx;
|
| 247 |
delete backend;
|
| 248 |
}
|
| 249 |
|
| 250 |
-
|
| 251 |
return ggml_backend_cpu_buffer_type();
|
| 252 |
|
| 253 |
GGML_UNUSED(backend);
|
| 254 |
}
|
| 255 |
|
| 256 |
-
|
| 257 |
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
| 258 |
|
| 259 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
@@ -285,7 +285,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
|
|
| 285 |
GGML_UNUSED(backend);
|
| 286 |
}
|
| 287 |
|
| 288 |
-
|
| 289 |
const struct ggml_tensor * src0 = op->src[0];
|
| 290 |
const struct ggml_tensor * src1 = op->src[1];
|
| 291 |
|
|
@@ -300,7 +300,7 @@ GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, cons
|
|
| 300 |
GGML_UNUSED(backend);
|
| 301 |
}
|
| 302 |
|
| 303 |
-
|
| 304 |
return ggml_backend_buft_is_host(buft);
|
| 305 |
|
| 306 |
GGML_UNUSED(backend);
|
|
@@ -322,11 +322,8 @@ static struct ggml_backend_i blas_backend_i = {
|
|
| 322 |
/* .supports_op = */ ggml_backend_blas_supports_op,
|
| 323 |
/* .supports_buft = */ ggml_backend_blas_supports_buft,
|
| 324 |
/* .offload_op = */ NULL,
|
| 325 |
-
/* .event_new = */ NULL,
|
| 326 |
-
/* .event_free = */ NULL,
|
| 327 |
/* .event_record = */ NULL,
|
| 328 |
/* .event_wait = */ NULL,
|
| 329 |
-
/* .event_synchronize = */ NULL,
|
| 330 |
};
|
| 331 |
|
| 332 |
static ggml_guid_t ggml_backend_blas_guid(void) {
|
|
@@ -340,6 +337,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|
| 340 |
ggml_backend_t backend = new ggml_backend {
|
| 341 |
/* .guid = */ ggml_backend_blas_guid(),
|
| 342 |
/* .interface = */ blas_backend_i,
|
|
|
|
| 343 |
/* .context = */ ctx,
|
| 344 |
};
|
| 345 |
|
|
@@ -356,7 +354,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|
| 356 |
return backend;
|
| 357 |
}
|
| 358 |
|
| 359 |
-
|
| 360 |
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
|
| 361 |
}
|
| 362 |
|
|
|
|
| 235 |
|
| 236 |
// backend interface
|
| 237 |
|
| 238 |
+
static const char * ggml_backend_blas_name(ggml_backend_t backend) {
|
| 239 |
return "BLAS";
|
| 240 |
|
| 241 |
GGML_UNUSED(backend);
|
| 242 |
}
|
| 243 |
|
| 244 |
+
static void ggml_backend_blas_free(ggml_backend_t backend) {
|
| 245 |
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
| 246 |
delete ctx;
|
| 247 |
delete backend;
|
| 248 |
}
|
| 249 |
|
| 250 |
+
static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
|
| 251 |
return ggml_backend_cpu_buffer_type();
|
| 252 |
|
| 253 |
GGML_UNUSED(backend);
|
| 254 |
}
|
| 255 |
|
| 256 |
+
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 257 |
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
| 258 |
|
| 259 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
|
|
| 285 |
GGML_UNUSED(backend);
|
| 286 |
}
|
| 287 |
|
| 288 |
+
static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 289 |
const struct ggml_tensor * src0 = op->src[0];
|
| 290 |
const struct ggml_tensor * src1 = op->src[1];
|
| 291 |
|
|
|
|
| 300 |
GGML_UNUSED(backend);
|
| 301 |
}
|
| 302 |
|
| 303 |
+
static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 304 |
return ggml_backend_buft_is_host(buft);
|
| 305 |
|
| 306 |
GGML_UNUSED(backend);
|
|
|
|
| 322 |
/* .supports_op = */ ggml_backend_blas_supports_op,
|
| 323 |
/* .supports_buft = */ ggml_backend_blas_supports_buft,
|
| 324 |
/* .offload_op = */ NULL,
|
|
|
|
|
|
|
| 325 |
/* .event_record = */ NULL,
|
| 326 |
/* .event_wait = */ NULL,
|
|
|
|
| 327 |
};
|
| 328 |
|
| 329 |
static ggml_guid_t ggml_backend_blas_guid(void) {
|
|
|
|
| 337 |
ggml_backend_t backend = new ggml_backend {
|
| 338 |
/* .guid = */ ggml_backend_blas_guid(),
|
| 339 |
/* .interface = */ blas_backend_i,
|
| 340 |
+
/* .device = */ nullptr,
|
| 341 |
/* .context = */ ctx,
|
| 342 |
};
|
| 343 |
|
|
|
|
| 354 |
return backend;
|
| 355 |
}
|
| 356 |
|
| 357 |
+
bool ggml_backend_is_blas(ggml_backend_t backend) {
|
| 358 |
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
|
| 359 |
}
|
| 360 |
|
ggml/src/ggml-cann.cpp
CHANGED
|
@@ -560,7 +560,7 @@ struct ggml_backend_cann_buffer_context {
|
|
| 560 |
* @return A pointer to a C-string containing the name of the buffer.
|
| 561 |
*/
|
| 562 |
|
| 563 |
-
|
| 564 |
ggml_backend_buffer_t buffer) {
|
| 565 |
return "CANN";
|
| 566 |
|
|
@@ -576,7 +576,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
|
|
| 576 |
* @param buffer The buffer to check.
|
| 577 |
* @return true if the buffer is a CANN buffer, false otherwise.
|
| 578 |
*/
|
| 579 |
-
|
| 580 |
ggml_backend_buffer_t buffer) {
|
| 581 |
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
|
| 582 |
}
|
|
@@ -589,7 +589,7 @@ GGML_CALL static bool ggml_backend_buffer_is_cann(
|
|
| 589 |
*
|
| 590 |
* @param buffer The CANN buffer to free.
|
| 591 |
*/
|
| 592 |
-
|
| 593 |
ggml_backend_buffer_t buffer) {
|
| 594 |
ggml_backend_cann_buffer_context* ctx =
|
| 595 |
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
@@ -605,7 +605,7 @@ GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
|
|
| 605 |
* @param buffer The CANN buffer whose base pointer is to be retrieved.
|
| 606 |
* @return A pointer to the base of the device memory allocated for the buffer.
|
| 607 |
*/
|
| 608 |
-
|
| 609 |
ggml_backend_buffer_t buffer) {
|
| 610 |
ggml_backend_cann_buffer_context* ctx =
|
| 611 |
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
@@ -625,9 +625,9 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
|
|
| 625 |
* @param dst Pointer to the destination buffer where transformed data will be
|
| 626 |
* stored.
|
| 627 |
*/
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
|
| 632 |
int64_t n_elems = ggml_nelements(tensor);
|
| 633 |
int64_t groups = n_elems / QK4_0;
|
|
@@ -677,7 +677,7 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|
| 677 |
* @param dst Pointer to the destination buffer where the Q4.0 formatted data
|
| 678 |
* will be stored.
|
| 679 |
*/
|
| 680 |
-
|
| 681 |
const ggml_tensor* tensor, void* src, void* dst) {
|
| 682 |
|
| 683 |
int64_t n_elems = ggml_nelements(tensor);
|
|
@@ -726,9 +726,9 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
|
|
| 726 |
* @param dst Pointer to the destination buffer where transformed data will be
|
| 727 |
* stored.
|
| 728 |
*/
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
int64_t n_elems = ggml_nelements(tensor);
|
| 733 |
int64_t groups = n_elems / QK8_0;
|
| 734 |
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
@@ -760,7 +760,7 @@ GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
|
| 760 |
* @param dst Pointer to the destination buffer where the Q8.0 formatted data
|
| 761 |
* will be stored.
|
| 762 |
*/
|
| 763 |
-
|
| 764 |
const ggml_tensor* tensor, const void* src, void* dst) {
|
| 765 |
int64_t n_elems = ggml_nelements(tensor);
|
| 766 |
int64_t groups = n_elems / QK8_0;
|
|
@@ -792,8 +792,8 @@ GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
|
|
| 792 |
* @param dst Pointer to the destination buffer where transformed data will be
|
| 793 |
* stored.
|
| 794 |
*/
|
| 795 |
-
|
| 796 |
-
|
| 797 |
switch (tensor->type) {
|
| 798 |
case GGML_TYPE_Q4_0:
|
| 799 |
ggml_backend_cann_transform_q4_0(tensor, src, dst);
|
|
@@ -818,7 +818,7 @@ GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
|
| 818 |
* @param dst Pointer to the destination buffer where transformed tensor data
|
| 819 |
* will be stored.
|
| 820 |
*/
|
| 821 |
-
|
| 822 |
const ggml_tensor* tensor, void* src, void* dst) {
|
| 823 |
switch (tensor->type) {
|
| 824 |
case GGML_TYPE_Q4_0:
|
|
@@ -841,7 +841,7 @@ GGML_CALL static void ggml_backend_cann_transform_back(
|
|
| 841 |
* @param type The tensor type to check.
|
| 842 |
* @return true if transformation is needed, false otherwise.
|
| 843 |
*/
|
| 844 |
-
|
| 845 |
switch (type) {
|
| 846 |
case GGML_TYPE_Q4_0:
|
| 847 |
case GGML_TYPE_Q8_0:
|
|
@@ -860,7 +860,7 @@ GGML_CALL static bool need_transform(ggml_type type) {
|
|
| 860 |
* @param buffer The CANN buffer from which to initialize the tensor.
|
| 861 |
* @param tensor Pointer to the tensor to be initialized.
|
| 862 |
*/
|
| 863 |
-
|
| 864 |
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
|
| 865 |
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
| 866 |
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
|
@@ -896,7 +896,7 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
|
|
| 896 |
* @param offset Offset in the source data from where to start copying.
|
| 897 |
* @param size Size of the data to be copied, in bytes.
|
| 898 |
*/
|
| 899 |
-
|
| 900 |
ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
|
| 901 |
size_t offset, size_t size) {
|
| 902 |
ggml_backend_cann_buffer_context *ctx =
|
|
@@ -941,7 +941,7 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
|
| 941 |
* @param offset Offset in the destination buffer where to start copying.
|
| 942 |
* @param size Size of the data to be copied, in bytes.
|
| 943 |
*/
|
| 944 |
-
|
| 945 |
ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
|
| 946 |
size_t offset, size_t size) {
|
| 947 |
ggml_backend_cann_buffer_context* ctx =
|
|
@@ -975,7 +975,7 @@ GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
|
|
| 975 |
* @param dst Pointer to the destination tensor where the data will be copied.
|
| 976 |
* @return true if the copy operation succeeded, false otherwise.
|
| 977 |
*/
|
| 978 |
-
|
| 979 |
ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
|
| 980 |
if (ggml_backend_buffer_is_cann(src->buffer)) {
|
| 981 |
ggml_backend_cann_buffer_context* src_ctx =
|
|
@@ -1017,7 +1017,7 @@ GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
|
|
| 1017 |
* @param buffer The CANN buffer to be cleared.
|
| 1018 |
* @param value The value to which each byte in the buffer will be set.
|
| 1019 |
*/
|
| 1020 |
-
|
| 1021 |
ggml_backend_buffer_t buffer, uint8_t value) {
|
| 1022 |
ggml_backend_cann_buffer_context* ctx =
|
| 1023 |
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
@@ -1065,7 +1065,7 @@ struct ggml_backend_cann_buffer_type_context {
|
|
| 1065 |
* @param buft Pointer to the buffer type context.
|
| 1066 |
* @return Const pointer to the C-style string containing the name.
|
| 1067 |
*/
|
| 1068 |
-
|
| 1069 |
ggml_backend_buffer_type_t buft) {
|
| 1070 |
return "CANN";
|
| 1071 |
|
|
@@ -1082,7 +1082,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
|
|
| 1082 |
* @param size Size in bytes of the buffer to allocate.
|
| 1083 |
* @return Pointer to the allocated buffer, or nullptr if allocation fails.
|
| 1084 |
*/
|
| 1085 |
-
|
| 1086 |
ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
| 1087 |
size_t size) {
|
| 1088 |
ggml_backend_cann_buffer_type_context* buft_ctx =
|
|
@@ -1121,7 +1121,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
| 1121 |
* @return The alignment requirement in bytes (fixed at 128 bytes for CANN
|
| 1122 |
* buffers).
|
| 1123 |
*/
|
| 1124 |
-
|
| 1125 |
ggml_backend_buffer_type_t buft) {
|
| 1126 |
return 128;
|
| 1127 |
|
|
@@ -1142,7 +1142,7 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
|
|
| 1142 |
* @return The total allocation size in bytes required for the tensor in the
|
| 1143 |
* CANN buffer.
|
| 1144 |
*/
|
| 1145 |
-
|
| 1146 |
ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
|
| 1147 |
size_t size = ggml_nbytes(tensor);
|
| 1148 |
int64_t ne0 = tensor->ne[0];
|
|
@@ -1193,7 +1193,7 @@ static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
|
|
| 1193 |
* @return A pointer to the buffer type interface for the specified device, or
|
| 1194 |
* nullptr if the device index is out of range.
|
| 1195 |
*/
|
| 1196 |
-
|
| 1197 |
ggml_backend_cann_buffer_type(int32_t device) {
|
| 1198 |
static std::mutex mutex;
|
| 1199 |
std::lock_guard<std::mutex> lock(mutex);
|
|
@@ -1231,7 +1231,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
|
| 1231 |
* @param buft Pointer to the host buffer type context.
|
| 1232 |
* @return Const pointer to the C-style string containing the name.
|
| 1233 |
*/
|
| 1234 |
-
|
| 1235 |
return "CANN_Host";
|
| 1236 |
|
| 1237 |
GGML_UNUSED(buft);
|
|
@@ -1246,7 +1246,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backe
|
|
| 1246 |
* @param buft Pointer to the host buffer context.
|
| 1247 |
* @return Const pointer to the C-style string containing the name.
|
| 1248 |
*/
|
| 1249 |
-
|
| 1250 |
return "CANN_Host";
|
| 1251 |
|
| 1252 |
GGML_UNUSED(buffer);
|
|
@@ -1260,7 +1260,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_bu
|
|
| 1260 |
*
|
| 1261 |
* @param buffer The CANN host buffer to free.
|
| 1262 |
*/
|
| 1263 |
-
|
| 1264 |
ACL_CHECK(aclrtFreeHost(buffer->context));
|
| 1265 |
}
|
| 1266 |
|
|
@@ -1294,7 +1294,7 @@ static void * ggml_cann_host_malloc(size_t size) {
|
|
| 1294 |
* @param size Size in bytes of the host buffer to allocate.
|
| 1295 |
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
| 1296 |
*/
|
| 1297 |
-
|
| 1298 |
void * hostPtr = ggml_cann_host_malloc(size);
|
| 1299 |
|
| 1300 |
if (hostPtr == nullptr) {
|
|
@@ -1316,7 +1316,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_
|
|
| 1316 |
* Provides function pointers for allocating, querying properties, and managing
|
| 1317 |
* memory for CANN buffer types in the GGML backend.
|
| 1318 |
*/
|
| 1319 |
-
|
| 1320 |
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
| 1321 |
/* .iface = */ {
|
| 1322 |
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
|
@@ -1326,6 +1326,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
|
| 1326 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 1327 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 1328 |
},
|
|
|
|
| 1329 |
/* .context = */ nullptr,
|
| 1330 |
};
|
| 1331 |
|
|
@@ -1495,7 +1496,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
| 1495 |
* @param backend Pointer to the CANN backend structure.
|
| 1496 |
* @return A pointer to a constant string representing the backend name.
|
| 1497 |
*/
|
| 1498 |
-
|
| 1499 |
ggml_backend_cann_context* cann_ctx =
|
| 1500 |
(ggml_backend_cann_context*)backend->context;
|
| 1501 |
|
|
@@ -1510,7 +1511,7 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
|
| 1510 |
*
|
| 1511 |
* @param backend Pointer to the CANN backend structure to be freed.
|
| 1512 |
*/
|
| 1513 |
-
|
| 1514 |
ggml_backend_cann_context* cann_ctx =
|
| 1515 |
(ggml_backend_cann_context*)backend->context;
|
| 1516 |
ACL_CHECK(aclrtSynchronizeDevice());
|
|
@@ -1535,7 +1536,7 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|
| 1535 |
* @param backend Pointer to the CANN backend structure.
|
| 1536 |
* @return Pointer to the buffer type structure for the CANN backend.
|
| 1537 |
*/
|
| 1538 |
-
|
| 1539 |
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
| 1540 |
ggml_backend_cann_context* cann_ctx =
|
| 1541 |
(ggml_backend_cann_context*)backend->context;
|
|
@@ -1556,11 +1557,11 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
|
| 1556 |
* @param offset Offset in bytes within the host data.
|
| 1557 |
* @param size Size of the data to copy in bytes.
|
| 1558 |
*/
|
| 1559 |
-
|
| 1560 |
-
|
| 1561 |
-
|
| 1562 |
-
|
| 1563 |
-
|
| 1564 |
ggml_backend_cann_context *cann_ctx =
|
| 1565 |
(ggml_backend_cann_context *)backend->context;
|
| 1566 |
|
|
@@ -1587,7 +1588,7 @@ GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
|
| 1587 |
}
|
| 1588 |
}
|
| 1589 |
|
| 1590 |
-
|
| 1591 |
ggml_backend_t backend, const ggml_tensor *tensor, void *data,
|
| 1592 |
size_t offset, size_t size) {
|
| 1593 |
ggml_backend_cann_context *cann_ctx =
|
|
@@ -1626,7 +1627,7 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
|
|
| 1626 |
* @param dst Pointer to the destination tensor to copy data to.
|
| 1627 |
* @return true if the copy operation succeeds, false otherwise.
|
| 1628 |
*/
|
| 1629 |
-
|
| 1630 |
ggml_backend_t backend_src, ggml_backend_t backend_dst,
|
| 1631 |
const ggml_tensor* src, ggml_tensor* dst) {
|
| 1632 |
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
|
|
@@ -1694,7 +1695,7 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
|
|
| 1694 |
*
|
| 1695 |
* @param backend Pointer to the CANN backend structure to synchronize.
|
| 1696 |
*/
|
| 1697 |
-
|
| 1698 |
ggml_backend_cann_context* cann_ctx =
|
| 1699 |
(ggml_backend_cann_context*)backend->context;
|
| 1700 |
|
|
@@ -1715,7 +1716,7 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
|
| 1715 |
* @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
|
| 1716 |
* completes successfully, otherwise an appropriate error status.
|
| 1717 |
*/
|
| 1718 |
-
|
| 1719 |
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
| 1720 |
ggml_backend_cann_context* cann_ctx =
|
| 1721 |
(ggml_backend_cann_context*)backend->context;
|
|
@@ -1753,7 +1754,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
|
|
| 1753 |
* @return bool Returns true if the operation is supported by the backend,
|
| 1754 |
* otherwise false.
|
| 1755 |
*/
|
| 1756 |
-
|
| 1757 |
const ggml_tensor* op) {
|
| 1758 |
switch (op->op) {
|
| 1759 |
case GGML_OP_UNARY:
|
|
@@ -1875,7 +1876,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
|
| 1875 |
* @return bool Returns true if the CANN backend supports the buffer type,
|
| 1876 |
* otherwise false.
|
| 1877 |
*/
|
| 1878 |
-
|
| 1879 |
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 1880 |
if (ggml_backend_buft_is_cann(buft)) {
|
| 1881 |
ggml_backend_cann_context * cann_ctx =
|
|
@@ -1901,7 +1902,7 @@ GGML_CALL static bool ggml_backend_cann_supports_buft(
|
|
| 1901 |
* @return bool Returns true if the operation should be offloaded, otherwise
|
| 1902 |
* false.
|
| 1903 |
*/
|
| 1904 |
-
|
| 1905 |
const ggml_tensor* op) {
|
| 1906 |
const int min_batch_size = 32;
|
| 1907 |
GGML_UNUSED(backend);
|
|
@@ -2021,11 +2022,8 @@ static ggml_backend_i ggml_backend_cann_interface = {
|
|
| 2021 |
/* .supports_op = */ ggml_backend_cann_supports_op,
|
| 2022 |
/* .supports_buft = */ ggml_backend_cann_supports_buft,
|
| 2023 |
/* .offload_op = */ ggml_backend_cann_offload_op,
|
| 2024 |
-
/* .event_new = */ ggml_backend_cann_event_new,
|
| 2025 |
-
/* .event_free = */ ggml_backend_cann_event_free,
|
| 2026 |
/* .event_record = */ ggml_backend_cann_event_record,
|
| 2027 |
/* .event_wait = */ ggml_backend_cann_event_wait,
|
| 2028 |
-
/* .event_synchronize = */ ggml_backend_cann_event_synchronize,
|
| 2029 |
};
|
| 2030 |
|
| 2031 |
/**
|
|
@@ -2042,7 +2040,7 @@ static ggml_guid_t ggml_backend_cann_guid() {
|
|
| 2042 |
return &guid;
|
| 2043 |
}
|
| 2044 |
|
| 2045 |
-
|
| 2046 |
aclInit(nullptr);
|
| 2047 |
if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
|
| 2048 |
GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
|
|
@@ -2058,75 +2056,30 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
|
| 2058 |
ggml_backend_t cann_backend =
|
| 2059 |
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
| 2060 |
/* .interface = */ ggml_backend_cann_interface,
|
|
|
|
| 2061 |
/* .context = */ ctx};
|
| 2062 |
|
| 2063 |
return cann_backend;
|
| 2064 |
}
|
| 2065 |
|
| 2066 |
-
|
| 2067 |
return backend != NULL &&
|
| 2068 |
ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
|
| 2069 |
}
|
| 2070 |
|
| 2071 |
-
|
| 2072 |
return ggml_cann_info().device_count;
|
| 2073 |
}
|
| 2074 |
|
| 2075 |
-
|
| 2076 |
int32_t device, char* description, size_t description_size) {
|
| 2077 |
ggml_cann_set_device(device);
|
| 2078 |
const char* soc_name = aclrtGetSocName();
|
| 2079 |
snprintf(description, description_size, "%s", soc_name);
|
| 2080 |
}
|
| 2081 |
|
| 2082 |
-
|
| 2083 |
-
|
| 2084 |
ggml_cann_set_device(device);
|
| 2085 |
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
| 2086 |
}
|
| 2087 |
-
|
| 2088 |
-
// backend registry
|
| 2089 |
-
/**
|
| 2090 |
-
* @brief Initializes a CANN backend based on the provided parameters.
|
| 2091 |
-
*
|
| 2092 |
-
* This function initializes a CANN backend using the device index and then
|
| 2093 |
-
* initializes the backend using `ggml_backend_cann_init`.
|
| 2094 |
-
*
|
| 2095 |
-
* @param params Parameters for initialization (unused in this implementation).
|
| 2096 |
-
* @param user_data User data containing the device index to initialize the
|
| 2097 |
-
* backend.
|
| 2098 |
-
* @return ggml_backend_t The initialized CANN backend.
|
| 2099 |
-
*/
|
| 2100 |
-
GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params,
|
| 2101 |
-
void* user_data) {
|
| 2102 |
-
ggml_backend_t cann_backend =
|
| 2103 |
-
ggml_backend_cann_init((int)(intptr_t)user_data);
|
| 2104 |
-
return cann_backend;
|
| 2105 |
-
|
| 2106 |
-
GGML_UNUSED(params);
|
| 2107 |
-
}
|
| 2108 |
-
|
| 2109 |
-
extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
|
| 2110 |
-
|
| 2111 |
-
/**
|
| 2112 |
-
* @brief Registers CANN (Ascend) devices as backend options.
|
| 2113 |
-
*
|
| 2114 |
-
* This function initializes ACL, retrieves the number of available CANN
|
| 2115 |
-
* devices, and registers each device as a backend option using
|
| 2116 |
-
* `ggml_backend_register`. Each device is given a unique name based on
|
| 2117 |
-
* `GGML_CANN_NAME` followed by its index.
|
| 2118 |
-
*
|
| 2119 |
-
* @return int The number of CANN devices registered.
|
| 2120 |
-
*/
|
| 2121 |
-
GGML_CALL int ggml_backend_cann_reg_devices() {
|
| 2122 |
-
uint32_t device_count = ggml_backend_cann_get_device_count();
|
| 2123 |
-
// initialization
|
| 2124 |
-
for (uint32_t i = 0; i < device_count; i++) {
|
| 2125 |
-
char name[128];
|
| 2126 |
-
snprintf(name, sizeof(name), "CANN%d", i);
|
| 2127 |
-
ggml_backend_register(name, ggml_backend_reg_cann_init,
|
| 2128 |
-
ggml_backend_cann_buffer_type(i),
|
| 2129 |
-
(void*)(intptr_t)i);
|
| 2130 |
-
}
|
| 2131 |
-
return device_count;
|
| 2132 |
-
}
|
|
|
|
| 560 |
* @return A pointer to a C-string containing the name of the buffer.
|
| 561 |
*/
|
| 562 |
|
| 563 |
+
static const char* ggml_backend_cann_buffer_get_name(
|
| 564 |
ggml_backend_buffer_t buffer) {
|
| 565 |
return "CANN";
|
| 566 |
|
|
|
|
| 576 |
* @param buffer The buffer to check.
|
| 577 |
* @return true if the buffer is a CANN buffer, false otherwise.
|
| 578 |
*/
|
| 579 |
+
static bool ggml_backend_buffer_is_cann(
|
| 580 |
ggml_backend_buffer_t buffer) {
|
| 581 |
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
|
| 582 |
}
|
|
|
|
| 589 |
*
|
| 590 |
* @param buffer The CANN buffer to free.
|
| 591 |
*/
|
| 592 |
+
static void ggml_backend_cann_buffer_free_buffer(
|
| 593 |
ggml_backend_buffer_t buffer) {
|
| 594 |
ggml_backend_cann_buffer_context* ctx =
|
| 595 |
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
|
|
| 605 |
* @param buffer The CANN buffer whose base pointer is to be retrieved.
|
| 606 |
* @return A pointer to the base of the device memory allocated for the buffer.
|
| 607 |
*/
|
| 608 |
+
static void* ggml_backend_cann_buffer_get_base(
|
| 609 |
ggml_backend_buffer_t buffer) {
|
| 610 |
ggml_backend_cann_buffer_context* ctx =
|
| 611 |
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
|
|
| 625 |
* @param dst Pointer to the destination buffer where transformed data will be
|
| 626 |
* stored.
|
| 627 |
*/
|
| 628 |
+
static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
| 629 |
+
const void* src,
|
| 630 |
+
void* dst) {
|
| 631 |
|
| 632 |
int64_t n_elems = ggml_nelements(tensor);
|
| 633 |
int64_t groups = n_elems / QK4_0;
|
|
|
|
| 677 |
* @param dst Pointer to the destination buffer where the Q4.0 formatted data
|
| 678 |
* will be stored.
|
| 679 |
*/
|
| 680 |
+
static void ggml_backend_cann_transform_back_q4_0(
|
| 681 |
const ggml_tensor* tensor, void* src, void* dst) {
|
| 682 |
|
| 683 |
int64_t n_elems = ggml_nelements(tensor);
|
|
|
|
| 726 |
* @param dst Pointer to the destination buffer where transformed data will be
|
| 727 |
* stored.
|
| 728 |
*/
|
| 729 |
+
static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
| 730 |
+
const void* src,
|
| 731 |
+
void* dst) {
|
| 732 |
int64_t n_elems = ggml_nelements(tensor);
|
| 733 |
int64_t groups = n_elems / QK8_0;
|
| 734 |
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
|
|
| 760 |
* @param dst Pointer to the destination buffer where the Q8.0 formatted data
|
| 761 |
* will be stored.
|
| 762 |
*/
|
| 763 |
+
static void ggml_backend_cann_transform_back_q8_0(
|
| 764 |
const ggml_tensor* tensor, const void* src, void* dst) {
|
| 765 |
int64_t n_elems = ggml_nelements(tensor);
|
| 766 |
int64_t groups = n_elems / QK8_0;
|
|
|
|
| 792 |
* @param dst Pointer to the destination buffer where transformed data will be
|
| 793 |
* stored.
|
| 794 |
*/
|
| 795 |
+
static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
| 796 |
+
const void* src, void* dst) {
|
| 797 |
switch (tensor->type) {
|
| 798 |
case GGML_TYPE_Q4_0:
|
| 799 |
ggml_backend_cann_transform_q4_0(tensor, src, dst);
|
|
|
|
| 818 |
* @param dst Pointer to the destination buffer where transformed tensor data
|
| 819 |
* will be stored.
|
| 820 |
*/
|
| 821 |
+
static void ggml_backend_cann_transform_back(
|
| 822 |
const ggml_tensor* tensor, void* src, void* dst) {
|
| 823 |
switch (tensor->type) {
|
| 824 |
case GGML_TYPE_Q4_0:
|
|
|
|
| 841 |
* @param type The tensor type to check.
|
| 842 |
* @return true if transformation is needed, false otherwise.
|
| 843 |
*/
|
| 844 |
+
static bool need_transform(ggml_type type) {
|
| 845 |
switch (type) {
|
| 846 |
case GGML_TYPE_Q4_0:
|
| 847 |
case GGML_TYPE_Q8_0:
|
|
|
|
| 860 |
* @param buffer The CANN buffer from which to initialize the tensor.
|
| 861 |
* @param tensor Pointer to the tensor to be initialized.
|
| 862 |
*/
|
| 863 |
+
static void ggml_backend_cann_buffer_init_tensor(
|
| 864 |
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
|
| 865 |
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
| 866 |
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
|
|
|
| 896 |
* @param offset Offset in the source data from where to start copying.
|
| 897 |
* @param size Size of the data to be copied, in bytes.
|
| 898 |
*/
|
| 899 |
+
static void ggml_backend_cann_buffer_set_tensor(
|
| 900 |
ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
|
| 901 |
size_t offset, size_t size) {
|
| 902 |
ggml_backend_cann_buffer_context *ctx =
|
|
|
|
| 941 |
* @param offset Offset in the destination buffer where to start copying.
|
| 942 |
* @param size Size of the data to be copied, in bytes.
|
| 943 |
*/
|
| 944 |
+
static void ggml_backend_cann_buffer_get_tensor(
|
| 945 |
ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
|
| 946 |
size_t offset, size_t size) {
|
| 947 |
ggml_backend_cann_buffer_context* ctx =
|
|
|
|
| 975 |
* @param dst Pointer to the destination tensor where the data will be copied.
|
| 976 |
* @return true if the copy operation succeeded, false otherwise.
|
| 977 |
*/
|
| 978 |
+
static bool ggml_backend_cann_buffer_cpy_tensor(
|
| 979 |
ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
|
| 980 |
if (ggml_backend_buffer_is_cann(src->buffer)) {
|
| 981 |
ggml_backend_cann_buffer_context* src_ctx =
|
|
|
|
| 1017 |
* @param buffer The CANN buffer to be cleared.
|
| 1018 |
* @param value The value to which each byte in the buffer will be set.
|
| 1019 |
*/
|
| 1020 |
+
static void ggml_backend_cann_buffer_clear(
|
| 1021 |
ggml_backend_buffer_t buffer, uint8_t value) {
|
| 1022 |
ggml_backend_cann_buffer_context* ctx =
|
| 1023 |
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
|
|
| 1065 |
* @param buft Pointer to the buffer type context.
|
| 1066 |
* @return Const pointer to the C-style string containing the name.
|
| 1067 |
*/
|
| 1068 |
+
static const char* ggml_backend_cann_buffer_type_name(
|
| 1069 |
ggml_backend_buffer_type_t buft) {
|
| 1070 |
return "CANN";
|
| 1071 |
|
|
|
|
| 1082 |
* @param size Size in bytes of the buffer to allocate.
|
| 1083 |
* @return Pointer to the allocated buffer, or nullptr if allocation fails.
|
| 1084 |
*/
|
| 1085 |
+
static ggml_backend_buffer_t
|
| 1086 |
ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
| 1087 |
size_t size) {
|
| 1088 |
ggml_backend_cann_buffer_type_context* buft_ctx =
|
|
|
|
| 1121 |
* @return The alignment requirement in bytes (fixed at 128 bytes for CANN
|
| 1122 |
* buffers).
|
| 1123 |
*/
|
| 1124 |
+
static size_t ggml_backend_cann_buffer_type_get_alignment(
|
| 1125 |
ggml_backend_buffer_type_t buft) {
|
| 1126 |
return 128;
|
| 1127 |
|
|
|
|
| 1142 |
* @return The total allocation size in bytes required for the tensor in the
|
| 1143 |
* CANN buffer.
|
| 1144 |
*/
|
| 1145 |
+
static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
| 1146 |
ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
|
| 1147 |
size_t size = ggml_nbytes(tensor);
|
| 1148 |
int64_t ne0 = tensor->ne[0];
|
|
|
|
| 1193 |
* @return A pointer to the buffer type interface for the specified device, or
|
| 1194 |
* nullptr if the device index is out of range.
|
| 1195 |
*/
|
| 1196 |
+
ggml_backend_buffer_type_t
|
| 1197 |
ggml_backend_cann_buffer_type(int32_t device) {
|
| 1198 |
static std::mutex mutex;
|
| 1199 |
std::lock_guard<std::mutex> lock(mutex);
|
|
|
|
| 1231 |
* @param buft Pointer to the host buffer type context.
|
| 1232 |
* @return Const pointer to the C-style string containing the name.
|
| 1233 |
*/
|
| 1234 |
+
static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 1235 |
return "CANN_Host";
|
| 1236 |
|
| 1237 |
GGML_UNUSED(buft);
|
|
|
|
| 1246 |
* @param buft Pointer to the host buffer context.
|
| 1247 |
* @return Const pointer to the C-style string containing the name.
|
| 1248 |
*/
|
| 1249 |
+
static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
| 1250 |
return "CANN_Host";
|
| 1251 |
|
| 1252 |
GGML_UNUSED(buffer);
|
|
|
|
| 1260 |
*
|
| 1261 |
* @param buffer The CANN host buffer to free.
|
| 1262 |
*/
|
| 1263 |
+
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
| 1264 |
ACL_CHECK(aclrtFreeHost(buffer->context));
|
| 1265 |
}
|
| 1266 |
|
|
|
|
| 1294 |
* @param size Size in bytes of the host buffer to allocate.
|
| 1295 |
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
| 1296 |
*/
|
| 1297 |
+
static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 1298 |
void * hostPtr = ggml_cann_host_malloc(size);
|
| 1299 |
|
| 1300 |
if (hostPtr == nullptr) {
|
|
|
|
| 1316 |
* Provides function pointers for allocating, querying properties, and managing
|
| 1317 |
* memory for CANN buffer types in the GGML backend.
|
| 1318 |
*/
|
| 1319 |
+
ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
| 1320 |
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
| 1321 |
/* .iface = */ {
|
| 1322 |
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
|
|
|
| 1326 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 1327 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 1328 |
},
|
| 1329 |
+
/* .device = */ nullptr,
|
| 1330 |
/* .context = */ nullptr,
|
| 1331 |
};
|
| 1332 |
|
|
|
|
| 1496 |
* @param backend Pointer to the CANN backend structure.
|
| 1497 |
* @return A pointer to a constant string representing the backend name.
|
| 1498 |
*/
|
| 1499 |
+
static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
| 1500 |
ggml_backend_cann_context* cann_ctx =
|
| 1501 |
(ggml_backend_cann_context*)backend->context;
|
| 1502 |
|
|
|
|
| 1511 |
*
|
| 1512 |
* @param backend Pointer to the CANN backend structure to be freed.
|
| 1513 |
*/
|
| 1514 |
+
static void ggml_backend_cann_free(ggml_backend_t backend) {
|
| 1515 |
ggml_backend_cann_context* cann_ctx =
|
| 1516 |
(ggml_backend_cann_context*)backend->context;
|
| 1517 |
ACL_CHECK(aclrtSynchronizeDevice());
|
|
|
|
| 1536 |
* @param backend Pointer to the CANN backend structure.
|
| 1537 |
* @return Pointer to the buffer type structure for the CANN backend.
|
| 1538 |
*/
|
| 1539 |
+
static ggml_backend_buffer_type_t
|
| 1540 |
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
| 1541 |
ggml_backend_cann_context* cann_ctx =
|
| 1542 |
(ggml_backend_cann_context*)backend->context;
|
|
|
|
| 1557 |
* @param offset Offset in bytes within the host data.
|
| 1558 |
* @param size Size of the data to copy in bytes.
|
| 1559 |
*/
|
| 1560 |
+
static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
| 1561 |
+
ggml_tensor *tensor,
|
| 1562 |
+
const void *data,
|
| 1563 |
+
size_t offset,
|
| 1564 |
+
size_t size) {
|
| 1565 |
ggml_backend_cann_context *cann_ctx =
|
| 1566 |
(ggml_backend_cann_context *)backend->context;
|
| 1567 |
|
|
|
|
| 1588 |
}
|
| 1589 |
}
|
| 1590 |
|
| 1591 |
+
static void ggml_backend_cann_get_tensor_async(
|
| 1592 |
ggml_backend_t backend, const ggml_tensor *tensor, void *data,
|
| 1593 |
size_t offset, size_t size) {
|
| 1594 |
ggml_backend_cann_context *cann_ctx =
|
|
|
|
| 1627 |
* @param dst Pointer to the destination tensor to copy data to.
|
| 1628 |
* @return true if the copy operation succeeds, false otherwise.
|
| 1629 |
*/
|
| 1630 |
+
static bool ggml_backend_cann_cpy_tensor_async(
|
| 1631 |
ggml_backend_t backend_src, ggml_backend_t backend_dst,
|
| 1632 |
const ggml_tensor* src, ggml_tensor* dst) {
|
| 1633 |
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
|
|
|
|
| 1695 |
*
|
| 1696 |
* @param backend Pointer to the CANN backend structure to synchronize.
|
| 1697 |
*/
|
| 1698 |
+
static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
| 1699 |
ggml_backend_cann_context* cann_ctx =
|
| 1700 |
(ggml_backend_cann_context*)backend->context;
|
| 1701 |
|
|
|
|
| 1716 |
* @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
|
| 1717 |
* completes successfully, otherwise an appropriate error status.
|
| 1718 |
*/
|
| 1719 |
+
static enum ggml_status ggml_backend_cann_graph_compute(
|
| 1720 |
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
| 1721 |
ggml_backend_cann_context* cann_ctx =
|
| 1722 |
(ggml_backend_cann_context*)backend->context;
|
|
|
|
| 1754 |
* @return bool Returns true if the operation is supported by the backend,
|
| 1755 |
* otherwise false.
|
| 1756 |
*/
|
| 1757 |
+
static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
| 1758 |
const ggml_tensor* op) {
|
| 1759 |
switch (op->op) {
|
| 1760 |
case GGML_OP_UNARY:
|
|
|
|
| 1876 |
* @return bool Returns true if the CANN backend supports the buffer type,
|
| 1877 |
* otherwise false.
|
| 1878 |
*/
|
| 1879 |
+
static bool ggml_backend_cann_supports_buft(
|
| 1880 |
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 1881 |
if (ggml_backend_buft_is_cann(buft)) {
|
| 1882 |
ggml_backend_cann_context * cann_ctx =
|
|
|
|
| 1902 |
* @return bool Returns true if the operation should be offloaded, otherwise
|
| 1903 |
* false.
|
| 1904 |
*/
|
| 1905 |
+
static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
|
| 1906 |
const ggml_tensor* op) {
|
| 1907 |
const int min_batch_size = 32;
|
| 1908 |
GGML_UNUSED(backend);
|
|
|
|
| 2022 |
/* .supports_op = */ ggml_backend_cann_supports_op,
|
| 2023 |
/* .supports_buft = */ ggml_backend_cann_supports_buft,
|
| 2024 |
/* .offload_op = */ ggml_backend_cann_offload_op,
|
|
|
|
|
|
|
| 2025 |
/* .event_record = */ ggml_backend_cann_event_record,
|
| 2026 |
/* .event_wait = */ ggml_backend_cann_event_wait,
|
|
|
|
| 2027 |
};
|
| 2028 |
|
| 2029 |
/**
|
|
|
|
| 2040 |
return &guid;
|
| 2041 |
}
|
| 2042 |
|
| 2043 |
+
ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
| 2044 |
aclInit(nullptr);
|
| 2045 |
if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
|
| 2046 |
GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
|
|
|
|
| 2056 |
ggml_backend_t cann_backend =
|
| 2057 |
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
| 2058 |
/* .interface = */ ggml_backend_cann_interface,
|
| 2059 |
+
/* .device = */ nullptr,
|
| 2060 |
/* .context = */ ctx};
|
| 2061 |
|
| 2062 |
return cann_backend;
|
| 2063 |
}
|
| 2064 |
|
| 2065 |
+
bool ggml_backend_is_cann(ggml_backend_t backend) {
|
| 2066 |
return backend != NULL &&
|
| 2067 |
ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
|
| 2068 |
}
|
| 2069 |
|
| 2070 |
+
int32_t ggml_backend_cann_get_device_count() {
|
| 2071 |
return ggml_cann_info().device_count;
|
| 2072 |
}
|
| 2073 |
|
| 2074 |
+
void ggml_backend_cann_get_device_description(
|
| 2075 |
int32_t device, char* description, size_t description_size) {
|
| 2076 |
ggml_cann_set_device(device);
|
| 2077 |
const char* soc_name = aclrtGetSocName();
|
| 2078 |
snprintf(description, description_size, "%s", soc_name);
|
| 2079 |
}
|
| 2080 |
|
| 2081 |
+
void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
| 2082 |
+
size_t* total) {
|
| 2083 |
ggml_cann_set_device(device);
|
| 2084 |
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
| 2085 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-cuda.cu
CHANGED
|
@@ -101,11 +101,11 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
|
| 101 |
int id = -1; // in case cudaGetDevice fails
|
| 102 |
cudaGetDevice(&id);
|
| 103 |
|
| 104 |
-
GGML_CUDA_LOG_ERROR("
|
| 105 |
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
| 106 |
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
| 107 |
-
// abort with
|
| 108 |
-
GGML_ABORT("
|
| 109 |
}
|
| 110 |
|
| 111 |
// this is faster on Windows
|
|
@@ -329,7 +329,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
| 329 |
return;
|
| 330 |
}
|
| 331 |
}
|
| 332 |
-
GGML_CUDA_LOG_WARN("
|
| 333 |
ggml_cuda_set_device(device);
|
| 334 |
CUDA_CHECK(cudaFree(ptr));
|
| 335 |
pool_size -= size;
|
|
@@ -459,26 +459,26 @@ struct ggml_backend_cuda_buffer_context {
|
|
| 459 |
}
|
| 460 |
};
|
| 461 |
|
| 462 |
-
|
| 463 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 464 |
return ctx->name.c_str();
|
| 465 |
}
|
| 466 |
|
| 467 |
-
|
| 468 |
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
| 469 |
}
|
| 470 |
|
| 471 |
-
|
| 472 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 473 |
delete ctx;
|
| 474 |
}
|
| 475 |
|
| 476 |
-
|
| 477 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 478 |
return ctx->dev_ptr;
|
| 479 |
}
|
| 480 |
|
| 481 |
-
|
| 482 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 483 |
|
| 484 |
if (tensor->view_src != NULL) {
|
|
@@ -498,7 +498,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|
| 498 |
}
|
| 499 |
}
|
| 500 |
|
| 501 |
-
|
| 502 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 503 |
|
| 504 |
ggml_cuda_set_device(ctx->device);
|
|
@@ -506,7 +506,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer
|
|
| 506 |
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
| 507 |
}
|
| 508 |
|
| 509 |
-
|
| 510 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 511 |
|
| 512 |
ggml_cuda_set_device(ctx->device);
|
|
@@ -514,7 +514,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
|
|
| 514 |
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
| 515 |
}
|
| 516 |
|
| 517 |
-
|
| 518 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 519 |
|
| 520 |
ggml_cuda_set_device(ctx->device);
|
|
@@ -522,7 +522,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
|
|
| 522 |
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
| 523 |
}
|
| 524 |
|
| 525 |
-
|
| 526 |
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
| 527 |
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
| 528 |
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
|
|
@@ -543,7 +543,7 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t
|
|
| 543 |
GGML_UNUSED(buffer);
|
| 544 |
}
|
| 545 |
|
| 546 |
-
|
| 547 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 548 |
|
| 549 |
ggml_cuda_set_device(ctx->device);
|
|
@@ -552,7 +552,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffe
|
|
| 552 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 553 |
}
|
| 554 |
|
| 555 |
-
static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
| 556 |
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
| 557 |
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
| 558 |
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
|
@@ -571,17 +571,17 @@ struct ggml_backend_cuda_buffer_type_context {
|
|
| 571 |
std::string name;
|
| 572 |
};
|
| 573 |
|
| 574 |
-
|
| 575 |
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 576 |
|
| 577 |
return ctx->name.c_str();
|
| 578 |
}
|
| 579 |
|
| 580 |
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
|
| 581 |
-
return buft->iface.get_name ==
|
| 582 |
}
|
| 583 |
|
| 584 |
-
|
| 585 |
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 586 |
|
| 587 |
ggml_cuda_set_device(buft_ctx->device);
|
|
@@ -602,13 +602,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|
| 602 |
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
| 603 |
}
|
| 604 |
|
| 605 |
-
|
| 606 |
return 128;
|
| 607 |
|
| 608 |
GGML_UNUSED(buft);
|
| 609 |
}
|
| 610 |
|
| 611 |
-
|
| 612 |
size_t size = ggml_nbytes(tensor);
|
| 613 |
int64_t ne0 = tensor->ne[0];
|
| 614 |
|
|
@@ -623,8 +623,8 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
|
|
| 623 |
GGML_UNUSED(buft);
|
| 624 |
}
|
| 625 |
|
| 626 |
-
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
| 627 |
-
/* .get_name = */
|
| 628 |
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
| 629 |
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
| 630 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
@@ -632,7 +632,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|
| 632 |
/* .is_host = */ NULL,
|
| 633 |
};
|
| 634 |
|
| 635 |
-
|
| 636 |
static std::mutex mutex;
|
| 637 |
std::lock_guard<std::mutex> lock(mutex);
|
| 638 |
|
|
@@ -645,9 +645,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
| 645 |
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
| 646 |
|
| 647 |
if (!ggml_backend_cuda_buffer_type_initialized) {
|
| 648 |
-
for (int i = 0; i <
|
| 649 |
ggml_backend_cuda_buffer_types[i] = {
|
| 650 |
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
|
|
|
| 651 |
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
| 652 |
};
|
| 653 |
}
|
|
@@ -717,7 +718,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
|
| 717 |
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
| 718 |
};
|
| 719 |
|
| 720 |
-
|
| 721 |
return GGML_CUDA_NAME "_Split";
|
| 722 |
|
| 723 |
GGML_UNUSED(buffer);
|
|
@@ -728,19 +729,19 @@ static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
|
| 728 |
GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
| 729 |
}
|
| 730 |
|
| 731 |
-
|
| 732 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
| 733 |
delete ctx;
|
| 734 |
}
|
| 735 |
|
| 736 |
-
|
| 737 |
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
| 738 |
return (void *)0x1000;
|
| 739 |
|
| 740 |
GGML_UNUSED(buffer);
|
| 741 |
}
|
| 742 |
|
| 743 |
-
|
| 744 |
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
| 745 |
|
| 746 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
@@ -788,7 +789,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
|
| 788 |
tensor->extra = extra;
|
| 789 |
}
|
| 790 |
|
| 791 |
-
|
| 792 |
// split tensors must always be set in their entirety at once
|
| 793 |
GGML_ASSERT(offset == 0);
|
| 794 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
@@ -826,7 +827,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
|
|
| 826 |
}
|
| 827 |
}
|
| 828 |
|
| 829 |
-
|
| 830 |
// split tensors must always be set in their entirety at once
|
| 831 |
GGML_ASSERT(offset == 0);
|
| 832 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
@@ -864,12 +865,12 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
|
|
| 864 |
}
|
| 865 |
}
|
| 866 |
|
| 867 |
-
|
| 868 |
GGML_UNUSED(buffer);
|
| 869 |
GGML_UNUSED(value);
|
| 870 |
}
|
| 871 |
|
| 872 |
-
static
|
| 873 |
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
| 874 |
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
| 875 |
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
|
@@ -884,17 +885,17 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
|
| 884 |
|
| 885 |
// cuda split buffer type
|
| 886 |
|
| 887 |
-
|
| 888 |
return GGML_CUDA_NAME "_Split";
|
| 889 |
|
| 890 |
GGML_UNUSED(buft);
|
| 891 |
}
|
| 892 |
|
| 893 |
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
| 894 |
-
return buft->iface.get_name ==
|
| 895 |
}
|
| 896 |
|
| 897 |
-
|
| 898 |
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 899 |
// instead, we allocate them for each tensor separately in init_tensor
|
| 900 |
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
|
@@ -904,13 +905,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc
|
|
| 904 |
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
| 905 |
}
|
| 906 |
|
| 907 |
-
|
| 908 |
return 128;
|
| 909 |
|
| 910 |
GGML_UNUSED(buft);
|
| 911 |
}
|
| 912 |
|
| 913 |
-
|
| 914 |
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
| 915 |
|
| 916 |
size_t total_size = 0;
|
|
@@ -937,14 +938,14 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
|
|
| 937 |
return total_size;
|
| 938 |
}
|
| 939 |
|
| 940 |
-
|
| 941 |
return false;
|
| 942 |
|
| 943 |
GGML_UNUSED(buft);
|
| 944 |
}
|
| 945 |
|
| 946 |
-
static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
|
| 947 |
-
/* .get_name = */
|
| 948 |
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
| 949 |
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
| 950 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
@@ -952,7 +953,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
|
| 952 |
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
| 953 |
};
|
| 954 |
|
| 955 |
-
|
| 956 |
static std::mutex mutex;
|
| 957 |
std::lock_guard<std::mutex> lock(mutex);
|
| 958 |
|
|
@@ -981,6 +982,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f
|
|
| 981 |
|
| 982 |
struct ggml_backend_buffer_type buft {
|
| 983 |
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
|
|
|
| 984 |
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
| 985 |
};
|
| 986 |
|
|
@@ -990,19 +992,19 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f
|
|
| 990 |
|
| 991 |
// host buffer type
|
| 992 |
|
| 993 |
-
|
| 994 |
return GGML_CUDA_NAME "_Host";
|
| 995 |
|
| 996 |
GGML_UNUSED(buft);
|
| 997 |
}
|
| 998 |
|
| 999 |
-
|
| 1000 |
return GGML_CUDA_NAME "_Host";
|
| 1001 |
|
| 1002 |
GGML_UNUSED(buffer);
|
| 1003 |
}
|
| 1004 |
|
| 1005 |
-
|
| 1006 |
CUDA_CHECK(cudaFreeHost(buffer->context));
|
| 1007 |
}
|
| 1008 |
|
|
@@ -1024,7 +1026,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
| 1024 |
return ptr;
|
| 1025 |
}
|
| 1026 |
|
| 1027 |
-
|
| 1028 |
void * ptr = ggml_cuda_host_malloc(size);
|
| 1029 |
|
| 1030 |
if (ptr == nullptr) {
|
|
@@ -1040,7 +1042,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_
|
|
| 1040 |
return buffer;
|
| 1041 |
}
|
| 1042 |
|
| 1043 |
-
|
| 1044 |
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
| 1045 |
/* .iface = */ {
|
| 1046 |
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
|
@@ -1050,6 +1052,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
| 1050 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 1051 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 1052 |
},
|
|
|
|
| 1053 |
/* .context = */ nullptr,
|
| 1054 |
};
|
| 1055 |
|
|
@@ -2383,26 +2386,26 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
| 2383 |
|
| 2384 |
// backend
|
| 2385 |
|
| 2386 |
-
|
| 2387 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2388 |
|
| 2389 |
return cuda_ctx->name.c_str();
|
| 2390 |
}
|
| 2391 |
|
| 2392 |
-
|
| 2393 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2394 |
|
| 2395 |
delete cuda_ctx;
|
| 2396 |
delete backend;
|
| 2397 |
}
|
| 2398 |
|
| 2399 |
-
|
| 2400 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2401 |
|
| 2402 |
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
| 2403 |
}
|
| 2404 |
|
| 2405 |
-
|
| 2406 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2407 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 2408 |
|
|
@@ -2411,7 +2414,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
|
|
| 2411 |
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
|
| 2412 |
}
|
| 2413 |
|
| 2414 |
-
|
| 2415 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2416 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 2417 |
|
|
@@ -2420,7 +2423,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
|
|
| 2420 |
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
|
| 2421 |
}
|
| 2422 |
|
| 2423 |
-
|
| 2424 |
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
| 2425 |
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
| 2426 |
|
|
@@ -2475,7 +2478,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
|
| 2475 |
return true;
|
| 2476 |
}
|
| 2477 |
|
| 2478 |
-
|
| 2479 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2480 |
|
| 2481 |
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
|
|
@@ -2534,7 +2537,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
|
| 2534 |
return true;
|
| 2535 |
}
|
| 2536 |
|
| 2537 |
-
|
| 2538 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2539 |
|
| 2540 |
ggml_cuda_set_device(cuda_ctx->device);
|
|
@@ -2806,8 +2809,187 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2806 |
return GGML_STATUS_SUCCESS;
|
| 2807 |
}
|
| 2808 |
|
| 2809 |
-
|
| 2810 |
-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2811 |
switch (op->op) {
|
| 2812 |
case GGML_OP_UNARY:
|
| 2813 |
switch (ggml_get_unary_op(op)) {
|
|
@@ -3021,7 +3203,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 3021 |
if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
|
| 3022 |
return true;
|
| 3023 |
}
|
| 3024 |
-
const int cc = ggml_cuda_info().devices[
|
| 3025 |
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
| 3026 |
}
|
| 3027 |
case GGML_OP_CROSS_ENTROPY_LOSS:
|
|
@@ -3031,115 +3213,170 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 3031 |
default:
|
| 3032 |
return false;
|
| 3033 |
}
|
| 3034 |
-
|
| 3035 |
-
GGML_UNUSED(backend);
|
| 3036 |
}
|
| 3037 |
|
| 3038 |
-
|
| 3039 |
if (ggml_backend_buft_is_cuda_split(buft)) {
|
| 3040 |
return true;
|
| 3041 |
}
|
| 3042 |
|
| 3043 |
if (ggml_backend_buft_is_cuda(buft)) {
|
| 3044 |
-
|
| 3045 |
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 3046 |
-
return buft_ctx->device ==
|
| 3047 |
}
|
| 3048 |
|
| 3049 |
return false;
|
| 3050 |
}
|
| 3051 |
|
| 3052 |
-
|
| 3053 |
const int min_batch_size = 32;
|
| 3054 |
|
| 3055 |
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
| 3056 |
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
| 3057 |
|
| 3058 |
-
GGML_UNUSED(
|
| 3059 |
}
|
| 3060 |
|
| 3061 |
-
static ggml_backend_event_t
|
| 3062 |
#ifdef GGML_CUDA_NO_PEER_COPY
|
| 3063 |
return nullptr;
|
| 3064 |
#else
|
| 3065 |
-
|
| 3066 |
|
| 3067 |
-
ggml_cuda_set_device(
|
| 3068 |
|
| 3069 |
cudaEvent_t event;
|
| 3070 |
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
| 3071 |
|
| 3072 |
return new ggml_backend_event {
|
| 3073 |
-
/* .
|
| 3074 |
/* .context = */ event,
|
| 3075 |
};
|
| 3076 |
#endif
|
| 3077 |
}
|
| 3078 |
|
| 3079 |
-
static void
|
| 3080 |
-
|
| 3081 |
|
|
|
|
| 3082 |
delete event;
|
| 3083 |
}
|
| 3084 |
|
| 3085 |
-
static void
|
| 3086 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3087 |
|
| 3088 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3089 |
}
|
| 3090 |
|
| 3091 |
-
static
|
| 3092 |
-
|
|
|
|
|
|
|
| 3093 |
|
| 3094 |
-
|
| 3095 |
-
|
| 3096 |
-
|
| 3097 |
-
|
| 3098 |
-
|
| 3099 |
-
auto wait_fn = [](void * user_data) {
|
| 3100 |
-
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
| 3101 |
-
ggml_backend_event_synchronize(event);
|
| 3102 |
-
};
|
| 3103 |
|
| 3104 |
-
|
| 3105 |
-
|
| 3106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3107 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3108 |
}
|
| 3109 |
|
| 3110 |
-
static void
|
| 3111 |
-
|
|
|
|
| 3112 |
}
|
| 3113 |
|
| 3114 |
-
static
|
| 3115 |
-
/* .get_name
|
| 3116 |
-
/* .
|
| 3117 |
-
/* .
|
| 3118 |
-
/* .
|
| 3119 |
-
/* .
|
| 3120 |
-
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
| 3121 |
-
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
| 3122 |
-
/* .graph_plan_create = */ NULL,
|
| 3123 |
-
/* .graph_plan_free = */ NULL,
|
| 3124 |
-
/* .graph_plan_update = */ NULL,
|
| 3125 |
-
/* .graph_plan_compute = */ NULL,
|
| 3126 |
-
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
| 3127 |
-
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
| 3128 |
-
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
|
| 3129 |
-
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
| 3130 |
-
/* .event_new = */ ggml_backend_cuda_event_new,
|
| 3131 |
-
/* .event_free = */ ggml_backend_cuda_event_free,
|
| 3132 |
-
/* .event_record = */ ggml_backend_cuda_event_record,
|
| 3133 |
-
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
| 3134 |
-
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
|
| 3135 |
};
|
| 3136 |
|
| 3137 |
-
|
| 3138 |
-
|
| 3139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3140 |
}
|
| 3141 |
|
| 3142 |
-
|
| 3143 |
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
| 3144 |
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
| 3145 |
return nullptr;
|
|
@@ -3154,82 +3391,9 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
| 3154 |
ggml_backend_t cuda_backend = new ggml_backend {
|
| 3155 |
/* .guid = */ ggml_backend_cuda_guid(),
|
| 3156 |
/* .interface = */ ggml_backend_cuda_interface,
|
| 3157 |
-
/* .
|
|
|
|
| 3158 |
};
|
| 3159 |
|
| 3160 |
return cuda_backend;
|
| 3161 |
}
|
| 3162 |
-
|
| 3163 |
-
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
| 3164 |
-
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
|
| 3165 |
-
}
|
| 3166 |
-
|
| 3167 |
-
GGML_CALL int ggml_backend_cuda_get_device_count() {
|
| 3168 |
-
return ggml_cuda_info().device_count;
|
| 3169 |
-
}
|
| 3170 |
-
|
| 3171 |
-
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
| 3172 |
-
cudaDeviceProp prop;
|
| 3173 |
-
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
| 3174 |
-
snprintf(description, description_size, "%s", prop.name);
|
| 3175 |
-
}
|
| 3176 |
-
|
| 3177 |
-
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
| 3178 |
-
ggml_cuda_set_device(device);
|
| 3179 |
-
|
| 3180 |
-
CUDA_CHECK(cudaMemGetInfo(free, total));
|
| 3181 |
-
}
|
| 3182 |
-
|
| 3183 |
-
GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
| 3184 |
-
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
| 3185 |
-
return false;
|
| 3186 |
-
}
|
| 3187 |
-
|
| 3188 |
-
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
| 3189 |
-
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
| 3190 |
-
if (err != cudaSuccess) {
|
| 3191 |
-
// clear the error
|
| 3192 |
-
cudaGetLastError();
|
| 3193 |
-
|
| 3194 |
-
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
| 3195 |
-
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 3196 |
-
return false;
|
| 3197 |
-
}
|
| 3198 |
-
return true;
|
| 3199 |
-
#else
|
| 3200 |
-
return false;
|
| 3201 |
-
#endif
|
| 3202 |
-
}
|
| 3203 |
-
|
| 3204 |
-
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
| 3205 |
-
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
| 3206 |
-
return;
|
| 3207 |
-
}
|
| 3208 |
-
|
| 3209 |
-
cudaError_t err = cudaHostUnregister(buffer);
|
| 3210 |
-
if (err != cudaSuccess) {
|
| 3211 |
-
// clear the error
|
| 3212 |
-
cudaGetLastError();
|
| 3213 |
-
}
|
| 3214 |
-
}
|
| 3215 |
-
|
| 3216 |
-
// backend registry
|
| 3217 |
-
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
| 3218 |
-
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
| 3219 |
-
return cuda_backend;
|
| 3220 |
-
|
| 3221 |
-
GGML_UNUSED(params);
|
| 3222 |
-
}
|
| 3223 |
-
|
| 3224 |
-
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
| 3225 |
-
|
| 3226 |
-
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
| 3227 |
-
int device_count = ggml_backend_cuda_get_device_count();
|
| 3228 |
-
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
| 3229 |
-
for (int i = 0; i < device_count; i++) {
|
| 3230 |
-
char name[128];
|
| 3231 |
-
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
|
| 3232 |
-
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
|
| 3233 |
-
}
|
| 3234 |
-
return device_count;
|
| 3235 |
-
}
|
|
|
|
| 101 |
int id = -1; // in case cudaGetDevice fails
|
| 102 |
cudaGetDevice(&id);
|
| 103 |
|
| 104 |
+
GGML_CUDA_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
|
| 105 |
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
| 106 |
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
| 107 |
+
// abort with GGML_ABORT to get a stack trace
|
| 108 |
+
GGML_ABORT(GGML_CUDA_NAME " error");
|
| 109 |
}
|
| 110 |
|
| 111 |
// this is faster on Windows
|
|
|
|
| 329 |
return;
|
| 330 |
}
|
| 331 |
}
|
| 332 |
+
GGML_CUDA_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
| 333 |
ggml_cuda_set_device(device);
|
| 334 |
CUDA_CHECK(cudaFree(ptr));
|
| 335 |
pool_size -= size;
|
|
|
|
| 459 |
}
|
| 460 |
};
|
| 461 |
|
| 462 |
+
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 463 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 464 |
return ctx->name.c_str();
|
| 465 |
}
|
| 466 |
|
| 467 |
+
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
| 468 |
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
| 469 |
}
|
| 470 |
|
| 471 |
+
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 472 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 473 |
delete ctx;
|
| 474 |
}
|
| 475 |
|
| 476 |
+
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 477 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 478 |
return ctx->dev_ptr;
|
| 479 |
}
|
| 480 |
|
| 481 |
+
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 482 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 483 |
|
| 484 |
if (tensor->view_src != NULL) {
|
|
|
|
| 498 |
}
|
| 499 |
}
|
| 500 |
|
| 501 |
+
static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
| 502 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 503 |
|
| 504 |
ggml_cuda_set_device(ctx->device);
|
|
|
|
| 506 |
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
| 507 |
}
|
| 508 |
|
| 509 |
+
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 510 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 511 |
|
| 512 |
ggml_cuda_set_device(ctx->device);
|
|
|
|
| 514 |
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
| 515 |
}
|
| 516 |
|
| 517 |
+
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 518 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 519 |
|
| 520 |
ggml_cuda_set_device(ctx->device);
|
|
|
|
| 522 |
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
| 523 |
}
|
| 524 |
|
| 525 |
+
static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
| 526 |
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
| 527 |
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
| 528 |
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
|
|
|
|
| 543 |
GGML_UNUSED(buffer);
|
| 544 |
}
|
| 545 |
|
| 546 |
+
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 547 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 548 |
|
| 549 |
ggml_cuda_set_device(ctx->device);
|
|
|
|
| 552 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 553 |
}
|
| 554 |
|
| 555 |
+
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
| 556 |
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
| 557 |
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
| 558 |
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
|
|
|
| 571 |
std::string name;
|
| 572 |
};
|
| 573 |
|
| 574 |
+
static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 575 |
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 576 |
|
| 577 |
return ctx->name.c_str();
|
| 578 |
}
|
| 579 |
|
| 580 |
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
|
| 581 |
+
return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name;
|
| 582 |
}
|
| 583 |
|
| 584 |
+
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 585 |
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 586 |
|
| 587 |
ggml_cuda_set_device(buft_ctx->device);
|
|
|
|
| 602 |
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
| 603 |
}
|
| 604 |
|
| 605 |
+
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 606 |
return 128;
|
| 607 |
|
| 608 |
GGML_UNUSED(buft);
|
| 609 |
}
|
| 610 |
|
| 611 |
+
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 612 |
size_t size = ggml_nbytes(tensor);
|
| 613 |
int64_t ne0 = tensor->ne[0];
|
| 614 |
|
|
|
|
| 623 |
GGML_UNUSED(buft);
|
| 624 |
}
|
| 625 |
|
| 626 |
+
static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
| 627 |
+
/* .get_name = */ ggml_backend_cuda_buffer_type_get_name,
|
| 628 |
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
| 629 |
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
| 630 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
|
|
| 632 |
/* .is_host = */ NULL,
|
| 633 |
};
|
| 634 |
|
| 635 |
+
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
| 636 |
static std::mutex mutex;
|
| 637 |
std::lock_guard<std::mutex> lock(mutex);
|
| 638 |
|
|
|
|
| 645 |
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
| 646 |
|
| 647 |
if (!ggml_backend_cuda_buffer_type_initialized) {
|
| 648 |
+
for (int i = 0; i < ggml_backend_cuda_get_device_count(); i++) {
|
| 649 |
ggml_backend_cuda_buffer_types[i] = {
|
| 650 |
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
| 651 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i),
|
| 652 |
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
| 653 |
};
|
| 654 |
}
|
|
|
|
| 718 |
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
| 719 |
};
|
| 720 |
|
| 721 |
+
static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 722 |
return GGML_CUDA_NAME "_Split";
|
| 723 |
|
| 724 |
GGML_UNUSED(buffer);
|
|
|
|
| 729 |
GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
| 730 |
}
|
| 731 |
|
| 732 |
+
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 733 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
| 734 |
delete ctx;
|
| 735 |
}
|
| 736 |
|
| 737 |
+
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 738 |
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
| 739 |
return (void *)0x1000;
|
| 740 |
|
| 741 |
GGML_UNUSED(buffer);
|
| 742 |
}
|
| 743 |
|
| 744 |
+
static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 745 |
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
| 746 |
|
| 747 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
|
|
| 789 |
tensor->extra = extra;
|
| 790 |
}
|
| 791 |
|
| 792 |
+
static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 793 |
// split tensors must always be set in their entirety at once
|
| 794 |
GGML_ASSERT(offset == 0);
|
| 795 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
|
|
| 827 |
}
|
| 828 |
}
|
| 829 |
|
| 830 |
+
static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 831 |
// split tensors must always be set in their entirety at once
|
| 832 |
GGML_ASSERT(offset == 0);
|
| 833 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
|
|
| 865 |
}
|
| 866 |
}
|
| 867 |
|
| 868 |
+
static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 869 |
GGML_UNUSED(buffer);
|
| 870 |
GGML_UNUSED(value);
|
| 871 |
}
|
| 872 |
|
| 873 |
+
static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
| 874 |
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
| 875 |
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
| 876 |
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
|
|
|
| 885 |
|
| 886 |
// cuda split buffer type
|
| 887 |
|
| 888 |
+
static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 889 |
return GGML_CUDA_NAME "_Split";
|
| 890 |
|
| 891 |
GGML_UNUSED(buft);
|
| 892 |
}
|
| 893 |
|
| 894 |
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
| 895 |
+
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name;
|
| 896 |
}
|
| 897 |
|
| 898 |
+
static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 899 |
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 900 |
// instead, we allocate them for each tensor separately in init_tensor
|
| 901 |
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
|
|
|
| 905 |
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
| 906 |
}
|
| 907 |
|
| 908 |
+
static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 909 |
return 128;
|
| 910 |
|
| 911 |
GGML_UNUSED(buft);
|
| 912 |
}
|
| 913 |
|
| 914 |
+
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 915 |
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
| 916 |
|
| 917 |
size_t total_size = 0;
|
|
|
|
| 938 |
return total_size;
|
| 939 |
}
|
| 940 |
|
| 941 |
+
static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 942 |
return false;
|
| 943 |
|
| 944 |
GGML_UNUSED(buft);
|
| 945 |
}
|
| 946 |
|
| 947 |
+
static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
|
| 948 |
+
/* .get_name = */ ggml_backend_cuda_split_buffer_type_get_name,
|
| 949 |
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
| 950 |
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
| 951 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
|
|
| 953 |
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
| 954 |
};
|
| 955 |
|
| 956 |
+
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
| 957 |
static std::mutex mutex;
|
| 958 |
std::lock_guard<std::mutex> lock(mutex);
|
| 959 |
|
|
|
|
| 982 |
|
| 983 |
struct ggml_backend_buffer_type buft {
|
| 984 |
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
| 985 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
|
| 986 |
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
| 987 |
};
|
| 988 |
|
|
|
|
| 992 |
|
| 993 |
// host buffer type
|
| 994 |
|
| 995 |
+
static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 996 |
return GGML_CUDA_NAME "_Host";
|
| 997 |
|
| 998 |
GGML_UNUSED(buft);
|
| 999 |
}
|
| 1000 |
|
| 1001 |
+
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
| 1002 |
return GGML_CUDA_NAME "_Host";
|
| 1003 |
|
| 1004 |
GGML_UNUSED(buffer);
|
| 1005 |
}
|
| 1006 |
|
| 1007 |
+
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1008 |
CUDA_CHECK(cudaFreeHost(buffer->context));
|
| 1009 |
}
|
| 1010 |
|
|
|
|
| 1026 |
return ptr;
|
| 1027 |
}
|
| 1028 |
|
| 1029 |
+
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 1030 |
void * ptr = ggml_cuda_host_malloc(size);
|
| 1031 |
|
| 1032 |
if (ptr == nullptr) {
|
|
|
|
| 1042 |
return buffer;
|
| 1043 |
}
|
| 1044 |
|
| 1045 |
+
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
| 1046 |
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
| 1047 |
/* .iface = */ {
|
| 1048 |
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
|
|
|
| 1052 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 1053 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 1054 |
},
|
| 1055 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
|
| 1056 |
/* .context = */ nullptr,
|
| 1057 |
};
|
| 1058 |
|
|
|
|
| 2386 |
|
| 2387 |
// backend
|
| 2388 |
|
| 2389 |
+
static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
|
| 2390 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2391 |
|
| 2392 |
return cuda_ctx->name.c_str();
|
| 2393 |
}
|
| 2394 |
|
| 2395 |
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
| 2396 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2397 |
|
| 2398 |
delete cuda_ctx;
|
| 2399 |
delete backend;
|
| 2400 |
}
|
| 2401 |
|
| 2402 |
+
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
| 2403 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2404 |
|
| 2405 |
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
| 2406 |
}
|
| 2407 |
|
| 2408 |
+
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 2409 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2410 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 2411 |
|
|
|
|
| 2414 |
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
|
| 2415 |
}
|
| 2416 |
|
| 2417 |
+
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 2418 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2419 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 2420 |
|
|
|
|
| 2423 |
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
|
| 2424 |
}
|
| 2425 |
|
| 2426 |
+
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
| 2427 |
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
| 2428 |
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
| 2429 |
|
|
|
|
| 2478 |
return true;
|
| 2479 |
}
|
| 2480 |
|
| 2481 |
+
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
| 2482 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2483 |
|
| 2484 |
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
|
|
|
|
| 2537 |
return true;
|
| 2538 |
}
|
| 2539 |
|
| 2540 |
+
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 2541 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2542 |
|
| 2543 |
ggml_cuda_set_device(cuda_ctx->device);
|
|
|
|
| 2809 |
return GGML_STATUS_SUCCESS;
|
| 2810 |
}
|
| 2811 |
|
| 2812 |
+
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
| 2813 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2814 |
+
|
| 2815 |
+
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
|
| 2816 |
+
}
|
| 2817 |
+
|
| 2818 |
+
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
| 2819 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2820 |
+
|
| 2821 |
+
if (ggml_backend_is_cuda(backend)) {
|
| 2822 |
+
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
|
| 2823 |
+
} else {
|
| 2824 |
+
#if 0
|
| 2825 |
+
// untested
|
| 2826 |
+
auto wait_fn = [](void * user_data) {
|
| 2827 |
+
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
| 2828 |
+
ggml_backend_event_synchronize(event);
|
| 2829 |
+
};
|
| 2830 |
+
|
| 2831 |
+
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
|
| 2832 |
+
#endif
|
| 2833 |
+
GGML_ABORT("fatal error");
|
| 2834 |
+
}
|
| 2835 |
+
}
|
| 2836 |
+
|
| 2837 |
+
static const ggml_backend_i ggml_backend_cuda_interface = {
|
| 2838 |
+
/* .get_name = */ ggml_backend_cuda_get_name,
|
| 2839 |
+
/* .free = */ ggml_backend_cuda_free,
|
| 2840 |
+
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
| 2841 |
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
| 2842 |
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
| 2843 |
+
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
| 2844 |
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
| 2845 |
+
/* .graph_plan_create = */ NULL,
|
| 2846 |
+
/* .graph_plan_free = */ NULL,
|
| 2847 |
+
/* .graph_plan_update = */ NULL,
|
| 2848 |
+
/* .graph_plan_compute = */ NULL,
|
| 2849 |
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
| 2850 |
+
/* .supports_op = */ NULL, // moved to device
|
| 2851 |
+
/* .supports_buft = */ NULL, // moved to device
|
| 2852 |
+
/* .offload_op = */ NULL, // moved to device
|
| 2853 |
+
/* .event_record = */ ggml_backend_cuda_event_record,
|
| 2854 |
+
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
| 2855 |
+
};
|
| 2856 |
+
|
| 2857 |
+
static ggml_guid_t ggml_backend_cuda_guid() {
|
| 2858 |
+
static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
|
| 2859 |
+
return &guid;
|
| 2860 |
+
}
|
| 2861 |
+
|
| 2862 |
+
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
| 2863 |
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
|
| 2864 |
+
}
|
| 2865 |
+
|
| 2866 |
+
int ggml_backend_cuda_get_device_count() {
|
| 2867 |
+
return ggml_cuda_info().device_count;
|
| 2868 |
+
}
|
| 2869 |
+
|
| 2870 |
+
void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
| 2871 |
+
cudaDeviceProp prop;
|
| 2872 |
+
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
| 2873 |
+
snprintf(description, description_size, "%s", prop.name);
|
| 2874 |
+
}
|
| 2875 |
+
|
| 2876 |
+
void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
| 2877 |
+
ggml_cuda_set_device(device);
|
| 2878 |
+
|
| 2879 |
+
CUDA_CHECK(cudaMemGetInfo(free, total));
|
| 2880 |
+
}
|
| 2881 |
+
|
| 2882 |
+
bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
| 2883 |
+
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
| 2884 |
+
return false;
|
| 2885 |
+
}
|
| 2886 |
+
|
| 2887 |
+
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
| 2888 |
+
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
| 2889 |
+
if (err != cudaSuccess) {
|
| 2890 |
+
// clear the error
|
| 2891 |
+
cudaGetLastError();
|
| 2892 |
+
|
| 2893 |
+
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
| 2894 |
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 2895 |
+
return false;
|
| 2896 |
+
}
|
| 2897 |
+
return true;
|
| 2898 |
+
#else
|
| 2899 |
+
return false;
|
| 2900 |
+
#endif
|
| 2901 |
+
}
|
| 2902 |
+
|
| 2903 |
+
void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
| 2904 |
+
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
| 2905 |
+
return;
|
| 2906 |
+
}
|
| 2907 |
+
|
| 2908 |
+
cudaError_t err = cudaHostUnregister(buffer);
|
| 2909 |
+
if (err != cudaSuccess) {
|
| 2910 |
+
// clear the error
|
| 2911 |
+
cudaGetLastError();
|
| 2912 |
+
}
|
| 2913 |
+
}
|
| 2914 |
+
|
| 2915 |
+
|
| 2916 |
+
// backend device
|
| 2917 |
+
|
| 2918 |
+
struct ggml_backend_cuda_device_context {
|
| 2919 |
+
int device;
|
| 2920 |
+
std::string name;
|
| 2921 |
+
std::string description;
|
| 2922 |
+
};
|
| 2923 |
+
|
| 2924 |
+
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
| 2925 |
+
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
| 2926 |
+
return ctx->name.c_str();
|
| 2927 |
+
}
|
| 2928 |
+
|
| 2929 |
+
static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
|
| 2930 |
+
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
| 2931 |
+
return ctx->description.c_str();
|
| 2932 |
+
}
|
| 2933 |
+
|
| 2934 |
+
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
| 2935 |
+
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
| 2936 |
+
ggml_cuda_set_device(ctx->device);
|
| 2937 |
+
CUDA_CHECK(cudaMemGetInfo(free, total));
|
| 2938 |
+
}
|
| 2939 |
+
|
| 2940 |
+
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
|
| 2941 |
+
GGML_UNUSED(dev);
|
| 2942 |
+
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
| 2943 |
+
}
|
| 2944 |
+
|
| 2945 |
+
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
| 2946 |
+
props->name = ggml_backend_cuda_device_get_name(dev);
|
| 2947 |
+
props->description = ggml_backend_cuda_device_get_description(dev);
|
| 2948 |
+
props->type = ggml_backend_cuda_device_get_type(dev);
|
| 2949 |
+
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
| 2950 |
+
|
| 2951 |
+
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
| 2952 |
+
#ifdef GGML_CUDA_NO_PEER_COPY
|
| 2953 |
+
bool events = false;
|
| 2954 |
+
#else
|
| 2955 |
+
bool events = true;
|
| 2956 |
+
#endif
|
| 2957 |
+
|
| 2958 |
+
props->caps = {
|
| 2959 |
+
/* async */ true,
|
| 2960 |
+
/* host_buffer */ host_buffer,
|
| 2961 |
+
/* events */ events,
|
| 2962 |
+
};
|
| 2963 |
+
}
|
| 2964 |
+
|
| 2965 |
+
static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) {
|
| 2966 |
+
GGML_UNUSED(params);
|
| 2967 |
+
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
| 2968 |
+
return ggml_backend_cuda_init(ctx->device);
|
| 2969 |
+
}
|
| 2970 |
+
|
| 2971 |
+
static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) {
|
| 2972 |
+
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
| 2973 |
+
return ggml_backend_cuda_buffer_type(ctx->device);
|
| 2974 |
+
}
|
| 2975 |
+
|
| 2976 |
+
static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) {
|
| 2977 |
+
GGML_UNUSED(dev);
|
| 2978 |
+
return ggml_backend_cuda_host_buffer_type();
|
| 2979 |
+
}
|
| 2980 |
+
|
| 2981 |
+
static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
| 2982 |
+
GGML_UNUSED(dev);
|
| 2983 |
+
GGML_UNUSED(ptr);
|
| 2984 |
+
GGML_UNUSED(size);
|
| 2985 |
+
GGML_UNUSED(max_tensor_size);
|
| 2986 |
+
return nullptr;
|
| 2987 |
+
}
|
| 2988 |
+
|
| 2989 |
+
// TODO: move these functions here
|
| 2990 |
+
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
| 2991 |
+
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
| 2992 |
+
|
| 2993 |
switch (op->op) {
|
| 2994 |
case GGML_OP_UNARY:
|
| 2995 |
switch (ggml_get_unary_op(op)) {
|
|
|
|
| 3203 |
if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
|
| 3204 |
return true;
|
| 3205 |
}
|
| 3206 |
+
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
| 3207 |
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
| 3208 |
}
|
| 3209 |
case GGML_OP_CROSS_ENTROPY_LOSS:
|
|
|
|
| 3213 |
default:
|
| 3214 |
return false;
|
| 3215 |
}
|
|
|
|
|
|
|
| 3216 |
}
|
| 3217 |
|
| 3218 |
+
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
| 3219 |
if (ggml_backend_buft_is_cuda_split(buft)) {
|
| 3220 |
return true;
|
| 3221 |
}
|
| 3222 |
|
| 3223 |
if (ggml_backend_buft_is_cuda(buft)) {
|
| 3224 |
+
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
|
| 3225 |
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 3226 |
+
return buft_ctx->device == dev_ctx->device;
|
| 3227 |
}
|
| 3228 |
|
| 3229 |
return false;
|
| 3230 |
}
|
| 3231 |
|
| 3232 |
+
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
| 3233 |
const int min_batch_size = 32;
|
| 3234 |
|
| 3235 |
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
| 3236 |
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
| 3237 |
|
| 3238 |
+
GGML_UNUSED(dev);
|
| 3239 |
}
|
| 3240 |
|
| 3241 |
+
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
|
| 3242 |
#ifdef GGML_CUDA_NO_PEER_COPY
|
| 3243 |
return nullptr;
|
| 3244 |
#else
|
| 3245 |
+
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
|
| 3246 |
|
| 3247 |
+
ggml_cuda_set_device(dev_ctx->device);
|
| 3248 |
|
| 3249 |
cudaEvent_t event;
|
| 3250 |
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
| 3251 |
|
| 3252 |
return new ggml_backend_event {
|
| 3253 |
+
/* .device = */ dev,
|
| 3254 |
/* .context = */ event,
|
| 3255 |
};
|
| 3256 |
#endif
|
| 3257 |
}
|
| 3258 |
|
| 3259 |
+
static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
| 3260 |
+
GGML_UNUSED(dev);
|
| 3261 |
|
| 3262 |
+
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
|
| 3263 |
delete event;
|
| 3264 |
}
|
| 3265 |
|
| 3266 |
+
static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
| 3267 |
+
GGML_UNUSED(dev);
|
| 3268 |
+
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
| 3269 |
+
}
|
| 3270 |
+
|
| 3271 |
+
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
| 3272 |
+
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
| 3273 |
+
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
| 3274 |
+
/* .get_memory = */ ggml_backend_cuda_device_get_memory,
|
| 3275 |
+
/* .get_type = */ ggml_backend_cuda_device_get_type,
|
| 3276 |
+
/* .get_props = */ ggml_backend_cuda_device_get_props,
|
| 3277 |
+
/* .init_backend = */ ggml_backend_cuda_device_init,
|
| 3278 |
+
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
|
| 3279 |
+
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
|
| 3280 |
+
/* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr,
|
| 3281 |
+
/* .supports_op = */ ggml_backend_cuda_device_supports_op,
|
| 3282 |
+
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
|
| 3283 |
+
/* .offload_op = */ ggml_backend_cuda_device_offload_op,
|
| 3284 |
+
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
| 3285 |
+
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
| 3286 |
+
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
| 3287 |
+
};
|
| 3288 |
|
| 3289 |
+
// backend reg
|
| 3290 |
+
|
| 3291 |
+
struct ggml_backend_cuda_reg_context {
|
| 3292 |
+
std::vector<ggml_backend_dev_t> devices;
|
| 3293 |
+
};
|
| 3294 |
+
|
| 3295 |
+
static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) {
|
| 3296 |
+
GGML_UNUSED(reg);
|
| 3297 |
+
return GGML_CUDA_NAME;
|
| 3298 |
}
|
| 3299 |
|
| 3300 |
+
static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) {
|
| 3301 |
+
ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
|
| 3302 |
+
return ctx->devices.size();
|
| 3303 |
+
}
|
| 3304 |
|
| 3305 |
+
static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
| 3306 |
+
ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
|
| 3307 |
+
GGML_ASSERT(index < ctx->devices.size());
|
| 3308 |
+
return ctx->devices[index];
|
| 3309 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3310 |
|
| 3311 |
+
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
| 3312 |
+
GGML_UNUSED(reg);
|
| 3313 |
+
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
| 3314 |
+
return (void *)ggml_backend_cuda_split_buffer_type;
|
| 3315 |
+
}
|
| 3316 |
+
if (strcmp(name, "ggml_backend_register_host_buffer") == 0) {
|
| 3317 |
+
return (void *)ggml_backend_cuda_register_host_buffer;
|
| 3318 |
}
|
| 3319 |
+
if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
|
| 3320 |
+
return (void *)ggml_backend_cuda_unregister_host_buffer;
|
| 3321 |
+
}
|
| 3322 |
+
return nullptr;
|
| 3323 |
}
|
| 3324 |
|
| 3325 |
+
static void ggml_backend_cuda_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
|
| 3326 |
+
GGML_UNUSED(reg);
|
| 3327 |
+
ggml_backend_cuda_log_set_callback(log_callback, user_data);
|
| 3328 |
}
|
| 3329 |
|
| 3330 |
+
static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
|
| 3331 |
+
/* .get_name = */ ggml_backend_cuda_reg_get_name,
|
| 3332 |
+
/* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
|
| 3333 |
+
/* .get_device_get = */ ggml_backend_cuda_reg_get_device,
|
| 3334 |
+
/* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
|
| 3335 |
+
/* .set_log_callback = */ ggml_backend_cuda_reg_set_log_callback,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3336 |
};
|
| 3337 |
|
| 3338 |
+
// backend registry
|
| 3339 |
+
ggml_backend_reg_t ggml_backend_cuda_reg() {
|
| 3340 |
+
static ggml_backend_reg reg;
|
| 3341 |
+
static bool initialized = false;
|
| 3342 |
+
|
| 3343 |
+
{
|
| 3344 |
+
static std::mutex mutex;
|
| 3345 |
+
std::lock_guard<std::mutex> lock(mutex);
|
| 3346 |
+
if (!initialized) {
|
| 3347 |
+
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
| 3348 |
+
|
| 3349 |
+
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
| 3350 |
+
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
| 3351 |
+
dev_ctx->device = i;
|
| 3352 |
+
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
|
| 3353 |
+
|
| 3354 |
+
ggml_cuda_set_device(i);
|
| 3355 |
+
cudaDeviceProp prop;
|
| 3356 |
+
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
| 3357 |
+
dev_ctx->description = prop.name;
|
| 3358 |
+
|
| 3359 |
+
ggml_backend_dev_t dev = new ggml_backend_device {
|
| 3360 |
+
/* .interface = */ ggml_backend_cuda_device_interface,
|
| 3361 |
+
/* .reg = */ ®,
|
| 3362 |
+
/* .context = */ dev_ctx
|
| 3363 |
+
};
|
| 3364 |
+
ctx->devices.push_back(dev);
|
| 3365 |
+
}
|
| 3366 |
+
|
| 3367 |
+
reg = ggml_backend_reg {
|
| 3368 |
+
/* .interface = */ ggml_backend_cuda_reg_interface,
|
| 3369 |
+
/* .context = */ ctx
|
| 3370 |
+
};
|
| 3371 |
+
}
|
| 3372 |
+
|
| 3373 |
+
initialized = true;
|
| 3374 |
+
}
|
| 3375 |
+
|
| 3376 |
+
return ®
|
| 3377 |
}
|
| 3378 |
|
| 3379 |
+
ggml_backend_t ggml_backend_cuda_init(int device) {
|
| 3380 |
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
| 3381 |
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
| 3382 |
return nullptr;
|
|
|
|
| 3391 |
ggml_backend_t cuda_backend = new ggml_backend {
|
| 3392 |
/* .guid = */ ggml_backend_cuda_guid(),
|
| 3393 |
/* .interface = */ ggml_backend_cuda_interface,
|
| 3394 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
|
| 3395 |
+
/* .context = */ ctx,
|
| 3396 |
};
|
| 3397 |
|
| 3398 |
return cuda_backend;
|
| 3399 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-kompute.cpp
CHANGED
|
@@ -1921,6 +1921,7 @@ ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
|
|
| 1921 |
for (const auto & dev : devices) {
|
| 1922 |
vec.push_back({
|
| 1923 |
/* .iface = */ ggml_backend_kompute_buffer_type_interface,
|
|
|
|
| 1924 |
/* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
|
| 1925 |
});
|
| 1926 |
}
|
|
@@ -1989,11 +1990,8 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
| 1989 |
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
| 1990 |
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
| 1991 |
/* .offload_op = */ NULL,
|
| 1992 |
-
/* .event_new = */ NULL,
|
| 1993 |
-
/* .event_free = */ NULL,
|
| 1994 |
/* .event_record = */ NULL,
|
| 1995 |
/* .event_wait = */ NULL,
|
| 1996 |
-
/* .event_synchronize = */ NULL,
|
| 1997 |
};
|
| 1998 |
|
| 1999 |
static ggml_guid_t ggml_backend_kompute_guid() {
|
|
@@ -2008,6 +2006,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
|
|
| 2008 |
ggml_backend_t kompute_backend = new ggml_backend {
|
| 2009 |
/* .guid = */ ggml_backend_kompute_guid(),
|
| 2010 |
/* .interface = */ kompute_backend_i,
|
|
|
|
| 2011 |
/* .context = */ s_kompute_context,
|
| 2012 |
};
|
| 2013 |
|
|
@@ -2017,23 +2016,3 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
|
|
| 2017 |
bool ggml_backend_is_kompute(ggml_backend_t backend) {
|
| 2018 |
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
|
| 2019 |
}
|
| 2020 |
-
|
| 2021 |
-
static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
|
| 2022 |
-
GGML_UNUSED(params);
|
| 2023 |
-
return ggml_backend_kompute_init(intptr_t(user_data));
|
| 2024 |
-
}
|
| 2025 |
-
|
| 2026 |
-
extern "C" int ggml_backend_kompute_reg_devices();
|
| 2027 |
-
|
| 2028 |
-
int ggml_backend_kompute_reg_devices() {
|
| 2029 |
-
auto devices = ggml_vk_available_devices_internal(0);
|
| 2030 |
-
for (const auto & device : devices) {
|
| 2031 |
-
ggml_backend_register(
|
| 2032 |
-
ggml_kompute_format_name(device.index).c_str(),
|
| 2033 |
-
ggml_backend_reg_kompute_init,
|
| 2034 |
-
ggml_backend_kompute_buffer_type(device.index),
|
| 2035 |
-
reinterpret_cast<void *>(intptr_t(device.index))
|
| 2036 |
-
);
|
| 2037 |
-
}
|
| 2038 |
-
return devices.size();
|
| 2039 |
-
}
|
|
|
|
| 1921 |
for (const auto & dev : devices) {
|
| 1922 |
vec.push_back({
|
| 1923 |
/* .iface = */ ggml_backend_kompute_buffer_type_interface,
|
| 1924 |
+
/* .device = */ nullptr,
|
| 1925 |
/* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
|
| 1926 |
});
|
| 1927 |
}
|
|
|
|
| 1990 |
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
| 1991 |
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
| 1992 |
/* .offload_op = */ NULL,
|
|
|
|
|
|
|
| 1993 |
/* .event_record = */ NULL,
|
| 1994 |
/* .event_wait = */ NULL,
|
|
|
|
| 1995 |
};
|
| 1996 |
|
| 1997 |
static ggml_guid_t ggml_backend_kompute_guid() {
|
|
|
|
| 2006 |
ggml_backend_t kompute_backend = new ggml_backend {
|
| 2007 |
/* .guid = */ ggml_backend_kompute_guid(),
|
| 2008 |
/* .interface = */ kompute_backend_i,
|
| 2009 |
+
/* .device = */ nullptr,
|
| 2010 |
/* .context = */ s_kompute_context,
|
| 2011 |
};
|
| 2012 |
|
|
|
|
| 2016 |
bool ggml_backend_is_kompute(ggml_backend_t backend) {
|
| 2017 |
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
|
| 2018 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-metal.m
CHANGED
|
@@ -3201,13 +3201,13 @@ static void ggml_backend_metal_free_device(void) {
|
|
| 3201 |
}
|
| 3202 |
}
|
| 3203 |
|
| 3204 |
-
|
| 3205 |
return "Metal";
|
| 3206 |
|
| 3207 |
UNUSED(buffer);
|
| 3208 |
}
|
| 3209 |
|
| 3210 |
-
|
| 3211 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 3212 |
|
| 3213 |
for (int i = 0; i < ctx->n_buffers; i++) {
|
|
@@ -3226,25 +3226,25 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_
|
|
| 3226 |
free(ctx);
|
| 3227 |
}
|
| 3228 |
|
| 3229 |
-
|
| 3230 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 3231 |
|
| 3232 |
return ctx->all_data;
|
| 3233 |
}
|
| 3234 |
|
| 3235 |
-
|
| 3236 |
memcpy((char *)tensor->data + offset, data, size);
|
| 3237 |
|
| 3238 |
UNUSED(buffer);
|
| 3239 |
}
|
| 3240 |
|
| 3241 |
-
|
| 3242 |
memcpy(data, (const char *)tensor->data + offset, size);
|
| 3243 |
|
| 3244 |
UNUSED(buffer);
|
| 3245 |
}
|
| 3246 |
|
| 3247 |
-
|
| 3248 |
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 3249 |
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 3250 |
return true;
|
|
@@ -3254,7 +3254,7 @@ GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t
|
|
| 3254 |
UNUSED(buffer);
|
| 3255 |
}
|
| 3256 |
|
| 3257 |
-
|
| 3258 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 3259 |
|
| 3260 |
memset(ctx->all_data, value, ctx->all_size);
|
|
@@ -3275,7 +3275,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
|
| 3275 |
|
| 3276 |
// default buffer type
|
| 3277 |
|
| 3278 |
-
|
| 3279 |
return "Metal";
|
| 3280 |
|
| 3281 |
UNUSED(buft);
|
|
@@ -3306,7 +3306,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
|
|
| 3306 |
UNUSED(size_aligned);
|
| 3307 |
}
|
| 3308 |
|
| 3309 |
-
|
| 3310 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 3311 |
|
| 3312 |
const size_t size_page = sysconf(_SC_PAGESIZE);
|
|
@@ -3348,12 +3348,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
|
|
| 3348 |
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
| 3349 |
}
|
| 3350 |
|
| 3351 |
-
|
| 3352 |
return 32;
|
| 3353 |
UNUSED(buft);
|
| 3354 |
}
|
| 3355 |
|
| 3356 |
-
|
| 3357 |
id<MTLDevice> device = ggml_backend_metal_get_device();
|
| 3358 |
size_t max_size = device.maxBufferLength;
|
| 3359 |
ggml_backend_metal_free_device();
|
|
@@ -3363,13 +3363,13 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
|
|
| 3363 |
UNUSED(buft);
|
| 3364 |
}
|
| 3365 |
|
| 3366 |
-
|
| 3367 |
return true;
|
| 3368 |
|
| 3369 |
UNUSED(buft);
|
| 3370 |
}
|
| 3371 |
|
| 3372 |
-
|
| 3373 |
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
| 3374 |
/* .iface = */ {
|
| 3375 |
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
|
@@ -3379,6 +3379,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
| 3379 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 3380 |
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
| 3381 |
},
|
|
|
|
| 3382 |
/* .context = */ NULL,
|
| 3383 |
};
|
| 3384 |
|
|
@@ -3387,7 +3388,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
| 3387 |
|
| 3388 |
// buffer from ptr
|
| 3389 |
|
| 3390 |
-
|
| 3391 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 3392 |
|
| 3393 |
ctx->all_data = data;
|
|
@@ -3467,37 +3468,37 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
|
|
| 3467 |
|
| 3468 |
// backend
|
| 3469 |
|
| 3470 |
-
|
| 3471 |
return "Metal";
|
| 3472 |
|
| 3473 |
UNUSED(backend);
|
| 3474 |
}
|
| 3475 |
|
| 3476 |
-
|
| 3477 |
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
| 3478 |
ggml_metal_free(ctx);
|
| 3479 |
free(backend);
|
| 3480 |
}
|
| 3481 |
|
| 3482 |
-
|
| 3483 |
return ggml_backend_metal_buffer_type();
|
| 3484 |
|
| 3485 |
UNUSED(backend);
|
| 3486 |
}
|
| 3487 |
|
| 3488 |
-
|
| 3489 |
struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
|
| 3490 |
|
| 3491 |
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
| 3492 |
}
|
| 3493 |
|
| 3494 |
-
|
| 3495 |
struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
|
| 3496 |
|
| 3497 |
return ggml_metal_supports_op(metal_ctx, op);
|
| 3498 |
}
|
| 3499 |
|
| 3500 |
-
|
| 3501 |
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
|
| 3502 |
|
| 3503 |
UNUSED(backend);
|
|
@@ -3538,11 +3539,8 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|
| 3538 |
/* .supports_op = */ ggml_backend_metal_supports_op,
|
| 3539 |
/* .supports_buft = */ ggml_backend_metal_supports_buft,
|
| 3540 |
/* .offload_op = */ NULL,
|
| 3541 |
-
/* .event_new = */ NULL,
|
| 3542 |
-
/* .event_free = */ NULL,
|
| 3543 |
/* .event_record = */ NULL,
|
| 3544 |
/* .event_wait = */ NULL,
|
| 3545 |
-
/* .event_synchronize = */ NULL,
|
| 3546 |
};
|
| 3547 |
|
| 3548 |
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
|
@@ -3567,6 +3565,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
|
|
| 3567 |
*backend = (struct ggml_backend) {
|
| 3568 |
/* .guid = */ ggml_backend_metal_guid(),
|
| 3569 |
/* .interface = */ ggml_backend_metal_i,
|
|
|
|
| 3570 |
/* .context = */ ctx,
|
| 3571 |
};
|
| 3572 |
|
|
@@ -3603,9 +3602,9 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
|
| 3603 |
ctx->capture_next_compute = true;
|
| 3604 |
}
|
| 3605 |
|
| 3606 |
-
|
| 3607 |
|
| 3608 |
-
|
| 3609 |
return ggml_backend_metal_init();
|
| 3610 |
|
| 3611 |
GGML_UNUSED(params);
|
|
|
|
| 3201 |
}
|
| 3202 |
}
|
| 3203 |
|
| 3204 |
+
static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 3205 |
return "Metal";
|
| 3206 |
|
| 3207 |
UNUSED(buffer);
|
| 3208 |
}
|
| 3209 |
|
| 3210 |
+
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 3211 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 3212 |
|
| 3213 |
for (int i = 0; i < ctx->n_buffers; i++) {
|
|
|
|
| 3226 |
free(ctx);
|
| 3227 |
}
|
| 3228 |
|
| 3229 |
+
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 3230 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 3231 |
|
| 3232 |
return ctx->all_data;
|
| 3233 |
}
|
| 3234 |
|
| 3235 |
+
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 3236 |
memcpy((char *)tensor->data + offset, data, size);
|
| 3237 |
|
| 3238 |
UNUSED(buffer);
|
| 3239 |
}
|
| 3240 |
|
| 3241 |
+
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 3242 |
memcpy(data, (const char *)tensor->data + offset, size);
|
| 3243 |
|
| 3244 |
UNUSED(buffer);
|
| 3245 |
}
|
| 3246 |
|
| 3247 |
+
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 3248 |
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 3249 |
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 3250 |
return true;
|
|
|
|
| 3254 |
UNUSED(buffer);
|
| 3255 |
}
|
| 3256 |
|
| 3257 |
+
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 3258 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 3259 |
|
| 3260 |
memset(ctx->all_data, value, ctx->all_size);
|
|
|
|
| 3275 |
|
| 3276 |
// default buffer type
|
| 3277 |
|
| 3278 |
+
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 3279 |
return "Metal";
|
| 3280 |
|
| 3281 |
UNUSED(buft);
|
|
|
|
| 3306 |
UNUSED(size_aligned);
|
| 3307 |
}
|
| 3308 |
|
| 3309 |
+
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 3310 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 3311 |
|
| 3312 |
const size_t size_page = sysconf(_SC_PAGESIZE);
|
|
|
|
| 3348 |
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
| 3349 |
}
|
| 3350 |
|
| 3351 |
+
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 3352 |
return 32;
|
| 3353 |
UNUSED(buft);
|
| 3354 |
}
|
| 3355 |
|
| 3356 |
+
static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
| 3357 |
id<MTLDevice> device = ggml_backend_metal_get_device();
|
| 3358 |
size_t max_size = device.maxBufferLength;
|
| 3359 |
ggml_backend_metal_free_device();
|
|
|
|
| 3363 |
UNUSED(buft);
|
| 3364 |
}
|
| 3365 |
|
| 3366 |
+
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 3367 |
return true;
|
| 3368 |
|
| 3369 |
UNUSED(buft);
|
| 3370 |
}
|
| 3371 |
|
| 3372 |
+
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
| 3373 |
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
| 3374 |
/* .iface = */ {
|
| 3375 |
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
|
|
|
| 3379 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 3380 |
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
| 3381 |
},
|
| 3382 |
+
/* .device = */ NULL,
|
| 3383 |
/* .context = */ NULL,
|
| 3384 |
};
|
| 3385 |
|
|
|
|
| 3388 |
|
| 3389 |
// buffer from ptr
|
| 3390 |
|
| 3391 |
+
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
| 3392 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 3393 |
|
| 3394 |
ctx->all_data = data;
|
|
|
|
| 3468 |
|
| 3469 |
// backend
|
| 3470 |
|
| 3471 |
+
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
| 3472 |
return "Metal";
|
| 3473 |
|
| 3474 |
UNUSED(backend);
|
| 3475 |
}
|
| 3476 |
|
| 3477 |
+
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
| 3478 |
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
| 3479 |
ggml_metal_free(ctx);
|
| 3480 |
free(backend);
|
| 3481 |
}
|
| 3482 |
|
| 3483 |
+
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
| 3484 |
return ggml_backend_metal_buffer_type();
|
| 3485 |
|
| 3486 |
UNUSED(backend);
|
| 3487 |
}
|
| 3488 |
|
| 3489 |
+
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 3490 |
struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
|
| 3491 |
|
| 3492 |
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
| 3493 |
}
|
| 3494 |
|
| 3495 |
+
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 3496 |
struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
|
| 3497 |
|
| 3498 |
return ggml_metal_supports_op(metal_ctx, op);
|
| 3499 |
}
|
| 3500 |
|
| 3501 |
+
static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 3502 |
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
|
| 3503 |
|
| 3504 |
UNUSED(backend);
|
|
|
|
| 3539 |
/* .supports_op = */ ggml_backend_metal_supports_op,
|
| 3540 |
/* .supports_buft = */ ggml_backend_metal_supports_buft,
|
| 3541 |
/* .offload_op = */ NULL,
|
|
|
|
|
|
|
| 3542 |
/* .event_record = */ NULL,
|
| 3543 |
/* .event_wait = */ NULL,
|
|
|
|
| 3544 |
};
|
| 3545 |
|
| 3546 |
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
|
|
|
| 3565 |
*backend = (struct ggml_backend) {
|
| 3566 |
/* .guid = */ ggml_backend_metal_guid(),
|
| 3567 |
/* .interface = */ ggml_backend_metal_i,
|
| 3568 |
+
/* .device = */ NULL,
|
| 3569 |
/* .context = */ ctx,
|
| 3570 |
};
|
| 3571 |
|
|
|
|
| 3602 |
ctx->capture_next_compute = true;
|
| 3603 |
}
|
| 3604 |
|
| 3605 |
+
ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
| 3606 |
|
| 3607 |
+
ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
|
| 3608 |
return ggml_backend_metal_init();
|
| 3609 |
|
| 3610 |
GGML_UNUSED(params);
|
ggml/src/ggml-rpc.cpp
CHANGED
|
@@ -319,12 +319,12 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
|
| 319 |
return sock;
|
| 320 |
}
|
| 321 |
|
| 322 |
-
|
| 323 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 324 |
return ctx->name.c_str();
|
| 325 |
}
|
| 326 |
|
| 327 |
-
|
| 328 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 329 |
// input serialization format: | remote_ptr (8 bytes) |
|
| 330 |
std::vector<uint8_t> input(sizeof(uint64_t), 0);
|
|
@@ -337,7 +337,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t
|
|
| 337 |
delete ctx;
|
| 338 |
}
|
| 339 |
|
| 340 |
-
|
| 341 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 342 |
if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
|
| 343 |
return ctx->base_cache[buffer];
|
|
@@ -388,7 +388,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
|
| 388 |
return result;
|
| 389 |
}
|
| 390 |
|
| 391 |
-
|
| 392 |
UNUSED(buffer);
|
| 393 |
if (ggml_is_quantized(tensor->type)) {
|
| 394 |
// TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
|
|
@@ -396,7 +396,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t
|
|
| 396 |
}
|
| 397 |
}
|
| 398 |
|
| 399 |
-
|
| 400 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 401 |
// input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
|
| 402 |
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
|
|
@@ -410,7 +410,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t b
|
|
| 410 |
GGML_ASSERT(status);
|
| 411 |
}
|
| 412 |
|
| 413 |
-
|
| 414 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 415 |
// input serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
|
| 416 |
int input_size = sizeof(rpc_tensor) + 2*sizeof(uint64_t);
|
|
@@ -427,7 +427,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t b
|
|
| 427 |
memcpy(data, output.data(), size);
|
| 428 |
}
|
| 429 |
|
| 430 |
-
|
| 431 |
// check if src and dst are on the same server
|
| 432 |
ggml_backend_buffer_t src_buffer = src->buffer;
|
| 433 |
ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context;
|
|
@@ -452,7 +452,7 @@ GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t b
|
|
| 452 |
return output[0];
|
| 453 |
}
|
| 454 |
|
| 455 |
-
|
| 456 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 457 |
// serialization format: | bufptr (8 bytes) | value (1 byte) |
|
| 458 |
int input_size = sizeof(uint64_t) + sizeof(uint8_t);
|
|
@@ -477,12 +477,12 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
|
| 477 |
/* .reset = */ NULL,
|
| 478 |
};
|
| 479 |
|
| 480 |
-
|
| 481 |
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
| 482 |
return buft_ctx->name.c_str();
|
| 483 |
}
|
| 484 |
|
| 485 |
-
|
| 486 |
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
| 487 |
// input serialization format: | size (8 bytes) |
|
| 488 |
int input_size = sizeof(uint64_t);
|
|
@@ -522,7 +522,7 @@ static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
|
|
| 522 |
return alignment;
|
| 523 |
}
|
| 524 |
|
| 525 |
-
|
| 526 |
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
| 527 |
return buft_ctx->alignment;
|
| 528 |
}
|
|
@@ -540,12 +540,12 @@ static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
|
|
| 540 |
return max_size;
|
| 541 |
}
|
| 542 |
|
| 543 |
-
|
| 544 |
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
| 545 |
return buft_ctx->max_size;
|
| 546 |
}
|
| 547 |
|
| 548 |
-
|
| 549 |
UNUSED(buft);
|
| 550 |
return ggml_nbytes(tensor);
|
| 551 |
}
|
|
@@ -559,24 +559,24 @@ static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
|
| 559 |
/* .is_host = */ NULL,
|
| 560 |
};
|
| 561 |
|
| 562 |
-
|
| 563 |
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
| 564 |
|
| 565 |
return rpc_ctx->name.c_str();
|
| 566 |
}
|
| 567 |
|
| 568 |
-
|
| 569 |
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
| 570 |
delete rpc_ctx;
|
| 571 |
delete backend;
|
| 572 |
}
|
| 573 |
|
| 574 |
-
|
| 575 |
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
| 576 |
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
| 577 |
}
|
| 578 |
|
| 579 |
-
|
| 580 |
UNUSED(backend);
|
| 581 |
// this is no-op because we don't have any async operations
|
| 582 |
}
|
|
@@ -618,7 +618,7 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
|
|
| 618 |
memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
|
| 619 |
}
|
| 620 |
|
| 621 |
-
|
| 622 |
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
| 623 |
std::vector<uint8_t> input;
|
| 624 |
serialize_graph(cgraph, input);
|
|
@@ -630,14 +630,14 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
|
|
| 630 |
return (enum ggml_status)output[0];
|
| 631 |
}
|
| 632 |
|
| 633 |
-
|
| 634 |
UNUSED(backend);
|
| 635 |
UNUSED(op);
|
| 636 |
//TODO: call the remote backend and cache the results
|
| 637 |
return true;
|
| 638 |
}
|
| 639 |
|
| 640 |
-
|
| 641 |
if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
|
| 642 |
return false;
|
| 643 |
}
|
|
@@ -662,14 +662,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|
| 662 |
/* .supports_op = */ ggml_backend_rpc_supports_op,
|
| 663 |
/* .supports_buft = */ ggml_backend_rpc_supports_buft,
|
| 664 |
/* .offload_op = */ NULL,
|
| 665 |
-
/* .event_new = */ NULL,
|
| 666 |
-
/* .event_free = */ NULL,
|
| 667 |
/* .event_record = */ NULL,
|
| 668 |
/* .event_wait = */ NULL,
|
| 669 |
-
/* .event_synchronize = */ NULL,
|
| 670 |
};
|
| 671 |
|
| 672 |
-
GGML_API
|
| 673 |
static std::mutex mutex;
|
| 674 |
std::lock_guard<std::mutex> lock(mutex);
|
| 675 |
// NOTE: buffer types are allocated and never freed; this is by design
|
|
@@ -694,13 +691,14 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
|
|
| 694 |
|
| 695 |
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
|
| 696 |
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
|
|
|
| 697 |
/* .context = */ buft_ctx
|
| 698 |
};
|
| 699 |
buft_map[endpoint] = buft;
|
| 700 |
return buft;
|
| 701 |
}
|
| 702 |
|
| 703 |
-
|
| 704 |
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
| 705 |
/* .endpoint = */ endpoint,
|
| 706 |
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
|
@@ -709,12 +707,13 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
|
| 709 |
ggml_backend_t backend = new ggml_backend {
|
| 710 |
/* .guid = */ ggml_backend_rpc_guid(),
|
| 711 |
/* .interface = */ ggml_backend_rpc_interface,
|
|
|
|
| 712 |
/* .context = */ ctx
|
| 713 |
};
|
| 714 |
return backend;
|
| 715 |
}
|
| 716 |
|
| 717 |
-
GGML_API
|
| 718 |
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
|
| 719 |
}
|
| 720 |
|
|
@@ -734,7 +733,7 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
|
|
| 734 |
*total = total_mem;
|
| 735 |
}
|
| 736 |
|
| 737 |
-
GGML_API
|
| 738 |
auto sock = get_socket(endpoint);
|
| 739 |
if (sock == nullptr) {
|
| 740 |
*free = 0;
|
|
|
|
| 319 |
return sock;
|
| 320 |
}
|
| 321 |
|
| 322 |
+
static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 323 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 324 |
return ctx->name.c_str();
|
| 325 |
}
|
| 326 |
|
| 327 |
+
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 328 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 329 |
// input serialization format: | remote_ptr (8 bytes) |
|
| 330 |
std::vector<uint8_t> input(sizeof(uint64_t), 0);
|
|
|
|
| 337 |
delete ctx;
|
| 338 |
}
|
| 339 |
|
| 340 |
+
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 341 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 342 |
if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
|
| 343 |
return ctx->base_cache[buffer];
|
|
|
|
| 388 |
return result;
|
| 389 |
}
|
| 390 |
|
| 391 |
+
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 392 |
UNUSED(buffer);
|
| 393 |
if (ggml_is_quantized(tensor->type)) {
|
| 394 |
// TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
|
|
|
|
| 396 |
}
|
| 397 |
}
|
| 398 |
|
| 399 |
+
static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 400 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 401 |
// input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
|
| 402 |
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
|
|
|
|
| 410 |
GGML_ASSERT(status);
|
| 411 |
}
|
| 412 |
|
| 413 |
+
static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 414 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 415 |
// input serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
|
| 416 |
int input_size = sizeof(rpc_tensor) + 2*sizeof(uint64_t);
|
|
|
|
| 427 |
memcpy(data, output.data(), size);
|
| 428 |
}
|
| 429 |
|
| 430 |
+
static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
| 431 |
// check if src and dst are on the same server
|
| 432 |
ggml_backend_buffer_t src_buffer = src->buffer;
|
| 433 |
ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context;
|
|
|
|
| 452 |
return output[0];
|
| 453 |
}
|
| 454 |
|
| 455 |
+
static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 456 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 457 |
// serialization format: | bufptr (8 bytes) | value (1 byte) |
|
| 458 |
int input_size = sizeof(uint64_t) + sizeof(uint8_t);
|
|
|
|
| 477 |
/* .reset = */ NULL,
|
| 478 |
};
|
| 479 |
|
| 480 |
+
static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 481 |
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
| 482 |
return buft_ctx->name.c_str();
|
| 483 |
}
|
| 484 |
|
| 485 |
+
static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 486 |
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
| 487 |
// input serialization format: | size (8 bytes) |
|
| 488 |
int input_size = sizeof(uint64_t);
|
|
|
|
| 522 |
return alignment;
|
| 523 |
}
|
| 524 |
|
| 525 |
+
static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 526 |
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
| 527 |
return buft_ctx->alignment;
|
| 528 |
}
|
|
|
|
| 540 |
return max_size;
|
| 541 |
}
|
| 542 |
|
| 543 |
+
static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
|
| 544 |
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
| 545 |
return buft_ctx->max_size;
|
| 546 |
}
|
| 547 |
|
| 548 |
+
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 549 |
UNUSED(buft);
|
| 550 |
return ggml_nbytes(tensor);
|
| 551 |
}
|
|
|
|
| 559 |
/* .is_host = */ NULL,
|
| 560 |
};
|
| 561 |
|
| 562 |
+
static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
| 563 |
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
| 564 |
|
| 565 |
return rpc_ctx->name.c_str();
|
| 566 |
}
|
| 567 |
|
| 568 |
+
static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
| 569 |
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
| 570 |
delete rpc_ctx;
|
| 571 |
delete backend;
|
| 572 |
}
|
| 573 |
|
| 574 |
+
static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
| 575 |
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
| 576 |
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
| 577 |
}
|
| 578 |
|
| 579 |
+
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
| 580 |
UNUSED(backend);
|
| 581 |
// this is no-op because we don't have any async operations
|
| 582 |
}
|
|
|
|
| 618 |
memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
|
| 619 |
}
|
| 620 |
|
| 621 |
+
static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 622 |
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
| 623 |
std::vector<uint8_t> input;
|
| 624 |
serialize_graph(cgraph, input);
|
|
|
|
| 630 |
return (enum ggml_status)output[0];
|
| 631 |
}
|
| 632 |
|
| 633 |
+
static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 634 |
UNUSED(backend);
|
| 635 |
UNUSED(op);
|
| 636 |
//TODO: call the remote backend and cache the results
|
| 637 |
return true;
|
| 638 |
}
|
| 639 |
|
| 640 |
+
static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 641 |
if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
|
| 642 |
return false;
|
| 643 |
}
|
|
|
|
| 662 |
/* .supports_op = */ ggml_backend_rpc_supports_op,
|
| 663 |
/* .supports_buft = */ ggml_backend_rpc_supports_buft,
|
| 664 |
/* .offload_op = */ NULL,
|
|
|
|
|
|
|
| 665 |
/* .event_record = */ NULL,
|
| 666 |
/* .event_wait = */ NULL,
|
|
|
|
| 667 |
};
|
| 668 |
|
| 669 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
| 670 |
static std::mutex mutex;
|
| 671 |
std::lock_guard<std::mutex> lock(mutex);
|
| 672 |
// NOTE: buffer types are allocated and never freed; this is by design
|
|
|
|
| 691 |
|
| 692 |
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
|
| 693 |
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
| 694 |
+
/* .device = */ nullptr,
|
| 695 |
/* .context = */ buft_ctx
|
| 696 |
};
|
| 697 |
buft_map[endpoint] = buft;
|
| 698 |
return buft;
|
| 699 |
}
|
| 700 |
|
| 701 |
+
ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
| 702 |
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
| 703 |
/* .endpoint = */ endpoint,
|
| 704 |
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
|
|
|
| 707 |
ggml_backend_t backend = new ggml_backend {
|
| 708 |
/* .guid = */ ggml_backend_rpc_guid(),
|
| 709 |
/* .interface = */ ggml_backend_rpc_interface,
|
| 710 |
+
/* .device = */ nullptr,
|
| 711 |
/* .context = */ ctx
|
| 712 |
};
|
| 713 |
return backend;
|
| 714 |
}
|
| 715 |
|
| 716 |
+
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
| 717 |
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
|
| 718 |
}
|
| 719 |
|
|
|
|
| 733 |
*total = total_mem;
|
| 734 |
}
|
| 735 |
|
| 736 |
+
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
| 737 |
auto sock = get_socket(endpoint);
|
| 738 |
if (sock == nullptr) {
|
| 739 |
*free = 0;
|
ggml/src/ggml-sycl.cpp
CHANGED
|
@@ -4038,7 +4038,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
|
| 4038 |
return true;
|
| 4039 |
}
|
| 4040 |
|
| 4041 |
-
GGML_API
|
| 4042 |
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
|
| 4043 |
for(int i=0;i<max_len;i++) id_list[i] = -1;
|
| 4044 |
|
|
@@ -4068,7 +4068,7 @@ catch (sycl::exception const &exc) {
|
|
| 4068 |
std::exit(1);
|
| 4069 |
}
|
| 4070 |
|
| 4071 |
-
GGML_API
|
| 4072 |
size_t description_size) try {
|
| 4073 |
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
|
| 4074 |
dpct::device_info prop;
|
|
@@ -4082,7 +4082,7 @@ catch (sycl::exception const &exc) {
|
|
| 4082 |
std::exit(1);
|
| 4083 |
}
|
| 4084 |
|
| 4085 |
-
|
| 4086 |
size_t *total) try {
|
| 4087 |
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
| 4088 |
ggml_sycl_set_device(device);
|
|
@@ -4135,12 +4135,12 @@ struct ggml_backend_sycl_buffer_context {
|
|
| 4135 |
}
|
| 4136 |
};
|
| 4137 |
|
| 4138 |
-
|
| 4139 |
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
| 4140 |
return ctx->name.c_str();
|
| 4141 |
}
|
| 4142 |
|
| 4143 |
-
|
| 4144 |
return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
|
| 4145 |
}
|
| 4146 |
|
|
@@ -4162,7 +4162,7 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
| 4162 |
return ctx->dev_ptr;
|
| 4163 |
}
|
| 4164 |
|
| 4165 |
-
|
| 4166 |
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
| 4167 |
ggml_tensor *tensor) try {
|
| 4168 |
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
|
@@ -4237,7 +4237,7 @@ catch (sycl::exception const &exc) {
|
|
| 4237 |
std::exit(1);
|
| 4238 |
}
|
| 4239 |
|
| 4240 |
-
|
| 4241 |
ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
| 4242 |
const ggml_tensor *src,
|
| 4243 |
ggml_tensor *dst) try {
|
|
@@ -4339,12 +4339,12 @@ struct ggml_backend_sycl_buffer_type_context {
|
|
| 4339 |
queue_ptr stream = nullptr;
|
| 4340 |
};
|
| 4341 |
|
| 4342 |
-
|
| 4343 |
ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
| 4344 |
|
| 4345 |
return ctx->name.c_str();
|
| 4346 |
}
|
| 4347 |
-
|
| 4348 |
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
| 4349 |
size_t size) try {
|
| 4350 |
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
|
@@ -4368,7 +4368,7 @@ catch (sycl::exception const &exc) {
|
|
| 4368 |
std::exit(1);
|
| 4369 |
}
|
| 4370 |
|
| 4371 |
-
|
| 4372 |
return 128;
|
| 4373 |
UNUSED(buft);
|
| 4374 |
}
|
|
@@ -4379,7 +4379,7 @@ static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_typ
|
|
| 4379 |
UNUSED(buft);
|
| 4380 |
}
|
| 4381 |
|
| 4382 |
-
|
| 4383 |
size_t size = ggml_nbytes(tensor);
|
| 4384 |
int64_t ne0 = tensor->ne[0];
|
| 4385 |
|
|
@@ -4424,6 +4424,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|
| 4424 |
queue_ptr stream = &(device_i.default_queue());
|
| 4425 |
ggml_backend_sycl_buffer_types[i] = {
|
| 4426 |
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
|
|
|
| 4427 |
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
|
| 4428 |
};
|
| 4429 |
}
|
|
@@ -4449,6 +4450,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
|
|
| 4449 |
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
| 4450 |
ggml_backend_sycl_buffer_types[i] = {
|
| 4451 |
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
|
|
|
| 4452 |
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
|
| 4453 |
};
|
| 4454 |
}
|
|
@@ -4513,7 +4515,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
|
| 4513 |
std::vector<queue_ptr> streams;
|
| 4514 |
};
|
| 4515 |
|
| 4516 |
-
|
| 4517 |
return GGML_SYCL_NAME "_Split";
|
| 4518 |
|
| 4519 |
UNUSED(buffer);
|
|
@@ -4523,19 +4525,19 @@ static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
|
| 4523 |
return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
| 4524 |
}
|
| 4525 |
|
| 4526 |
-
|
| 4527 |
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
| 4528 |
delete ctx;
|
| 4529 |
}
|
| 4530 |
|
| 4531 |
-
|
| 4532 |
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
| 4533 |
return (void *)0x1000;
|
| 4534 |
|
| 4535 |
UNUSED(buffer);
|
| 4536 |
}
|
| 4537 |
|
| 4538 |
-
|
| 4539 |
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
| 4540 |
ggml_tensor *tensor) try {
|
| 4541 |
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
|
@@ -4618,7 +4620,7 @@ catch (sycl::exception const &exc) {
|
|
| 4618 |
std::exit(1);
|
| 4619 |
}
|
| 4620 |
|
| 4621 |
-
|
| 4622 |
ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
| 4623 |
ggml_tensor *tensor, const void *data,
|
| 4624 |
size_t offset, size_t size) try {
|
|
@@ -4671,7 +4673,7 @@ catch (sycl::exception const &exc) {
|
|
| 4671 |
std::exit(1);
|
| 4672 |
}
|
| 4673 |
|
| 4674 |
-
|
| 4675 |
ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
| 4676 |
const ggml_tensor *tensor, void *data,
|
| 4677 |
size_t offset, size_t size) try {
|
|
@@ -4724,7 +4726,7 @@ catch (sycl::exception const &exc) {
|
|
| 4724 |
std::exit(1);
|
| 4725 |
}
|
| 4726 |
|
| 4727 |
-
|
| 4728 |
UNUSED(buffer);
|
| 4729 |
UNUSED(value);
|
| 4730 |
}
|
|
@@ -4742,13 +4744,13 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
|
|
| 4742 |
/* .reset = */ NULL,
|
| 4743 |
};
|
| 4744 |
|
| 4745 |
-
|
| 4746 |
return GGML_SYCL_NAME "_Split";
|
| 4747 |
|
| 4748 |
UNUSED(buft);
|
| 4749 |
}
|
| 4750 |
|
| 4751 |
-
|
| 4752 |
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 4753 |
// instead, we allocate them for each tensor separately in init_tensor
|
| 4754 |
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
|
@@ -4758,12 +4760,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc
|
|
| 4758 |
return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size);
|
| 4759 |
}
|
| 4760 |
|
| 4761 |
-
|
| 4762 |
return 128;
|
| 4763 |
UNUSED(buft);
|
| 4764 |
}
|
| 4765 |
|
| 4766 |
-
|
| 4767 |
ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context;
|
| 4768 |
|
| 4769 |
size_t total_size = 0;
|
|
@@ -4790,7 +4792,7 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
|
|
| 4790 |
return total_size;
|
| 4791 |
}
|
| 4792 |
|
| 4793 |
-
|
| 4794 |
return false;
|
| 4795 |
|
| 4796 |
UNUSED(buft);
|
|
@@ -4805,7 +4807,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
|
| 4805 |
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
|
| 4806 |
};
|
| 4807 |
|
| 4808 |
-
|
| 4809 |
static std::mutex mutex;
|
| 4810 |
std::lock_guard<std::mutex> lock(mutex);
|
| 4811 |
|
|
@@ -4837,6 +4839,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
|
|
| 4837 |
|
| 4838 |
struct ggml_backend_buffer_type buft {
|
| 4839 |
/* .iface = */ ggml_backend_sycl_split_buffer_type_interface,
|
|
|
|
| 4840 |
/* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr},
|
| 4841 |
};
|
| 4842 |
|
|
@@ -4846,13 +4849,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
|
|
| 4846 |
|
| 4847 |
// host buffer type
|
| 4848 |
|
| 4849 |
-
|
| 4850 |
return GGML_SYCL_NAME "_Host";
|
| 4851 |
|
| 4852 |
UNUSED(buft);
|
| 4853 |
}
|
| 4854 |
|
| 4855 |
-
|
| 4856 |
return GGML_SYCL_NAME "_Host";
|
| 4857 |
|
| 4858 |
UNUSED(buffer);
|
|
@@ -4890,6 +4893,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|
| 4890 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 4891 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 4892 |
},
|
|
|
|
| 4893 |
/* .context = */ nullptr,
|
| 4894 |
};
|
| 4895 |
|
|
@@ -4898,14 +4902,14 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|
| 4898 |
|
| 4899 |
// backend
|
| 4900 |
|
| 4901 |
-
|
| 4902 |
|
| 4903 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4904 |
|
| 4905 |
return sycl_ctx->name.c_str();
|
| 4906 |
}
|
| 4907 |
|
| 4908 |
-
|
| 4909 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4910 |
|
| 4911 |
delete sycl_ctx;
|
|
@@ -4913,12 +4917,12 @@ GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
|
| 4913 |
}
|
| 4914 |
|
| 4915 |
|
| 4916 |
-
|
| 4917 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4918 |
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
| 4919 |
}
|
| 4920 |
|
| 4921 |
-
|
| 4922 |
ggml_tensor *tensor,
|
| 4923 |
const void *data, size_t offset,
|
| 4924 |
size_t size) try {
|
|
@@ -4936,7 +4940,7 @@ catch (sycl::exception const &exc) {
|
|
| 4936 |
std::exit(1);
|
| 4937 |
}
|
| 4938 |
|
| 4939 |
-
|
| 4940 |
const ggml_tensor *tensor,
|
| 4941 |
void *data, size_t offset,
|
| 4942 |
size_t size) try {
|
|
@@ -4954,9 +4958,9 @@ catch (sycl::exception const &exc) {
|
|
| 4954 |
std::exit(1);
|
| 4955 |
}
|
| 4956 |
|
| 4957 |
-
|
| 4958 |
-
|
| 4959 |
-
|
| 4960 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4961 |
if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
|
| 4962 |
/*
|
|
@@ -4991,7 +4995,7 @@ catch (sycl::exception const &exc) {
|
|
| 4991 |
std::exit(1);
|
| 4992 |
}
|
| 4993 |
|
| 4994 |
-
|
| 4995 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4996 |
ggml_sycl_set_main_device(sycl_ctx->device);
|
| 4997 |
|
|
@@ -5019,7 +5023,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
|
|
| 5019 |
return GGML_STATUS_SUCCESS;
|
| 5020 |
}
|
| 5021 |
|
| 5022 |
-
|
| 5023 |
switch (op->op) {
|
| 5024 |
case GGML_OP_CONV_TRANSPOSE_1D:
|
| 5025 |
{
|
|
@@ -5166,13 +5170,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
| 5166 |
UNUSED(backend);
|
| 5167 |
}
|
| 5168 |
|
| 5169 |
-
|
| 5170 |
const int min_batch_size = 32;
|
| 5171 |
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
|
| 5172 |
GGML_UNUSED(backend);
|
| 5173 |
}
|
| 5174 |
|
| 5175 |
-
|
| 5176 |
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
|
| 5177 |
return false;
|
| 5178 |
}
|
|
@@ -5197,11 +5201,8 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
| 5197 |
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
| 5198 |
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
|
| 5199 |
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
| 5200 |
-
/* .event_new = */ NULL,
|
| 5201 |
-
/* .event_free = */ NULL,
|
| 5202 |
/* .event_record = */ NULL,
|
| 5203 |
/* .event_wait = */ NULL,
|
| 5204 |
-
/* .event_synchronize = */ NULL,
|
| 5205 |
};
|
| 5206 |
|
| 5207 |
static ggml_guid_t ggml_backend_sycl_guid() {
|
|
@@ -5209,7 +5210,7 @@ static ggml_guid_t ggml_backend_sycl_guid() {
|
|
| 5209 |
return &guid;
|
| 5210 |
}
|
| 5211 |
|
| 5212 |
-
|
| 5213 |
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
|
| 5214 |
ggml_check_sycl();
|
| 5215 |
|
|
@@ -5224,6 +5225,7 @@ GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
| 5224 |
ggml_backend_t sycl_backend = new ggml_backend {
|
| 5225 |
/* .guid = */ ggml_backend_sycl_guid(),
|
| 5226 |
/* .interface = */ ggml_backend_sycl_interface,
|
|
|
|
| 5227 |
/* .context = */ ctx
|
| 5228 |
};
|
| 5229 |
|
|
@@ -5234,26 +5236,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
|
| 5234 |
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
|
| 5235 |
}
|
| 5236 |
|
| 5237 |
-
|
| 5238 |
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
|
| 5239 |
return ggml_sycl_info().device_count;
|
| 5240 |
}
|
| 5241 |
-
|
| 5242 |
-
GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
|
| 5243 |
-
ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
|
| 5244 |
-
return sycl_backend;
|
| 5245 |
-
|
| 5246 |
-
UNUSED(params);
|
| 5247 |
-
}
|
| 5248 |
-
|
| 5249 |
-
extern "C" int ggml_backend_sycl_reg_devices();
|
| 5250 |
-
|
| 5251 |
-
int ggml_backend_sycl_reg_devices() {
|
| 5252 |
-
assert(ggml_sycl_info().device_count>0);
|
| 5253 |
-
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
| 5254 |
-
char name[128];
|
| 5255 |
-
snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i);
|
| 5256 |
-
ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i);
|
| 5257 |
-
}
|
| 5258 |
-
return ggml_sycl_info().device_count;
|
| 5259 |
-
}
|
|
|
|
| 4038 |
return true;
|
| 4039 |
}
|
| 4040 |
|
| 4041 |
+
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
| 4042 |
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
|
| 4043 |
for(int i=0;i<max_len;i++) id_list[i] = -1;
|
| 4044 |
|
|
|
|
| 4068 |
std::exit(1);
|
| 4069 |
}
|
| 4070 |
|
| 4071 |
+
GGML_API void ggml_sycl_get_device_description(int device, char *description,
|
| 4072 |
size_t description_size) try {
|
| 4073 |
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
|
| 4074 |
dpct::device_info prop;
|
|
|
|
| 4082 |
std::exit(1);
|
| 4083 |
}
|
| 4084 |
|
| 4085 |
+
void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
| 4086 |
size_t *total) try {
|
| 4087 |
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
| 4088 |
ggml_sycl_set_device(device);
|
|
|
|
| 4135 |
}
|
| 4136 |
};
|
| 4137 |
|
| 4138 |
+
static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 4139 |
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
| 4140 |
return ctx->name.c_str();
|
| 4141 |
}
|
| 4142 |
|
| 4143 |
+
static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
|
| 4144 |
return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
|
| 4145 |
}
|
| 4146 |
|
|
|
|
| 4162 |
return ctx->dev_ptr;
|
| 4163 |
}
|
| 4164 |
|
| 4165 |
+
static void
|
| 4166 |
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
| 4167 |
ggml_tensor *tensor) try {
|
| 4168 |
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
|
|
|
| 4237 |
std::exit(1);
|
| 4238 |
}
|
| 4239 |
|
| 4240 |
+
static bool
|
| 4241 |
ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
| 4242 |
const ggml_tensor *src,
|
| 4243 |
ggml_tensor *dst) try {
|
|
|
|
| 4339 |
queue_ptr stream = nullptr;
|
| 4340 |
};
|
| 4341 |
|
| 4342 |
+
static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 4343 |
ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
| 4344 |
|
| 4345 |
return ctx->name.c_str();
|
| 4346 |
}
|
| 4347 |
+
static ggml_backend_buffer_t
|
| 4348 |
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
| 4349 |
size_t size) try {
|
| 4350 |
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
|
|
|
| 4368 |
std::exit(1);
|
| 4369 |
}
|
| 4370 |
|
| 4371 |
+
static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 4372 |
return 128;
|
| 4373 |
UNUSED(buft);
|
| 4374 |
}
|
|
|
|
| 4379 |
UNUSED(buft);
|
| 4380 |
}
|
| 4381 |
|
| 4382 |
+
static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 4383 |
size_t size = ggml_nbytes(tensor);
|
| 4384 |
int64_t ne0 = tensor->ne[0];
|
| 4385 |
|
|
|
|
| 4424 |
queue_ptr stream = &(device_i.default_queue());
|
| 4425 |
ggml_backend_sycl_buffer_types[i] = {
|
| 4426 |
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
| 4427 |
+
/* .device = */ nullptr,
|
| 4428 |
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
|
| 4429 |
};
|
| 4430 |
}
|
|
|
|
| 4450 |
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
| 4451 |
ggml_backend_sycl_buffer_types[i] = {
|
| 4452 |
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
| 4453 |
+
/* .device = */ nullptr,
|
| 4454 |
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
|
| 4455 |
};
|
| 4456 |
}
|
|
|
|
| 4515 |
std::vector<queue_ptr> streams;
|
| 4516 |
};
|
| 4517 |
|
| 4518 |
+
static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 4519 |
return GGML_SYCL_NAME "_Split";
|
| 4520 |
|
| 4521 |
UNUSED(buffer);
|
|
|
|
| 4525 |
return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
| 4526 |
}
|
| 4527 |
|
| 4528 |
+
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 4529 |
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
| 4530 |
delete ctx;
|
| 4531 |
}
|
| 4532 |
|
| 4533 |
+
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 4534 |
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
| 4535 |
return (void *)0x1000;
|
| 4536 |
|
| 4537 |
UNUSED(buffer);
|
| 4538 |
}
|
| 4539 |
|
| 4540 |
+
static void
|
| 4541 |
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
| 4542 |
ggml_tensor *tensor) try {
|
| 4543 |
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
|
|
|
| 4620 |
std::exit(1);
|
| 4621 |
}
|
| 4622 |
|
| 4623 |
+
static void
|
| 4624 |
ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
| 4625 |
ggml_tensor *tensor, const void *data,
|
| 4626 |
size_t offset, size_t size) try {
|
|
|
|
| 4673 |
std::exit(1);
|
| 4674 |
}
|
| 4675 |
|
| 4676 |
+
static void
|
| 4677 |
ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
| 4678 |
const ggml_tensor *tensor, void *data,
|
| 4679 |
size_t offset, size_t size) try {
|
|
|
|
| 4726 |
std::exit(1);
|
| 4727 |
}
|
| 4728 |
|
| 4729 |
+
static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 4730 |
UNUSED(buffer);
|
| 4731 |
UNUSED(value);
|
| 4732 |
}
|
|
|
|
| 4744 |
/* .reset = */ NULL,
|
| 4745 |
};
|
| 4746 |
|
| 4747 |
+
static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 4748 |
return GGML_SYCL_NAME "_Split";
|
| 4749 |
|
| 4750 |
UNUSED(buft);
|
| 4751 |
}
|
| 4752 |
|
| 4753 |
+
static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 4754 |
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 4755 |
// instead, we allocate them for each tensor separately in init_tensor
|
| 4756 |
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
|
|
|
| 4760 |
return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size);
|
| 4761 |
}
|
| 4762 |
|
| 4763 |
+
static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 4764 |
return 128;
|
| 4765 |
UNUSED(buft);
|
| 4766 |
}
|
| 4767 |
|
| 4768 |
+
static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 4769 |
ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context;
|
| 4770 |
|
| 4771 |
size_t total_size = 0;
|
|
|
|
| 4792 |
return total_size;
|
| 4793 |
}
|
| 4794 |
|
| 4795 |
+
static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 4796 |
return false;
|
| 4797 |
|
| 4798 |
UNUSED(buft);
|
|
|
|
| 4807 |
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
|
| 4808 |
};
|
| 4809 |
|
| 4810 |
+
ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
|
| 4811 |
static std::mutex mutex;
|
| 4812 |
std::lock_guard<std::mutex> lock(mutex);
|
| 4813 |
|
|
|
|
| 4839 |
|
| 4840 |
struct ggml_backend_buffer_type buft {
|
| 4841 |
/* .iface = */ ggml_backend_sycl_split_buffer_type_interface,
|
| 4842 |
+
/* .device = */ nullptr,
|
| 4843 |
/* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr},
|
| 4844 |
};
|
| 4845 |
|
|
|
|
| 4849 |
|
| 4850 |
// host buffer type
|
| 4851 |
|
| 4852 |
+
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 4853 |
return GGML_SYCL_NAME "_Host";
|
| 4854 |
|
| 4855 |
UNUSED(buft);
|
| 4856 |
}
|
| 4857 |
|
| 4858 |
+
static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
|
| 4859 |
return GGML_SYCL_NAME "_Host";
|
| 4860 |
|
| 4861 |
UNUSED(buffer);
|
|
|
|
| 4893 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 4894 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 4895 |
},
|
| 4896 |
+
/* .device = */ nullptr,
|
| 4897 |
/* .context = */ nullptr,
|
| 4898 |
};
|
| 4899 |
|
|
|
|
| 4902 |
|
| 4903 |
// backend
|
| 4904 |
|
| 4905 |
+
static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
| 4906 |
|
| 4907 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4908 |
|
| 4909 |
return sycl_ctx->name.c_str();
|
| 4910 |
}
|
| 4911 |
|
| 4912 |
+
static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
| 4913 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4914 |
|
| 4915 |
delete sycl_ctx;
|
|
|
|
| 4917 |
}
|
| 4918 |
|
| 4919 |
|
| 4920 |
+
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
| 4921 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4922 |
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
| 4923 |
}
|
| 4924 |
|
| 4925 |
+
static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
| 4926 |
ggml_tensor *tensor,
|
| 4927 |
const void *data, size_t offset,
|
| 4928 |
size_t size) try {
|
|
|
|
| 4940 |
std::exit(1);
|
| 4941 |
}
|
| 4942 |
|
| 4943 |
+
static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
| 4944 |
const ggml_tensor *tensor,
|
| 4945 |
void *data, size_t offset,
|
| 4946 |
size_t size) try {
|
|
|
|
| 4958 |
std::exit(1);
|
| 4959 |
}
|
| 4960 |
|
| 4961 |
+
static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
| 4962 |
+
const ggml_tensor *src,
|
| 4963 |
+
ggml_tensor *dst) try {
|
| 4964 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4965 |
if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
|
| 4966 |
/*
|
|
|
|
| 4995 |
std::exit(1);
|
| 4996 |
}
|
| 4997 |
|
| 4998 |
+
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 4999 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 5000 |
ggml_sycl_set_main_device(sycl_ctx->device);
|
| 5001 |
|
|
|
|
| 5023 |
return GGML_STATUS_SUCCESS;
|
| 5024 |
}
|
| 5025 |
|
| 5026 |
+
static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 5027 |
switch (op->op) {
|
| 5028 |
case GGML_OP_CONV_TRANSPOSE_1D:
|
| 5029 |
{
|
|
|
|
| 5170 |
UNUSED(backend);
|
| 5171 |
}
|
| 5172 |
|
| 5173 |
+
static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 5174 |
const int min_batch_size = 32;
|
| 5175 |
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
|
| 5176 |
GGML_UNUSED(backend);
|
| 5177 |
}
|
| 5178 |
|
| 5179 |
+
static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 5180 |
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
|
| 5181 |
return false;
|
| 5182 |
}
|
|
|
|
| 5201 |
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
| 5202 |
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
|
| 5203 |
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
|
|
|
|
|
|
| 5204 |
/* .event_record = */ NULL,
|
| 5205 |
/* .event_wait = */ NULL,
|
|
|
|
| 5206 |
};
|
| 5207 |
|
| 5208 |
static ggml_guid_t ggml_backend_sycl_guid() {
|
|
|
|
| 5210 |
return &guid;
|
| 5211 |
}
|
| 5212 |
|
| 5213 |
+
ggml_backend_t ggml_backend_sycl_init(int device) {
|
| 5214 |
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
|
| 5215 |
ggml_check_sycl();
|
| 5216 |
|
|
|
|
| 5225 |
ggml_backend_t sycl_backend = new ggml_backend {
|
| 5226 |
/* .guid = */ ggml_backend_sycl_guid(),
|
| 5227 |
/* .interface = */ ggml_backend_sycl_interface,
|
| 5228 |
+
/* .device = */ nullptr,
|
| 5229 |
/* .context = */ ctx
|
| 5230 |
};
|
| 5231 |
|
|
|
|
| 5236 |
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
|
| 5237 |
}
|
| 5238 |
|
| 5239 |
+
int ggml_backend_sycl_get_device_count() {
|
| 5240 |
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
|
| 5241 |
return ggml_sycl_info().device_count;
|
| 5242 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-vulkan.cpp
CHANGED
|
@@ -119,11 +119,11 @@ struct ggml_backend_vk_buffer_type_context {
|
|
| 119 |
vk_device device;
|
| 120 |
};
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
| 128 |
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
| 129 |
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
|
@@ -622,7 +622,7 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor);
|
|
| 622 |
|
| 623 |
typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
| 624 |
|
| 625 |
-
|
| 626 |
|
| 627 |
// variables to track number of compiles in progress
|
| 628 |
static uint32_t compile_count = 0;
|
|
@@ -1953,6 +1953,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
| 1953 |
|
| 1954 |
device->buffer_type = {
|
| 1955 |
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
|
|
|
| 1956 |
/* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
|
| 1957 |
};
|
| 1958 |
|
|
@@ -6147,13 +6148,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
| 6147 |
ctx->device->device.destroyFence(ctx->fence);
|
| 6148 |
}
|
| 6149 |
|
| 6150 |
-
|
| 6151 |
ggml_vk_instance_init();
|
| 6152 |
|
| 6153 |
return vk_instance.device_indices.size();
|
| 6154 |
}
|
| 6155 |
|
| 6156 |
-
|
| 6157 |
ggml_vk_instance_init();
|
| 6158 |
|
| 6159 |
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
|
@@ -6170,36 +6171,36 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
|
|
| 6170 |
|
| 6171 |
// device backend
|
| 6172 |
|
| 6173 |
-
|
| 6174 |
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6175 |
return ctx->name.c_str();
|
| 6176 |
}
|
| 6177 |
|
| 6178 |
-
|
| 6179 |
return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
|
| 6180 |
}
|
| 6181 |
|
| 6182 |
-
|
| 6183 |
VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
|
| 6184 |
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6185 |
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
| 6186 |
delete ctx;
|
| 6187 |
}
|
| 6188 |
|
| 6189 |
-
|
| 6190 |
return vk_ptr_base;
|
| 6191 |
|
| 6192 |
UNUSED(buffer);
|
| 6193 |
}
|
| 6194 |
|
| 6195 |
-
|
| 6196 |
VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
|
| 6197 |
if (tensor->view_src != nullptr) {
|
| 6198 |
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
| 6199 |
}
|
| 6200 |
}
|
| 6201 |
|
| 6202 |
-
|
| 6203 |
VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
| 6204 |
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6205 |
vk_buffer buf = buf_ctx->dev_buffer;
|
|
@@ -6207,7 +6208,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
| 6207 |
ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
| 6208 |
}
|
| 6209 |
|
| 6210 |
-
|
| 6211 |
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
| 6212 |
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6213 |
|
|
@@ -6216,7 +6217,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
| 6216 |
ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
| 6217 |
}
|
| 6218 |
|
| 6219 |
-
|
| 6220 |
if (ggml_backend_buffer_is_vk(src->buffer)) {
|
| 6221 |
ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
|
| 6222 |
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
@@ -6233,7 +6234,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|
| 6233 |
UNUSED(buffer);
|
| 6234 |
}
|
| 6235 |
|
| 6236 |
-
|
| 6237 |
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6238 |
|
| 6239 |
ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
|
|
@@ -6253,13 +6254,13 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
|
| 6253 |
};
|
| 6254 |
|
| 6255 |
// vk buffer type
|
| 6256 |
-
|
| 6257 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
| 6258 |
|
| 6259 |
return ctx->name.c_str();
|
| 6260 |
}
|
| 6261 |
|
| 6262 |
-
|
| 6263 |
VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
|
| 6264 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
| 6265 |
|
|
@@ -6275,23 +6276,23 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
|
| 6275 |
return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size);
|
| 6276 |
}
|
| 6277 |
|
| 6278 |
-
|
| 6279 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
| 6280 |
return ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
| 6281 |
}
|
| 6282 |
|
| 6283 |
-
|
| 6284 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
| 6285 |
return ctx->device->max_memory_allocation_size;
|
| 6286 |
}
|
| 6287 |
|
| 6288 |
-
|
| 6289 |
return ggml_nbytes(tensor);
|
| 6290 |
|
| 6291 |
UNUSED(buft);
|
| 6292 |
}
|
| 6293 |
|
| 6294 |
-
|
| 6295 |
ggml_vk_instance_init();
|
| 6296 |
|
| 6297 |
VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
|
|
@@ -6303,24 +6304,24 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num)
|
|
| 6303 |
|
| 6304 |
// host buffer type
|
| 6305 |
|
| 6306 |
-
|
| 6307 |
return GGML_VK_NAME "_Host";
|
| 6308 |
|
| 6309 |
UNUSED(buft);
|
| 6310 |
}
|
| 6311 |
|
| 6312 |
-
|
| 6313 |
return GGML_VK_NAME "_Host";
|
| 6314 |
|
| 6315 |
UNUSED(buffer);
|
| 6316 |
}
|
| 6317 |
|
| 6318 |
-
|
| 6319 |
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
| 6320 |
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
| 6321 |
}
|
| 6322 |
|
| 6323 |
-
|
| 6324 |
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
|
| 6325 |
|
| 6326 |
size += 32; // Behave like the CPU buffer type
|
|
@@ -6344,7 +6345,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
|
|
| 6344 |
UNUSED(buft);
|
| 6345 |
}
|
| 6346 |
|
| 6347 |
-
|
| 6348 |
return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment;
|
| 6349 |
|
| 6350 |
UNUSED(buft);
|
|
@@ -6352,7 +6353,7 @@ GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_back
|
|
| 6352 |
|
| 6353 |
// Should be changed to return device-specific host buffer type
|
| 6354 |
// but that probably requires changes in llama.cpp
|
| 6355 |
-
|
| 6356 |
static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
|
| 6357 |
/* .iface = */ {
|
| 6358 |
/* .get_name = */ ggml_backend_vk_host_buffer_type_name,
|
|
@@ -6362,6 +6363,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
| 6362 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 6363 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 6364 |
},
|
|
|
|
| 6365 |
/* .context = */ nullptr,
|
| 6366 |
};
|
| 6367 |
|
|
@@ -6375,13 +6377,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
| 6375 |
|
| 6376 |
// backend
|
| 6377 |
|
| 6378 |
-
|
| 6379 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6380 |
|
| 6381 |
return ctx->name.c_str();
|
| 6382 |
}
|
| 6383 |
|
| 6384 |
-
|
| 6385 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6386 |
VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
|
| 6387 |
|
|
@@ -6391,13 +6393,13 @@ GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
|
|
| 6391 |
delete backend;
|
| 6392 |
}
|
| 6393 |
|
| 6394 |
-
|
| 6395 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6396 |
|
| 6397 |
return &ctx->device->buffer_type;
|
| 6398 |
}
|
| 6399 |
|
| 6400 |
-
|
| 6401 |
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
| 6402 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6403 |
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
@@ -6420,7 +6422,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
| 6420 |
ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
| 6421 |
}
|
| 6422 |
|
| 6423 |
-
|
| 6424 |
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
| 6425 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6426 |
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
@@ -6443,7 +6445,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
| 6443 |
ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
| 6444 |
}
|
| 6445 |
|
| 6446 |
-
|
| 6447 |
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
|
| 6448 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6449 |
if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
|
@@ -6471,7 +6473,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
| 6471 |
return false;
|
| 6472 |
}
|
| 6473 |
|
| 6474 |
-
|
| 6475 |
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
|
| 6476 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6477 |
if(ctx->transfer_ctx.expired()) {
|
|
@@ -6501,7 +6503,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
|
|
| 6501 |
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
| 6502 |
}
|
| 6503 |
|
| 6504 |
-
|
| 6505 |
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
| 6506 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6507 |
|
|
@@ -6564,7 +6566,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
| 6564 |
UNUSED(backend);
|
| 6565 |
}
|
| 6566 |
|
| 6567 |
-
|
| 6568 |
// ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
|
| 6569 |
|
| 6570 |
switch (op->op) {
|
|
@@ -6687,7 +6689,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
| 6687 |
UNUSED(backend);
|
| 6688 |
}
|
| 6689 |
|
| 6690 |
-
|
| 6691 |
const int min_batch_size = 32;
|
| 6692 |
|
| 6693 |
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
|
@@ -6696,7 +6698,7 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
|
|
| 6696 |
UNUSED(backend);
|
| 6697 |
}
|
| 6698 |
|
| 6699 |
-
|
| 6700 |
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
| 6701 |
return false;
|
| 6702 |
}
|
|
@@ -6724,11 +6726,8 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
| 6724 |
/* .supports_op = */ ggml_backend_vk_supports_op,
|
| 6725 |
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
| 6726 |
/* .offload_op = */ ggml_backend_vk_offload_op,
|
| 6727 |
-
/* .event_new = */ NULL,
|
| 6728 |
-
/* .event_free = */ NULL,
|
| 6729 |
/* .event_record = */ NULL,
|
| 6730 |
/* .event_wait = */ NULL,
|
| 6731 |
-
/* .event_synchronize = */ NULL,
|
| 6732 |
};
|
| 6733 |
|
| 6734 |
static ggml_guid_t ggml_backend_vk_guid() {
|
|
@@ -6736,7 +6735,7 @@ static ggml_guid_t ggml_backend_vk_guid() {
|
|
| 6736 |
return &guid;
|
| 6737 |
}
|
| 6738 |
|
| 6739 |
-
|
| 6740 |
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
| 6741 |
|
| 6742 |
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
|
@@ -6745,25 +6744,26 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
|
| 6745 |
ggml_backend_t vk_backend = new ggml_backend {
|
| 6746 |
/* .guid = */ ggml_backend_vk_guid(),
|
| 6747 |
/* .interface = */ ggml_backend_vk_interface,
|
|
|
|
| 6748 |
/* .context = */ ctx,
|
| 6749 |
};
|
| 6750 |
|
| 6751 |
return vk_backend;
|
| 6752 |
}
|
| 6753 |
|
| 6754 |
-
|
| 6755 |
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
| 6756 |
}
|
| 6757 |
|
| 6758 |
-
|
| 6759 |
return ggml_vk_get_device_count();
|
| 6760 |
}
|
| 6761 |
|
| 6762 |
-
|
| 6763 |
ggml_vk_get_device_description(device, description, description_size);
|
| 6764 |
}
|
| 6765 |
|
| 6766 |
-
|
| 6767 |
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
| 6768 |
|
| 6769 |
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
|
@@ -6779,27 +6779,6 @@ GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size
|
|
| 6779 |
}
|
| 6780 |
}
|
| 6781 |
|
| 6782 |
-
// backend registry
|
| 6783 |
-
GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) {
|
| 6784 |
-
ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data);
|
| 6785 |
-
return vk_backend;
|
| 6786 |
-
|
| 6787 |
-
UNUSED(params);
|
| 6788 |
-
}
|
| 6789 |
-
|
| 6790 |
-
extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
|
| 6791 |
-
|
| 6792 |
-
GGML_CALL int ggml_backend_vk_reg_devices() {
|
| 6793 |
-
ggml_vk_instance_init();
|
| 6794 |
-
|
| 6795 |
-
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
| 6796 |
-
char name[128];
|
| 6797 |
-
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
|
| 6798 |
-
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
|
| 6799 |
-
}
|
| 6800 |
-
return vk_instance.device_indices.size();
|
| 6801 |
-
}
|
| 6802 |
-
|
| 6803 |
// Extension availability
|
| 6804 |
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
| 6805 |
#ifdef GGML_VULKAN_VALIDATE
|
|
|
|
| 119 |
vk_device device;
|
| 120 |
};
|
| 121 |
|
| 122 |
+
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
|
| 123 |
+
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
| 124 |
+
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
|
| 125 |
+
static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft);
|
| 126 |
+
static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor);
|
| 127 |
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
| 128 |
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
| 129 |
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
|
|
|
| 622 |
|
| 623 |
typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
| 624 |
|
| 625 |
+
static void ggml_backend_vk_free(ggml_backend_t backend);
|
| 626 |
|
| 627 |
// variables to track number of compiles in progress
|
| 628 |
static uint32_t compile_count = 0;
|
|
|
|
| 1953 |
|
| 1954 |
device->buffer_type = {
|
| 1955 |
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
| 1956 |
+
/* .device = */ nullptr,
|
| 1957 |
/* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
|
| 1958 |
};
|
| 1959 |
|
|
|
|
| 6148 |
ctx->device->device.destroyFence(ctx->fence);
|
| 6149 |
}
|
| 6150 |
|
| 6151 |
+
static int ggml_vk_get_device_count() {
|
| 6152 |
ggml_vk_instance_init();
|
| 6153 |
|
| 6154 |
return vk_instance.device_indices.size();
|
| 6155 |
}
|
| 6156 |
|
| 6157 |
+
static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
|
| 6158 |
ggml_vk_instance_init();
|
| 6159 |
|
| 6160 |
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
|
|
|
| 6171 |
|
| 6172 |
// device backend
|
| 6173 |
|
| 6174 |
+
static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 6175 |
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6176 |
return ctx->name.c_str();
|
| 6177 |
}
|
| 6178 |
|
| 6179 |
+
static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
| 6180 |
return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
|
| 6181 |
}
|
| 6182 |
|
| 6183 |
+
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 6184 |
VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
|
| 6185 |
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6186 |
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
| 6187 |
delete ctx;
|
| 6188 |
}
|
| 6189 |
|
| 6190 |
+
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 6191 |
return vk_ptr_base;
|
| 6192 |
|
| 6193 |
UNUSED(buffer);
|
| 6194 |
}
|
| 6195 |
|
| 6196 |
+
static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 6197 |
VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
|
| 6198 |
if (tensor->view_src != nullptr) {
|
| 6199 |
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
| 6200 |
}
|
| 6201 |
}
|
| 6202 |
|
| 6203 |
+
static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 6204 |
VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
| 6205 |
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6206 |
vk_buffer buf = buf_ctx->dev_buffer;
|
|
|
|
| 6208 |
ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
| 6209 |
}
|
| 6210 |
|
| 6211 |
+
static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 6212 |
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
| 6213 |
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6214 |
|
|
|
|
| 6217 |
ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
| 6218 |
}
|
| 6219 |
|
| 6220 |
+
static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
| 6221 |
if (ggml_backend_buffer_is_vk(src->buffer)) {
|
| 6222 |
ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
|
| 6223 |
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
|
|
| 6234 |
UNUSED(buffer);
|
| 6235 |
}
|
| 6236 |
|
| 6237 |
+
static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 6238 |
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6239 |
|
| 6240 |
ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
|
|
|
|
| 6254 |
};
|
| 6255 |
|
| 6256 |
// vk buffer type
|
| 6257 |
+
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 6258 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
| 6259 |
|
| 6260 |
return ctx->name.c_str();
|
| 6261 |
}
|
| 6262 |
|
| 6263 |
+
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 6264 |
VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
|
| 6265 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
| 6266 |
|
|
|
|
| 6276 |
return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size);
|
| 6277 |
}
|
| 6278 |
|
| 6279 |
+
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 6280 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
| 6281 |
return ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
| 6282 |
}
|
| 6283 |
|
| 6284 |
+
static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
| 6285 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
| 6286 |
return ctx->device->max_memory_allocation_size;
|
| 6287 |
}
|
| 6288 |
|
| 6289 |
+
static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 6290 |
return ggml_nbytes(tensor);
|
| 6291 |
|
| 6292 |
UNUSED(buft);
|
| 6293 |
}
|
| 6294 |
|
| 6295 |
+
ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
| 6296 |
ggml_vk_instance_init();
|
| 6297 |
|
| 6298 |
VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
|
|
|
|
| 6304 |
|
| 6305 |
// host buffer type
|
| 6306 |
|
| 6307 |
+
static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 6308 |
return GGML_VK_NAME "_Host";
|
| 6309 |
|
| 6310 |
UNUSED(buft);
|
| 6311 |
}
|
| 6312 |
|
| 6313 |
+
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) {
|
| 6314 |
return GGML_VK_NAME "_Host";
|
| 6315 |
|
| 6316 |
UNUSED(buffer);
|
| 6317 |
}
|
| 6318 |
|
| 6319 |
+
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 6320 |
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
| 6321 |
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
| 6322 |
}
|
| 6323 |
|
| 6324 |
+
static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 6325 |
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
|
| 6326 |
|
| 6327 |
size += 32; // Behave like the CPU buffer type
|
|
|
|
| 6345 |
UNUSED(buft);
|
| 6346 |
}
|
| 6347 |
|
| 6348 |
+
static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 6349 |
return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment;
|
| 6350 |
|
| 6351 |
UNUSED(buft);
|
|
|
|
| 6353 |
|
| 6354 |
// Should be changed to return device-specific host buffer type
|
| 6355 |
// but that probably requires changes in llama.cpp
|
| 6356 |
+
ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
| 6357 |
static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
|
| 6358 |
/* .iface = */ {
|
| 6359 |
/* .get_name = */ ggml_backend_vk_host_buffer_type_name,
|
|
|
|
| 6363 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 6364 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 6365 |
},
|
| 6366 |
+
/* .device = */ nullptr,
|
| 6367 |
/* .context = */ nullptr,
|
| 6368 |
};
|
| 6369 |
|
|
|
|
| 6377 |
|
| 6378 |
// backend
|
| 6379 |
|
| 6380 |
+
static const char * ggml_backend_vk_name(ggml_backend_t backend) {
|
| 6381 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6382 |
|
| 6383 |
return ctx->name.c_str();
|
| 6384 |
}
|
| 6385 |
|
| 6386 |
+
static void ggml_backend_vk_free(ggml_backend_t backend) {
|
| 6387 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6388 |
VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
|
| 6389 |
|
|
|
|
| 6393 |
delete backend;
|
| 6394 |
}
|
| 6395 |
|
| 6396 |
+
static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) {
|
| 6397 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6398 |
|
| 6399 |
return &ctx->device->buffer_type;
|
| 6400 |
}
|
| 6401 |
|
| 6402 |
+
static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 6403 |
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
| 6404 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6405 |
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
|
|
| 6422 |
ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
| 6423 |
}
|
| 6424 |
|
| 6425 |
+
static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 6426 |
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
| 6427 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6428 |
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
|
|
| 6445 |
ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
| 6446 |
}
|
| 6447 |
|
| 6448 |
+
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
| 6449 |
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
|
| 6450 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6451 |
if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
|
|
|
| 6473 |
return false;
|
| 6474 |
}
|
| 6475 |
|
| 6476 |
+
static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
| 6477 |
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
|
| 6478 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6479 |
if(ctx->transfer_ctx.expired()) {
|
|
|
|
| 6503 |
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
| 6504 |
}
|
| 6505 |
|
| 6506 |
+
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 6507 |
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
| 6508 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6509 |
|
|
|
|
| 6566 |
UNUSED(backend);
|
| 6567 |
}
|
| 6568 |
|
| 6569 |
+
static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 6570 |
// ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
|
| 6571 |
|
| 6572 |
switch (op->op) {
|
|
|
|
| 6689 |
UNUSED(backend);
|
| 6690 |
}
|
| 6691 |
|
| 6692 |
+
static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 6693 |
const int min_batch_size = 32;
|
| 6694 |
|
| 6695 |
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
|
|
|
| 6698 |
UNUSED(backend);
|
| 6699 |
}
|
| 6700 |
|
| 6701 |
+
static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 6702 |
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
| 6703 |
return false;
|
| 6704 |
}
|
|
|
|
| 6726 |
/* .supports_op = */ ggml_backend_vk_supports_op,
|
| 6727 |
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
| 6728 |
/* .offload_op = */ ggml_backend_vk_offload_op,
|
|
|
|
|
|
|
| 6729 |
/* .event_record = */ NULL,
|
| 6730 |
/* .event_wait = */ NULL,
|
|
|
|
| 6731 |
};
|
| 6732 |
|
| 6733 |
static ggml_guid_t ggml_backend_vk_guid() {
|
|
|
|
| 6735 |
return &guid;
|
| 6736 |
}
|
| 6737 |
|
| 6738 |
+
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
| 6739 |
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
| 6740 |
|
| 6741 |
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
|
|
|
| 6744 |
ggml_backend_t vk_backend = new ggml_backend {
|
| 6745 |
/* .guid = */ ggml_backend_vk_guid(),
|
| 6746 |
/* .interface = */ ggml_backend_vk_interface,
|
| 6747 |
+
/* .device = */ nullptr,
|
| 6748 |
/* .context = */ ctx,
|
| 6749 |
};
|
| 6750 |
|
| 6751 |
return vk_backend;
|
| 6752 |
}
|
| 6753 |
|
| 6754 |
+
bool ggml_backend_is_vk(ggml_backend_t backend) {
|
| 6755 |
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
| 6756 |
}
|
| 6757 |
|
| 6758 |
+
int ggml_backend_vk_get_device_count() {
|
| 6759 |
return ggml_vk_get_device_count();
|
| 6760 |
}
|
| 6761 |
|
| 6762 |
+
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
|
| 6763 |
ggml_vk_get_device_description(device, description, description_size);
|
| 6764 |
}
|
| 6765 |
|
| 6766 |
+
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
| 6767 |
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
| 6768 |
|
| 6769 |
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
|
|
|
| 6779 |
}
|
| 6780 |
}
|
| 6781 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6782 |
// Extension availability
|
| 6783 |
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
| 6784 |
#ifdef GGML_VULKAN_VALIDATE
|
ggml/src/ggml.c
CHANGED
|
@@ -461,7 +461,7 @@ struct ggml_arm_arch_features_type {
|
|
| 461 |
} ggml_arm_arch_features = {-1, -1, -1, 0};
|
| 462 |
#endif
|
| 463 |
|
| 464 |
-
|
| 465 |
switch (status) {
|
| 466 |
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
| 467 |
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
|
|
@@ -3384,19 +3384,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
|
|
| 3384 |
GGML_PRINT("%s: --- end ---\n", __func__);
|
| 3385 |
}
|
| 3386 |
|
| 3387 |
-
|
| 3388 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 3389 |
|
| 3390 |
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
| 3391 |
}
|
| 3392 |
|
| 3393 |
-
|
| 3394 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 3395 |
|
| 3396 |
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
| 3397 |
}
|
| 3398 |
|
| 3399 |
-
|
| 3400 |
size_t nbytes;
|
| 3401 |
size_t blck_size = ggml_blck_size(tensor->type);
|
| 3402 |
if (blck_size == 1) {
|
|
@@ -3419,15 +3419,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
| 3419 |
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
| 3420 |
}
|
| 3421 |
|
| 3422 |
-
|
| 3423 |
return type_traits[type].blck_size;
|
| 3424 |
}
|
| 3425 |
|
| 3426 |
-
|
| 3427 |
return type_traits[type].type_size;
|
| 3428 |
}
|
| 3429 |
|
| 3430 |
-
|
| 3431 |
assert(ne % ggml_blck_size(type) == 0);
|
| 3432 |
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
| 3433 |
}
|
|
@@ -3436,15 +3436,15 @@ double ggml_type_sizef(enum ggml_type type) {
|
|
| 3436 |
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
| 3437 |
}
|
| 3438 |
|
| 3439 |
-
|
| 3440 |
return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
|
| 3441 |
}
|
| 3442 |
|
| 3443 |
-
|
| 3444 |
return type_traits[type].is_quantized;
|
| 3445 |
}
|
| 3446 |
|
| 3447 |
-
|
| 3448 |
return GGML_OP_NAME[op];
|
| 3449 |
}
|
| 3450 |
|
|
@@ -3456,7 +3456,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
|
| 3456 |
return GGML_UNARY_OP_NAME[op];
|
| 3457 |
}
|
| 3458 |
|
| 3459 |
-
|
| 3460 |
if (t->op == GGML_OP_UNARY) {
|
| 3461 |
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
| 3462 |
return ggml_unary_op_name(uop);
|
|
@@ -3464,7 +3464,7 @@ GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
|
|
| 3464 |
return ggml_op_name(t->op);
|
| 3465 |
}
|
| 3466 |
|
| 3467 |
-
|
| 3468 |
return ggml_type_size(tensor->type);
|
| 3469 |
}
|
| 3470 |
|
|
@@ -3557,7 +3557,7 @@ size_t ggml_tensor_overhead(void) {
|
|
| 3557 |
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
| 3558 |
}
|
| 3559 |
|
| 3560 |
-
|
| 3561 |
return tensor->nb[0] > tensor->nb[1];
|
| 3562 |
}
|
| 3563 |
|
|
@@ -3583,23 +3583,23 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
|
|
| 3583 |
return true;
|
| 3584 |
}
|
| 3585 |
|
| 3586 |
-
|
| 3587 |
return ggml_is_contiguous_0(tensor);
|
| 3588 |
}
|
| 3589 |
|
| 3590 |
-
|
| 3591 |
return ggml_is_contiguous_n(tensor, 0);
|
| 3592 |
}
|
| 3593 |
|
| 3594 |
-
|
| 3595 |
return ggml_is_contiguous_n(tensor, 1);
|
| 3596 |
}
|
| 3597 |
|
| 3598 |
-
|
| 3599 |
return ggml_is_contiguous_n(tensor, 2);
|
| 3600 |
}
|
| 3601 |
|
| 3602 |
-
|
| 3603 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 3604 |
|
| 3605 |
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
|
@@ -3614,7 +3614,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
| 3614 |
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
| 3615 |
}
|
| 3616 |
|
| 3617 |
-
|
| 3618 |
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
| 3619 |
if (tensor->ne[i] == 0) {
|
| 3620 |
// empty if any dimension has no elements
|
|
@@ -4634,7 +4634,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|
| 4634 |
return (float *)(tensor->data);
|
| 4635 |
}
|
| 4636 |
|
| 4637 |
-
|
| 4638 |
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
| 4639 |
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
| 4640 |
}
|
|
@@ -12834,6 +12834,10 @@ static void ggml_compute_forward_out_prod_f32(
|
|
| 12834 |
|
| 12835 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 12836 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12837 |
const int ith = params->ith;
|
| 12838 |
const int nth = params->nth;
|
| 12839 |
|
|
@@ -14163,7 +14167,7 @@ static void ggml_rope_cache_init(
|
|
| 14163 |
}
|
| 14164 |
}
|
| 14165 |
|
| 14166 |
-
|
| 14167 |
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
| 14168 |
) {
|
| 14169 |
// start and end correction dims
|
|
|
|
| 461 |
} ggml_arm_arch_features = {-1, -1, -1, 0};
|
| 462 |
#endif
|
| 463 |
|
| 464 |
+
const char * ggml_status_to_string(enum ggml_status status) {
|
| 465 |
switch (status) {
|
| 466 |
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
| 467 |
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
|
|
|
|
| 3384 |
GGML_PRINT("%s: --- end ---\n", __func__);
|
| 3385 |
}
|
| 3386 |
|
| 3387 |
+
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
| 3388 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 3389 |
|
| 3390 |
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
| 3391 |
}
|
| 3392 |
|
| 3393 |
+
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
| 3394 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 3395 |
|
| 3396 |
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
| 3397 |
}
|
| 3398 |
|
| 3399 |
+
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
| 3400 |
size_t nbytes;
|
| 3401 |
size_t blck_size = ggml_blck_size(tensor->type);
|
| 3402 |
if (blck_size == 1) {
|
|
|
|
| 3419 |
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
| 3420 |
}
|
| 3421 |
|
| 3422 |
+
int64_t ggml_blck_size(enum ggml_type type) {
|
| 3423 |
return type_traits[type].blck_size;
|
| 3424 |
}
|
| 3425 |
|
| 3426 |
+
size_t ggml_type_size(enum ggml_type type) {
|
| 3427 |
return type_traits[type].type_size;
|
| 3428 |
}
|
| 3429 |
|
| 3430 |
+
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
| 3431 |
assert(ne % ggml_blck_size(type) == 0);
|
| 3432 |
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
| 3433 |
}
|
|
|
|
| 3436 |
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
| 3437 |
}
|
| 3438 |
|
| 3439 |
+
const char * ggml_type_name(enum ggml_type type) {
|
| 3440 |
return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
|
| 3441 |
}
|
| 3442 |
|
| 3443 |
+
bool ggml_is_quantized(enum ggml_type type) {
|
| 3444 |
return type_traits[type].is_quantized;
|
| 3445 |
}
|
| 3446 |
|
| 3447 |
+
const char * ggml_op_name(enum ggml_op op) {
|
| 3448 |
return GGML_OP_NAME[op];
|
| 3449 |
}
|
| 3450 |
|
|
|
|
| 3456 |
return GGML_UNARY_OP_NAME[op];
|
| 3457 |
}
|
| 3458 |
|
| 3459 |
+
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
| 3460 |
if (t->op == GGML_OP_UNARY) {
|
| 3461 |
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
| 3462 |
return ggml_unary_op_name(uop);
|
|
|
|
| 3464 |
return ggml_op_name(t->op);
|
| 3465 |
}
|
| 3466 |
|
| 3467 |
+
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
| 3468 |
return ggml_type_size(tensor->type);
|
| 3469 |
}
|
| 3470 |
|
|
|
|
| 3557 |
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
| 3558 |
}
|
| 3559 |
|
| 3560 |
+
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
| 3561 |
return tensor->nb[0] > tensor->nb[1];
|
| 3562 |
}
|
| 3563 |
|
|
|
|
| 3583 |
return true;
|
| 3584 |
}
|
| 3585 |
|
| 3586 |
+
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
| 3587 |
return ggml_is_contiguous_0(tensor);
|
| 3588 |
}
|
| 3589 |
|
| 3590 |
+
bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
| 3591 |
return ggml_is_contiguous_n(tensor, 0);
|
| 3592 |
}
|
| 3593 |
|
| 3594 |
+
bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
| 3595 |
return ggml_is_contiguous_n(tensor, 1);
|
| 3596 |
}
|
| 3597 |
|
| 3598 |
+
bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
| 3599 |
return ggml_is_contiguous_n(tensor, 2);
|
| 3600 |
}
|
| 3601 |
|
| 3602 |
+
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
| 3603 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 3604 |
|
| 3605 |
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
|
|
|
| 3614 |
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
| 3615 |
}
|
| 3616 |
|
| 3617 |
+
bool ggml_is_empty(const struct ggml_tensor * tensor) {
|
| 3618 |
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
| 3619 |
if (tensor->ne[i] == 0) {
|
| 3620 |
// empty if any dimension has no elements
|
|
|
|
| 4634 |
return (float *)(tensor->data);
|
| 4635 |
}
|
| 4636 |
|
| 4637 |
+
enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
| 4638 |
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
| 4639 |
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
| 4640 |
}
|
|
|
|
| 12834 |
|
| 12835 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 12836 |
|
| 12837 |
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 12838 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 12839 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 12840 |
+
|
| 12841 |
const int ith = params->ith;
|
| 12842 |
const int nth = params->nth;
|
| 12843 |
|
|
|
|
| 14167 |
}
|
| 14168 |
}
|
| 14169 |
|
| 14170 |
+
void ggml_rope_yarn_corr_dims(
|
| 14171 |
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
| 14172 |
) {
|
| 14173 |
// start and end correction dims
|