Diego Devesa JohannesGaessler commited on
Commit
1bdb50a
·
1 Parent(s): 7d7ac98

ggml-backend : add device and backend reg interfaces (llama/9707)

Browse files
ggml/include/ggml-backend.h CHANGED
@@ -12,43 +12,52 @@ extern "C" {
12
  typedef struct ggml_backend_event * ggml_backend_event_t;
13
  typedef struct ggml_backend * ggml_backend_t;
14
  typedef void * ggml_backend_graph_plan_t;
 
 
 
15
 
16
  //
17
- // Backend buffer
18
  //
19
 
20
- // buffer type
21
- GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
22
- GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
23
- GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
24
- GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
25
- GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
26
- GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
 
 
 
 
27
 
28
- // buffer
29
  enum ggml_backend_buffer_usage {
30
  GGML_BACKEND_BUFFER_USAGE_ANY = 0,
31
  GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
32
  GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
33
  };
34
 
35
- GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
36
- GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
37
- GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
38
- GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
39
- GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
- GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
41
- GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
42
- GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
43
- GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
44
- GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
45
- GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
46
- GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
47
- GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
48
- GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
 
 
 
49
 
50
  //
51
- // Backend
52
  //
53
 
54
  GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
@@ -64,9 +73,9 @@ extern "C" {
64
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
65
 
66
  // "offset" refers to the offset of the tensor data for setting/getting data
67
- GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
68
- GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
69
- GGML_API GGML_CALL void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
70
 
71
  GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
72
 
@@ -76,65 +85,121 @@ extern "C" {
76
  GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
77
  GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
78
  GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
 
79
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
80
  GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
81
  GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
82
 
83
- // tensor copy between different backends
84
- GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
85
-
86
  // asynchronous copy
87
  // the copy is performed after all the currently queued operations in backend_src
88
  // backend_dst will wait for the copy to complete before performing other operations
89
  // automatic fallback to sync copy if async is not supported
90
  GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
91
 
92
- // events
93
- GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
94
- GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
95
- GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
96
- GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
97
- GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
98
 
99
  //
100
- // CPU backend
101
  //
102
 
103
- GGML_API ggml_backend_t ggml_backend_cpu_init(void);
 
 
 
 
104
 
105
- GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
106
- GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
107
- GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
108
- GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
109
 
110
- // Create a backend buffer from an existing pointer
111
- GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
 
 
 
 
 
112
 
113
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
 
 
 
 
 
 
 
 
114
 
115
- #ifdef GGML_USE_CPU_HBM
116
- GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
117
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  //
120
  // Backend registry
121
  //
122
 
123
- // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- GGML_API size_t ggml_backend_reg_get_count(void);
126
- GGML_API size_t ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found
127
- GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
128
- GGML_API const char * ggml_backend_reg_get_name(size_t i);
129
- GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
130
- GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
131
- GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
132
 
133
  //
134
  // Backend scheduler
135
  //
136
 
137
- // The backend scheduler allows for multiple backends to be used together
138
  // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
139
  // The backends are selected based on:
140
  // - the backend that supports the operation
@@ -169,9 +234,9 @@ extern "C" {
169
  }
170
  */
171
 
172
- struct ggml_backend_sched;
173
  typedef struct ggml_backend_sched * ggml_backend_sched_t;
174
 
 
175
  // when ask == true, the scheduler wants to know if the user wants to observe this node
176
  // this allows the scheduler to batch nodes together in order to evaluate them in a single call
177
  //
@@ -226,7 +291,7 @@ extern "C" {
226
  GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
227
  GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
228
 
229
- typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
230
 
231
  // Compare the output of two backends
232
  GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
@@ -235,6 +300,26 @@ extern "C" {
235
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
236
  GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  #ifdef __cplusplus
240
  }
 
12
  typedef struct ggml_backend_event * ggml_backend_event_t;
13
  typedef struct ggml_backend * ggml_backend_t;
14
  typedef void * ggml_backend_graph_plan_t;
15
+ typedef struct ggml_backend_reg * ggml_backend_reg_t;
16
+ typedef struct ggml_backend_device * ggml_backend_dev_t;
17
+
18
 
19
  //
20
+ // Backend buffer type
21
  //
22
 
23
+ GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
24
+ GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
25
+ GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
26
+ GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
27
+ GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
28
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
29
+ GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
30
+
31
+ //
32
+ // Backend buffer
33
+ //
34
 
 
35
  enum ggml_backend_buffer_usage {
36
  GGML_BACKEND_BUFFER_USAGE_ANY = 0,
37
  GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
38
  GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
39
  };
40
 
41
+ GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
42
+ GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
43
+ GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
44
+ GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
45
+ GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
46
+ GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
47
+ GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
48
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
49
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
50
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
51
+ GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
52
+ GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
53
+ GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
54
+ GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
55
+
56
+ // tensor copy between different backends
57
+ GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
58
 
59
  //
60
+ // Backend (stream)
61
  //
62
 
63
  GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
 
73
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
74
 
75
  // "offset" refers to the offset of the tensor data for setting/getting data
76
+ GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
77
+ GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
78
+ GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
79
 
80
  GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
81
 
 
85
  GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
86
  GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
87
  GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
88
+
89
+ // NOTE: will be removed, use device version instead
90
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
91
  GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
92
  GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
93
 
 
 
 
94
  // asynchronous copy
95
  // the copy is performed after all the currently queued operations in backend_src
96
  // backend_dst will wait for the copy to complete before performing other operations
97
  // automatic fallback to sync copy if async is not supported
98
  GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
99
 
100
+ GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
 
 
 
 
 
101
 
102
  //
103
+ // Events
104
  //
105
 
106
+ GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
107
+ GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
108
+ GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
109
+ GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
110
+ GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
111
 
112
+ //
113
+ // Backend device
114
+ //
 
115
 
116
+ enum ggml_backend_dev_type {
117
+ GGML_BACKEND_DEVICE_TYPE_CPU,
118
+ GGML_BACKEND_DEVICE_TYPE_GPU,
119
+ // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
120
+ GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
121
+ GGML_BACKEND_DEVICE_TYPE_GPU_FULL
122
+ };
123
 
124
+ // functionality supported by the device
125
+ struct ggml_backend_dev_caps {
126
+ // asynchronous operations
127
+ bool async;
128
+ // pinned host buffer
129
+ bool host_buffer;
130
+ // event synchronization
131
+ bool events;
132
+ };
133
 
134
+ // all the device properties
135
+ struct ggml_backend_dev_props {
136
+ const char * name;
137
+ const char * description;
138
+ size_t memory_free;
139
+ size_t memory_total;
140
+ enum ggml_backend_dev_type type;
141
+ struct ggml_backend_dev_caps caps;
142
+ };
143
+
144
+ GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
145
+ GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
146
+ GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
147
+ GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
148
+ GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
149
+ GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
150
+ GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
151
+ GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
152
+ GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
153
+ GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
154
+
155
+ GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
156
+ GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
157
+ GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
158
+
159
+ //
160
+ // Backend (reg)
161
+ //
162
+
163
+ GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
164
+ GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
165
+ GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
166
+ GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
167
+ GGML_API void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data);
168
+
169
+ // Functions that may be obtained using ggml_backend_reg_get_proc_address
170
+ typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
171
 
172
  //
173
  // Backend registry
174
  //
175
 
176
+ // Backend (reg) enumeration
177
+ GGML_API size_t ggml_backend_reg_count(void);
178
+ GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
179
+ GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
180
+
181
+ // Device enumeration
182
+ GGML_API size_t ggml_backend_dev_count(void);
183
+ GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
184
+ GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
185
+ GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
186
+
187
+ // Set the log callback for all registered backends
188
+ GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data);
189
 
190
+ // Direct backend (stream) initialization
191
+ // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
192
+ GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
193
+ // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
194
+ GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
195
+ // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
196
+ GGML_API ggml_backend_t ggml_backend_init_best(void);
197
 
198
  //
199
  // Backend scheduler
200
  //
201
 
202
+ // The backend scheduler allows for multiple backend devices to be used together
203
  // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
204
  // The backends are selected based on:
205
  // - the backend that supports the operation
 
234
  }
235
  */
236
 
 
237
  typedef struct ggml_backend_sched * ggml_backend_sched_t;
238
 
239
+ // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
240
  // when ask == true, the scheduler wants to know if the user wants to observe this node
241
  // this allows the scheduler to batch nodes together in order to evaluate them in a single call
242
  //
 
291
  GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
292
  GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
293
 
294
+ typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
295
 
296
  // Compare the output of two backends
297
  GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
 
300
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
301
  GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
302
 
303
+ //
304
+ // CPU backend
305
+ //
306
+
307
+ GGML_API ggml_backend_t ggml_backend_cpu_init(void);
308
+
309
+ GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
310
+ GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
311
+ GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
312
+ GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
313
+
314
+ // Create a backend buffer from an existing pointer
315
+ GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
316
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
317
+
318
+ GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
319
+
320
+ #ifdef GGML_USE_CPU_HBM
321
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
322
+ #endif
323
 
324
  #ifdef __cplusplus
325
  }
ggml/include/ggml-blas.h CHANGED
@@ -9,13 +9,13 @@ extern "C" {
9
  #endif
10
 
11
  // backend API
12
- GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
13
 
14
- GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
15
 
16
  // number of threads used for conversion to float
17
  // for openblas and blis, this will also set the number of threads used for blas operations
18
- GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
19
 
20
 
21
  #ifdef __cplusplus
 
9
  #endif
10
 
11
  // backend API
12
+ GGML_API ggml_backend_t ggml_backend_blas_init(void);
13
 
14
+ GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
15
 
16
  // number of threads used for conversion to float
17
  // for openblas and blis, this will also set the number of threads used for blas operations
18
+ GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
19
 
20
 
21
  #ifdef __cplusplus
ggml/include/ggml-cann.h CHANGED
@@ -44,7 +44,7 @@ extern "C" {
44
  * @param device The index of the device to initialize.
45
  * @return A pointer to the initialized backend instance, or nullptr on failure.
46
  */
47
- GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
48
 
49
  /**
50
  * @brief Checks if a given backend is a CANN backend.
@@ -55,7 +55,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
55
  * @param backend The backend instance to check.
56
  * @return True if the backend is a CANN backend, false otherwise.
57
  */
58
- GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
59
 
60
  /**
61
  * @brief Retrieves the CANN buffer type for a specified device.
@@ -67,7 +67,7 @@ GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
67
  * @return A pointer to the buffer type interface for the specified device, or
68
  * nullptr if the device index is out of range.
69
  */
70
- GGML_API GGML_CALL ggml_backend_buffer_type_t
71
  ggml_backend_cann_buffer_type(int32_t device);
72
 
73
  /**
@@ -78,14 +78,14 @@ ggml_backend_cann_buffer_type(int32_t device);
78
  *
79
  * @return The number of CANN devices available.
80
  */
81
- GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
82
 
83
  /**
84
  * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
85
  *
86
  * @return A pointer to the host buffer type interface.
87
  */
88
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
89
 
90
  /**
91
  * @brief Retrieves the description of a specific CANN device.
@@ -97,7 +97,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type
97
  * @param description Pointer to a buffer where the description will be written.
98
  * @param description_size Size of the description buffer.
99
  */
100
- GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
101
  int32_t device, char* description, size_t description_size);
102
 
103
  /**
@@ -112,9 +112,9 @@ GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
112
  * @param total Pointer to a variable where the total memory size will be
113
  * stored.
114
  */
115
- GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device,
116
- size_t* free,
117
- size_t* total);
118
 
119
  /**
120
  * @brief Set the logging callback for GGML.
 
44
  * @param device The index of the device to initialize.
45
  * @return A pointer to the initialized backend instance, or nullptr on failure.
46
  */
47
+ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
48
 
49
  /**
50
  * @brief Checks if a given backend is a CANN backend.
 
55
  * @param backend The backend instance to check.
56
  * @return True if the backend is a CANN backend, false otherwise.
57
  */
58
+ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
59
 
60
  /**
61
  * @brief Retrieves the CANN buffer type for a specified device.
 
67
  * @return A pointer to the buffer type interface for the specified device, or
68
  * nullptr if the device index is out of range.
69
  */
70
+ GGML_API ggml_backend_buffer_type_t
71
  ggml_backend_cann_buffer_type(int32_t device);
72
 
73
  /**
 
78
  *
79
  * @return The number of CANN devices available.
80
  */
81
+ GGML_API int32_t ggml_backend_cann_get_device_count(void);
82
 
83
  /**
84
  * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
85
  *
86
  * @return A pointer to the host buffer type interface.
87
  */
88
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
89
 
90
  /**
91
  * @brief Retrieves the description of a specific CANN device.
 
97
  * @param description Pointer to a buffer where the description will be written.
98
  * @param description_size Size of the description buffer.
99
  */
100
+ GGML_API void ggml_backend_cann_get_device_description(
101
  int32_t device, char* description, size_t description_size);
102
 
103
  /**
 
112
  * @param total Pointer to a variable where the total memory size will be
113
  * stored.
114
  */
115
+ GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
116
+ size_t* free,
117
+ size_t* total);
118
 
119
  /**
120
  * @brief Set the logging callback for GGML.
ggml/include/ggml-cuda.h CHANGED
@@ -3,6 +3,10 @@
3
  #include "ggml.h"
4
  #include "ggml-backend.h"
5
 
 
 
 
 
6
  #ifdef GGML_USE_HIPBLAS
7
  #define GGML_CUDA_NAME "ROCm"
8
  #define GGML_CUBLAS_NAME "hipBLAS"
@@ -13,35 +17,33 @@
13
  #define GGML_CUDA_NAME "CUDA"
14
  #define GGML_CUBLAS_NAME "cuBLAS"
15
  #endif
16
-
17
- #ifdef __cplusplus
18
- extern "C" {
19
- #endif
20
-
21
  #define GGML_CUDA_MAX_DEVICES 16
22
 
23
  // backend API
24
- GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
25
 
26
- GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
27
 
28
  // device buffer
29
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
30
 
31
  // split tensor buffer that splits matrices by rows across multiple devices
32
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
33
 
34
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
35
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
36
 
37
- GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
38
- GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
39
- GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
40
 
41
- GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
42
- GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
43
 
44
  GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
 
 
 
45
  #ifdef __cplusplus
46
  }
47
  #endif
 
3
  #include "ggml.h"
4
  #include "ggml-backend.h"
5
 
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
  #ifdef GGML_USE_HIPBLAS
11
  #define GGML_CUDA_NAME "ROCm"
12
  #define GGML_CUBLAS_NAME "hipBLAS"
 
17
  #define GGML_CUDA_NAME "CUDA"
18
  #define GGML_CUBLAS_NAME "cuBLAS"
19
  #endif
 
 
 
 
 
20
  #define GGML_CUDA_MAX_DEVICES 16
21
 
22
  // backend API
23
+ GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
24
 
25
+ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
26
 
27
  // device buffer
28
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
29
 
30
  // split tensor buffer that splits matrices by rows across multiple devices
31
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
32
 
33
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
34
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
35
 
36
+ GGML_API int ggml_backend_cuda_get_device_count(void);
37
+ GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
38
+ GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
39
 
40
+ GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
41
+ GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
42
 
43
  GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
44
+
45
+ GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
46
+
47
  #ifdef __cplusplus
48
  }
49
  #endif
ggml/include/ggml-metal.h CHANGED
@@ -1,3 +1,5 @@
 
 
1
  // An interface allowing to compute ggml_cgraph with Metal
2
  //
3
  // This is a fully functional interface that extends ggml with GPU support for Apple devices.
@@ -43,11 +45,11 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
43
 
44
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
45
 
46
- GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
47
 
48
  GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
49
 
50
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
51
 
52
  // helper to check if the device supports a specific family
53
  // ideally, the user code should be doing these checks
 
1
+ // Note: this description is outdated
2
+ //
3
  // An interface allowing to compute ggml_cgraph with Metal
4
  //
5
  // This is a fully functional interface that extends ggml with GPU support for Apple devices.
 
45
 
46
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
47
 
48
+ GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
49
 
50
  GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
51
 
52
+ GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
53
 
54
  // helper to check if the device supports a specific family
55
  // ideally, the user code should be doing these checks
ggml/include/ggml-rpc.h CHANGED
@@ -10,14 +10,14 @@ extern "C" {
10
  #define GGML_RPC_MAX_SERVERS 16
11
 
12
  // backend API
13
- GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
14
- GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
15
 
16
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
17
 
18
- GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
19
 
20
- GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
21
 
22
  #ifdef __cplusplus
23
  }
 
10
  #define GGML_RPC_MAX_SERVERS 16
11
 
12
  // backend API
13
+ GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
14
+ GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
15
 
16
+ GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
17
 
18
+ GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
19
 
20
+ GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
21
 
22
  #ifdef __cplusplus
23
  }
ggml/include/ggml-sycl.h CHANGED
@@ -23,20 +23,20 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
23
  GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
24
 
25
  // split tensor buffer that splits matrices by rows across multiple devices
26
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
27
 
28
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
29
  GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
30
 
31
- GGML_API void ggml_backend_sycl_print_sycl_devices(void);
32
- GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
33
- GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
34
- GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
35
- GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
36
 
37
  // SYCL doesn't support registering host memory, keep here for reference
38
- // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
39
- // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
40
  #ifdef __cplusplus
41
  }
42
  #endif
 
23
  GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
24
 
25
  // split tensor buffer that splits matrices by rows across multiple devices
26
+ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
27
 
28
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
29
  GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
30
 
31
+ GGML_API void ggml_backend_sycl_print_sycl_devices(void);
32
+ GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
33
+ GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
34
+ GGML_API int ggml_backend_sycl_get_device_count();
35
+ GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
36
 
37
  // SYCL doesn't support registering host memory, keep here for reference
38
+ // GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
39
+ // GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
40
  #ifdef __cplusplus
41
  }
42
  #endif
ggml/include/ggml-vulkan.h CHANGED
@@ -13,16 +13,16 @@ extern "C" {
13
  GGML_API void ggml_vk_instance_init(void);
14
 
15
  // backend API
16
- GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
17
 
18
- GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
19
- GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
20
- GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
21
- GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
22
 
23
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
24
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
25
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
26
 
27
  #ifdef __cplusplus
28
  }
 
13
  GGML_API void ggml_vk_instance_init(void);
14
 
15
  // backend API
16
+ GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
17
 
18
+ GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
19
+ GGML_API int ggml_backend_vk_get_device_count(void);
20
+ GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
21
+ GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
22
 
23
+ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
24
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
25
+ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
26
 
27
  #ifdef __cplusplus
28
  }
ggml/include/ggml.h CHANGED
@@ -187,16 +187,6 @@
187
  # define GGML_API
188
  #endif
189
 
190
- #ifdef GGML_MULTIPLATFORM
191
- # if defined(_WIN32)
192
- # define GGML_CALL
193
- # else
194
- # define GGML_CALL __attribute__((__ms_abi__))
195
- # endif
196
- #else
197
- # define GGML_CALL
198
- #endif
199
-
200
  // TODO: support for clang
201
  #ifdef __GNUC__
202
  # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@@ -340,7 +330,7 @@ extern "C" {
340
  };
341
 
342
  // get ggml_status name string
343
- GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
344
 
345
  // ieee 754-2008 half-precision float16
346
  // todo: make this not an integral type
@@ -717,46 +707,46 @@ extern "C" {
717
  GGML_API void ggml_print_object (const struct ggml_object * obj);
718
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
719
 
720
- GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
721
- GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
722
- GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
723
- GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
724
 
725
- GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
726
- GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
727
- GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
728
 
729
  GGML_DEPRECATED(
730
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
731
  "use ggml_row_size() instead");
732
 
733
- GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
734
- GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
735
- GGML_API const char * ggml_op_symbol(enum ggml_op op);
736
 
737
- GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
738
- GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
739
 
740
- GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
741
 
742
- GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
743
 
744
  // TODO: temporary until model loading of ggml examples is refactored
745
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
746
 
747
- GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
748
- GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
749
- GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
750
- GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
751
- GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
752
- GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
753
- GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
754
- GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
755
 
756
- GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
757
- GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
758
- GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
759
- GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
760
 
761
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
762
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@@ -848,7 +838,7 @@ extern "C" {
848
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
849
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
850
 
851
- GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
852
 
853
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
854
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
@@ -1568,7 +1558,7 @@ extern "C" {
1568
  "use ggml_rope_ext_inplace instead");
1569
 
1570
  // compute correction dims for YaRN RoPE scaling
1571
- GGML_CALL void ggml_rope_yarn_corr_dims(
1572
  int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1573
 
1574
  // rotary position embedding backward, i.e compute dx from dy
 
187
  # define GGML_API
188
  #endif
189
 
 
 
 
 
 
 
 
 
 
 
190
  // TODO: support for clang
191
  #ifdef __GNUC__
192
  # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
 
330
  };
331
 
332
  // get ggml_status name string
333
+ GGML_API const char * ggml_status_to_string(enum ggml_status status);
334
 
335
  // ieee 754-2008 half-precision float16
336
  // todo: make this not an integral type
 
707
  GGML_API void ggml_print_object (const struct ggml_object * obj);
708
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
709
 
710
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
711
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
712
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
713
+ GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
714
 
715
+ GGML_API int64_t ggml_blck_size(enum ggml_type type);
716
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
717
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
718
 
719
  GGML_DEPRECATED(
720
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
721
  "use ggml_row_size() instead");
722
 
723
+ GGML_API const char * ggml_type_name(enum ggml_type type);
724
+ GGML_API const char * ggml_op_name (enum ggml_op op);
725
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
726
 
727
+ GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
728
+ GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
729
 
730
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
731
 
732
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
733
 
734
  // TODO: temporary until model loading of ggml examples is refactored
735
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
736
 
737
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
738
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
739
+ GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
740
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
741
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
742
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
743
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
744
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
745
 
746
+ GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
747
+ GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
748
+ GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
749
+ GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
750
 
751
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
752
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
 
838
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
839
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
840
 
841
+ GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
842
 
843
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
844
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
 
1558
  "use ggml_rope_ext_inplace instead");
1559
 
1560
  // compute correction dims for YaRN RoPE scaling
1561
+ void ggml_rope_yarn_corr_dims(
1562
  int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1563
 
1564
  // rotary position embedding backward, i.e compute dx from dy
ggml/src/CMakeLists.txt CHANGED
@@ -1325,7 +1325,7 @@ add_library(ggml
1325
  ../include/ggml-backend.h
1326
  ggml.c
1327
  ggml-alloc.c
1328
- ggml-backend.c
1329
  ggml-quants.c
1330
  ggml-quants.h
1331
  ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
 
1325
  ../include/ggml-backend.h
1326
  ggml.c
1327
  ggml-alloc.c
1328
+ ggml-backend.cpp
1329
  ggml-quants.c
1330
  ggml-quants.h
1331
  ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
ggml/src/ggml-backend-impl.h CHANGED
@@ -9,145 +9,229 @@ extern "C" {
9
  #endif
10
 
11
  //
12
- // Backend buffer
13
  //
14
 
15
- // buffer type
16
- typedef void * ggml_backend_buffer_type_context_t;
17
-
18
  struct ggml_backend_buffer_type_i {
19
- const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
  // allocate a buffer of this type
21
- ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
22
  // tensor alignment
23
- size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
24
- // max buffer size that can be allocated
25
- size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
26
- // data size needed to allocate the tensor, including padding
27
- size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
28
- // check if tensor data is in host memory
29
- bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
30
  };
31
 
32
  struct ggml_backend_buffer_type {
33
  struct ggml_backend_buffer_type_i iface;
34
- ggml_backend_buffer_type_context_t context;
 
35
  };
36
 
37
- // buffer
38
- typedef void * ggml_backend_buffer_context_t;
 
39
 
40
  struct ggml_backend_buffer_i {
41
- const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
42
- void (*GGML_CALL free_buffer) (ggml_backend_buffer_t buffer);
43
- void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
44
- void (*GGML_CALL init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
45
- void (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
46
- void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
47
- void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
48
- bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
49
- void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
50
- void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
 
 
 
 
 
 
 
51
  };
52
 
53
  struct ggml_backend_buffer {
54
  struct ggml_backend_buffer_i iface;
55
  ggml_backend_buffer_type_t buft;
56
- ggml_backend_buffer_context_t context;
57
  size_t size;
58
  enum ggml_backend_buffer_usage usage;
59
  };
60
 
61
- GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
62
- ggml_backend_buffer_type_t buft,
63
- struct ggml_backend_buffer_i iface,
64
- ggml_backend_buffer_context_t context,
65
- size_t size);
66
 
67
  // do not use directly, use ggml_backend_tensor_copy instead
68
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
69
 
 
70
  // buffer that contains a collection of buffers
71
- GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
72
- GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
73
- GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
74
 
75
  //
76
- // Backend
77
  //
78
 
79
- typedef void * ggml_backend_context_t;
80
-
81
  struct ggml_backend_i {
82
- const char * (*GGML_CALL get_name)(ggml_backend_t backend);
83
 
84
- void (*GGML_CALL free)(ggml_backend_t backend);
85
 
86
  // buffer allocation
87
- ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
88
 
89
  // (optional) asynchronous tensor data access
90
- void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
91
- void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
92
- bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
93
 
94
  // (optional) complete all pending operations
95
- void (*GGML_CALL synchronize)(ggml_backend_t backend);
96
 
97
- // compute graph with a plan (not used currently)
98
- // create a new plan for a graph
99
- ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
100
- void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
101
  // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
102
- void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
103
  // compute the graph with the plan
104
- enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
 
 
105
 
106
- // compute graph without a plan (async)
107
- enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
108
 
 
109
  // check if the backend can compute an operation
110
- bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
111
 
112
  // check if the backend can use tensors allocated in a buffer type
113
- bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
114
 
115
  // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
116
  // these should be expensive operations with large batch sizes that may benefit from running on this backend
117
  // even if the weight has to be copied from the CPU temporarily
118
- bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
119
 
120
  // (optional) event synchronization
121
- // create a new event that can record events on this backend instance
122
- ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
123
- void (*GGML_CALL event_free) (ggml_backend_event_t event);
124
- // record an event on the backend instance that created it
125
- void (*GGML_CALL event_record) (ggml_backend_event_t event);
126
- // wait for an event on on a different backend instance
127
- void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
128
- // block until an event is recorded
129
- void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
130
  };
131
 
132
  struct ggml_backend {
133
  ggml_guid_t guid;
134
-
135
  struct ggml_backend_i iface;
136
- ggml_backend_context_t context;
 
137
  };
138
 
139
  struct ggml_backend_event {
140
- ggml_backend_t backend;
141
  void * context;
142
  };
143
 
144
  //
145
- // Backend registry
146
  //
147
 
148
- typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
 
 
 
 
151
 
152
  #ifdef __cplusplus
153
  }
 
9
  #endif
10
 
11
  //
12
+ // Backend buffer type
13
  //
14
 
 
 
 
15
  struct ggml_backend_buffer_type_i {
16
+ const char * (*get_name) (ggml_backend_buffer_type_t buft);
17
  // allocate a buffer of this type
18
+ ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
19
  // tensor alignment
20
+ size_t (*get_alignment) (ggml_backend_buffer_type_t buft);
21
+ // (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
22
+ size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
23
+ // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
24
+ size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25
+ // (optional) check if tensor data is in host memory (defaults to false)
26
+ bool (*is_host) (ggml_backend_buffer_type_t buft);
27
  };
28
 
29
  struct ggml_backend_buffer_type {
30
  struct ggml_backend_buffer_type_i iface;
31
+ ggml_backend_dev_t device;
32
+ void * context;
33
  };
34
 
35
+ //
36
+ // Backend buffer
37
+ //
38
 
39
  struct ggml_backend_buffer_i {
40
+ const char * (*get_name) (ggml_backend_buffer_t buffer);
41
+ // (optional) free the buffer
42
+ void (*free_buffer) (ggml_backend_buffer_t buffer);
43
+ // base address of the buffer
44
+ void * (*get_base) (ggml_backend_buffer_t buffer);
45
+ // (optional) initialize a tensor in the buffer (eg. add tensor extras)
46
+ void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
47
+ // tensor data access
48
+ void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
49
+ void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
50
+ void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
51
+ // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
52
+ bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
53
+ // clear the entire buffer
54
+ void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
55
+ // (optional) reset any internal state due to tensor initialization, such as tensor extras
56
+ void (*reset) (ggml_backend_buffer_t buffer);
57
  };
58
 
59
  struct ggml_backend_buffer {
60
  struct ggml_backend_buffer_i iface;
61
  ggml_backend_buffer_type_t buft;
62
+ void * context;
63
  size_t size;
64
  enum ggml_backend_buffer_usage usage;
65
  };
66
 
67
+ ggml_backend_buffer_t ggml_backend_buffer_init(
68
+ ggml_backend_buffer_type_t buft,
69
+ struct ggml_backend_buffer_i iface,
70
+ void * context,
71
+ size_t size);
72
 
73
  // do not use directly, use ggml_backend_tensor_copy instead
74
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
75
 
76
+ // multi-buffer
77
  // buffer that contains a collection of buffers
78
+ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
79
+ bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
80
+ void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
81
 
82
  //
83
+ // Backend (stream)
84
  //
85
 
 
 
86
  struct ggml_backend_i {
87
+ const char * (*get_name)(ggml_backend_t backend);
88
 
89
+ void (*free)(ggml_backend_t backend);
90
 
91
  // buffer allocation
92
+ ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
93
 
94
  // (optional) asynchronous tensor data access
95
+ void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
96
+ void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
97
+ bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
98
 
99
  // (optional) complete all pending operations
100
+ void (*synchronize)(ggml_backend_t backend);
101
 
102
+ // (optional) compute graph with a plan (not used currently)
103
+ ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
104
+ void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
105
  // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
106
+ void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
107
  // compute the graph with the plan
108
+ enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
109
+
110
+ // compute graph (always async if supported by the backend)
111
+ enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
112
 
113
+ // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
114
+ // new backends should implement the device interface instead
115
 
116
+ // These functions are being moved to the device interface
117
  // check if the backend can compute an operation
118
+ bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
119
 
120
  // check if the backend can use tensors allocated in a buffer type
121
+ bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
122
 
123
  // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
124
  // these should be expensive operations with large batch sizes that may benefit from running on this backend
125
  // even if the weight has to be copied from the CPU temporarily
126
+ bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
127
 
128
  // (optional) event synchronization
129
+ // record an event on this stream
130
+ void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
131
+ // wait for an event on on a different stream
132
+ void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
 
 
 
 
 
133
  };
134
 
135
  struct ggml_backend {
136
  ggml_guid_t guid;
 
137
  struct ggml_backend_i iface;
138
+ ggml_backend_dev_t device;
139
+ void * context;
140
  };
141
 
142
  struct ggml_backend_event {
143
+ struct ggml_backend_device * device;
144
  void * context;
145
  };
146
 
147
  //
148
+ // Backend device
149
  //
150
 
151
+ // Note: if additional properties are needed, we should add a struct with all of them
152
+ // the current functions to obtain the properties can remain, since they are more convenient for often used properties
153
+ struct ggml_backend_device_i {
154
+ // device name: short identifier for this device, such as "CPU" or "CUDA0"
155
+ const char * (*get_name)(ggml_backend_dev_t dev);
156
+
157
+ // device description: short informative description of the device, could be the model name
158
+ const char * (*get_description)(ggml_backend_dev_t dev);
159
+
160
+ // device memory in bytes
161
+ void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
162
+
163
+ // device type
164
+ enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
165
+
166
+ // device properties
167
+ void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
168
+
169
+ // backend (stream) initialization
170
+ ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
171
+
172
+ // preferred buffer type
173
+ ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
174
+
175
+ // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
176
+ ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
177
+
178
+ // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
179
+ ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
180
+
181
+ // check if the backend can compute an operation
182
+ bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
183
+
184
+ // check if the backend can use tensors allocated in a buffer type
185
+ bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
186
+
187
+ // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
188
+ // these should be expensive operations with large batch sizes that may benefit from running on this backend
189
+ // even if the weight has to be copied from the CPU temporarily
190
+ bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
191
+
192
+ // (optional) event synchronization
193
+ ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
194
+ void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
195
+ void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
196
+ };
197
+
198
+ struct ggml_backend_device {
199
+ struct ggml_backend_device_i iface;
200
+ ggml_backend_reg_t reg;
201
+ void * context;
202
+ };
203
+
204
+ //
205
+ // Backend (reg)
206
+ //
207
+
208
+ struct ggml_backend_reg_i {
209
+ const char * (*get_name)(ggml_backend_reg_t reg);
210
+
211
+ // enumerate available devices
212
+ size_t (*get_device_count)(ggml_backend_reg_t reg);
213
+ ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
214
+
215
+ // (optional) get a pointer to a function in the backend
216
+ // backends can add custom functions that are not part of the standard ggml-backend interface
217
+ void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
218
+
219
+ // (optional) set the log callback for the backend
220
+ void (*set_log_callback)(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data);
221
+ };
222
+
223
+ struct ggml_backend_reg {
224
+ // int api_version; // TODO: for dynamic loading
225
+ struct ggml_backend_reg_i iface;
226
+ void * context;
227
+ };
228
+
229
 
230
+ // Internal backend registry API
231
+ void ggml_backend_register(ggml_backend_reg_t reg);
232
+ void ggml_backend_device_register(ggml_backend_dev_t device);
233
+ // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
234
+ // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
235
 
236
  #ifdef __cplusplus
237
  }
ggml/src/ggml-backend.cpp ADDED
@@ -0,0 +1,2496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Note: porting this file to C++ is a work in progress
2
+
3
+ #include "ggml-backend-impl.h"
4
+ #include "ggml-alloc.h"
5
+ #include "ggml-impl.h"
6
+
7
+ #include <assert.h>
8
+ #include <limits.h>
9
+ #include <stdarg.h>
10
+ #include <stdio.h>
11
+ #include <stdlib.h>
12
+ #include <string.h>
13
+
14
+ #include <vector>
15
+
16
+ // backend buffer type
17
+
18
+ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
19
+ return buft->iface.get_name(buft);
20
+ }
21
+
22
+ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
23
+ return buft->iface.alloc_buffer(buft, size);
24
+ }
25
+
26
+ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
27
+ return buft->iface.get_alignment(buft);
28
+ }
29
+
30
+ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
31
+ // get_max_size is optional, defaults to SIZE_MAX
32
+ if (buft->iface.get_max_size) {
33
+ return buft->iface.get_max_size(buft);
34
+ }
35
+ return SIZE_MAX;
36
+ }
37
+
38
+ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
39
+ // get_alloc_size is optional, defaults to ggml_nbytes
40
+ if (buft->iface.get_alloc_size) {
41
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
42
+ assert(size >= ggml_nbytes(tensor));
43
+ return size;
44
+ }
45
+ return ggml_nbytes(tensor);
46
+ }
47
+
48
+ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
49
+ if (buft->iface.is_host) {
50
+ return buft->iface.is_host(buft);
51
+ }
52
+ return false;
53
+ }
54
+
55
+ ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
56
+ return buft->device;
57
+ }
58
+
59
+ // backend buffer
60
+
61
+ ggml_backend_buffer_t ggml_backend_buffer_init(
62
+ ggml_backend_buffer_type_t buft,
63
+ struct ggml_backend_buffer_i iface,
64
+ void * context,
65
+ size_t size) {
66
+ ggml_backend_buffer_t buffer = new ggml_backend_buffer {
67
+ /* .interface = */ iface,
68
+ /* .buft = */ buft,
69
+ /* .context = */ context,
70
+ /* .size = */ size,
71
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
72
+ };
73
+
74
+ return buffer;
75
+ }
76
+
77
+ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
78
+ return buffer->iface.get_name(buffer);
79
+ }
80
+
81
+ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
82
+ if (buffer == NULL) {
83
+ return;
84
+ }
85
+
86
+ if (buffer->iface.free_buffer != NULL) {
87
+ buffer->iface.free_buffer(buffer);
88
+ }
89
+ delete buffer;
90
+ }
91
+
92
+ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
93
+ return buffer->size;
94
+ }
95
+
96
+ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
97
+ void * base = buffer->iface.get_base(buffer);
98
+
99
+ GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
100
+
101
+ return base;
102
+ }
103
+
104
+ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
105
+ // init_tensor is optional
106
+ if (buffer->iface.init_tensor) {
107
+ buffer->iface.init_tensor(buffer, tensor);
108
+ }
109
+ }
110
+
111
+ size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
112
+ return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
113
+ }
114
+
115
+ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
116
+ return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
117
+ }
118
+
119
+ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
120
+ return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
121
+ }
122
+
123
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
124
+ buffer->iface.clear(buffer, value);
125
+ }
126
+
127
+ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
128
+ return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
129
+ }
130
+
131
+ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
132
+ buffer->usage = usage;
133
+
134
+ // FIXME: add a generic callback to the buffer interface
135
+ if (ggml_backend_buffer_is_multi_buffer(buffer)) {
136
+ ggml_backend_multi_buffer_set_usage(buffer, usage);
137
+ }
138
+ }
139
+
140
+ enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
141
+ return buffer->usage;
142
+ }
143
+
144
+ ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
145
+ return buffer->buft;
146
+ }
147
+
148
+ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
149
+ if (buffer->iface.reset) {
150
+ buffer->iface.reset(buffer);
151
+ }
152
+ }
153
+
154
+ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
155
+ ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
156
+ if (dst_buf->iface.cpy_tensor) {
157
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
158
+ }
159
+ return false;
160
+ }
161
+
162
+ // backend
163
+
164
+ ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
165
+ if (backend == NULL) {
166
+ return NULL;
167
+ }
168
+ return backend->guid;
169
+ }
170
+
171
+ const char * ggml_backend_name(ggml_backend_t backend) {
172
+ if (backend == NULL) {
173
+ return "NULL";
174
+ }
175
+ return backend->iface.get_name(backend);
176
+ }
177
+
178
+ void ggml_backend_free(ggml_backend_t backend) {
179
+ if (backend == NULL) {
180
+ return;
181
+ }
182
+
183
+ backend->iface.free(backend);
184
+ }
185
+
186
+ ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
187
+ return backend->iface.get_default_buffer_type(backend);
188
+ }
189
+
190
+ ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
191
+ return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
192
+ }
193
+
194
+ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
195
+ return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
196
+ }
197
+
198
+ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
199
+ return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
200
+ }
201
+
202
+ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
203
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
204
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
205
+
206
+ if (backend->iface.set_tensor_async == NULL) {
207
+ ggml_backend_tensor_set(tensor, data, offset, size);
208
+ } else {
209
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
210
+ }
211
+ }
212
+
213
+ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
214
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
215
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
216
+
217
+ if (backend->iface.get_tensor_async == NULL) {
218
+ ggml_backend_tensor_get(tensor, data, offset, size);
219
+ } else {
220
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
221
+ }
222
+ }
223
+
224
+ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
225
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
226
+
227
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
228
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
229
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
230
+
231
+ if (!size) {
232
+ return;
233
+ }
234
+
235
+ buf->iface.set_tensor(buf, tensor, data, offset, size);
236
+ }
237
+
238
+ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
239
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
240
+
241
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
242
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
243
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
244
+
245
+ if (!size) {
246
+ return;
247
+ }
248
+
249
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
250
+ }
251
+
252
+ GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
253
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
254
+
255
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
256
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
258
+
259
+ if (!size) {
260
+ return;
261
+ }
262
+
263
+ GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
264
+
265
+ buf->iface.memset_tensor(buf, tensor, value, offset, size);
266
+ }
267
+
268
+ void ggml_backend_synchronize(ggml_backend_t backend) {
269
+ if (backend->iface.synchronize == NULL) {
270
+ return;
271
+ }
272
+
273
+ backend->iface.synchronize(backend);
274
+ }
275
+
276
+ ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
277
+ GGML_ASSERT(backend->iface.graph_plan_create != NULL);
278
+
279
+ return backend->iface.graph_plan_create(backend, cgraph);
280
+ }
281
+
282
+ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
283
+ GGML_ASSERT(backend->iface.graph_plan_free != NULL);
284
+
285
+ backend->iface.graph_plan_free(backend, plan);
286
+ }
287
+
288
+ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
289
+ GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
290
+
291
+ return backend->iface.graph_plan_compute(backend, plan);
292
+ }
293
+
294
+ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
295
+ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
296
+ ggml_backend_synchronize(backend);
297
+ return err;
298
+ }
299
+
300
+ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
301
+ return backend->iface.graph_compute(backend, cgraph);
302
+ }
303
+
304
+ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
305
+ // helper to ease transition to device interface
306
+ if (backend->device) {
307
+ return ggml_backend_dev_supports_op(backend->device, op);
308
+ }
309
+
310
+ return backend->iface.supports_op(backend, op);
311
+ }
312
+
313
+ bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
314
+ // helper to ease transition to device interface
315
+ if (backend->device) {
316
+ return ggml_backend_dev_supports_buft(backend->device, buft);
317
+ }
318
+
319
+ return backend->iface.supports_buft(backend, buft);
320
+ }
321
+
322
+ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
323
+ // helper to ease transition to device interface
324
+ if (backend->device) {
325
+ return ggml_backend_dev_offload_op(backend->device, op);
326
+ }
327
+
328
+ if (backend->iface.offload_op != NULL) {
329
+ return backend->iface.offload_op(backend, op);
330
+ }
331
+ return false;
332
+ }
333
+
334
+ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
335
+ return backend->device;
336
+ }
337
+
338
+ // backend copy
339
+
340
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
341
+ if (a->type != b->type) {
342
+ return false;
343
+ }
344
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
345
+ if (a->ne[i] != b->ne[i]) {
346
+ return false;
347
+ }
348
+ if (a->nb[i] != b->nb[i]) {
349
+ return false;
350
+ }
351
+ }
352
+ return true;
353
+ }
354
+
355
+ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
356
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
357
+
358
+ if (src == dst) {
359
+ return;
360
+ }
361
+
362
+ if (ggml_backend_buffer_is_host(src->buffer)) {
363
+ ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
364
+ } else if (ggml_backend_buffer_is_host(dst->buffer)) {
365
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
366
+ } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
367
+ #ifndef NDEBUG
368
+ fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
369
+ #endif
370
+ size_t nbytes = ggml_nbytes(src);
371
+ void * data = malloc(nbytes);
372
+ ggml_backend_tensor_get(src, data, 0, nbytes);
373
+ ggml_backend_tensor_set(dst, data, 0, nbytes);
374
+ free(data);
375
+ }
376
+ }
377
+
378
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
379
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
380
+
381
+ if (src == dst) {
382
+ return;
383
+ }
384
+
385
+ if (backend_dst->iface.cpy_tensor_async != NULL) {
386
+ if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
387
+ return;
388
+ }
389
+ }
390
+
391
+ // an async copy would normally happen after all the queued operations on both backends are completed
392
+ // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
393
+ ggml_backend_synchronize(backend_src);
394
+ ggml_backend_synchronize(backend_dst);
395
+ ggml_backend_tensor_copy(src, dst);
396
+ }
397
+
398
+ // events
399
+
400
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
401
+ // null device is allowed for the transition period to the device interface
402
+ if (device == NULL || device->iface.event_new == NULL) {
403
+ return NULL;
404
+ }
405
+ return device->iface.event_new(device);
406
+ }
407
+
408
+ void ggml_backend_event_free(ggml_backend_event_t event) {
409
+ if (event == NULL) {
410
+ return;
411
+ }
412
+ event->device->iface.event_free(event->device, event);
413
+ }
414
+
415
+ void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
416
+ GGML_ASSERT(backend->iface.event_record != NULL);
417
+
418
+ backend->iface.event_record(backend, event);
419
+ }
420
+
421
+ void ggml_backend_event_synchronize(ggml_backend_event_t event) {
422
+ GGML_ASSERT(event->device->iface.event_synchronize);
423
+
424
+ event->device->iface.event_synchronize(event->device, event);
425
+ }
426
+
427
+ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
428
+ GGML_ASSERT(backend->iface.event_wait != NULL);
429
+
430
+ backend->iface.event_wait(backend, event);
431
+ }
432
+
433
+ // Backend device
434
+
435
+ const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
436
+ return device->iface.get_name(device);
437
+ }
438
+
439
+ const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
440
+ return device->iface.get_description(device);
441
+ }
442
+
443
+ void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
444
+ device->iface.get_memory(device, free, total);
445
+ }
446
+
447
+ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
448
+ return device->iface.get_type(device);
449
+ }
450
+
451
+ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
452
+ device->iface.get_props(device, props);
453
+ }
454
+
455
+ ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
456
+ return device->reg;
457
+ }
458
+
459
+ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
460
+ return device->iface.init_backend(device, params);
461
+ }
462
+
463
+ ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
464
+ return device->iface.get_buffer_type(device);
465
+ }
466
+
467
+ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
468
+ return device->iface.get_host_buffer_type(device);
469
+ }
470
+
471
+ ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
472
+ return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
473
+ }
474
+
475
+ bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
476
+ return device->iface.supports_op(device, op);
477
+ }
478
+
479
+ bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
480
+ return device->iface.supports_buft(device, buft);
481
+ }
482
+
483
+ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
484
+ return device->iface.offload_op(device, op);
485
+ }
486
+
487
+ // Backend (reg)
488
+
489
+ const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
490
+ return reg->iface.get_name(reg);
491
+ }
492
+
493
+ size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
494
+ return reg->iface.get_device_count(reg);
495
+ }
496
+
497
+ ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
498
+ return reg->iface.get_device(reg, index);
499
+ }
500
+
501
+ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
502
+ if (!reg->iface.get_proc_address) {
503
+ return NULL;
504
+ }
505
+ return reg->iface.get_proc_address(reg, name);
506
+ }
507
+
508
+ void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
509
+ if (reg->iface.set_log_callback) {
510
+ reg->iface.set_log_callback(reg, log_callback, user_data);
511
+ }
512
+ }
513
+
514
+ // Backend registry
515
+
516
+ #ifdef GGML_USE_CUDA
517
+ #include "ggml-cuda.h"
518
+ #endif
519
+
520
+ struct ggml_backend_registry {
521
+ std::vector<ggml_backend_reg_t> backends;
522
+ std::vector<ggml_backend_dev_t> devices;
523
+
524
+ ggml_backend_registry() {
525
+ #ifdef GGML_USE_CUDA
526
+ register_backend(ggml_backend_cuda_reg());
527
+ #endif
528
+
529
+ register_backend(ggml_backend_cpu_reg());
530
+
531
+ // TODO: sycl, metal, vulkan, kompute, cann
532
+ }
533
+
534
+ void register_backend(ggml_backend_reg_t reg) {
535
+ #ifndef NDEBUG
536
+ fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
537
+ __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
538
+ #endif
539
+ backends.push_back(reg);
540
+ for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
541
+ register_device(ggml_backend_reg_dev_get(reg, i));
542
+ }
543
+ }
544
+
545
+ void register_device(ggml_backend_dev_t device) {
546
+ #ifndef NDEBUG
547
+ fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
548
+ #endif
549
+ devices.push_back(device);
550
+ }
551
+ };
552
+
553
+ static ggml_backend_registry & get_reg() {
554
+ static ggml_backend_registry reg;
555
+ return reg;
556
+ }
557
+
558
+ // Internal API
559
+ void ggml_backend_register(ggml_backend_reg_t reg) {
560
+ get_reg().register_backend(reg);
561
+ }
562
+
563
+ void ggml_backend_device_register(ggml_backend_dev_t device) {
564
+ get_reg().register_device(device);
565
+ }
566
+
567
+ // Backend (reg) enumeration
568
+ size_t ggml_backend_reg_count() {
569
+ return get_reg().backends.size();
570
+ }
571
+
572
+ ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
573
+ GGML_ASSERT(index < ggml_backend_reg_count());
574
+ return get_reg().backends[index];
575
+ }
576
+
577
+ ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
578
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
579
+ ggml_backend_reg_t reg = ggml_backend_reg_get(i);
580
+ if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
581
+ return reg;
582
+ }
583
+ }
584
+ return NULL;
585
+ }
586
+
587
+ // Device enumeration
588
+ size_t ggml_backend_dev_count() {
589
+ return get_reg().devices.size();
590
+ }
591
+
592
+ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
593
+ GGML_ASSERT(index < ggml_backend_dev_count());
594
+ return get_reg().devices[index];
595
+ }
596
+
597
+ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
598
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
599
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
600
+ if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
601
+ return dev;
602
+ }
603
+ }
604
+ return NULL;
605
+ }
606
+
607
+ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
608
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
609
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
610
+ if (ggml_backend_dev_type(dev) == type) {
611
+ return dev;
612
+ }
613
+ }
614
+ return NULL;
615
+ }
616
+
617
+ void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data) {
618
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
619
+ ggml_backend_reg_t reg = ggml_backend_reg_get(i);
620
+ ggml_backend_reg_set_log_callback(reg, log_callback, user_data);
621
+ }
622
+ }
623
+
624
+ // Convenience functions
625
+ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
626
+ ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
627
+ if (!dev) {
628
+ return NULL;
629
+ }
630
+ return ggml_backend_dev_init(dev, params);
631
+ }
632
+
633
+ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
634
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
635
+ if (!dev) {
636
+ return NULL;
637
+ }
638
+ return ggml_backend_dev_init(dev, params);
639
+ }
640
+
641
+ ggml_backend_t ggml_backend_init_best(void) {
642
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
643
+ if (!dev) {
644
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
645
+ }
646
+ if (!dev) {
647
+ return NULL;
648
+ }
649
+ return ggml_backend_dev_init(dev, NULL);
650
+ }
651
+
652
+ // backend CPU
653
+
654
+ static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
655
+
656
+ static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
657
+ return "CPU";
658
+
659
+ GGML_UNUSED(buffer);
660
+ }
661
+
662
+ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
663
+ uintptr_t data = (uintptr_t)buffer->context;
664
+
665
+ // align the buffer
666
+ if (data % TENSOR_ALIGNMENT != 0) {
667
+ data = GGML_PAD(data, TENSOR_ALIGNMENT);
668
+ }
669
+
670
+ return (void *)data;
671
+ }
672
+
673
+ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
674
+ free(buffer->context);
675
+ }
676
+
677
+ static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
678
+ memset((char *)tensor->data + offset, value, size);
679
+
680
+ GGML_UNUSED(buffer);
681
+ }
682
+
683
+ static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
684
+ memcpy((char *)tensor->data + offset, data, size);
685
+
686
+ GGML_UNUSED(buffer);
687
+ }
688
+
689
+ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
690
+ memcpy(data, (const char *)tensor->data + offset, size);
691
+
692
+ GGML_UNUSED(buffer);
693
+ }
694
+
695
+ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
696
+ if (ggml_backend_buffer_is_host(src->buffer)) {
697
+ memcpy(dst->data, src->data, ggml_nbytes(src));
698
+ return true;
699
+ }
700
+ return false;
701
+
702
+ GGML_UNUSED(buffer);
703
+ }
704
+
705
+ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
706
+ memset(buffer->context, value, buffer->size);
707
+ }
708
+
709
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
710
+ /* .get_name = */ ggml_backend_cpu_buffer_get_name,
711
+ /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
712
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
713
+ /* .init_tensor = */ NULL, // no initialization required
714
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
715
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
716
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
717
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
718
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
719
+ /* .reset = */ NULL,
720
+ };
721
+
722
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
723
+ /* .get_name = */ ggml_backend_cpu_buffer_get_name,
724
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
725
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
726
+ /* .init_tensor = */ NULL, // no initialization required
727
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
728
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
729
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
730
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
731
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
732
+ /* .reset = */ NULL,
733
+ };
734
+
735
+ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
736
+ return "CPU";
737
+
738
+ GGML_UNUSED(buft);
739
+ }
740
+
741
+ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
742
+ size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
743
+ void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
744
+ if (data == NULL) {
745
+ fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
746
+ return NULL;
747
+ }
748
+
749
+ return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
750
+ }
751
+
752
+ static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
753
+ return TENSOR_ALIGNMENT;
754
+
755
+ GGML_UNUSED(buft);
756
+ }
757
+
758
+ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
759
+ return true;
760
+
761
+ GGML_UNUSED(buft);
762
+ }
763
+
764
+ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
765
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
766
+ /* .iface = */ {
767
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
768
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
769
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
770
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
771
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
772
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
773
+ },
774
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
775
+ /* .context = */ NULL,
776
+ };
777
+
778
+ return &ggml_backend_cpu_buffer_type;
779
+ }
780
+
781
+ #ifdef GGML_USE_CPU_HBM
782
+
783
+ // buffer type HBM
784
+
785
+ #include <hbwmalloc.h>
786
+
787
+ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
788
+ return "CPU_HBM";
789
+
790
+ GGML_UNUSED(buft);
791
+ }
792
+
793
+ static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
794
+ return "CPU_HBM";
795
+
796
+ GGML_UNUSED(buf);
797
+ }
798
+
799
+ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
800
+ hbw_free(buffer->context);
801
+ }
802
+
803
+ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
804
+ //void * ptr = hbw_malloc(size);
805
+ void * ptr;
806
+ int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
807
+ if (result != 0) {
808
+ fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
809
+ return NULL;
810
+ }
811
+
812
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
813
+ buffer->buft = buft;
814
+ buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
815
+ buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
816
+
817
+ return buffer;
818
+ }
819
+
820
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
821
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
822
+ /* .iface = */ {
823
+ /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
824
+ /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
825
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
826
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
827
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
828
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
829
+ },
830
+ /* .context = */ NULL,
831
+ };
832
+
833
+ return &ggml_backend_cpu_buffer_type_hbm;
834
+ }
835
+ #endif
836
+
837
+ struct ggml_backend_cpu_context {
838
+ int n_threads;
839
+ ggml_threadpool_t threadpool;
840
+
841
+ uint8_t * work_data;
842
+ size_t work_size;
843
+
844
+ ggml_abort_callback abort_callback;
845
+ void * abort_callback_data;
846
+ };
847
+
848
+ static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
849
+ return "CPU";
850
+
851
+ GGML_UNUSED(backend);
852
+ }
853
+
854
+ static void ggml_backend_cpu_free(ggml_backend_t backend) {
855
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
856
+ delete[] cpu_ctx->work_data;
857
+ delete cpu_ctx;
858
+ delete backend;
859
+ }
860
+
861
+ static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
862
+ return ggml_backend_cpu_buffer_type();
863
+
864
+ GGML_UNUSED(backend);
865
+ }
866
+
867
+ struct ggml_backend_plan_cpu {
868
+ struct ggml_cplan cplan;
869
+ struct ggml_cgraph cgraph;
870
+ };
871
+
872
+ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
873
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
874
+
875
+ struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
876
+
877
+ cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
878
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
879
+
880
+ if (cpu_plan->cplan.work_size > 0) {
881
+ cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
882
+ if (cpu_plan->cplan.work_data == NULL) {
883
+ delete cpu_plan;
884
+ return NULL;
885
+ }
886
+ }
887
+
888
+ cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
889
+ cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
890
+
891
+ return cpu_plan;
892
+ }
893
+
894
+ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
895
+ struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
896
+
897
+ delete[] cpu_plan->cplan.work_data;
898
+ delete cpu_plan;
899
+
900
+ GGML_UNUSED(backend);
901
+ }
902
+
903
+ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
904
+ struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
905
+
906
+ return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
907
+
908
+ GGML_UNUSED(backend);
909
+ }
910
+
911
+ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
912
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
913
+
914
+ struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
915
+
916
+ if (cpu_ctx->work_size < cplan.work_size) {
917
+ delete[] cpu_ctx->work_data;
918
+ cpu_ctx->work_data = new uint8_t[cplan.work_size];
919
+ if (cpu_ctx->work_data == NULL) {
920
+ cpu_ctx->work_size = 0;
921
+ return GGML_STATUS_ALLOC_FAILED;
922
+ }
923
+ cpu_ctx->work_size = cplan.work_size;
924
+ }
925
+ cplan.work_data = (uint8_t *)cpu_ctx->work_data;
926
+
927
+ cplan.abort_callback = cpu_ctx->abort_callback;
928
+ cplan.abort_callback_data = cpu_ctx->abort_callback_data;
929
+
930
+ return ggml_graph_compute(cgraph, &cplan);
931
+ }
932
+
933
+ static const struct ggml_backend_i ggml_backend_cpu_i = {
934
+ /* .get_name = */ ggml_backend_cpu_get_name,
935
+ /* .free = */ ggml_backend_cpu_free,
936
+ /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
937
+ /* .set_tensor_async = */ NULL,
938
+ /* .get_tensor_async = */ NULL,
939
+ /* .cpy_tensor_async = */ NULL,
940
+ /* .synchronize = */ NULL,
941
+ /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
942
+ /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
943
+ /* .graph_plan_update = */ NULL,
944
+ /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
945
+ /* .graph_compute = */ ggml_backend_cpu_graph_compute,
946
+ /* .supports_op = */ NULL,
947
+ /* .supports_buft = */ NULL,
948
+ /* .offload_op = */ NULL,
949
+ /* .event_record = */ NULL,
950
+ /* .event_wait = */ NULL,
951
+ };
952
+
953
+ static ggml_guid_t ggml_backend_cpu_guid(void) {
954
+ static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
955
+ return &guid;
956
+ }
957
+
958
+ ggml_backend_t ggml_backend_cpu_init(void) {
959
+ struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
960
+ if (ctx == NULL) {
961
+ return NULL;
962
+ }
963
+
964
+ ctx->n_threads = GGML_DEFAULT_N_THREADS;
965
+ ctx->threadpool = NULL;
966
+ ctx->work_data = NULL;
967
+ ctx->work_size = 0;
968
+ ctx->abort_callback = NULL;
969
+ ctx->abort_callback_data = NULL;
970
+
971
+ ggml_backend_t cpu_backend = new ggml_backend {
972
+ /* .guid = */ ggml_backend_cpu_guid(),
973
+ /* .interface = */ ggml_backend_cpu_i,
974
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
975
+ /* .context = */ ctx,
976
+ };
977
+
978
+ if (cpu_backend == NULL) {
979
+ delete ctx;
980
+ return NULL;
981
+ }
982
+
983
+ return cpu_backend;
984
+ }
985
+
986
+ bool ggml_backend_is_cpu(ggml_backend_t backend) {
987
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
988
+ }
989
+
990
+ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
991
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
992
+
993
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
994
+ ctx->n_threads = n_threads;
995
+ }
996
+
997
+ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
998
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
999
+
1000
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
1001
+
1002
+ if (ctx->threadpool && ctx->threadpool != threadpool) {
1003
+ // already had a different threadpool, pause/suspend it before switching
1004
+ ggml_threadpool_pause(ctx->threadpool);
1005
+ }
1006
+ ctx->threadpool = threadpool;
1007
+ }
1008
+
1009
+ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
1010
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
1011
+
1012
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
1013
+ ctx->abort_callback = abort_callback;
1014
+ ctx->abort_callback_data = abort_callback_data;
1015
+ }
1016
+
1017
+ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1018
+ GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1019
+ return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1020
+ }
1021
+
1022
+ ////////////////////////
1023
+
1024
+ static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
1025
+ return "CPU";
1026
+
1027
+ GGML_UNUSED(dev);
1028
+ }
1029
+
1030
+ static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
1031
+ // TODO
1032
+ return "CPU";
1033
+
1034
+ GGML_UNUSED(dev);
1035
+ }
1036
+
1037
+ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
1038
+ // TODO
1039
+ *free = 0;
1040
+ *total = 0;
1041
+
1042
+ GGML_UNUSED(dev);
1043
+ }
1044
+
1045
+ static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
1046
+ return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
1047
+
1048
+ GGML_UNUSED(dev);
1049
+ }
1050
+
1051
+ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
1052
+ props->name = ggml_backend_cpu_device_get_name(dev);
1053
+ props->description = ggml_backend_cpu_device_get_description(dev);
1054
+ props->type = ggml_backend_cpu_device_get_type(dev);
1055
+ ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
1056
+ props->caps = {
1057
+ /* async */ false,
1058
+ /* host_buffer */ false,
1059
+ /* events */ false,
1060
+ };
1061
+ }
1062
+
1063
+ static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
1064
+ return ggml_backend_cpu_init();
1065
+
1066
+ GGML_UNUSED(dev);
1067
+ GGML_UNUSED(params);
1068
+ }
1069
+
1070
+ static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
1071
+ return ggml_backend_cpu_buffer_type();
1072
+
1073
+ GGML_UNUSED(dev);
1074
+ }
1075
+
1076
+ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1077
+ return ggml_backend_cpu_buffer_from_ptr(ptr, size);
1078
+
1079
+ GGML_UNUSED(dev);
1080
+ GGML_UNUSED(max_tensor_size);
1081
+ }
1082
+
1083
+ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
1084
+ switch (op->op) {
1085
+ case GGML_OP_CPY:
1086
+ return
1087
+ op->type != GGML_TYPE_IQ2_XXS &&
1088
+ op->type != GGML_TYPE_IQ2_XS &&
1089
+ op->type != GGML_TYPE_IQ1_S &&
1090
+ op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
1091
+ case GGML_OP_MUL_MAT:
1092
+ return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
1093
+ case GGML_OP_ROPE_BACK:
1094
+ return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
1095
+ case GGML_OP_IM2COL_BACK:
1096
+ return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
1097
+ case GGML_OP_OUT_PROD:
1098
+ return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
1099
+ default:
1100
+ return true;
1101
+ }
1102
+
1103
+ GGML_UNUSED(dev);
1104
+ }
1105
+
1106
+ static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
1107
+ return ggml_backend_buft_is_host(buft);
1108
+
1109
+ GGML_UNUSED(dev);
1110
+ }
1111
+
1112
+ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
1113
+ /* .get_name = */ ggml_backend_cpu_device_get_name,
1114
+ /* .get_description = */ ggml_backend_cpu_device_get_description,
1115
+ /* .get_memory = */ ggml_backend_cpu_device_get_memory,
1116
+ /* .get_type = */ ggml_backend_cpu_device_get_type,
1117
+ /* .get_props = */ ggml_backend_cpu_device_get_props,
1118
+ /* .init_backend = */ ggml_backend_cpu_device_init,
1119
+ /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
1120
+ /* .get_host_buffer_type = */ NULL,
1121
+ /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
1122
+ /* .supports_op = */ ggml_backend_cpu_device_supports_op,
1123
+ /* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
1124
+ /* .offload_op = */ NULL,
1125
+ /* .event_new = */ NULL,
1126
+ /* .event_free = */ NULL,
1127
+ /* .event_synchronize = */ NULL,
1128
+ };
1129
+
1130
+ ////////////////////////
1131
+
1132
+ static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
1133
+ return "CPU";
1134
+
1135
+ GGML_UNUSED(reg);
1136
+ }
1137
+
1138
+ static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
1139
+ return 1;
1140
+
1141
+ GGML_UNUSED(reg);
1142
+ }
1143
+
1144
+ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
1145
+ GGML_ASSERT(index == 0);
1146
+
1147
+ static ggml_backend_device ggml_backend_cpu_device = {
1148
+ /* .iface = */ ggml_backend_cpu_device_i,
1149
+ /* .reg = */ reg,
1150
+ /* .context = */ NULL,
1151
+ };
1152
+
1153
+ return &ggml_backend_cpu_device;
1154
+
1155
+ GGML_UNUSED(reg);
1156
+ GGML_UNUSED(index);
1157
+ }
1158
+
1159
+ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
1160
+ /* .get_name = */ ggml_backend_cpu_reg_get_name,
1161
+ /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
1162
+ /* .get_device = */ ggml_backend_cpu_reg_get_device,
1163
+ /* .get_proc_address = */ NULL,
1164
+ /* .set_log_callback = */ NULL,
1165
+ };
1166
+
1167
+ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
1168
+ static struct ggml_backend_reg ggml_backend_cpu_reg = {
1169
+ /* .iface = */ ggml_backend_cpu_reg_i,
1170
+ /* .context = */ NULL,
1171
+ };
1172
+
1173
+ return &ggml_backend_cpu_reg;
1174
+ }
1175
+
1176
+ // multi-buffer buffer
1177
+
1178
+ struct ggml_backend_multi_buffer_context {
1179
+ ggml_backend_buffer_t * buffers;
1180
+ size_t n_buffers;
1181
+ };
1182
+
1183
+ static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
1184
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1185
+
1186
+ return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
1187
+ }
1188
+
1189
+ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1190
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1191
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
1192
+ ggml_backend_buffer_free(ctx->buffers[i]);
1193
+ }
1194
+
1195
+ free(ctx->buffers);
1196
+ free(ctx);
1197
+ }
1198
+
1199
+ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1200
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1201
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
1202
+ ggml_backend_buffer_clear(ctx->buffers[i], value);
1203
+ }
1204
+ }
1205
+
1206
+ static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
1207
+ /* .get_name = */ ggml_backend_multi_buffer_get_name,
1208
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
1209
+ /* .get_base = */ NULL,
1210
+ /* .init_tensor = */ NULL,
1211
+ /* .memset_tensor = */ NULL,
1212
+ /* .set_tensor = */ NULL,
1213
+ /* .get_tensor = */ NULL,
1214
+ /* .cpy_tensor = */ NULL,
1215
+ /* .clear = */ ggml_backend_multi_buffer_clear,
1216
+ /* .reset = */ NULL,
1217
+ };
1218
+
1219
+ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
1220
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
1221
+ ctx->n_buffers = n_buffers;
1222
+ ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
1223
+
1224
+ GGML_ASSERT(ctx->buffers != NULL);
1225
+
1226
+ size_t total_size = 0;
1227
+ for (size_t i = 0; i < n_buffers; i++) {
1228
+ ctx->buffers[i] = buffers[i];
1229
+ total_size += ggml_backend_buffer_get_size(buffers[i]);
1230
+ }
1231
+
1232
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
1233
+ }
1234
+
1235
+ bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
1236
+ return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
1237
+ }
1238
+
1239
+ void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
1240
+ GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
1241
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1242
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
1243
+ ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
1244
+ }
1245
+ }
1246
+
1247
+ // creates a copy of the tensor with the same memory layout
1248
+ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
1249
+ struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
1250
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
1251
+ dup->nb[i] = tensor->nb[i];
1252
+ }
1253
+ return dup;
1254
+ }
1255
+
1256
+ static bool ggml_is_view_op(enum ggml_op op) {
1257
+ return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
1258
+ }
1259
+
1260
+ // scheduler
1261
+
1262
+ #ifndef GGML_SCHED_MAX_BACKENDS
1263
+ #define GGML_SCHED_MAX_BACKENDS 16
1264
+ #endif
1265
+
1266
+ #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1267
+ #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
1268
+ #endif
1269
+
1270
+ #ifndef GGML_SCHED_MAX_COPIES
1271
+ #define GGML_SCHED_MAX_COPIES 4
1272
+ #endif
1273
+
1274
+ struct ggml_backend_sched_split {
1275
+ int backend_id;
1276
+ int i_start;
1277
+ int i_end;
1278
+ struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1279
+ int n_inputs;
1280
+ // graph view of this split
1281
+ struct ggml_cgraph graph;
1282
+ };
1283
+
1284
+ struct ggml_backend_sched {
1285
+ bool is_reset; // true if the scheduler has been reset since the last graph split
1286
+ bool is_alloc;
1287
+
1288
+ int n_backends;
1289
+
1290
+ ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
1291
+ ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
1292
+ ggml_gallocr_t galloc;
1293
+
1294
+ // hash map of the nodes in the graph
1295
+ struct ggml_hash_set hash_set;
1296
+ int * hv_tensor_backend_ids; // [hash_set.size]
1297
+ struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
1298
+
1299
+ int * node_backend_ids; // [graph_size]
1300
+ int * leaf_backend_ids; // [graph_size]
1301
+
1302
+ int * prev_node_backend_ids; // [graph_size]
1303
+ int * prev_leaf_backend_ids; // [graph_size]
1304
+
1305
+ // copy of the graph with modified inputs
1306
+ struct ggml_cgraph graph;
1307
+
1308
+ // graph splits
1309
+ struct ggml_backend_sched_split * splits;
1310
+ int n_splits;
1311
+ int splits_capacity;
1312
+
1313
+ // pipeline parallelism support
1314
+ int n_copies;
1315
+ int cur_copy;
1316
+ ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1317
+ struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1318
+ int n_graph_inputs;
1319
+
1320
+ struct ggml_context * ctx;
1321
+
1322
+ ggml_backend_sched_eval_callback callback_eval;
1323
+ void * callback_eval_user_data;
1324
+
1325
+ char * context_buffer;
1326
+ size_t context_buffer_size;
1327
+
1328
+ bool debug;
1329
+ };
1330
+
1331
+ #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
1332
+ #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
1333
+ #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
1334
+ #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
1335
+
1336
+ // returns the priority of the backend, lower id is higher priority
1337
+ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
1338
+ for (int i = 0; i < sched->n_backends; i++) {
1339
+ if (sched->backends[i] == backend) {
1340
+ return i;
1341
+ }
1342
+ }
1343
+ return -1;
1344
+ }
1345
+
1346
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
1347
+ ggml_backend_buffer_t buffer = tensor->buffer;
1348
+ if (buffer == NULL) {
1349
+ return -1;
1350
+ }
1351
+
1352
+ // find highest prio backend that supports the buffer type and the op
1353
+ for (int i = 0; i < sched->n_backends; i++) {
1354
+ if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1355
+ ggml_backend_supports_op(sched->backends[i], op)) {
1356
+ return i;
1357
+ }
1358
+ }
1359
+
1360
+ #ifndef NDEBUG
1361
+ fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1362
+ __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1363
+ #endif
1364
+
1365
+ return -1;
1366
+ }
1367
+
1368
+ #if 0
1369
+ #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
1370
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1371
+ #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1372
+ #define GET_CAUSE(node) causes[hash_id(node)]
1373
+ #else
1374
+ #define SET_CAUSE(node, ...)
1375
+ #define GET_CAUSE(node) ""
1376
+ #endif
1377
+
1378
+ // returns the backend that should be used for the node based on the current locations
1379
+ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1380
+ // TODO: use supports_op to check if the backend supports the op
1381
+
1382
+ // assign pre-allocated nodes to their backend
1383
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1384
+ if (cur_backend_id != -1) {
1385
+ SET_CAUSE(tensor, "1.dst");
1386
+ return cur_backend_id;
1387
+ }
1388
+
1389
+ // view_src
1390
+ if (tensor->view_src != NULL) {
1391
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1392
+ if (cur_backend_id != -1) {
1393
+ SET_CAUSE(tensor, "1.vsrc");
1394
+ return cur_backend_id;
1395
+ }
1396
+ }
1397
+
1398
+ if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1399
+ // since the tensor is pre-allocated, it cannot be moved to another backend
1400
+ GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
1401
+ }
1402
+
1403
+ // graph input
1404
+ if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1405
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1406
+ SET_CAUSE(tensor, "1.inp");
1407
+ return cur_backend_id;
1408
+ }
1409
+
1410
+ // operations with weights are preferably run on the same backend as the weights
1411
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
1412
+ const struct ggml_tensor * src = tensor->src[i];
1413
+ if (src == NULL) {
1414
+ continue;
1415
+ }
1416
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1417
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1418
+ // check if a backend with higher prio wants to offload the op
1419
+ if (src_backend_id == sched->n_backends - 1) {
1420
+ for (int b = 0; b < src_backend_id; b++) {
1421
+ if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
1422
+ SET_CAUSE(tensor, "1.off");
1423
+ return b;
1424
+ }
1425
+ }
1426
+ }
1427
+ SET_CAUSE(tensor, "1.wgt%d", i);
1428
+ return src_backend_id;
1429
+ }
1430
+ }
1431
+
1432
+ return -1;
1433
+ }
1434
+
1435
+ static char * fmt_size(size_t size) {
1436
+ static char buffer[128];
1437
+ if (size >= 1024*1024) {
1438
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1439
+ } else {
1440
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1441
+ }
1442
+ return buffer;
1443
+ }
1444
+
1445
+ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1446
+ int cur_split = 0;
1447
+ for (int i = 0; i < graph->n_nodes; i++) {
1448
+ if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1449
+ ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1450
+ fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1451
+ sched->splits[cur_split].n_inputs);
1452
+ for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1453
+ fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1454
+ fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
1455
+ }
1456
+ fprintf(stderr, "\n");
1457
+ cur_split++;
1458
+ }
1459
+ struct ggml_tensor * node = graph->nodes[i];
1460
+ if (ggml_is_view_op(node->op)) {
1461
+ continue;
1462
+ }
1463
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1464
+ fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1465
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1466
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1467
+ struct ggml_tensor * src = node->src[j];
1468
+ if (src == NULL) {
1469
+ continue;
1470
+ }
1471
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1472
+ fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1473
+ fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1474
+ }
1475
+ fprintf(stderr, "\n");
1476
+ }
1477
+ }
1478
+
1479
+ static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
1480
+ ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1481
+ ggml_backend_buffer_type_t buft = NULL;
1482
+
1483
+ if (buf) {
1484
+ // the tensor is already allocated
1485
+ buft = buf->buft;
1486
+ } else {
1487
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
1488
+ int tensor_backend_id = tensor_backend_id(t);
1489
+ if (tensor_backend_id == -1 && t->view_src) {
1490
+ tensor_backend_id = tensor_backend_id(t->view_src);
1491
+ }
1492
+ if (tensor_backend_id != -1) {
1493
+ buft = sched->bufts[tensor_backend_id];
1494
+ }
1495
+ }
1496
+
1497
+ return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
1498
+ }
1499
+
1500
+ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1501
+ if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1502
+ *node_backend_id = cur_backend_id;
1503
+ SET_CAUSE(node, "2.sup");
1504
+ }
1505
+ }
1506
+
1507
+ // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1508
+ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1509
+ // reset splits
1510
+ sched->n_splits = 0;
1511
+ sched->n_graph_inputs = 0;
1512
+ sched->is_reset = false;
1513
+
1514
+ struct ggml_init_params params = {
1515
+ /* .mem_size = */ sched->context_buffer_size,
1516
+ /* .mem_buffer = */ sched->context_buffer,
1517
+ /* .no_alloc = */ true
1518
+ };
1519
+
1520
+ ggml_free(sched->ctx);
1521
+
1522
+ sched->ctx = ggml_init(params);
1523
+ if (sched->ctx == NULL) {
1524
+ GGML_ABORT("%s: failed to initialize context\n", __func__);
1525
+ }
1526
+
1527
+ // pass 1: assign backends to ops with pre-allocated inputs
1528
+ for (int i = 0; i < graph->n_leafs; i++) {
1529
+ struct ggml_tensor * leaf = graph->leafs[i];
1530
+ int * leaf_backend_id = &tensor_backend_id(leaf);
1531
+ // do not overwrite user assignments
1532
+ if (*leaf_backend_id == -1) {
1533
+ *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1534
+ }
1535
+ }
1536
+
1537
+ for (int i = 0; i < graph->n_nodes; i++) {
1538
+ struct ggml_tensor * node = graph->nodes[i];
1539
+ int * node_backend_id = &tensor_backend_id(node);
1540
+ // do not overwrite user assignments
1541
+ if (*node_backend_id == -1) {
1542
+ *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1543
+
1544
+ #if 0
1545
+ // src
1546
+ if (node->op == GGML_OP_NONE) {
1547
+ continue;
1548
+ }
1549
+
1550
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1551
+ struct ggml_tensor * src = node->src[j];
1552
+ if (src == NULL) {
1553
+ continue;
1554
+ }
1555
+ int * src_backend_id = &tensor_backend_id(src);
1556
+ if (*src_backend_id == -1) {
1557
+ *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1558
+ }
1559
+ }
1560
+ #endif
1561
+ }
1562
+ }
1563
+
1564
+ // pass 2: expand current backend assignments
1565
+ // assign the same backend to adjacent nodes
1566
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1567
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1568
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1569
+ // expand gpu down
1570
+ {
1571
+ int cur_backend_id = -1;
1572
+ for (int i = 0; i < graph->n_nodes; i++) {
1573
+ struct ggml_tensor * node = graph->nodes[i];
1574
+ if (ggml_is_view_op(node->op)) {
1575
+ continue;
1576
+ }
1577
+ int * node_backend_id = &tensor_backend_id(node);
1578
+ if (*node_backend_id != -1) {
1579
+ if (*node_backend_id == sched->n_backends - 1) {
1580
+ // skip cpu (lowest prio backend)
1581
+ cur_backend_id = -1;
1582
+ } else {
1583
+ cur_backend_id = *node_backend_id;
1584
+ }
1585
+ } else if (cur_backend_id != -1) {
1586
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1587
+ }
1588
+ }
1589
+ }
1590
+ // expand gpu up
1591
+ {
1592
+ int cur_backend_id = -1;
1593
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1594
+ struct ggml_tensor * node = graph->nodes[i];
1595
+ if (ggml_is_view_op(node->op)) {
1596
+ continue;
1597
+ }
1598
+ int * node_backend_id = &tensor_backend_id(node);
1599
+ if (*node_backend_id != -1) {
1600
+ if (*node_backend_id == sched->n_backends - 1) {
1601
+ // skip cpu (lowest prio backend)
1602
+ cur_backend_id = -1;
1603
+ } else {
1604
+ cur_backend_id = *node_backend_id;
1605
+ }
1606
+ } else if (cur_backend_id != -1) {
1607
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1608
+ }
1609
+ }
1610
+ }
1611
+ // expand rest down
1612
+ {
1613
+ int cur_backend_id = -1;
1614
+ for (int i = 0; i < graph->n_nodes; i++) {
1615
+ struct ggml_tensor * node = graph->nodes[i];
1616
+ if (ggml_is_view_op(node->op)) {
1617
+ continue;
1618
+ }
1619
+ int * node_backend_id = &tensor_backend_id(node);
1620
+ if (*node_backend_id != -1) {
1621
+ cur_backend_id = *node_backend_id;
1622
+ } else if (cur_backend_id != -1) {
1623
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1624
+ }
1625
+ }
1626
+ }
1627
+ // expand rest up
1628
+ {
1629
+ int cur_backend_id = -1;
1630
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1631
+ struct ggml_tensor * node = graph->nodes[i];
1632
+ if (ggml_is_view_op(node->op)) {
1633
+ continue;
1634
+ }
1635
+ int * node_backend_id = &tensor_backend_id(node);
1636
+ if (*node_backend_id != -1) {
1637
+ cur_backend_id = *node_backend_id;
1638
+ } else if (cur_backend_id != -1) {
1639
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1640
+ }
1641
+ }
1642
+ }
1643
+
1644
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1645
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1646
+ // however, we also need to verify that the sources are in compatible buffer types
1647
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1648
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1649
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1650
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1651
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1652
+ for (int i = 0; i < graph->n_nodes; i++) {
1653
+ struct ggml_tensor * node = graph->nodes[i];
1654
+ if (ggml_is_view_op(node->op)) {
1655
+ continue;
1656
+ }
1657
+ int * node_backend_id = &tensor_backend_id(node);
1658
+ if (*node_backend_id == -1) {
1659
+ // unassigned node: find the backend with the most supported inputs
1660
+ int n_supported_best = -1;
1661
+ for (int b = 0; b < sched->n_backends; b++) {
1662
+ if (ggml_backend_supports_op(sched->backends[b], node)) {
1663
+ int n_supported = 0;
1664
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1665
+ struct ggml_tensor * src = node->src[j];
1666
+ if (src == NULL) {
1667
+ continue;
1668
+ }
1669
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
1670
+ n_supported++;
1671
+ }
1672
+ }
1673
+ if (n_supported > n_supported_best) {
1674
+ n_supported_best = n_supported;
1675
+ *node_backend_id = b;
1676
+ SET_CAUSE(node, "3.best");
1677
+ }
1678
+ }
1679
+ }
1680
+ } else {
1681
+ // assigned node: upgrade to higher prio backend if possible
1682
+ for (int b = 0; b < *node_backend_id; b++) {
1683
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
1684
+ bool supported = true;
1685
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1686
+ struct ggml_tensor * src = node->src[j];
1687
+ if (src == NULL) {
1688
+ continue;
1689
+ }
1690
+ if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
1691
+ supported = false;
1692
+ break;
1693
+ }
1694
+ }
1695
+ if (supported) {
1696
+ *node_backend_id = b;
1697
+ SET_CAUSE(node, "3.upg");
1698
+ break;
1699
+ }
1700
+ }
1701
+ }
1702
+ }
1703
+ }
1704
+
1705
+ // pass 4: assign backends to remaining src from dst and view_src
1706
+ for (int i = 0; i < graph->n_nodes; i++) {
1707
+ struct ggml_tensor * node = graph->nodes[i];
1708
+ int * cur_backend_id = &tensor_backend_id(node);
1709
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1710
+ *cur_backend_id = tensor_backend_id(node->view_src);
1711
+ SET_CAUSE(node, "4.vsrc");
1712
+ }
1713
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1714
+ struct ggml_tensor * src = node->src[j];
1715
+ if (src == NULL) {
1716
+ continue;
1717
+ }
1718
+ int * src_backend_id = &tensor_backend_id(src);
1719
+ if (*src_backend_id == -1) {
1720
+ if (src->view_src != NULL) {
1721
+ // views are always on the same backend as the source
1722
+ *src_backend_id = tensor_backend_id(src->view_src);
1723
+ SET_CAUSE(src, "4.vsrc");
1724
+ } else {
1725
+ *src_backend_id = *cur_backend_id;
1726
+ SET_CAUSE(src, "4.cur");
1727
+ }
1728
+ }
1729
+ }
1730
+ }
1731
+
1732
+ // pass 5: split graph, find tensors that need to be copied
1733
+ {
1734
+ int i_split = 0;
1735
+ struct ggml_backend_sched_split * split = &sched->splits[0];
1736
+ // find the backend of the first split, skipping view ops
1737
+ int i = 0;
1738
+ for (; i < graph->n_nodes; i++) {
1739
+ struct ggml_tensor * node = graph->nodes[i];
1740
+ if (!ggml_is_view_op(node->op)) {
1741
+ split->backend_id = tensor_backend_id(node);
1742
+ break;
1743
+ }
1744
+ }
1745
+ split->i_start = 0;
1746
+ split->n_inputs = 0;
1747
+ int cur_backend_id = split->backend_id;
1748
+ for (; i < graph->n_nodes; i++) {
1749
+ struct ggml_tensor * node = graph->nodes[i];
1750
+
1751
+ if (ggml_is_view_op(node->op)) {
1752
+ continue;
1753
+ }
1754
+
1755
+ const int node_backend_id = tensor_backend_id(node);
1756
+
1757
+ assert(node_backend_id != -1); // all nodes should be assigned by now
1758
+
1759
+ // check if we should start a new split based on the sources of the current node
1760
+ bool need_new_split = false;
1761
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1762
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1763
+ struct ggml_tensor * src = node->src[j];
1764
+ if (src == NULL) {
1765
+ continue;
1766
+ }
1767
+ // check if a weight is on a different backend
1768
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1769
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1770
+ int src_backend_id = tensor_backend_id(src);
1771
+ if (src_backend_id != cur_backend_id) {
1772
+ need_new_split = true;
1773
+ break;
1774
+ }
1775
+ }
1776
+ // check if the split has too many inputs
1777
+ // FIXME: count the number of inputs instead of only checking when full
1778
+ if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1779
+ const size_t id = hash_id(src);
1780
+ int src_backend_id = sched->hv_tensor_backend_ids[id];
1781
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1782
+ if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1783
+ //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1784
+ need_new_split = true;
1785
+ break;
1786
+ }
1787
+ }
1788
+ }
1789
+ }
1790
+
1791
+ if (node_backend_id != cur_backend_id || need_new_split) {
1792
+ split->i_end = i;
1793
+ i_split++;
1794
+ if (i_split >= sched->splits_capacity) {
1795
+ sched->splits_capacity *= 2;
1796
+ sched->splits = (ggml_backend_sched_split *)
1797
+ realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1798
+ GGML_ASSERT(sched->splits != NULL);
1799
+ }
1800
+ split = &sched->splits[i_split];
1801
+ split->backend_id = node_backend_id;
1802
+ split->i_start = i;
1803
+ split->n_inputs = 0;
1804
+ cur_backend_id = node_backend_id;
1805
+ }
1806
+
1807
+ // find inputs that are not on the same backend
1808
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1809
+ struct ggml_tensor * src = node->src[j];
1810
+ if (src == NULL) {
1811
+ continue;
1812
+ }
1813
+
1814
+ size_t src_id = hash_id(src);
1815
+ const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1816
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1817
+
1818
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1819
+ if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1820
+ ggml_backend_t backend = sched->backends[src_backend_id];
1821
+ for (int c = 0; c < sched->n_copies; c++) {
1822
+ struct ggml_tensor * tensor_copy;
1823
+ if (c == sched->cur_copy) {
1824
+ tensor_copy = src; // use the original tensor as the current copy
1825
+ } else {
1826
+ tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1827
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1828
+ }
1829
+ if (sched->n_copies > 1) {
1830
+ ggml_set_input(tensor_copy);
1831
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1832
+ }
1833
+ tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1834
+ SET_CAUSE(tensor_copy, "4.cpy");
1835
+ }
1836
+ int n_graph_inputs = sched->n_graph_inputs++;
1837
+ GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1838
+ sched->graph_inputs[n_graph_inputs] = src;
1839
+ }
1840
+ }
1841
+
1842
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1843
+ // create a copy of the input in the split's backend
1844
+ if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1845
+ ggml_backend_t backend = sched->backends[cur_backend_id];
1846
+ for (int c = 0; c < sched->n_copies; c++) {
1847
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1848
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1849
+ if (sched->n_copies > 1) {
1850
+ ggml_set_input(tensor_copy);
1851
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1852
+ }
1853
+ tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1854
+ SET_CAUSE(tensor_copy, "4.cpy");
1855
+ }
1856
+ int n_inputs = split->n_inputs++;
1857
+ GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1858
+ split->inputs[n_inputs] = src;
1859
+ }
1860
+ node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1861
+ }
1862
+ }
1863
+ }
1864
+ split->i_end = graph->n_nodes;
1865
+ sched->n_splits = i_split + 1;
1866
+ }
1867
+
1868
+ if (sched->debug) {
1869
+ ggml_backend_sched_print_assignments(sched, graph);
1870
+ }
1871
+
1872
+ // swap node_backend_ids and leaf _backend_ids with prevs
1873
+ {
1874
+ int * tmp = sched->node_backend_ids;
1875
+ sched->node_backend_ids = sched->prev_node_backend_ids;
1876
+ sched->prev_node_backend_ids = tmp;
1877
+
1878
+ tmp = sched->leaf_backend_ids;
1879
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1880
+ sched->prev_leaf_backend_ids = tmp;
1881
+ }
1882
+
1883
+ int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1884
+ if (sched->graph.size < graph_size) {
1885
+ sched->graph.size = graph_size;
1886
+ sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
1887
+ sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
1888
+ GGML_ASSERT(sched->graph.nodes != NULL);
1889
+ GGML_ASSERT(sched->graph.leafs != NULL);
1890
+ }
1891
+ sched->graph.n_nodes = 0;
1892
+ sched->graph.n_leafs = 0;
1893
+
1894
+ struct ggml_cgraph * graph_copy = &sched->graph;
1895
+
1896
+ for (int i = 0; i < sched->n_splits; i++) {
1897
+ struct ggml_backend_sched_split * split = &sched->splits[i];
1898
+ split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1899
+
1900
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1901
+ for (int j = 0; j < split->n_inputs; j++) {
1902
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
1903
+
1904
+ struct ggml_tensor * input = split->inputs[j];
1905
+ const size_t input_id = hash_id(input);
1906
+ struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1907
+
1908
+ // add a dependency to the input source so that it is not freed before the copy is done
1909
+ struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1910
+ input_dep->src[0] = input;
1911
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1912
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1913
+
1914
+ // add a dependency to the input copy so that it is allocated at the start of the split
1915
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1916
+ graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1917
+ }
1918
+
1919
+ for (int j = split->i_start; j < split->i_end; j++) {
1920
+ assert(graph_copy->size > graph_copy->n_nodes);
1921
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1922
+ graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1923
+ }
1924
+ }
1925
+
1926
+ if (sched->n_copies > 1) {
1927
+ // add input copies as leafs so that they are allocated first
1928
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1929
+ struct ggml_tensor * input = sched->graph_inputs[i];
1930
+ size_t id = hash_id(input);
1931
+ int backend_id = tensor_backend_id(input);
1932
+ for (int c = 0; c < sched->n_copies; c++) {
1933
+ struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1934
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1935
+ assert(graph_copy->size > graph_copy->n_leafs);
1936
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1937
+ }
1938
+ }
1939
+
1940
+ for (int i = 0; i < sched->n_splits; i++) {
1941
+ struct ggml_backend_sched_split * split = &sched->splits[i];
1942
+ int backend_id = split->backend_id;
1943
+ for (int j = 0; j < split->n_inputs; j++) {
1944
+ struct ggml_tensor * input = split->inputs[j];
1945
+ size_t id = hash_id(input);
1946
+ for (int c = 0; c < sched->n_copies; c++) {
1947
+ struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1948
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1949
+ assert(graph_copy->size > graph_copy->n_leafs);
1950
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1951
+ }
1952
+ }
1953
+ }
1954
+ }
1955
+
1956
+ // add leafs from the original graph
1957
+ for (int i = 0; i < graph->n_leafs; i++) {
1958
+ struct ggml_tensor * leaf = graph->leafs[i];
1959
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1960
+ assert(graph_copy->size > graph_copy->n_leafs);
1961
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1962
+ }
1963
+ }
1964
+
1965
+ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1966
+ bool backend_ids_changed = false;
1967
+ for (int i = 0; i < sched->graph.n_nodes; i++) {
1968
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1969
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1970
+ backend_ids_changed = true;
1971
+ break;
1972
+ }
1973
+ }
1974
+ if (!backend_ids_changed) {
1975
+ for (int i = 0; i < sched->graph.n_leafs; i++) {
1976
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1977
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1978
+ backend_ids_changed = true;
1979
+ break;
1980
+ }
1981
+ }
1982
+ }
1983
+
1984
+ // allocate graph
1985
+ if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1986
+ // the re-allocation may cause the split inputs to be moved to a different address
1987
+ ggml_backend_sched_synchronize(sched);
1988
+ #ifndef NDEBUG
1989
+ fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1990
+ #endif
1991
+ ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1992
+ if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1993
+ fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1994
+ return false;
1995
+ }
1996
+ }
1997
+
1998
+ return true;
1999
+ }
2000
+
2001
+ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
2002
+ struct ggml_backend_sched_split * splits = sched->splits;
2003
+
2004
+ for (int i = 0; i < sched->n_splits; i++) {
2005
+ struct ggml_backend_sched_split * split = &splits[i];
2006
+ int split_backend_id = split->backend_id;
2007
+ ggml_backend_t split_backend = sched->backends[split_backend_id];
2008
+
2009
+ // copy the input tensors to the split backend
2010
+ for (int j = 0; j < split->n_inputs; j++) {
2011
+ ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
2012
+ struct ggml_tensor * input = split->inputs[j];
2013
+ struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
2014
+
2015
+ if (input->flags & GGML_TENSOR_FLAG_INPUT) {
2016
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
2017
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2018
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
2019
+ } else {
2020
+ ggml_backend_synchronize(split_backend);
2021
+ }
2022
+ ggml_backend_tensor_copy(input, input_cpy);
2023
+ } else {
2024
+ // wait for the split backend to finish using the input before overwriting it
2025
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2026
+ ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
2027
+ } else {
2028
+ ggml_backend_synchronize(split_backend);
2029
+ }
2030
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
2031
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
2032
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
2033
+ ggml_backend_synchronize(input_backend);
2034
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2035
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
2036
+ } else {
2037
+ ggml_backend_synchronize(split_backend);
2038
+ }
2039
+ ggml_backend_tensor_copy(input, input_cpy);
2040
+ }
2041
+ }
2042
+ }
2043
+
2044
+ if (!sched->callback_eval) {
2045
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
2046
+ if (ec != GGML_STATUS_SUCCESS) {
2047
+ return ec;
2048
+ }
2049
+ } else {
2050
+ // similar to ggml_backend_compare_graph_backend
2051
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
2052
+ struct ggml_tensor * t = split->graph.nodes[j0];
2053
+
2054
+ // check if the user needs data from this node
2055
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
2056
+
2057
+ int j1 = j0;
2058
+
2059
+ // determine the range [j0, j1] of nodes that can be computed together
2060
+ while (!need && j1 < split->graph.n_nodes - 1) {
2061
+ t = split->graph.nodes[++j1];
2062
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
2063
+ }
2064
+
2065
+ struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
2066
+
2067
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
2068
+ if (ec != GGML_STATUS_SUCCESS) {
2069
+ return ec;
2070
+ }
2071
+
2072
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
2073
+ ggml_backend_synchronize(split_backend);
2074
+
2075
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
2076
+ break;
2077
+ }
2078
+
2079
+ j0 = j1;
2080
+ }
2081
+ }
2082
+
2083
+ // record the event of this copy
2084
+ if (split->n_inputs > 0) {
2085
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2086
+ ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
2087
+ }
2088
+ }
2089
+ }
2090
+
2091
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
2092
+
2093
+ return GGML_STATUS_SUCCESS;
2094
+ }
2095
+
2096
+ ggml_backend_sched_t ggml_backend_sched_new(
2097
+ ggml_backend_t * backends,
2098
+ ggml_backend_buffer_type_t * bufts,
2099
+ int n_backends,
2100
+ size_t graph_size,
2101
+ bool parallel) {
2102
+ GGML_ASSERT(n_backends > 0);
2103
+ GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
2104
+ GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
2105
+
2106
+ struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
2107
+
2108
+ sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
2109
+ sched->n_backends = n_backends;
2110
+ sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
2111
+
2112
+ // initialize hash table
2113
+ // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
2114
+ sched->hash_set = ggml_hash_set_new(graph_size);
2115
+ sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
2116
+ sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
2117
+
2118
+ const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
2119
+ const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
2120
+ sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
2121
+ sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
2122
+ sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
2123
+ sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
2124
+
2125
+ sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
2126
+ sched->context_buffer = (char *) malloc(sched->context_buffer_size);
2127
+
2128
+ const int initial_splits_capacity = 16;
2129
+ sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
2130
+ sched->splits_capacity = initial_splits_capacity;
2131
+
2132
+ for (int b = 0; b < n_backends; b++) {
2133
+ sched->backends[b] = backends[b];
2134
+ sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
2135
+ GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
2136
+ if (sched->n_copies > 1) {
2137
+ for (int c = 0; c < sched->n_copies; c++) {
2138
+ sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
2139
+ }
2140
+ }
2141
+ }
2142
+
2143
+ sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
2144
+
2145
+ ggml_backend_sched_reset(sched);
2146
+
2147
+ return sched;
2148
+ }
2149
+
2150
+ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
2151
+ if (sched == NULL) {
2152
+ return;
2153
+ }
2154
+ for (int b = 0; b < sched->n_backends; b++) {
2155
+ for (int c = 0; c < sched->n_copies; c++) {
2156
+ ggml_backend_event_free(sched->events[b][c]);
2157
+ }
2158
+ }
2159
+ ggml_gallocr_free(sched->galloc);
2160
+ ggml_free(sched->ctx);
2161
+ ggml_hash_set_free(&sched->hash_set);
2162
+ free(sched->splits);
2163
+ free(sched->hv_tensor_backend_ids);
2164
+ free(sched->hv_tensor_copies);
2165
+ free(sched->node_backend_ids);
2166
+ free(sched->leaf_backend_ids);
2167
+ free(sched->prev_node_backend_ids);
2168
+ free(sched->prev_leaf_backend_ids);
2169
+ free(sched->context_buffer);
2170
+ free(sched->graph.nodes);
2171
+ free(sched->graph.leafs);
2172
+ free(sched);
2173
+ }
2174
+
2175
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
2176
+ // reset state for the next run
2177
+ if (!sched->is_reset) {
2178
+ ggml_hash_set_reset(&sched->hash_set);
2179
+ memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
2180
+ memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
2181
+ sched->is_reset = true;
2182
+ }
2183
+ sched->is_alloc = false;
2184
+ }
2185
+
2186
+ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
2187
+ GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
2188
+
2189
+ ggml_backend_sched_split_graph(sched, measure_graph);
2190
+
2191
+ if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
2192
+ return false;
2193
+ }
2194
+
2195
+ ggml_backend_sched_reset(sched);
2196
+ ggml_backend_sched_synchronize(sched);
2197
+
2198
+ return true;
2199
+ }
2200
+
2201
+ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
2202
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
2203
+
2204
+ ggml_backend_sched_split_graph(sched, graph);
2205
+
2206
+
2207
+ if (!ggml_backend_sched_alloc_splits(sched)) {
2208
+ return false;
2209
+ }
2210
+
2211
+ sched->is_alloc = true;
2212
+
2213
+ return true;
2214
+ }
2215
+
2216
+ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
2217
+ enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
2218
+ ggml_backend_sched_synchronize(sched);
2219
+ return err;
2220
+ }
2221
+
2222
+ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
2223
+ if (!sched->is_reset && !sched->is_alloc) {
2224
+ ggml_backend_sched_reset(sched);
2225
+ }
2226
+
2227
+ if (!sched->is_alloc) {
2228
+ if (!ggml_backend_sched_alloc_graph(sched, graph)) {
2229
+ return GGML_STATUS_ALLOC_FAILED;
2230
+ }
2231
+ }
2232
+
2233
+ return ggml_backend_sched_compute_splits(sched);
2234
+ }
2235
+
2236
+ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
2237
+ for (int i = 0; i < sched->n_backends; i++) {
2238
+ ggml_backend_synchronize(sched->backends[i]);
2239
+ }
2240
+ }
2241
+
2242
+ void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
2243
+ sched->callback_eval = callback;
2244
+ sched->callback_eval_user_data = user_data;
2245
+ }
2246
+
2247
+ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
2248
+ return sched->n_splits;
2249
+ }
2250
+
2251
+ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
2252
+ return sched->n_copies;
2253
+ }
2254
+
2255
+ int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
2256
+ return sched->n_backends;
2257
+ }
2258
+
2259
+ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
2260
+ GGML_ASSERT(i >= 0 && i < sched->n_backends);
2261
+ return sched->backends[i];
2262
+ }
2263
+
2264
+ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
2265
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
2266
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2267
+
2268
+ return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
2269
+ }
2270
+
2271
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
2272
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
2273
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2274
+ tensor_backend_id(node) = backend_index;
2275
+ SET_CAUSE(node, "usr");
2276
+ sched->is_reset = false;
2277
+ }
2278
+
2279
+ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
2280
+ int backend_index = tensor_backend_id(node);
2281
+ if (backend_index == -1) {
2282
+ return NULL;
2283
+ }
2284
+ return sched->backends[backend_index];
2285
+ }
2286
+
2287
+ // utils
2288
+
2289
+ void ggml_backend_view_init(struct ggml_tensor * tensor) {
2290
+ GGML_ASSERT(tensor->buffer == NULL);
2291
+ GGML_ASSERT(tensor->view_src != NULL);
2292
+ GGML_ASSERT(tensor->view_src->buffer != NULL);
2293
+ GGML_ASSERT(tensor->view_src->data != NULL);
2294
+
2295
+ tensor->buffer = tensor->view_src->buffer;
2296
+ tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
2297
+ ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
2298
+ }
2299
+
2300
+ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
2301
+ GGML_ASSERT(tensor->buffer == NULL);
2302
+ GGML_ASSERT(tensor->data == NULL);
2303
+ GGML_ASSERT(tensor->view_src == NULL);
2304
+ GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
2305
+ GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
2306
+ (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
2307
+
2308
+ tensor->buffer = buffer;
2309
+ tensor->data = addr;
2310
+ ggml_backend_buffer_init_tensor(buffer, tensor);
2311
+ }
2312
+
2313
+ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
2314
+ struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
2315
+
2316
+ GGML_ASSERT(src != NULL);
2317
+ GGML_ASSERT(src->data && "graph must be allocated");
2318
+
2319
+ size_t id = ggml_hash_insert(&hash_set, src);
2320
+ if (id == GGML_HASHSET_ALREADY_EXISTS) {
2321
+ return node_copies[ggml_hash_find(&hash_set, src)];
2322
+ }
2323
+
2324
+ struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
2325
+ if (src->view_src != NULL) {
2326
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
2327
+ dst->view_offs = src->view_offs;
2328
+ }
2329
+ dst->op = src->op;
2330
+ memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
2331
+ ggml_set_name(dst, src->name);
2332
+
2333
+ // copy src
2334
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2335
+ struct ggml_tensor * s = src->src[i];
2336
+ if (s == NULL) {
2337
+ continue;
2338
+ }
2339
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
2340
+ }
2341
+
2342
+ node_copies[id] = dst;
2343
+ return dst;
2344
+ }
2345
+
2346
+ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
2347
+ size_t id = ggml_hash_find(hash_set, src);
2348
+ if (node_init[id]) {
2349
+ return;
2350
+ }
2351
+ node_init[id] = true;
2352
+
2353
+ struct ggml_tensor * dst = node_copies[id];
2354
+ if (dst->view_src != NULL) {
2355
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
2356
+ ggml_backend_view_init(dst);
2357
+ }
2358
+ else {
2359
+ ggml_backend_tensor_copy(src, dst);
2360
+ }
2361
+
2362
+ // init src
2363
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2364
+ struct ggml_tensor * s = src->src[i];
2365
+ if (s == NULL) {
2366
+ continue;
2367
+ }
2368
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
2369
+ }
2370
+ }
2371
+
2372
+ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
2373
+ struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
2374
+ struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2375
+ bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
2376
+
2377
+ struct ggml_init_params params = {
2378
+ /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
2379
+ /* .mem_buffer = */ NULL,
2380
+ /* .no_alloc = */ true
2381
+ };
2382
+
2383
+ struct ggml_context * ctx_allocated = ggml_init(params);
2384
+ struct ggml_context * ctx_unallocated = ggml_init(params);
2385
+
2386
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2387
+ fprintf(stderr, "failed to allocate context for graph copy\n");
2388
+ ggml_hash_set_free(&hash_set);
2389
+ free(node_copies);
2390
+ free(node_init);
2391
+ ggml_free(ctx_allocated);
2392
+ ggml_free(ctx_unallocated);
2393
+ return {
2394
+ /* .buffer = */ NULL,
2395
+ /* .ctx_allocated = */ NULL,
2396
+ /* .ctx_unallocated = */ NULL,
2397
+ /* .graph = */ NULL,
2398
+ };
2399
+ }
2400
+
2401
+ // dup nodes
2402
+ for (int i = 0; i < graph->n_nodes; i++) {
2403
+ struct ggml_tensor * node = graph->nodes[i];
2404
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
2405
+ }
2406
+
2407
+ // allocate nodes
2408
+ ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2409
+ if (buffer == NULL) {
2410
+ fprintf(stderr, "failed to allocate buffer for graph copy\n");
2411
+ ggml_hash_set_free(&hash_set);
2412
+ free(node_copies);
2413
+ free(node_init);
2414
+ ggml_free(ctx_allocated);
2415
+ ggml_free(ctx_unallocated);
2416
+ return {
2417
+ /* .buffer = */ NULL,
2418
+ /* .ctx_allocated = */ NULL,
2419
+ /* .ctx_unallocated = */ NULL,
2420
+ /* .graph = */ NULL,
2421
+ };
2422
+ }
2423
+
2424
+ //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2425
+
2426
+ // copy data and init views
2427
+ for (int i = 0; i < graph->n_nodes; i++) {
2428
+ struct ggml_tensor * node = graph->nodes[i];
2429
+ graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
2430
+ }
2431
+
2432
+ // build graph copy
2433
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
2434
+ for (int i = 0; i < graph->n_nodes; i++) {
2435
+ struct ggml_tensor * node = graph->nodes[i];
2436
+ struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
2437
+ graph_copy->nodes[i] = node_copy;
2438
+ }
2439
+ graph_copy->n_nodes = graph->n_nodes;
2440
+
2441
+ ggml_hash_set_free(&hash_set);
2442
+ free(node_copies);
2443
+ free(node_init);
2444
+
2445
+ return {
2446
+ /* .buffer = */ buffer,
2447
+ /* .ctx_allocated = */ ctx_allocated,
2448
+ /* .ctx_unallocated = */ ctx_unallocated,
2449
+ /* .graph = */ graph_copy,
2450
+ };
2451
+ }
2452
+
2453
+ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
2454
+ ggml_backend_buffer_free(copy.buffer);
2455
+ ggml_free(copy.ctx_allocated);
2456
+ ggml_free(copy.ctx_unallocated);
2457
+ }
2458
+
2459
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
2460
+ struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
2461
+ if (copy.buffer == NULL) {
2462
+ return false;
2463
+ }
2464
+
2465
+ struct ggml_cgraph * g1 = graph;
2466
+ struct ggml_cgraph * g2 = copy.graph;
2467
+
2468
+ assert(g1->n_nodes == g2->n_nodes);
2469
+
2470
+ for (int i = 0; i < g1->n_nodes; i++) {
2471
+ //printf("eval %d/%d\n", i, g1->n_nodes);
2472
+ struct ggml_tensor * t1 = g1->nodes[i];
2473
+ struct ggml_tensor * t2 = g2->nodes[i];
2474
+
2475
+ assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
2476
+
2477
+ struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
2478
+ struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
2479
+
2480
+ ggml_backend_graph_compute(backend1, &g1v);
2481
+ ggml_backend_graph_compute(backend2, &g2v);
2482
+
2483
+ if (ggml_is_view_op(t1->op)) {
2484
+ continue;
2485
+ }
2486
+
2487
+ // compare results, calculate rms etc
2488
+ if (!callback(i, t1, t2, user_data)) {
2489
+ break;
2490
+ }
2491
+ }
2492
+
2493
+ ggml_backend_graph_copy_free(copy);
2494
+
2495
+ return true;
2496
+ }
ggml/src/ggml-blas.cpp CHANGED
@@ -235,25 +235,25 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g
235
 
236
  // backend interface
237
 
238
- GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
239
  return "BLAS";
240
 
241
  GGML_UNUSED(backend);
242
  }
243
 
244
- GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
245
  ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
246
  delete ctx;
247
  delete backend;
248
  }
249
 
250
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
251
  return ggml_backend_cpu_buffer_type();
252
 
253
  GGML_UNUSED(backend);
254
  }
255
 
256
- GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
257
  ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
258
 
259
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -285,7 +285,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
285
  GGML_UNUSED(backend);
286
  }
287
 
288
- GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
289
  const struct ggml_tensor * src0 = op->src[0];
290
  const struct ggml_tensor * src1 = op->src[1];
291
 
@@ -300,7 +300,7 @@ GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, cons
300
  GGML_UNUSED(backend);
301
  }
302
 
303
- GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
304
  return ggml_backend_buft_is_host(buft);
305
 
306
  GGML_UNUSED(backend);
@@ -322,11 +322,8 @@ static struct ggml_backend_i blas_backend_i = {
322
  /* .supports_op = */ ggml_backend_blas_supports_op,
323
  /* .supports_buft = */ ggml_backend_blas_supports_buft,
324
  /* .offload_op = */ NULL,
325
- /* .event_new = */ NULL,
326
- /* .event_free = */ NULL,
327
  /* .event_record = */ NULL,
328
  /* .event_wait = */ NULL,
329
- /* .event_synchronize = */ NULL,
330
  };
331
 
332
  static ggml_guid_t ggml_backend_blas_guid(void) {
@@ -340,6 +337,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
340
  ggml_backend_t backend = new ggml_backend {
341
  /* .guid = */ ggml_backend_blas_guid(),
342
  /* .interface = */ blas_backend_i,
 
343
  /* .context = */ ctx,
344
  };
345
 
@@ -356,7 +354,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
356
  return backend;
357
  }
358
 
359
- GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
360
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
361
  }
362
 
 
235
 
236
  // backend interface
237
 
238
+ static const char * ggml_backend_blas_name(ggml_backend_t backend) {
239
  return "BLAS";
240
 
241
  GGML_UNUSED(backend);
242
  }
243
 
244
+ static void ggml_backend_blas_free(ggml_backend_t backend) {
245
  ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
246
  delete ctx;
247
  delete backend;
248
  }
249
 
250
+ static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
251
  return ggml_backend_cpu_buffer_type();
252
 
253
  GGML_UNUSED(backend);
254
  }
255
 
256
+ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
257
  ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
258
 
259
  for (int i = 0; i < cgraph->n_nodes; i++) {
 
285
  GGML_UNUSED(backend);
286
  }
287
 
288
+ static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
289
  const struct ggml_tensor * src0 = op->src[0];
290
  const struct ggml_tensor * src1 = op->src[1];
291
 
 
300
  GGML_UNUSED(backend);
301
  }
302
 
303
+ static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
304
  return ggml_backend_buft_is_host(buft);
305
 
306
  GGML_UNUSED(backend);
 
322
  /* .supports_op = */ ggml_backend_blas_supports_op,
323
  /* .supports_buft = */ ggml_backend_blas_supports_buft,
324
  /* .offload_op = */ NULL,
 
 
325
  /* .event_record = */ NULL,
326
  /* .event_wait = */ NULL,
 
327
  };
328
 
329
  static ggml_guid_t ggml_backend_blas_guid(void) {
 
337
  ggml_backend_t backend = new ggml_backend {
338
  /* .guid = */ ggml_backend_blas_guid(),
339
  /* .interface = */ blas_backend_i,
340
+ /* .device = */ nullptr,
341
  /* .context = */ ctx,
342
  };
343
 
 
354
  return backend;
355
  }
356
 
357
+ bool ggml_backend_is_blas(ggml_backend_t backend) {
358
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
359
  }
360
 
ggml/src/ggml-cann.cpp CHANGED
@@ -560,7 +560,7 @@ struct ggml_backend_cann_buffer_context {
560
  * @return A pointer to a C-string containing the name of the buffer.
561
  */
562
 
563
- GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
564
  ggml_backend_buffer_t buffer) {
565
  return "CANN";
566
 
@@ -576,7 +576,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
576
  * @param buffer The buffer to check.
577
  * @return true if the buffer is a CANN buffer, false otherwise.
578
  */
579
- GGML_CALL static bool ggml_backend_buffer_is_cann(
580
  ggml_backend_buffer_t buffer) {
581
  return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
582
  }
@@ -589,7 +589,7 @@ GGML_CALL static bool ggml_backend_buffer_is_cann(
589
  *
590
  * @param buffer The CANN buffer to free.
591
  */
592
- GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
593
  ggml_backend_buffer_t buffer) {
594
  ggml_backend_cann_buffer_context* ctx =
595
  (ggml_backend_cann_buffer_context*)buffer->context;
@@ -605,7 +605,7 @@ GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
605
  * @param buffer The CANN buffer whose base pointer is to be retrieved.
606
  * @return A pointer to the base of the device memory allocated for the buffer.
607
  */
608
- GGML_CALL static void* ggml_backend_cann_buffer_get_base(
609
  ggml_backend_buffer_t buffer) {
610
  ggml_backend_cann_buffer_context* ctx =
611
  (ggml_backend_cann_buffer_context*)buffer->context;
@@ -625,9 +625,9 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
625
  * @param dst Pointer to the destination buffer where transformed data will be
626
  * stored.
627
  */
628
- GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
629
- const void* src,
630
- void* dst) {
631
 
632
  int64_t n_elems = ggml_nelements(tensor);
633
  int64_t groups = n_elems / QK4_0;
@@ -677,7 +677,7 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
677
  * @param dst Pointer to the destination buffer where the Q4.0 formatted data
678
  * will be stored.
679
  */
680
- GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
681
  const ggml_tensor* tensor, void* src, void* dst) {
682
 
683
  int64_t n_elems = ggml_nelements(tensor);
@@ -726,9 +726,9 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
726
  * @param dst Pointer to the destination buffer where transformed data will be
727
  * stored.
728
  */
729
- GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
730
- const void* src,
731
- void* dst) {
732
  int64_t n_elems = ggml_nelements(tensor);
733
  int64_t groups = n_elems / QK8_0;
734
  size_t quant_bytes = n_elems * sizeof(uint8_t);
@@ -760,7 +760,7 @@ GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
760
  * @param dst Pointer to the destination buffer where the Q8.0 formatted data
761
  * will be stored.
762
  */
763
- GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
764
  const ggml_tensor* tensor, const void* src, void* dst) {
765
  int64_t n_elems = ggml_nelements(tensor);
766
  int64_t groups = n_elems / QK8_0;
@@ -792,8 +792,8 @@ GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
792
  * @param dst Pointer to the destination buffer where transformed data will be
793
  * stored.
794
  */
795
- GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
796
- const void* src, void* dst) {
797
  switch (tensor->type) {
798
  case GGML_TYPE_Q4_0:
799
  ggml_backend_cann_transform_q4_0(tensor, src, dst);
@@ -818,7 +818,7 @@ GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
818
  * @param dst Pointer to the destination buffer where transformed tensor data
819
  * will be stored.
820
  */
821
- GGML_CALL static void ggml_backend_cann_transform_back(
822
  const ggml_tensor* tensor, void* src, void* dst) {
823
  switch (tensor->type) {
824
  case GGML_TYPE_Q4_0:
@@ -841,7 +841,7 @@ GGML_CALL static void ggml_backend_cann_transform_back(
841
  * @param type The tensor type to check.
842
  * @return true if transformation is needed, false otherwise.
843
  */
844
- GGML_CALL static bool need_transform(ggml_type type) {
845
  switch (type) {
846
  case GGML_TYPE_Q4_0:
847
  case GGML_TYPE_Q8_0:
@@ -860,7 +860,7 @@ GGML_CALL static bool need_transform(ggml_type type) {
860
  * @param buffer The CANN buffer from which to initialize the tensor.
861
  * @param tensor Pointer to the tensor to be initialized.
862
  */
863
- GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
864
  ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
865
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
866
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
@@ -896,7 +896,7 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
896
  * @param offset Offset in the source data from where to start copying.
897
  * @param size Size of the data to be copied, in bytes.
898
  */
899
- GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
900
  ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
901
  size_t offset, size_t size) {
902
  ggml_backend_cann_buffer_context *ctx =
@@ -941,7 +941,7 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
941
  * @param offset Offset in the destination buffer where to start copying.
942
  * @param size Size of the data to be copied, in bytes.
943
  */
944
- GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
945
  ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
946
  size_t offset, size_t size) {
947
  ggml_backend_cann_buffer_context* ctx =
@@ -975,7 +975,7 @@ GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
975
  * @param dst Pointer to the destination tensor where the data will be copied.
976
  * @return true if the copy operation succeeded, false otherwise.
977
  */
978
- GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
979
  ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
980
  if (ggml_backend_buffer_is_cann(src->buffer)) {
981
  ggml_backend_cann_buffer_context* src_ctx =
@@ -1017,7 +1017,7 @@ GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
1017
  * @param buffer The CANN buffer to be cleared.
1018
  * @param value The value to which each byte in the buffer will be set.
1019
  */
1020
- GGML_CALL static void ggml_backend_cann_buffer_clear(
1021
  ggml_backend_buffer_t buffer, uint8_t value) {
1022
  ggml_backend_cann_buffer_context* ctx =
1023
  (ggml_backend_cann_buffer_context*)buffer->context;
@@ -1065,7 +1065,7 @@ struct ggml_backend_cann_buffer_type_context {
1065
  * @param buft Pointer to the buffer type context.
1066
  * @return Const pointer to the C-style string containing the name.
1067
  */
1068
- GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
1069
  ggml_backend_buffer_type_t buft) {
1070
  return "CANN";
1071
 
@@ -1082,7 +1082,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
1082
  * @param size Size in bytes of the buffer to allocate.
1083
  * @return Pointer to the allocated buffer, or nullptr if allocation fails.
1084
  */
1085
- GGML_CALL static ggml_backend_buffer_t
1086
  ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1087
  size_t size) {
1088
  ggml_backend_cann_buffer_type_context* buft_ctx =
@@ -1121,7 +1121,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1121
  * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
1122
  * buffers).
1123
  */
1124
- GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
1125
  ggml_backend_buffer_type_t buft) {
1126
  return 128;
1127
 
@@ -1142,7 +1142,7 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
1142
  * @return The total allocation size in bytes required for the tensor in the
1143
  * CANN buffer.
1144
  */
1145
- GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1146
  ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
1147
  size_t size = ggml_nbytes(tensor);
1148
  int64_t ne0 = tensor->ne[0];
@@ -1193,7 +1193,7 @@ static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
1193
  * @return A pointer to the buffer type interface for the specified device, or
1194
  * nullptr if the device index is out of range.
1195
  */
1196
- GGML_CALL ggml_backend_buffer_type_t
1197
  ggml_backend_cann_buffer_type(int32_t device) {
1198
  static std::mutex mutex;
1199
  std::lock_guard<std::mutex> lock(mutex);
@@ -1231,7 +1231,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
1231
  * @param buft Pointer to the host buffer type context.
1232
  * @return Const pointer to the C-style string containing the name.
1233
  */
1234
- GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
1235
  return "CANN_Host";
1236
 
1237
  GGML_UNUSED(buft);
@@ -1246,7 +1246,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backe
1246
  * @param buft Pointer to the host buffer context.
1247
  * @return Const pointer to the C-style string containing the name.
1248
  */
1249
- GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
1250
  return "CANN_Host";
1251
 
1252
  GGML_UNUSED(buffer);
@@ -1260,7 +1260,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_bu
1260
  *
1261
  * @param buffer The CANN host buffer to free.
1262
  */
1263
- GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
1264
  ACL_CHECK(aclrtFreeHost(buffer->context));
1265
  }
1266
 
@@ -1294,7 +1294,7 @@ static void * ggml_cann_host_malloc(size_t size) {
1294
  * @param size Size in bytes of the host buffer to allocate.
1295
  * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
1296
  */
1297
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1298
  void * hostPtr = ggml_cann_host_malloc(size);
1299
 
1300
  if (hostPtr == nullptr) {
@@ -1316,7 +1316,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_
1316
  * Provides function pointers for allocating, querying properties, and managing
1317
  * memory for CANN buffer types in the GGML backend.
1318
  */
1319
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1320
  static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
1321
  /* .iface = */ {
1322
  /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
@@ -1326,6 +1326,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1326
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1327
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1328
  },
 
1329
  /* .context = */ nullptr,
1330
  };
1331
 
@@ -1495,7 +1496,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1495
  * @param backend Pointer to the CANN backend structure.
1496
  * @return A pointer to a constant string representing the backend name.
1497
  */
1498
- GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1499
  ggml_backend_cann_context* cann_ctx =
1500
  (ggml_backend_cann_context*)backend->context;
1501
 
@@ -1510,7 +1511,7 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1510
  *
1511
  * @param backend Pointer to the CANN backend structure to be freed.
1512
  */
1513
- GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
1514
  ggml_backend_cann_context* cann_ctx =
1515
  (ggml_backend_cann_context*)backend->context;
1516
  ACL_CHECK(aclrtSynchronizeDevice());
@@ -1535,7 +1536,7 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
1535
  * @param backend Pointer to the CANN backend structure.
1536
  * @return Pointer to the buffer type structure for the CANN backend.
1537
  */
1538
- GGML_CALL static ggml_backend_buffer_type_t
1539
  ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
1540
  ggml_backend_cann_context* cann_ctx =
1541
  (ggml_backend_cann_context*)backend->context;
@@ -1556,11 +1557,11 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
1556
  * @param offset Offset in bytes within the host data.
1557
  * @param size Size of the data to copy in bytes.
1558
  */
1559
- GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1560
- ggml_tensor *tensor,
1561
- const void *data,
1562
- size_t offset,
1563
- size_t size) {
1564
  ggml_backend_cann_context *cann_ctx =
1565
  (ggml_backend_cann_context *)backend->context;
1566
 
@@ -1587,7 +1588,7 @@ GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1587
  }
1588
  }
1589
 
1590
- GGML_CALL static void ggml_backend_cann_get_tensor_async(
1591
  ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1592
  size_t offset, size_t size) {
1593
  ggml_backend_cann_context *cann_ctx =
@@ -1626,7 +1627,7 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
1626
  * @param dst Pointer to the destination tensor to copy data to.
1627
  * @return true if the copy operation succeeds, false otherwise.
1628
  */
1629
- GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
1630
  ggml_backend_t backend_src, ggml_backend_t backend_dst,
1631
  const ggml_tensor* src, ggml_tensor* dst) {
1632
  GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
@@ -1694,7 +1695,7 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
1694
  *
1695
  * @param backend Pointer to the CANN backend structure to synchronize.
1696
  */
1697
- GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1698
  ggml_backend_cann_context* cann_ctx =
1699
  (ggml_backend_cann_context*)backend->context;
1700
 
@@ -1715,7 +1716,7 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1715
  * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
1716
  * completes successfully, otherwise an appropriate error status.
1717
  */
1718
- GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
1719
  ggml_backend_t backend, ggml_cgraph* cgraph) {
1720
  ggml_backend_cann_context* cann_ctx =
1721
  (ggml_backend_cann_context*)backend->context;
@@ -1753,7 +1754,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
1753
  * @return bool Returns true if the operation is supported by the backend,
1754
  * otherwise false.
1755
  */
1756
- GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1757
  const ggml_tensor* op) {
1758
  switch (op->op) {
1759
  case GGML_OP_UNARY:
@@ -1875,7 +1876,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
1875
  * @return bool Returns true if the CANN backend supports the buffer type,
1876
  * otherwise false.
1877
  */
1878
- GGML_CALL static bool ggml_backend_cann_supports_buft(
1879
  ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
1880
  if (ggml_backend_buft_is_cann(buft)) {
1881
  ggml_backend_cann_context * cann_ctx =
@@ -1901,7 +1902,7 @@ GGML_CALL static bool ggml_backend_cann_supports_buft(
1901
  * @return bool Returns true if the operation should be offloaded, otherwise
1902
  * false.
1903
  */
1904
- GGML_CALL static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
1905
  const ggml_tensor* op) {
1906
  const int min_batch_size = 32;
1907
  GGML_UNUSED(backend);
@@ -2021,11 +2022,8 @@ static ggml_backend_i ggml_backend_cann_interface = {
2021
  /* .supports_op = */ ggml_backend_cann_supports_op,
2022
  /* .supports_buft = */ ggml_backend_cann_supports_buft,
2023
  /* .offload_op = */ ggml_backend_cann_offload_op,
2024
- /* .event_new = */ ggml_backend_cann_event_new,
2025
- /* .event_free = */ ggml_backend_cann_event_free,
2026
  /* .event_record = */ ggml_backend_cann_event_record,
2027
  /* .event_wait = */ ggml_backend_cann_event_wait,
2028
- /* .event_synchronize = */ ggml_backend_cann_event_synchronize,
2029
  };
2030
 
2031
  /**
@@ -2042,7 +2040,7 @@ static ggml_guid_t ggml_backend_cann_guid() {
2042
  return &guid;
2043
  }
2044
 
2045
- GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
2046
  aclInit(nullptr);
2047
  if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
2048
  GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
@@ -2058,75 +2056,30 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
2058
  ggml_backend_t cann_backend =
2059
  new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
2060
  /* .interface = */ ggml_backend_cann_interface,
 
2061
  /* .context = */ ctx};
2062
 
2063
  return cann_backend;
2064
  }
2065
 
2066
- GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend) {
2067
  return backend != NULL &&
2068
  ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
2069
  }
2070
 
2071
- GGML_CALL int32_t ggml_backend_cann_get_device_count() {
2072
  return ggml_cann_info().device_count;
2073
  }
2074
 
2075
- GGML_CALL void ggml_backend_cann_get_device_description(
2076
  int32_t device, char* description, size_t description_size) {
2077
  ggml_cann_set_device(device);
2078
  const char* soc_name = aclrtGetSocName();
2079
  snprintf(description, description_size, "%s", soc_name);
2080
  }
2081
 
2082
- GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
2083
- size_t* total) {
2084
  ggml_cann_set_device(device);
2085
  ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
2086
  }
2087
-
2088
- // backend registry
2089
- /**
2090
- * @brief Initializes a CANN backend based on the provided parameters.
2091
- *
2092
- * This function initializes a CANN backend using the device index and then
2093
- * initializes the backend using `ggml_backend_cann_init`.
2094
- *
2095
- * @param params Parameters for initialization (unused in this implementation).
2096
- * @param user_data User data containing the device index to initialize the
2097
- * backend.
2098
- * @return ggml_backend_t The initialized CANN backend.
2099
- */
2100
- GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params,
2101
- void* user_data) {
2102
- ggml_backend_t cann_backend =
2103
- ggml_backend_cann_init((int)(intptr_t)user_data);
2104
- return cann_backend;
2105
-
2106
- GGML_UNUSED(params);
2107
- }
2108
-
2109
- extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
2110
-
2111
- /**
2112
- * @brief Registers CANN (Ascend) devices as backend options.
2113
- *
2114
- * This function initializes ACL, retrieves the number of available CANN
2115
- * devices, and registers each device as a backend option using
2116
- * `ggml_backend_register`. Each device is given a unique name based on
2117
- * `GGML_CANN_NAME` followed by its index.
2118
- *
2119
- * @return int The number of CANN devices registered.
2120
- */
2121
- GGML_CALL int ggml_backend_cann_reg_devices() {
2122
- uint32_t device_count = ggml_backend_cann_get_device_count();
2123
- // initialization
2124
- for (uint32_t i = 0; i < device_count; i++) {
2125
- char name[128];
2126
- snprintf(name, sizeof(name), "CANN%d", i);
2127
- ggml_backend_register(name, ggml_backend_reg_cann_init,
2128
- ggml_backend_cann_buffer_type(i),
2129
- (void*)(intptr_t)i);
2130
- }
2131
- return device_count;
2132
- }
 
560
  * @return A pointer to a C-string containing the name of the buffer.
561
  */
562
 
563
+ static const char* ggml_backend_cann_buffer_get_name(
564
  ggml_backend_buffer_t buffer) {
565
  return "CANN";
566
 
 
576
  * @param buffer The buffer to check.
577
  * @return true if the buffer is a CANN buffer, false otherwise.
578
  */
579
+ static bool ggml_backend_buffer_is_cann(
580
  ggml_backend_buffer_t buffer) {
581
  return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
582
  }
 
589
  *
590
  * @param buffer The CANN buffer to free.
591
  */
592
+ static void ggml_backend_cann_buffer_free_buffer(
593
  ggml_backend_buffer_t buffer) {
594
  ggml_backend_cann_buffer_context* ctx =
595
  (ggml_backend_cann_buffer_context*)buffer->context;
 
605
  * @param buffer The CANN buffer whose base pointer is to be retrieved.
606
  * @return A pointer to the base of the device memory allocated for the buffer.
607
  */
608
+ static void* ggml_backend_cann_buffer_get_base(
609
  ggml_backend_buffer_t buffer) {
610
  ggml_backend_cann_buffer_context* ctx =
611
  (ggml_backend_cann_buffer_context*)buffer->context;
 
625
  * @param dst Pointer to the destination buffer where transformed data will be
626
  * stored.
627
  */
628
+ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
629
+ const void* src,
630
+ void* dst) {
631
 
632
  int64_t n_elems = ggml_nelements(tensor);
633
  int64_t groups = n_elems / QK4_0;
 
677
  * @param dst Pointer to the destination buffer where the Q4.0 formatted data
678
  * will be stored.
679
  */
680
+ static void ggml_backend_cann_transform_back_q4_0(
681
  const ggml_tensor* tensor, void* src, void* dst) {
682
 
683
  int64_t n_elems = ggml_nelements(tensor);
 
726
  * @param dst Pointer to the destination buffer where transformed data will be
727
  * stored.
728
  */
729
+ static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
730
+ const void* src,
731
+ void* dst) {
732
  int64_t n_elems = ggml_nelements(tensor);
733
  int64_t groups = n_elems / QK8_0;
734
  size_t quant_bytes = n_elems * sizeof(uint8_t);
 
760
  * @param dst Pointer to the destination buffer where the Q8.0 formatted data
761
  * will be stored.
762
  */
763
+ static void ggml_backend_cann_transform_back_q8_0(
764
  const ggml_tensor* tensor, const void* src, void* dst) {
765
  int64_t n_elems = ggml_nelements(tensor);
766
  int64_t groups = n_elems / QK8_0;
 
792
  * @param dst Pointer to the destination buffer where transformed data will be
793
  * stored.
794
  */
795
+ static void ggml_backend_cann_transform(ggml_tensor* tensor,
796
+ const void* src, void* dst) {
797
  switch (tensor->type) {
798
  case GGML_TYPE_Q4_0:
799
  ggml_backend_cann_transform_q4_0(tensor, src, dst);
 
818
  * @param dst Pointer to the destination buffer where transformed tensor data
819
  * will be stored.
820
  */
821
+ static void ggml_backend_cann_transform_back(
822
  const ggml_tensor* tensor, void* src, void* dst) {
823
  switch (tensor->type) {
824
  case GGML_TYPE_Q4_0:
 
841
  * @param type The tensor type to check.
842
  * @return true if transformation is needed, false otherwise.
843
  */
844
+ static bool need_transform(ggml_type type) {
845
  switch (type) {
846
  case GGML_TYPE_Q4_0:
847
  case GGML_TYPE_Q8_0:
 
860
  * @param buffer The CANN buffer from which to initialize the tensor.
861
  * @param tensor Pointer to the tensor to be initialized.
862
  */
863
+ static void ggml_backend_cann_buffer_init_tensor(
864
  ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
865
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
866
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
 
896
  * @param offset Offset in the source data from where to start copying.
897
  * @param size Size of the data to be copied, in bytes.
898
  */
899
+ static void ggml_backend_cann_buffer_set_tensor(
900
  ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
901
  size_t offset, size_t size) {
902
  ggml_backend_cann_buffer_context *ctx =
 
941
  * @param offset Offset in the destination buffer where to start copying.
942
  * @param size Size of the data to be copied, in bytes.
943
  */
944
+ static void ggml_backend_cann_buffer_get_tensor(
945
  ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
946
  size_t offset, size_t size) {
947
  ggml_backend_cann_buffer_context* ctx =
 
975
  * @param dst Pointer to the destination tensor where the data will be copied.
976
  * @return true if the copy operation succeeded, false otherwise.
977
  */
978
+ static bool ggml_backend_cann_buffer_cpy_tensor(
979
  ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
980
  if (ggml_backend_buffer_is_cann(src->buffer)) {
981
  ggml_backend_cann_buffer_context* src_ctx =
 
1017
  * @param buffer The CANN buffer to be cleared.
1018
  * @param value The value to which each byte in the buffer will be set.
1019
  */
1020
+ static void ggml_backend_cann_buffer_clear(
1021
  ggml_backend_buffer_t buffer, uint8_t value) {
1022
  ggml_backend_cann_buffer_context* ctx =
1023
  (ggml_backend_cann_buffer_context*)buffer->context;
 
1065
  * @param buft Pointer to the buffer type context.
1066
  * @return Const pointer to the C-style string containing the name.
1067
  */
1068
+ static const char* ggml_backend_cann_buffer_type_name(
1069
  ggml_backend_buffer_type_t buft) {
1070
  return "CANN";
1071
 
 
1082
  * @param size Size in bytes of the buffer to allocate.
1083
  * @return Pointer to the allocated buffer, or nullptr if allocation fails.
1084
  */
1085
+ static ggml_backend_buffer_t
1086
  ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1087
  size_t size) {
1088
  ggml_backend_cann_buffer_type_context* buft_ctx =
 
1121
  * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
1122
  * buffers).
1123
  */
1124
+ static size_t ggml_backend_cann_buffer_type_get_alignment(
1125
  ggml_backend_buffer_type_t buft) {
1126
  return 128;
1127
 
 
1142
  * @return The total allocation size in bytes required for the tensor in the
1143
  * CANN buffer.
1144
  */
1145
+ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1146
  ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
1147
  size_t size = ggml_nbytes(tensor);
1148
  int64_t ne0 = tensor->ne[0];
 
1193
  * @return A pointer to the buffer type interface for the specified device, or
1194
  * nullptr if the device index is out of range.
1195
  */
1196
+ ggml_backend_buffer_type_t
1197
  ggml_backend_cann_buffer_type(int32_t device) {
1198
  static std::mutex mutex;
1199
  std::lock_guard<std::mutex> lock(mutex);
 
1231
  * @param buft Pointer to the host buffer type context.
1232
  * @return Const pointer to the C-style string containing the name.
1233
  */
1234
+ static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
1235
  return "CANN_Host";
1236
 
1237
  GGML_UNUSED(buft);
 
1246
  * @param buft Pointer to the host buffer context.
1247
  * @return Const pointer to the C-style string containing the name.
1248
  */
1249
+ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
1250
  return "CANN_Host";
1251
 
1252
  GGML_UNUSED(buffer);
 
1260
  *
1261
  * @param buffer The CANN host buffer to free.
1262
  */
1263
+ static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
1264
  ACL_CHECK(aclrtFreeHost(buffer->context));
1265
  }
1266
 
 
1294
  * @param size Size in bytes of the host buffer to allocate.
1295
  * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
1296
  */
1297
+ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1298
  void * hostPtr = ggml_cann_host_malloc(size);
1299
 
1300
  if (hostPtr == nullptr) {
 
1316
  * Provides function pointers for allocating, querying properties, and managing
1317
  * memory for CANN buffer types in the GGML backend.
1318
  */
1319
+ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1320
  static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
1321
  /* .iface = */ {
1322
  /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
 
1326
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1327
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1328
  },
1329
+ /* .device = */ nullptr,
1330
  /* .context = */ nullptr,
1331
  };
1332
 
 
1496
  * @param backend Pointer to the CANN backend structure.
1497
  * @return A pointer to a constant string representing the backend name.
1498
  */
1499
+ static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1500
  ggml_backend_cann_context* cann_ctx =
1501
  (ggml_backend_cann_context*)backend->context;
1502
 
 
1511
  *
1512
  * @param backend Pointer to the CANN backend structure to be freed.
1513
  */
1514
+ static void ggml_backend_cann_free(ggml_backend_t backend) {
1515
  ggml_backend_cann_context* cann_ctx =
1516
  (ggml_backend_cann_context*)backend->context;
1517
  ACL_CHECK(aclrtSynchronizeDevice());
 
1536
  * @param backend Pointer to the CANN backend structure.
1537
  * @return Pointer to the buffer type structure for the CANN backend.
1538
  */
1539
+ static ggml_backend_buffer_type_t
1540
  ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
1541
  ggml_backend_cann_context* cann_ctx =
1542
  (ggml_backend_cann_context*)backend->context;
 
1557
  * @param offset Offset in bytes within the host data.
1558
  * @param size Size of the data to copy in bytes.
1559
  */
1560
+ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1561
+ ggml_tensor *tensor,
1562
+ const void *data,
1563
+ size_t offset,
1564
+ size_t size) {
1565
  ggml_backend_cann_context *cann_ctx =
1566
  (ggml_backend_cann_context *)backend->context;
1567
 
 
1588
  }
1589
  }
1590
 
1591
+ static void ggml_backend_cann_get_tensor_async(
1592
  ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1593
  size_t offset, size_t size) {
1594
  ggml_backend_cann_context *cann_ctx =
 
1627
  * @param dst Pointer to the destination tensor to copy data to.
1628
  * @return true if the copy operation succeeds, false otherwise.
1629
  */
1630
+ static bool ggml_backend_cann_cpy_tensor_async(
1631
  ggml_backend_t backend_src, ggml_backend_t backend_dst,
1632
  const ggml_tensor* src, ggml_tensor* dst) {
1633
  GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
 
1695
  *
1696
  * @param backend Pointer to the CANN backend structure to synchronize.
1697
  */
1698
+ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1699
  ggml_backend_cann_context* cann_ctx =
1700
  (ggml_backend_cann_context*)backend->context;
1701
 
 
1716
  * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
1717
  * completes successfully, otherwise an appropriate error status.
1718
  */
1719
+ static enum ggml_status ggml_backend_cann_graph_compute(
1720
  ggml_backend_t backend, ggml_cgraph* cgraph) {
1721
  ggml_backend_cann_context* cann_ctx =
1722
  (ggml_backend_cann_context*)backend->context;
 
1754
  * @return bool Returns true if the operation is supported by the backend,
1755
  * otherwise false.
1756
  */
1757
+ static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1758
  const ggml_tensor* op) {
1759
  switch (op->op) {
1760
  case GGML_OP_UNARY:
 
1876
  * @return bool Returns true if the CANN backend supports the buffer type,
1877
  * otherwise false.
1878
  */
1879
+ static bool ggml_backend_cann_supports_buft(
1880
  ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
1881
  if (ggml_backend_buft_is_cann(buft)) {
1882
  ggml_backend_cann_context * cann_ctx =
 
1902
  * @return bool Returns true if the operation should be offloaded, otherwise
1903
  * false.
1904
  */
1905
+ static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
1906
  const ggml_tensor* op) {
1907
  const int min_batch_size = 32;
1908
  GGML_UNUSED(backend);
 
2022
  /* .supports_op = */ ggml_backend_cann_supports_op,
2023
  /* .supports_buft = */ ggml_backend_cann_supports_buft,
2024
  /* .offload_op = */ ggml_backend_cann_offload_op,
 
 
2025
  /* .event_record = */ ggml_backend_cann_event_record,
2026
  /* .event_wait = */ ggml_backend_cann_event_wait,
 
2027
  };
2028
 
2029
  /**
 
2040
  return &guid;
2041
  }
2042
 
2043
+ ggml_backend_t ggml_backend_cann_init(int32_t device) {
2044
  aclInit(nullptr);
2045
  if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
2046
  GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
 
2056
  ggml_backend_t cann_backend =
2057
  new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
2058
  /* .interface = */ ggml_backend_cann_interface,
2059
+ /* .device = */ nullptr,
2060
  /* .context = */ ctx};
2061
 
2062
  return cann_backend;
2063
  }
2064
 
2065
+ bool ggml_backend_is_cann(ggml_backend_t backend) {
2066
  return backend != NULL &&
2067
  ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
2068
  }
2069
 
2070
+ int32_t ggml_backend_cann_get_device_count() {
2071
  return ggml_cann_info().device_count;
2072
  }
2073
 
2074
+ void ggml_backend_cann_get_device_description(
2075
  int32_t device, char* description, size_t description_size) {
2076
  ggml_cann_set_device(device);
2077
  const char* soc_name = aclrtGetSocName();
2078
  snprintf(description, description_size, "%s", soc_name);
2079
  }
2080
 
2081
+ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
2082
+ size_t* total) {
2083
  ggml_cann_set_device(device);
2084
  ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
2085
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cuda.cu CHANGED
@@ -101,11 +101,11 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
101
  int id = -1; // in case cudaGetDevice fails
102
  cudaGetDevice(&id);
103
 
104
- GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
105
  GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
106
  GGML_CUDA_LOG_ERROR(" %s\n", stmt);
107
- // abort with GGML_ASSERT to get a stack trace
108
- GGML_ABORT("CUDA error");
109
  }
110
 
111
  // this is faster on Windows
@@ -329,7 +329,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
329
  return;
330
  }
331
  }
332
- GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
333
  ggml_cuda_set_device(device);
334
  CUDA_CHECK(cudaFree(ptr));
335
  pool_size -= size;
@@ -459,26 +459,26 @@ struct ggml_backend_cuda_buffer_context {
459
  }
460
  };
461
 
462
- GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
463
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
464
  return ctx->name.c_str();
465
  }
466
 
467
- GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
468
  return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
469
  }
470
 
471
- GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
472
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
473
  delete ctx;
474
  }
475
 
476
- GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
477
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
478
  return ctx->dev_ptr;
479
  }
480
 
481
- GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
482
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
483
 
484
  if (tensor->view_src != NULL) {
@@ -498,7 +498,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
498
  }
499
  }
500
 
501
- GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
502
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
503
 
504
  ggml_cuda_set_device(ctx->device);
@@ -506,7 +506,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer
506
  CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
507
  }
508
 
509
- GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
510
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
511
 
512
  ggml_cuda_set_device(ctx->device);
@@ -514,7 +514,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
514
  CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
515
  }
516
 
517
- GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
518
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
519
 
520
  ggml_cuda_set_device(ctx->device);
@@ -522,7 +522,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
522
  CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
523
  }
524
 
525
- GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
526
  if (ggml_backend_buffer_is_cuda(src->buffer)) {
527
  ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
528
  ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
@@ -543,7 +543,7 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t
543
  GGML_UNUSED(buffer);
544
  }
545
 
546
- GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
547
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
548
 
549
  ggml_cuda_set_device(ctx->device);
@@ -552,7 +552,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffe
552
  CUDA_CHECK(cudaDeviceSynchronize());
553
  }
554
 
555
- static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
556
  /* .get_name = */ ggml_backend_cuda_buffer_get_name,
557
  /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
558
  /* .get_base = */ ggml_backend_cuda_buffer_get_base,
@@ -571,17 +571,17 @@ struct ggml_backend_cuda_buffer_type_context {
571
  std::string name;
572
  };
573
 
574
- GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
575
  ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
576
 
577
  return ctx->name.c_str();
578
  }
579
 
580
  static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
581
- return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
582
  }
583
 
584
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
585
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
586
 
587
  ggml_cuda_set_device(buft_ctx->device);
@@ -602,13 +602,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
602
  return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
603
  }
604
 
605
- GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
606
  return 128;
607
 
608
  GGML_UNUSED(buft);
609
  }
610
 
611
- GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
612
  size_t size = ggml_nbytes(tensor);
613
  int64_t ne0 = tensor->ne[0];
614
 
@@ -623,8 +623,8 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
623
  GGML_UNUSED(buft);
624
  }
625
 
626
- static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
627
- /* .get_name = */ ggml_backend_cuda_buffer_type_name,
628
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
629
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
630
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
@@ -632,7 +632,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
632
  /* .is_host = */ NULL,
633
  };
634
 
635
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
636
  static std::mutex mutex;
637
  std::lock_guard<std::mutex> lock(mutex);
638
 
@@ -645,9 +645,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
645
  static bool ggml_backend_cuda_buffer_type_initialized = false;
646
 
647
  if (!ggml_backend_cuda_buffer_type_initialized) {
648
- for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
649
  ggml_backend_cuda_buffer_types[i] = {
650
  /* .iface = */ ggml_backend_cuda_buffer_type_interface,
 
651
  /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
652
  };
653
  }
@@ -717,7 +718,7 @@ struct ggml_backend_cuda_split_buffer_context {
717
  std::vector<ggml_tensor_extra_gpu *> tensor_extras;
718
  };
719
 
720
- GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
721
  return GGML_CUDA_NAME "_Split";
722
 
723
  GGML_UNUSED(buffer);
@@ -728,19 +729,19 @@ static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
728
  GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
729
  }
730
 
731
- GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
732
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
733
  delete ctx;
734
  }
735
 
736
- GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
737
  // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
738
  return (void *)0x1000;
739
 
740
  GGML_UNUSED(buffer);
741
  }
742
 
743
- GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
744
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
745
 
746
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@@ -788,7 +789,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
788
  tensor->extra = extra;
789
  }
790
 
791
- GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
792
  // split tensors must always be set in their entirety at once
793
  GGML_ASSERT(offset == 0);
794
  GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -826,7 +827,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
826
  }
827
  }
828
 
829
- GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
830
  // split tensors must always be set in their entirety at once
831
  GGML_ASSERT(offset == 0);
832
  GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -864,12 +865,12 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
864
  }
865
  }
866
 
867
- GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
868
  GGML_UNUSED(buffer);
869
  GGML_UNUSED(value);
870
  }
871
 
872
- static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
873
  /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
874
  /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
875
  /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
@@ -884,17 +885,17 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
884
 
885
  // cuda split buffer type
886
 
887
- GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
888
  return GGML_CUDA_NAME "_Split";
889
 
890
  GGML_UNUSED(buft);
891
  }
892
 
893
  static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
894
- return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
895
  }
896
 
897
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
898
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
899
  // instead, we allocate them for each tensor separately in init_tensor
900
  // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
@@ -904,13 +905,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc
904
  return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
905
  }
906
 
907
- GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
908
  return 128;
909
 
910
  GGML_UNUSED(buft);
911
  }
912
 
913
- GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
914
  ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
915
 
916
  size_t total_size = 0;
@@ -937,14 +938,14 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
937
  return total_size;
938
  }
939
 
940
- GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
941
  return false;
942
 
943
  GGML_UNUSED(buft);
944
  }
945
 
946
- static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
947
- /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
948
  /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
949
  /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
950
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
@@ -952,7 +953,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
952
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
953
  };
954
 
955
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
956
  static std::mutex mutex;
957
  std::lock_guard<std::mutex> lock(mutex);
958
 
@@ -981,6 +982,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f
981
 
982
  struct ggml_backend_buffer_type buft {
983
  /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
 
984
  /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
985
  };
986
 
@@ -990,19 +992,19 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f
990
 
991
  // host buffer type
992
 
993
- GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
994
  return GGML_CUDA_NAME "_Host";
995
 
996
  GGML_UNUSED(buft);
997
  }
998
 
999
- GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
1000
  return GGML_CUDA_NAME "_Host";
1001
 
1002
  GGML_UNUSED(buffer);
1003
  }
1004
 
1005
- GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1006
  CUDA_CHECK(cudaFreeHost(buffer->context));
1007
  }
1008
 
@@ -1024,7 +1026,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
1024
  return ptr;
1025
  }
1026
 
1027
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1028
  void * ptr = ggml_cuda_host_malloc(size);
1029
 
1030
  if (ptr == nullptr) {
@@ -1040,7 +1042,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_
1040
  return buffer;
1041
  }
1042
 
1043
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1044
  static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
1045
  /* .iface = */ {
1046
  /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
@@ -1050,6 +1052,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1050
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1051
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1052
  },
 
1053
  /* .context = */ nullptr,
1054
  };
1055
 
@@ -2383,26 +2386,26 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2383
 
2384
  // backend
2385
 
2386
- GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
2387
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2388
 
2389
  return cuda_ctx->name.c_str();
2390
  }
2391
 
2392
- GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
2393
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2394
 
2395
  delete cuda_ctx;
2396
  delete backend;
2397
  }
2398
 
2399
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
2400
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2401
 
2402
  return ggml_backend_cuda_buffer_type(cuda_ctx->device);
2403
  }
2404
 
2405
- GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2406
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2407
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2408
 
@@ -2411,7 +2414,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
2411
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
2412
  }
2413
 
2414
- GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2415
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2416
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2417
 
@@ -2420,7 +2423,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
2420
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
2421
  }
2422
 
2423
- GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
2424
  ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
2425
  ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
2426
 
@@ -2475,7 +2478,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
2475
  return true;
2476
  }
2477
 
2478
- GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2479
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2480
 
2481
  CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
@@ -2534,7 +2537,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
2534
  return true;
2535
  }
2536
 
2537
- GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2538
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2539
 
2540
  ggml_cuda_set_device(cuda_ctx->device);
@@ -2806,8 +2809,187 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2806
  return GGML_STATUS_SUCCESS;
2807
  }
2808
 
2809
- GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2810
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2811
  switch (op->op) {
2812
  case GGML_OP_UNARY:
2813
  switch (ggml_get_unary_op(op)) {
@@ -3021,7 +3203,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
3021
  if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
3022
  return true;
3023
  }
3024
- const int cc = ggml_cuda_info().devices[cuda_ctx->device].cc;
3025
  return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
3026
  }
3027
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -3031,115 +3213,170 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
3031
  default:
3032
  return false;
3033
  }
3034
-
3035
- GGML_UNUSED(backend);
3036
  }
3037
 
3038
- GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
3039
  if (ggml_backend_buft_is_cuda_split(buft)) {
3040
  return true;
3041
  }
3042
 
3043
  if (ggml_backend_buft_is_cuda(buft)) {
3044
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
3045
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
3046
- return buft_ctx->device == cuda_ctx->device;
3047
  }
3048
 
3049
  return false;
3050
  }
3051
 
3052
- GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
3053
  const int min_batch_size = 32;
3054
 
3055
  return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
3056
  (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
3057
 
3058
- GGML_UNUSED(backend);
3059
  }
3060
 
3061
- static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
3062
  #ifdef GGML_CUDA_NO_PEER_COPY
3063
  return nullptr;
3064
  #else
3065
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
3066
 
3067
- ggml_cuda_set_device(cuda_ctx->device);
3068
 
3069
  cudaEvent_t event;
3070
  CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
3071
 
3072
  return new ggml_backend_event {
3073
- /* .backend = */ backend,
3074
  /* .context = */ event,
3075
  };
3076
  #endif
3077
  }
3078
 
3079
- static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
3080
- CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
3081
 
 
3082
  delete event;
3083
  }
3084
 
3085
- static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
3086
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3087
 
3088
- CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
 
 
 
 
 
 
 
 
3089
  }
3090
 
3091
- static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
3092
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
 
3093
 
3094
- if (ggml_backend_is_cuda(event->backend)) {
3095
- CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
3096
- } else {
3097
- #if 0
3098
- // untested
3099
- auto wait_fn = [](void * user_data) {
3100
- ggml_backend_event_t event = (ggml_backend_event_t)user_data;
3101
- ggml_backend_event_synchronize(event);
3102
- };
3103
 
3104
- CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
3105
- #endif
3106
- GGML_ABORT("fatal error");
 
 
 
 
3107
  }
 
 
 
 
3108
  }
3109
 
3110
- static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
3111
- CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
 
3112
  }
3113
 
3114
- static ggml_backend_i ggml_backend_cuda_interface = {
3115
- /* .get_name = */ ggml_backend_cuda_name,
3116
- /* .free = */ ggml_backend_cuda_free,
3117
- /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
3118
- /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
3119
- /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
3120
- /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
3121
- /* .synchronize = */ ggml_backend_cuda_synchronize,
3122
- /* .graph_plan_create = */ NULL,
3123
- /* .graph_plan_free = */ NULL,
3124
- /* .graph_plan_update = */ NULL,
3125
- /* .graph_plan_compute = */ NULL,
3126
- /* .graph_compute = */ ggml_backend_cuda_graph_compute,
3127
- /* .supports_op = */ ggml_backend_cuda_supports_op,
3128
- /* .supports_buft = */ ggml_backend_cuda_supports_buft,
3129
- /* .offload_op = */ ggml_backend_cuda_offload_op,
3130
- /* .event_new = */ ggml_backend_cuda_event_new,
3131
- /* .event_free = */ ggml_backend_cuda_event_free,
3132
- /* .event_record = */ ggml_backend_cuda_event_record,
3133
- /* .event_wait = */ ggml_backend_cuda_event_wait,
3134
- /* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
3135
  };
3136
 
3137
- static ggml_guid_t ggml_backend_cuda_guid() {
3138
- static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
3139
- return &guid;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3140
  }
3141
 
3142
- GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
3143
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
3144
  GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
3145
  return nullptr;
@@ -3154,82 +3391,9 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
3154
  ggml_backend_t cuda_backend = new ggml_backend {
3155
  /* .guid = */ ggml_backend_cuda_guid(),
3156
  /* .interface = */ ggml_backend_cuda_interface,
3157
- /* .context = */ ctx
 
3158
  };
3159
 
3160
  return cuda_backend;
3161
  }
3162
-
3163
- GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
3164
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
3165
- }
3166
-
3167
- GGML_CALL int ggml_backend_cuda_get_device_count() {
3168
- return ggml_cuda_info().device_count;
3169
- }
3170
-
3171
- GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
3172
- cudaDeviceProp prop;
3173
- CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
3174
- snprintf(description, description_size, "%s", prop.name);
3175
- }
3176
-
3177
- GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
3178
- ggml_cuda_set_device(device);
3179
-
3180
- CUDA_CHECK(cudaMemGetInfo(free, total));
3181
- }
3182
-
3183
- GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
3184
- if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
3185
- return false;
3186
- }
3187
-
3188
- #if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
3189
- cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
3190
- if (err != cudaSuccess) {
3191
- // clear the error
3192
- cudaGetLastError();
3193
-
3194
- GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3195
- size / 1024.0 / 1024.0, cudaGetErrorString(err));
3196
- return false;
3197
- }
3198
- return true;
3199
- #else
3200
- return false;
3201
- #endif
3202
- }
3203
-
3204
- GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
3205
- if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
3206
- return;
3207
- }
3208
-
3209
- cudaError_t err = cudaHostUnregister(buffer);
3210
- if (err != cudaSuccess) {
3211
- // clear the error
3212
- cudaGetLastError();
3213
- }
3214
- }
3215
-
3216
- // backend registry
3217
- GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
3218
- ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
3219
- return cuda_backend;
3220
-
3221
- GGML_UNUSED(params);
3222
- }
3223
-
3224
- extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
3225
-
3226
- GGML_CALL int ggml_backend_cuda_reg_devices() {
3227
- int device_count = ggml_backend_cuda_get_device_count();
3228
- //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
3229
- for (int i = 0; i < device_count; i++) {
3230
- char name[128];
3231
- snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
3232
- ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
3233
- }
3234
- return device_count;
3235
- }
 
101
  int id = -1; // in case cudaGetDevice fails
102
  cudaGetDevice(&id);
103
 
104
+ GGML_CUDA_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
105
  GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
106
  GGML_CUDA_LOG_ERROR(" %s\n", stmt);
107
+ // abort with GGML_ABORT to get a stack trace
108
+ GGML_ABORT(GGML_CUDA_NAME " error");
109
  }
110
 
111
  // this is faster on Windows
 
329
  return;
330
  }
331
  }
332
+ GGML_CUDA_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
333
  ggml_cuda_set_device(device);
334
  CUDA_CHECK(cudaFree(ptr));
335
  pool_size -= size;
 
459
  }
460
  };
461
 
462
+ static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
463
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
464
  return ctx->name.c_str();
465
  }
466
 
467
+ static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
468
  return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
469
  }
470
 
471
+ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
472
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
473
  delete ctx;
474
  }
475
 
476
+ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
477
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
478
  return ctx->dev_ptr;
479
  }
480
 
481
+ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
482
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
483
 
484
  if (tensor->view_src != NULL) {
 
498
  }
499
  }
500
 
501
+ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
502
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
503
 
504
  ggml_cuda_set_device(ctx->device);
 
506
  CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
507
  }
508
 
509
+ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
510
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
511
 
512
  ggml_cuda_set_device(ctx->device);
 
514
  CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
515
  }
516
 
517
+ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
518
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
519
 
520
  ggml_cuda_set_device(ctx->device);
 
522
  CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
523
  }
524
 
525
+ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
526
  if (ggml_backend_buffer_is_cuda(src->buffer)) {
527
  ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
528
  ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
 
543
  GGML_UNUSED(buffer);
544
  }
545
 
546
+ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
547
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
548
 
549
  ggml_cuda_set_device(ctx->device);
 
552
  CUDA_CHECK(cudaDeviceSynchronize());
553
  }
554
 
555
+ static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
556
  /* .get_name = */ ggml_backend_cuda_buffer_get_name,
557
  /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
558
  /* .get_base = */ ggml_backend_cuda_buffer_get_base,
 
571
  std::string name;
572
  };
573
 
574
+ static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
575
  ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
576
 
577
  return ctx->name.c_str();
578
  }
579
 
580
  static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
581
+ return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name;
582
  }
583
 
584
+ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
585
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
586
 
587
  ggml_cuda_set_device(buft_ctx->device);
 
602
  return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
603
  }
604
 
605
+ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
606
  return 128;
607
 
608
  GGML_UNUSED(buft);
609
  }
610
 
611
+ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
612
  size_t size = ggml_nbytes(tensor);
613
  int64_t ne0 = tensor->ne[0];
614
 
 
623
  GGML_UNUSED(buft);
624
  }
625
 
626
+ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
627
+ /* .get_name = */ ggml_backend_cuda_buffer_type_get_name,
628
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
629
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
630
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
 
632
  /* .is_host = */ NULL,
633
  };
634
 
635
+ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
636
  static std::mutex mutex;
637
  std::lock_guard<std::mutex> lock(mutex);
638
 
 
645
  static bool ggml_backend_cuda_buffer_type_initialized = false;
646
 
647
  if (!ggml_backend_cuda_buffer_type_initialized) {
648
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); i++) {
649
  ggml_backend_cuda_buffer_types[i] = {
650
  /* .iface = */ ggml_backend_cuda_buffer_type_interface,
651
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i),
652
  /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
653
  };
654
  }
 
718
  std::vector<ggml_tensor_extra_gpu *> tensor_extras;
719
  };
720
 
721
+ static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
722
  return GGML_CUDA_NAME "_Split";
723
 
724
  GGML_UNUSED(buffer);
 
729
  GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
730
  }
731
 
732
+ static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
733
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
734
  delete ctx;
735
  }
736
 
737
+ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
738
  // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
739
  return (void *)0x1000;
740
 
741
  GGML_UNUSED(buffer);
742
  }
743
 
744
+ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
745
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
746
 
747
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
 
789
  tensor->extra = extra;
790
  }
791
 
792
+ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
793
  // split tensors must always be set in their entirety at once
794
  GGML_ASSERT(offset == 0);
795
  GGML_ASSERT(size == ggml_nbytes(tensor));
 
827
  }
828
  }
829
 
830
+ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
831
  // split tensors must always be set in their entirety at once
832
  GGML_ASSERT(offset == 0);
833
  GGML_ASSERT(size == ggml_nbytes(tensor));
 
865
  }
866
  }
867
 
868
+ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
869
  GGML_UNUSED(buffer);
870
  GGML_UNUSED(value);
871
  }
872
 
873
+ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
874
  /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
875
  /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
876
  /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
 
885
 
886
  // cuda split buffer type
887
 
888
+ static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
889
  return GGML_CUDA_NAME "_Split";
890
 
891
  GGML_UNUSED(buft);
892
  }
893
 
894
  static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
895
+ return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name;
896
  }
897
 
898
+ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
899
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
900
  // instead, we allocate them for each tensor separately in init_tensor
901
  // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
 
905
  return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
906
  }
907
 
908
+ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
909
  return 128;
910
 
911
  GGML_UNUSED(buft);
912
  }
913
 
914
+ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
915
  ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
916
 
917
  size_t total_size = 0;
 
938
  return total_size;
939
  }
940
 
941
+ static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
942
  return false;
943
 
944
  GGML_UNUSED(buft);
945
  }
946
 
947
+ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
948
+ /* .get_name = */ ggml_backend_cuda_split_buffer_type_get_name,
949
  /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
950
  /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
951
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
 
953
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
954
  };
955
 
956
+ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
957
  static std::mutex mutex;
958
  std::lock_guard<std::mutex> lock(mutex);
959
 
 
982
 
983
  struct ggml_backend_buffer_type buft {
984
  /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
985
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
986
  /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
987
  };
988
 
 
992
 
993
  // host buffer type
994
 
995
+ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
996
  return GGML_CUDA_NAME "_Host";
997
 
998
  GGML_UNUSED(buft);
999
  }
1000
 
1001
+ static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
1002
  return GGML_CUDA_NAME "_Host";
1003
 
1004
  GGML_UNUSED(buffer);
1005
  }
1006
 
1007
+ static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1008
  CUDA_CHECK(cudaFreeHost(buffer->context));
1009
  }
1010
 
 
1026
  return ptr;
1027
  }
1028
 
1029
+ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1030
  void * ptr = ggml_cuda_host_malloc(size);
1031
 
1032
  if (ptr == nullptr) {
 
1042
  return buffer;
1043
  }
1044
 
1045
+ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1046
  static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
1047
  /* .iface = */ {
1048
  /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
 
1052
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1053
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1054
  },
1055
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
1056
  /* .context = */ nullptr,
1057
  };
1058
 
 
2386
 
2387
  // backend
2388
 
2389
+ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
2390
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2391
 
2392
  return cuda_ctx->name.c_str();
2393
  }
2394
 
2395
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
2396
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2397
 
2398
  delete cuda_ctx;
2399
  delete backend;
2400
  }
2401
 
2402
+ static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
2403
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2404
 
2405
  return ggml_backend_cuda_buffer_type(cuda_ctx->device);
2406
  }
2407
 
2408
+ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2409
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2410
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2411
 
 
2414
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
2415
  }
2416
 
2417
+ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2418
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2419
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2420
 
 
2423
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
2424
  }
2425
 
2426
+ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
2427
  ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
2428
  ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
2429
 
 
2478
  return true;
2479
  }
2480
 
2481
+ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2482
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2483
 
2484
  CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
 
2537
  return true;
2538
  }
2539
 
2540
+ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2541
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2542
 
2543
  ggml_cuda_set_device(cuda_ctx->device);
 
2809
  return GGML_STATUS_SUCCESS;
2810
  }
2811
 
2812
+ static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
2813
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2814
+
2815
+ CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
2816
+ }
2817
+
2818
+ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
2819
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2820
+
2821
+ if (ggml_backend_is_cuda(backend)) {
2822
+ CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
2823
+ } else {
2824
+ #if 0
2825
+ // untested
2826
+ auto wait_fn = [](void * user_data) {
2827
+ ggml_backend_event_t event = (ggml_backend_event_t)user_data;
2828
+ ggml_backend_event_synchronize(event);
2829
+ };
2830
+
2831
+ CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
2832
+ #endif
2833
+ GGML_ABORT("fatal error");
2834
+ }
2835
+ }
2836
+
2837
+ static const ggml_backend_i ggml_backend_cuda_interface = {
2838
+ /* .get_name = */ ggml_backend_cuda_get_name,
2839
+ /* .free = */ ggml_backend_cuda_free,
2840
+ /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
2841
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
2842
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
2843
+ /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
2844
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
2845
+ /* .graph_plan_create = */ NULL,
2846
+ /* .graph_plan_free = */ NULL,
2847
+ /* .graph_plan_update = */ NULL,
2848
+ /* .graph_plan_compute = */ NULL,
2849
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2850
+ /* .supports_op = */ NULL, // moved to device
2851
+ /* .supports_buft = */ NULL, // moved to device
2852
+ /* .offload_op = */ NULL, // moved to device
2853
+ /* .event_record = */ ggml_backend_cuda_event_record,
2854
+ /* .event_wait = */ ggml_backend_cuda_event_wait,
2855
+ };
2856
+
2857
+ static ggml_guid_t ggml_backend_cuda_guid() {
2858
+ static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
2859
+ return &guid;
2860
+ }
2861
+
2862
+ bool ggml_backend_is_cuda(ggml_backend_t backend) {
2863
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
2864
+ }
2865
+
2866
+ int ggml_backend_cuda_get_device_count() {
2867
+ return ggml_cuda_info().device_count;
2868
+ }
2869
+
2870
+ void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
2871
+ cudaDeviceProp prop;
2872
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
2873
+ snprintf(description, description_size, "%s", prop.name);
2874
+ }
2875
+
2876
+ void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
2877
+ ggml_cuda_set_device(device);
2878
+
2879
+ CUDA_CHECK(cudaMemGetInfo(free, total));
2880
+ }
2881
+
2882
+ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
2883
+ if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
2884
+ return false;
2885
+ }
2886
+
2887
+ #if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
2888
+ cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
2889
+ if (err != cudaSuccess) {
2890
+ // clear the error
2891
+ cudaGetLastError();
2892
+
2893
+ GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
2894
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
2895
+ return false;
2896
+ }
2897
+ return true;
2898
+ #else
2899
+ return false;
2900
+ #endif
2901
+ }
2902
+
2903
+ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
2904
+ if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
2905
+ return;
2906
+ }
2907
+
2908
+ cudaError_t err = cudaHostUnregister(buffer);
2909
+ if (err != cudaSuccess) {
2910
+ // clear the error
2911
+ cudaGetLastError();
2912
+ }
2913
+ }
2914
+
2915
+
2916
+ // backend device
2917
+
2918
+ struct ggml_backend_cuda_device_context {
2919
+ int device;
2920
+ std::string name;
2921
+ std::string description;
2922
+ };
2923
+
2924
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
2925
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2926
+ return ctx->name.c_str();
2927
+ }
2928
+
2929
+ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
2930
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2931
+ return ctx->description.c_str();
2932
+ }
2933
+
2934
+ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
2935
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2936
+ ggml_cuda_set_device(ctx->device);
2937
+ CUDA_CHECK(cudaMemGetInfo(free, total));
2938
+ }
2939
+
2940
+ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
2941
+ GGML_UNUSED(dev);
2942
+ return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
2943
+ }
2944
+
2945
+ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
2946
+ props->name = ggml_backend_cuda_device_get_name(dev);
2947
+ props->description = ggml_backend_cuda_device_get_description(dev);
2948
+ props->type = ggml_backend_cuda_device_get_type(dev);
2949
+ ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
2950
+
2951
+ bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
2952
+ #ifdef GGML_CUDA_NO_PEER_COPY
2953
+ bool events = false;
2954
+ #else
2955
+ bool events = true;
2956
+ #endif
2957
+
2958
+ props->caps = {
2959
+ /* async */ true,
2960
+ /* host_buffer */ host_buffer,
2961
+ /* events */ events,
2962
+ };
2963
+ }
2964
+
2965
+ static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) {
2966
+ GGML_UNUSED(params);
2967
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2968
+ return ggml_backend_cuda_init(ctx->device);
2969
+ }
2970
+
2971
+ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) {
2972
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2973
+ return ggml_backend_cuda_buffer_type(ctx->device);
2974
+ }
2975
+
2976
+ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) {
2977
+ GGML_UNUSED(dev);
2978
+ return ggml_backend_cuda_host_buffer_type();
2979
+ }
2980
+
2981
+ static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
2982
+ GGML_UNUSED(dev);
2983
+ GGML_UNUSED(ptr);
2984
+ GGML_UNUSED(size);
2985
+ GGML_UNUSED(max_tensor_size);
2986
+ return nullptr;
2987
+ }
2988
+
2989
+ // TODO: move these functions here
2990
+ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
2991
+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
2992
+
2993
  switch (op->op) {
2994
  case GGML_OP_UNARY:
2995
  switch (ggml_get_unary_op(op)) {
 
3203
  if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
3204
  return true;
3205
  }
3206
+ const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3207
  return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
3208
  }
3209
  case GGML_OP_CROSS_ENTROPY_LOSS:
 
3213
  default:
3214
  return false;
3215
  }
 
 
3216
  }
3217
 
3218
+ static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3219
  if (ggml_backend_buft_is_cuda_split(buft)) {
3220
  return true;
3221
  }
3222
 
3223
  if (ggml_backend_buft_is_cuda(buft)) {
3224
+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
3225
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
3226
+ return buft_ctx->device == dev_ctx->device;
3227
  }
3228
 
3229
  return false;
3230
  }
3231
 
3232
+ static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
3233
  const int min_batch_size = 32;
3234
 
3235
  return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
3236
  (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
3237
 
3238
+ GGML_UNUSED(dev);
3239
  }
3240
 
3241
+ static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
3242
  #ifdef GGML_CUDA_NO_PEER_COPY
3243
  return nullptr;
3244
  #else
3245
+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
3246
 
3247
+ ggml_cuda_set_device(dev_ctx->device);
3248
 
3249
  cudaEvent_t event;
3250
  CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
3251
 
3252
  return new ggml_backend_event {
3253
+ /* .device = */ dev,
3254
  /* .context = */ event,
3255
  };
3256
  #endif
3257
  }
3258
 
3259
+ static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
3260
+ GGML_UNUSED(dev);
3261
 
3262
+ CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
3263
  delete event;
3264
  }
3265
 
3266
+ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
3267
+ GGML_UNUSED(dev);
3268
+ CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
3269
+ }
3270
+
3271
+ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
3272
+ /* .get_name = */ ggml_backend_cuda_device_get_name,
3273
+ /* .get_description = */ ggml_backend_cuda_device_get_description,
3274
+ /* .get_memory = */ ggml_backend_cuda_device_get_memory,
3275
+ /* .get_type = */ ggml_backend_cuda_device_get_type,
3276
+ /* .get_props = */ ggml_backend_cuda_device_get_props,
3277
+ /* .init_backend = */ ggml_backend_cuda_device_init,
3278
+ /* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
3279
+ /* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
3280
+ /* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr,
3281
+ /* .supports_op = */ ggml_backend_cuda_device_supports_op,
3282
+ /* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
3283
+ /* .offload_op = */ ggml_backend_cuda_device_offload_op,
3284
+ /* .event_new = */ ggml_backend_cuda_device_event_new,
3285
+ /* .event_free = */ ggml_backend_cuda_device_event_free,
3286
+ /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
3287
+ };
3288
 
3289
+ // backend reg
3290
+
3291
+ struct ggml_backend_cuda_reg_context {
3292
+ std::vector<ggml_backend_dev_t> devices;
3293
+ };
3294
+
3295
+ static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) {
3296
+ GGML_UNUSED(reg);
3297
+ return GGML_CUDA_NAME;
3298
  }
3299
 
3300
+ static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) {
3301
+ ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
3302
+ return ctx->devices.size();
3303
+ }
3304
 
3305
+ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) {
3306
+ ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
3307
+ GGML_ASSERT(index < ctx->devices.size());
3308
+ return ctx->devices[index];
3309
+ }
 
 
 
 
3310
 
3311
+ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
3312
+ GGML_UNUSED(reg);
3313
+ if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
3314
+ return (void *)ggml_backend_cuda_split_buffer_type;
3315
+ }
3316
+ if (strcmp(name, "ggml_backend_register_host_buffer") == 0) {
3317
+ return (void *)ggml_backend_cuda_register_host_buffer;
3318
  }
3319
+ if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
3320
+ return (void *)ggml_backend_cuda_unregister_host_buffer;
3321
+ }
3322
+ return nullptr;
3323
  }
3324
 
3325
+ static void ggml_backend_cuda_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
3326
+ GGML_UNUSED(reg);
3327
+ ggml_backend_cuda_log_set_callback(log_callback, user_data);
3328
  }
3329
 
3330
+ static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
3331
+ /* .get_name = */ ggml_backend_cuda_reg_get_name,
3332
+ /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
3333
+ /* .get_device_get = */ ggml_backend_cuda_reg_get_device,
3334
+ /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
3335
+ /* .set_log_callback = */ ggml_backend_cuda_reg_set_log_callback,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3336
  };
3337
 
3338
+ // backend registry
3339
+ ggml_backend_reg_t ggml_backend_cuda_reg() {
3340
+ static ggml_backend_reg reg;
3341
+ static bool initialized = false;
3342
+
3343
+ {
3344
+ static std::mutex mutex;
3345
+ std::lock_guard<std::mutex> lock(mutex);
3346
+ if (!initialized) {
3347
+ ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
3348
+
3349
+ for (int i = 0; i < ggml_cuda_info().device_count; i++) {
3350
+ ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
3351
+ dev_ctx->device = i;
3352
+ dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
3353
+
3354
+ ggml_cuda_set_device(i);
3355
+ cudaDeviceProp prop;
3356
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
3357
+ dev_ctx->description = prop.name;
3358
+
3359
+ ggml_backend_dev_t dev = new ggml_backend_device {
3360
+ /* .interface = */ ggml_backend_cuda_device_interface,
3361
+ /* .reg = */ &reg,
3362
+ /* .context = */ dev_ctx
3363
+ };
3364
+ ctx->devices.push_back(dev);
3365
+ }
3366
+
3367
+ reg = ggml_backend_reg {
3368
+ /* .interface = */ ggml_backend_cuda_reg_interface,
3369
+ /* .context = */ ctx
3370
+ };
3371
+ }
3372
+
3373
+ initialized = true;
3374
+ }
3375
+
3376
+ return &reg;
3377
  }
3378
 
3379
+ ggml_backend_t ggml_backend_cuda_init(int device) {
3380
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
3381
  GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
3382
  return nullptr;
 
3391
  ggml_backend_t cuda_backend = new ggml_backend {
3392
  /* .guid = */ ggml_backend_cuda_guid(),
3393
  /* .interface = */ ggml_backend_cuda_interface,
3394
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
3395
+ /* .context = */ ctx,
3396
  };
3397
 
3398
  return cuda_backend;
3399
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute.cpp CHANGED
@@ -1921,6 +1921,7 @@ ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
1921
  for (const auto & dev : devices) {
1922
  vec.push_back({
1923
  /* .iface = */ ggml_backend_kompute_buffer_type_interface,
 
1924
  /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
1925
  });
1926
  }
@@ -1989,11 +1990,8 @@ static struct ggml_backend_i kompute_backend_i = {
1989
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1990
  /* .supports_buft = */ ggml_backend_kompute_supports_buft,
1991
  /* .offload_op = */ NULL,
1992
- /* .event_new = */ NULL,
1993
- /* .event_free = */ NULL,
1994
  /* .event_record = */ NULL,
1995
  /* .event_wait = */ NULL,
1996
- /* .event_synchronize = */ NULL,
1997
  };
1998
 
1999
  static ggml_guid_t ggml_backend_kompute_guid() {
@@ -2008,6 +2006,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
2008
  ggml_backend_t kompute_backend = new ggml_backend {
2009
  /* .guid = */ ggml_backend_kompute_guid(),
2010
  /* .interface = */ kompute_backend_i,
 
2011
  /* .context = */ s_kompute_context,
2012
  };
2013
 
@@ -2017,23 +2016,3 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
2017
  bool ggml_backend_is_kompute(ggml_backend_t backend) {
2018
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
2019
  }
2020
-
2021
- static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
2022
- GGML_UNUSED(params);
2023
- return ggml_backend_kompute_init(intptr_t(user_data));
2024
- }
2025
-
2026
- extern "C" int ggml_backend_kompute_reg_devices();
2027
-
2028
- int ggml_backend_kompute_reg_devices() {
2029
- auto devices = ggml_vk_available_devices_internal(0);
2030
- for (const auto & device : devices) {
2031
- ggml_backend_register(
2032
- ggml_kompute_format_name(device.index).c_str(),
2033
- ggml_backend_reg_kompute_init,
2034
- ggml_backend_kompute_buffer_type(device.index),
2035
- reinterpret_cast<void *>(intptr_t(device.index))
2036
- );
2037
- }
2038
- return devices.size();
2039
- }
 
1921
  for (const auto & dev : devices) {
1922
  vec.push_back({
1923
  /* .iface = */ ggml_backend_kompute_buffer_type_interface,
1924
+ /* .device = */ nullptr,
1925
  /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
1926
  });
1927
  }
 
1990
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1991
  /* .supports_buft = */ ggml_backend_kompute_supports_buft,
1992
  /* .offload_op = */ NULL,
 
 
1993
  /* .event_record = */ NULL,
1994
  /* .event_wait = */ NULL,
 
1995
  };
1996
 
1997
  static ggml_guid_t ggml_backend_kompute_guid() {
 
2006
  ggml_backend_t kompute_backend = new ggml_backend {
2007
  /* .guid = */ ggml_backend_kompute_guid(),
2008
  /* .interface = */ kompute_backend_i,
2009
+ /* .device = */ nullptr,
2010
  /* .context = */ s_kompute_context,
2011
  };
2012
 
 
2016
  bool ggml_backend_is_kompute(ggml_backend_t backend) {
2017
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
2018
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-metal.m CHANGED
@@ -3201,13 +3201,13 @@ static void ggml_backend_metal_free_device(void) {
3201
  }
3202
  }
3203
 
3204
- GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
3205
  return "Metal";
3206
 
3207
  UNUSED(buffer);
3208
  }
3209
 
3210
- GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
3211
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
3212
 
3213
  for (int i = 0; i < ctx->n_buffers; i++) {
@@ -3226,25 +3226,25 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_
3226
  free(ctx);
3227
  }
3228
 
3229
- GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
3230
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
3231
 
3232
  return ctx->all_data;
3233
  }
3234
 
3235
- GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
3236
  memcpy((char *)tensor->data + offset, data, size);
3237
 
3238
  UNUSED(buffer);
3239
  }
3240
 
3241
- GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
3242
  memcpy(data, (const char *)tensor->data + offset, size);
3243
 
3244
  UNUSED(buffer);
3245
  }
3246
 
3247
- GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
3248
  if (ggml_backend_buffer_is_host(src->buffer)) {
3249
  memcpy(dst->data, src->data, ggml_nbytes(src));
3250
  return true;
@@ -3254,7 +3254,7 @@ GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t
3254
  UNUSED(buffer);
3255
  }
3256
 
3257
- GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
3258
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
3259
 
3260
  memset(ctx->all_data, value, ctx->all_size);
@@ -3275,7 +3275,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
3275
 
3276
  // default buffer type
3277
 
3278
- GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
3279
  return "Metal";
3280
 
3281
  UNUSED(buft);
@@ -3306,7 +3306,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
3306
  UNUSED(size_aligned);
3307
  }
3308
 
3309
- GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
3310
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
3311
 
3312
  const size_t size_page = sysconf(_SC_PAGESIZE);
@@ -3348,12 +3348,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
3348
  return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
3349
  }
3350
 
3351
- GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
3352
  return 32;
3353
  UNUSED(buft);
3354
  }
3355
 
3356
- GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
3357
  id<MTLDevice> device = ggml_backend_metal_get_device();
3358
  size_t max_size = device.maxBufferLength;
3359
  ggml_backend_metal_free_device();
@@ -3363,13 +3363,13 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
3363
  UNUSED(buft);
3364
  }
3365
 
3366
- GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
3367
  return true;
3368
 
3369
  UNUSED(buft);
3370
  }
3371
 
3372
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
3373
  static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
3374
  /* .iface = */ {
3375
  /* .get_name = */ ggml_backend_metal_buffer_type_get_name,
@@ -3379,6 +3379,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
3379
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
3380
  /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
3381
  },
 
3382
  /* .context = */ NULL,
3383
  };
3384
 
@@ -3387,7 +3388,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
3387
 
3388
  // buffer from ptr
3389
 
3390
- GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
3391
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
3392
 
3393
  ctx->all_data = data;
@@ -3467,37 +3468,37 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
3467
 
3468
  // backend
3469
 
3470
- GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
3471
  return "Metal";
3472
 
3473
  UNUSED(backend);
3474
  }
3475
 
3476
- GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
3477
  struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
3478
  ggml_metal_free(ctx);
3479
  free(backend);
3480
  }
3481
 
3482
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
3483
  return ggml_backend_metal_buffer_type();
3484
 
3485
  UNUSED(backend);
3486
  }
3487
 
3488
- GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
3489
  struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
3490
 
3491
  return ggml_metal_graph_compute(metal_ctx, cgraph);
3492
  }
3493
 
3494
- GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
3495
  struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
3496
 
3497
  return ggml_metal_supports_op(metal_ctx, op);
3498
  }
3499
 
3500
- GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
3501
  return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
3502
 
3503
  UNUSED(backend);
@@ -3538,11 +3539,8 @@ static struct ggml_backend_i ggml_backend_metal_i = {
3538
  /* .supports_op = */ ggml_backend_metal_supports_op,
3539
  /* .supports_buft = */ ggml_backend_metal_supports_buft,
3540
  /* .offload_op = */ NULL,
3541
- /* .event_new = */ NULL,
3542
- /* .event_free = */ NULL,
3543
  /* .event_record = */ NULL,
3544
  /* .event_wait = */ NULL,
3545
- /* .event_synchronize = */ NULL,
3546
  };
3547
 
3548
  void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
@@ -3567,6 +3565,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
3567
  *backend = (struct ggml_backend) {
3568
  /* .guid = */ ggml_backend_metal_guid(),
3569
  /* .interface = */ ggml_backend_metal_i,
 
3570
  /* .context = */ ctx,
3571
  };
3572
 
@@ -3603,9 +3602,9 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
3603
  ctx->capture_next_compute = true;
3604
  }
3605
 
3606
- GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
3607
 
3608
- GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
3609
  return ggml_backend_metal_init();
3610
 
3611
  GGML_UNUSED(params);
 
3201
  }
3202
  }
3203
 
3204
+ static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
3205
  return "Metal";
3206
 
3207
  UNUSED(buffer);
3208
  }
3209
 
3210
+ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
3211
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
3212
 
3213
  for (int i = 0; i < ctx->n_buffers; i++) {
 
3226
  free(ctx);
3227
  }
3228
 
3229
+ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
3230
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
3231
 
3232
  return ctx->all_data;
3233
  }
3234
 
3235
+ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
3236
  memcpy((char *)tensor->data + offset, data, size);
3237
 
3238
  UNUSED(buffer);
3239
  }
3240
 
3241
+ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
3242
  memcpy(data, (const char *)tensor->data + offset, size);
3243
 
3244
  UNUSED(buffer);
3245
  }
3246
 
3247
+ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
3248
  if (ggml_backend_buffer_is_host(src->buffer)) {
3249
  memcpy(dst->data, src->data, ggml_nbytes(src));
3250
  return true;
 
3254
  UNUSED(buffer);
3255
  }
3256
 
3257
+ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
3258
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
3259
 
3260
  memset(ctx->all_data, value, ctx->all_size);
 
3275
 
3276
  // default buffer type
3277
 
3278
+ static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
3279
  return "Metal";
3280
 
3281
  UNUSED(buft);
 
3306
  UNUSED(size_aligned);
3307
  }
3308
 
3309
+ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
3310
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
3311
 
3312
  const size_t size_page = sysconf(_SC_PAGESIZE);
 
3348
  return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
3349
  }
3350
 
3351
+ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
3352
  return 32;
3353
  UNUSED(buft);
3354
  }
3355
 
3356
+ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
3357
  id<MTLDevice> device = ggml_backend_metal_get_device();
3358
  size_t max_size = device.maxBufferLength;
3359
  ggml_backend_metal_free_device();
 
3363
  UNUSED(buft);
3364
  }
3365
 
3366
+ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
3367
  return true;
3368
 
3369
  UNUSED(buft);
3370
  }
3371
 
3372
+ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
3373
  static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
3374
  /* .iface = */ {
3375
  /* .get_name = */ ggml_backend_metal_buffer_type_get_name,
 
3379
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
3380
  /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
3381
  },
3382
+ /* .device = */ NULL,
3383
  /* .context = */ NULL,
3384
  };
3385
 
 
3388
 
3389
  // buffer from ptr
3390
 
3391
+ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
3392
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
3393
 
3394
  ctx->all_data = data;
 
3468
 
3469
  // backend
3470
 
3471
+ static const char * ggml_backend_metal_name(ggml_backend_t backend) {
3472
  return "Metal";
3473
 
3474
  UNUSED(backend);
3475
  }
3476
 
3477
+ static void ggml_backend_metal_free(ggml_backend_t backend) {
3478
  struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
3479
  ggml_metal_free(ctx);
3480
  free(backend);
3481
  }
3482
 
3483
+ static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
3484
  return ggml_backend_metal_buffer_type();
3485
 
3486
  UNUSED(backend);
3487
  }
3488
 
3489
+ static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
3490
  struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
3491
 
3492
  return ggml_metal_graph_compute(metal_ctx, cgraph);
3493
  }
3494
 
3495
+ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
3496
  struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
3497
 
3498
  return ggml_metal_supports_op(metal_ctx, op);
3499
  }
3500
 
3501
+ static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
3502
  return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
3503
 
3504
  UNUSED(backend);
 
3539
  /* .supports_op = */ ggml_backend_metal_supports_op,
3540
  /* .supports_buft = */ ggml_backend_metal_supports_buft,
3541
  /* .offload_op = */ NULL,
 
 
3542
  /* .event_record = */ NULL,
3543
  /* .event_wait = */ NULL,
 
3544
  };
3545
 
3546
  void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
 
3565
  *backend = (struct ggml_backend) {
3566
  /* .guid = */ ggml_backend_metal_guid(),
3567
  /* .interface = */ ggml_backend_metal_i,
3568
+ /* .device = */ NULL,
3569
  /* .context = */ ctx,
3570
  };
3571
 
 
3602
  ctx->capture_next_compute = true;
3603
  }
3604
 
3605
+ ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
3606
 
3607
+ ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
3608
  return ggml_backend_metal_init();
3609
 
3610
  GGML_UNUSED(params);
ggml/src/ggml-rpc.cpp CHANGED
@@ -319,12 +319,12 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
319
  return sock;
320
  }
321
 
322
- GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
323
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
324
  return ctx->name.c_str();
325
  }
326
 
327
- GGML_CALL static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
328
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
329
  // input serialization format: | remote_ptr (8 bytes) |
330
  std::vector<uint8_t> input(sizeof(uint64_t), 0);
@@ -337,7 +337,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t
337
  delete ctx;
338
  }
339
 
340
- GGML_CALL static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
341
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
342
  if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
343
  return ctx->base_cache[buffer];
@@ -388,7 +388,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
388
  return result;
389
  }
390
 
391
- GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
392
  UNUSED(buffer);
393
  if (ggml_is_quantized(tensor->type)) {
394
  // TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
@@ -396,7 +396,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t
396
  }
397
  }
398
 
399
- GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
400
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
401
  // input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
402
  size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
@@ -410,7 +410,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t b
410
  GGML_ASSERT(status);
411
  }
412
 
413
- GGML_CALL static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
414
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
415
  // input serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
416
  int input_size = sizeof(rpc_tensor) + 2*sizeof(uint64_t);
@@ -427,7 +427,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t b
427
  memcpy(data, output.data(), size);
428
  }
429
 
430
- GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
431
  // check if src and dst are on the same server
432
  ggml_backend_buffer_t src_buffer = src->buffer;
433
  ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context;
@@ -452,7 +452,7 @@ GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t b
452
  return output[0];
453
  }
454
 
455
- GGML_CALL static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
456
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
457
  // serialization format: | bufptr (8 bytes) | value (1 byte) |
458
  int input_size = sizeof(uint64_t) + sizeof(uint8_t);
@@ -477,12 +477,12 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
477
  /* .reset = */ NULL,
478
  };
479
 
480
- GGML_CALL static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) {
481
  ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
482
  return buft_ctx->name.c_str();
483
  }
484
 
485
- GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
486
  ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
487
  // input serialization format: | size (8 bytes) |
488
  int input_size = sizeof(uint64_t);
@@ -522,7 +522,7 @@ static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
522
  return alignment;
523
  }
524
 
525
- GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
526
  ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
527
  return buft_ctx->alignment;
528
  }
@@ -540,12 +540,12 @@ static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
540
  return max_size;
541
  }
542
 
543
- GGML_CALL static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
544
  ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
545
  return buft_ctx->max_size;
546
  }
547
 
548
- GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
549
  UNUSED(buft);
550
  return ggml_nbytes(tensor);
551
  }
@@ -559,24 +559,24 @@ static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
559
  /* .is_host = */ NULL,
560
  };
561
 
562
- GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
563
  ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
564
 
565
  return rpc_ctx->name.c_str();
566
  }
567
 
568
- GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
569
  ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
570
  delete rpc_ctx;
571
  delete backend;
572
  }
573
 
574
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
575
  ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
576
  return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
577
  }
578
 
579
- GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
580
  UNUSED(backend);
581
  // this is no-op because we don't have any async operations
582
  }
@@ -618,7 +618,7 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
618
  memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
619
  }
620
 
621
- GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
622
  ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
623
  std::vector<uint8_t> input;
624
  serialize_graph(cgraph, input);
@@ -630,14 +630,14 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
630
  return (enum ggml_status)output[0];
631
  }
632
 
633
- GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
634
  UNUSED(backend);
635
  UNUSED(op);
636
  //TODO: call the remote backend and cache the results
637
  return true;
638
  }
639
 
640
- GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
641
  if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
642
  return false;
643
  }
@@ -662,14 +662,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
662
  /* .supports_op = */ ggml_backend_rpc_supports_op,
663
  /* .supports_buft = */ ggml_backend_rpc_supports_buft,
664
  /* .offload_op = */ NULL,
665
- /* .event_new = */ NULL,
666
- /* .event_free = */ NULL,
667
  /* .event_record = */ NULL,
668
  /* .event_wait = */ NULL,
669
- /* .event_synchronize = */ NULL,
670
  };
671
 
672
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
673
  static std::mutex mutex;
674
  std::lock_guard<std::mutex> lock(mutex);
675
  // NOTE: buffer types are allocated and never freed; this is by design
@@ -694,13 +691,14 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
694
 
695
  ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
696
  /* .iface = */ ggml_backend_rpc_buffer_type_interface,
 
697
  /* .context = */ buft_ctx
698
  };
699
  buft_map[endpoint] = buft;
700
  return buft;
701
  }
702
 
703
- GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
704
  ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
705
  /* .endpoint = */ endpoint,
706
  /* .name = */ "RPC[" + std::string(endpoint) + "]",
@@ -709,12 +707,13 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
709
  ggml_backend_t backend = new ggml_backend {
710
  /* .guid = */ ggml_backend_rpc_guid(),
711
  /* .interface = */ ggml_backend_rpc_interface,
 
712
  /* .context = */ ctx
713
  };
714
  return backend;
715
  }
716
 
717
- GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) {
718
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
719
  }
720
 
@@ -734,7 +733,7 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
734
  *total = total_mem;
735
  }
736
 
737
- GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
738
  auto sock = get_socket(endpoint);
739
  if (sock == nullptr) {
740
  *free = 0;
 
319
  return sock;
320
  }
321
 
322
+ static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
323
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
324
  return ctx->name.c_str();
325
  }
326
 
327
+ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
328
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
329
  // input serialization format: | remote_ptr (8 bytes) |
330
  std::vector<uint8_t> input(sizeof(uint64_t), 0);
 
337
  delete ctx;
338
  }
339
 
340
+ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
341
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
342
  if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
343
  return ctx->base_cache[buffer];
 
388
  return result;
389
  }
390
 
391
+ static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
392
  UNUSED(buffer);
393
  if (ggml_is_quantized(tensor->type)) {
394
  // TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
 
396
  }
397
  }
398
 
399
+ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
400
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
401
  // input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
402
  size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
 
410
  GGML_ASSERT(status);
411
  }
412
 
413
+ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
414
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
415
  // input serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
416
  int input_size = sizeof(rpc_tensor) + 2*sizeof(uint64_t);
 
427
  memcpy(data, output.data(), size);
428
  }
429
 
430
+ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
431
  // check if src and dst are on the same server
432
  ggml_backend_buffer_t src_buffer = src->buffer;
433
  ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context;
 
452
  return output[0];
453
  }
454
 
455
+ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
456
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
457
  // serialization format: | bufptr (8 bytes) | value (1 byte) |
458
  int input_size = sizeof(uint64_t) + sizeof(uint8_t);
 
477
  /* .reset = */ NULL,
478
  };
479
 
480
+ static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) {
481
  ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
482
  return buft_ctx->name.c_str();
483
  }
484
 
485
+ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
486
  ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
487
  // input serialization format: | size (8 bytes) |
488
  int input_size = sizeof(uint64_t);
 
522
  return alignment;
523
  }
524
 
525
+ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
526
  ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
527
  return buft_ctx->alignment;
528
  }
 
540
  return max_size;
541
  }
542
 
543
+ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
544
  ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
545
  return buft_ctx->max_size;
546
  }
547
 
548
+ static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
549
  UNUSED(buft);
550
  return ggml_nbytes(tensor);
551
  }
 
559
  /* .is_host = */ NULL,
560
  };
561
 
562
+ static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
563
  ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
564
 
565
  return rpc_ctx->name.c_str();
566
  }
567
 
568
+ static void ggml_backend_rpc_free(ggml_backend_t backend) {
569
  ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
570
  delete rpc_ctx;
571
  delete backend;
572
  }
573
 
574
+ static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
575
  ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
576
  return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
577
  }
578
 
579
+ static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
580
  UNUSED(backend);
581
  // this is no-op because we don't have any async operations
582
  }
 
618
  memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
619
  }
620
 
621
+ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
622
  ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
623
  std::vector<uint8_t> input;
624
  serialize_graph(cgraph, input);
 
630
  return (enum ggml_status)output[0];
631
  }
632
 
633
+ static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
634
  UNUSED(backend);
635
  UNUSED(op);
636
  //TODO: call the remote backend and cache the results
637
  return true;
638
  }
639
 
640
+ static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
641
  if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
642
  return false;
643
  }
 
662
  /* .supports_op = */ ggml_backend_rpc_supports_op,
663
  /* .supports_buft = */ ggml_backend_rpc_supports_buft,
664
  /* .offload_op = */ NULL,
 
 
665
  /* .event_record = */ NULL,
666
  /* .event_wait = */ NULL,
 
667
  };
668
 
669
+ GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
670
  static std::mutex mutex;
671
  std::lock_guard<std::mutex> lock(mutex);
672
  // NOTE: buffer types are allocated and never freed; this is by design
 
691
 
692
  ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
693
  /* .iface = */ ggml_backend_rpc_buffer_type_interface,
694
+ /* .device = */ nullptr,
695
  /* .context = */ buft_ctx
696
  };
697
  buft_map[endpoint] = buft;
698
  return buft;
699
  }
700
 
701
+ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
702
  ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
703
  /* .endpoint = */ endpoint,
704
  /* .name = */ "RPC[" + std::string(endpoint) + "]",
 
707
  ggml_backend_t backend = new ggml_backend {
708
  /* .guid = */ ggml_backend_rpc_guid(),
709
  /* .interface = */ ggml_backend_rpc_interface,
710
+ /* .device = */ nullptr,
711
  /* .context = */ ctx
712
  };
713
  return backend;
714
  }
715
 
716
+ GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) {
717
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
718
  }
719
 
 
733
  *total = total_mem;
734
  }
735
 
736
+ GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
737
  auto sock = get_socket(endpoint);
738
  if (sock == nullptr) {
739
  *free = 0;
ggml/src/ggml-sycl.cpp CHANGED
@@ -4038,7 +4038,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
4038
  return true;
4039
  }
4040
 
4041
- GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
4042
  GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
4043
  for(int i=0;i<max_len;i++) id_list[i] = -1;
4044
 
@@ -4068,7 +4068,7 @@ catch (sycl::exception const &exc) {
4068
  std::exit(1);
4069
  }
4070
 
4071
- GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
4072
  size_t description_size) try {
4073
  GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
4074
  dpct::device_info prop;
@@ -4082,7 +4082,7 @@ catch (sycl::exception const &exc) {
4082
  std::exit(1);
4083
  }
4084
 
4085
- GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
4086
  size_t *total) try {
4087
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
4088
  ggml_sycl_set_device(device);
@@ -4135,12 +4135,12 @@ struct ggml_backend_sycl_buffer_context {
4135
  }
4136
  };
4137
 
4138
- GGML_CALL static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
4139
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
4140
  return ctx->name.c_str();
4141
  }
4142
 
4143
- GGML_CALL static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
4144
  return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
4145
  }
4146
 
@@ -4162,7 +4162,7 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
4162
  return ctx->dev_ptr;
4163
  }
4164
 
4165
- GGML_CALL static void
4166
  ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
4167
  ggml_tensor *tensor) try {
4168
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
@@ -4237,7 +4237,7 @@ catch (sycl::exception const &exc) {
4237
  std::exit(1);
4238
  }
4239
 
4240
- GGML_CALL static bool
4241
  ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
4242
  const ggml_tensor *src,
4243
  ggml_tensor *dst) try {
@@ -4339,12 +4339,12 @@ struct ggml_backend_sycl_buffer_type_context {
4339
  queue_ptr stream = nullptr;
4340
  };
4341
 
4342
- GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) {
4343
  ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
4344
 
4345
  return ctx->name.c_str();
4346
  }
4347
- GGML_CALL static ggml_backend_buffer_t
4348
  ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
4349
  size_t size) try {
4350
  ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
@@ -4368,7 +4368,7 @@ catch (sycl::exception const &exc) {
4368
  std::exit(1);
4369
  }
4370
 
4371
- GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
4372
  return 128;
4373
  UNUSED(buft);
4374
  }
@@ -4379,7 +4379,7 @@ static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_typ
4379
  UNUSED(buft);
4380
  }
4381
 
4382
- GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
4383
  size_t size = ggml_nbytes(tensor);
4384
  int64_t ne0 = tensor->ne[0];
4385
 
@@ -4424,6 +4424,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
4424
  queue_ptr stream = &(device_i.default_queue());
4425
  ggml_backend_sycl_buffer_types[i] = {
4426
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
 
4427
  /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
4428
  };
4429
  }
@@ -4449,6 +4450,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
4449
  for (int i = 0; i < ggml_sycl_info().device_count; i++) {
4450
  ggml_backend_sycl_buffer_types[i] = {
4451
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
 
4452
  /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
4453
  };
4454
  }
@@ -4513,7 +4515,7 @@ struct ggml_backend_sycl_split_buffer_context {
4513
  std::vector<queue_ptr> streams;
4514
  };
4515
 
4516
- GGML_CALL static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
4517
  return GGML_SYCL_NAME "_Split";
4518
 
4519
  UNUSED(buffer);
@@ -4523,19 +4525,19 @@ static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
4523
  return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
4524
  }
4525
 
4526
- GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
4527
  ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
4528
  delete ctx;
4529
  }
4530
 
4531
- GGML_CALL static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
4532
  // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
4533
  return (void *)0x1000;
4534
 
4535
  UNUSED(buffer);
4536
  }
4537
 
4538
- GGML_CALL static void
4539
  ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
4540
  ggml_tensor *tensor) try {
4541
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
@@ -4618,7 +4620,7 @@ catch (sycl::exception const &exc) {
4618
  std::exit(1);
4619
  }
4620
 
4621
- GGML_CALL static void
4622
  ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
4623
  ggml_tensor *tensor, const void *data,
4624
  size_t offset, size_t size) try {
@@ -4671,7 +4673,7 @@ catch (sycl::exception const &exc) {
4671
  std::exit(1);
4672
  }
4673
 
4674
- GGML_CALL static void
4675
  ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
4676
  const ggml_tensor *tensor, void *data,
4677
  size_t offset, size_t size) try {
@@ -4724,7 +4726,7 @@ catch (sycl::exception const &exc) {
4724
  std::exit(1);
4725
  }
4726
 
4727
- GGML_CALL static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
4728
  UNUSED(buffer);
4729
  UNUSED(value);
4730
  }
@@ -4742,13 +4744,13 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
4742
  /* .reset = */ NULL,
4743
  };
4744
 
4745
- GGML_CALL static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
4746
  return GGML_SYCL_NAME "_Split";
4747
 
4748
  UNUSED(buft);
4749
  }
4750
 
4751
- GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
4752
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
4753
  // instead, we allocate them for each tensor separately in init_tensor
4754
  // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
@@ -4758,12 +4760,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc
4758
  return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size);
4759
  }
4760
 
4761
- GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
4762
  return 128;
4763
  UNUSED(buft);
4764
  }
4765
 
4766
- GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
4767
  ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context;
4768
 
4769
  size_t total_size = 0;
@@ -4790,7 +4792,7 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
4790
  return total_size;
4791
  }
4792
 
4793
- GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
4794
  return false;
4795
 
4796
  UNUSED(buft);
@@ -4805,7 +4807,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
4805
  /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
4806
  };
4807
 
4808
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
4809
  static std::mutex mutex;
4810
  std::lock_guard<std::mutex> lock(mutex);
4811
 
@@ -4837,6 +4839,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
4837
 
4838
  struct ggml_backend_buffer_type buft {
4839
  /* .iface = */ ggml_backend_sycl_split_buffer_type_interface,
 
4840
  /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr},
4841
  };
4842
 
@@ -4846,13 +4849,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
4846
 
4847
  // host buffer type
4848
 
4849
- GGML_CALL static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
4850
  return GGML_SYCL_NAME "_Host";
4851
 
4852
  UNUSED(buft);
4853
  }
4854
 
4855
- GGML_CALL static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
4856
  return GGML_SYCL_NAME "_Host";
4857
 
4858
  UNUSED(buffer);
@@ -4890,6 +4893,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
4890
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
4891
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
4892
  },
 
4893
  /* .context = */ nullptr,
4894
  };
4895
 
@@ -4898,14 +4902,14 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
4898
 
4899
  // backend
4900
 
4901
- GGML_CALL static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
4902
 
4903
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4904
 
4905
  return sycl_ctx->name.c_str();
4906
  }
4907
 
4908
- GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) {
4909
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4910
 
4911
  delete sycl_ctx;
@@ -4913,12 +4917,12 @@ GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) {
4913
  }
4914
 
4915
 
4916
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
4917
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4918
  return ggml_backend_sycl_buffer_type(sycl_ctx->device);
4919
  }
4920
 
4921
- GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
4922
  ggml_tensor *tensor,
4923
  const void *data, size_t offset,
4924
  size_t size) try {
@@ -4936,7 +4940,7 @@ catch (sycl::exception const &exc) {
4936
  std::exit(1);
4937
  }
4938
 
4939
- GGML_CALL static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
4940
  const ggml_tensor *tensor,
4941
  void *data, size_t offset,
4942
  size_t size) try {
@@ -4954,9 +4958,9 @@ catch (sycl::exception const &exc) {
4954
  std::exit(1);
4955
  }
4956
 
4957
- GGML_CALL static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
4958
- const ggml_tensor *src,
4959
- ggml_tensor *dst) try {
4960
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4961
  if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
4962
  /*
@@ -4991,7 +4995,7 @@ catch (sycl::exception const &exc) {
4991
  std::exit(1);
4992
  }
4993
 
4994
- GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
4995
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4996
  ggml_sycl_set_main_device(sycl_ctx->device);
4997
 
@@ -5019,7 +5023,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
5019
  return GGML_STATUS_SUCCESS;
5020
  }
5021
 
5022
- GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
5023
  switch (op->op) {
5024
  case GGML_OP_CONV_TRANSPOSE_1D:
5025
  {
@@ -5166,13 +5170,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
5166
  UNUSED(backend);
5167
  }
5168
 
5169
- GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
5170
  const int min_batch_size = 32;
5171
  return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
5172
  GGML_UNUSED(backend);
5173
  }
5174
 
5175
- GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
5176
  if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
5177
  return false;
5178
  }
@@ -5197,11 +5201,8 @@ static ggml_backend_i ggml_backend_sycl_interface = {
5197
  /* .supports_op = */ ggml_backend_sycl_supports_op,
5198
  /* .supports_buft = */ ggml_backend_sycl_supports_buft,
5199
  /* .offload_op = */ ggml_backend_sycl_offload_op,
5200
- /* .event_new = */ NULL,
5201
- /* .event_free = */ NULL,
5202
  /* .event_record = */ NULL,
5203
  /* .event_wait = */ NULL,
5204
- /* .event_synchronize = */ NULL,
5205
  };
5206
 
5207
  static ggml_guid_t ggml_backend_sycl_guid() {
@@ -5209,7 +5210,7 @@ static ggml_guid_t ggml_backend_sycl_guid() {
5209
  return &guid;
5210
  }
5211
 
5212
- GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
5213
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
5214
  ggml_check_sycl();
5215
 
@@ -5224,6 +5225,7 @@ GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
5224
  ggml_backend_t sycl_backend = new ggml_backend {
5225
  /* .guid = */ ggml_backend_sycl_guid(),
5226
  /* .interface = */ ggml_backend_sycl_interface,
 
5227
  /* .context = */ ctx
5228
  };
5229
 
@@ -5234,26 +5236,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
5234
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
5235
  }
5236
 
5237
- GGML_CALL int ggml_backend_sycl_get_device_count() {
5238
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
5239
  return ggml_sycl_info().device_count;
5240
  }
5241
-
5242
- GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
5243
- ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
5244
- return sycl_backend;
5245
-
5246
- UNUSED(params);
5247
- }
5248
-
5249
- extern "C" int ggml_backend_sycl_reg_devices();
5250
-
5251
- int ggml_backend_sycl_reg_devices() {
5252
- assert(ggml_sycl_info().device_count>0);
5253
- for (int i = 0; i < ggml_sycl_info().device_count; i++) {
5254
- char name[128];
5255
- snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i);
5256
- ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i);
5257
- }
5258
- return ggml_sycl_info().device_count;
5259
- }
 
4038
  return true;
4039
  }
4040
 
4041
+ GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
4042
  GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
4043
  for(int i=0;i<max_len;i++) id_list[i] = -1;
4044
 
 
4068
  std::exit(1);
4069
  }
4070
 
4071
+ GGML_API void ggml_sycl_get_device_description(int device, char *description,
4072
  size_t description_size) try {
4073
  GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
4074
  dpct::device_info prop;
 
4082
  std::exit(1);
4083
  }
4084
 
4085
+ void ggml_backend_sycl_get_device_memory(int device, size_t *free,
4086
  size_t *total) try {
4087
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
4088
  ggml_sycl_set_device(device);
 
4135
  }
4136
  };
4137
 
4138
+ static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
4139
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
4140
  return ctx->name.c_str();
4141
  }
4142
 
4143
+ static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
4144
  return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
4145
  }
4146
 
 
4162
  return ctx->dev_ptr;
4163
  }
4164
 
4165
+ static void
4166
  ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
4167
  ggml_tensor *tensor) try {
4168
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
 
4237
  std::exit(1);
4238
  }
4239
 
4240
+ static bool
4241
  ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
4242
  const ggml_tensor *src,
4243
  ggml_tensor *dst) try {
 
4339
  queue_ptr stream = nullptr;
4340
  };
4341
 
4342
+ static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) {
4343
  ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
4344
 
4345
  return ctx->name.c_str();
4346
  }
4347
+ static ggml_backend_buffer_t
4348
  ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
4349
  size_t size) try {
4350
  ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
 
4368
  std::exit(1);
4369
  }
4370
 
4371
+ static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
4372
  return 128;
4373
  UNUSED(buft);
4374
  }
 
4379
  UNUSED(buft);
4380
  }
4381
 
4382
+ static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
4383
  size_t size = ggml_nbytes(tensor);
4384
  int64_t ne0 = tensor->ne[0];
4385
 
 
4424
  queue_ptr stream = &(device_i.default_queue());
4425
  ggml_backend_sycl_buffer_types[i] = {
4426
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
4427
+ /* .device = */ nullptr,
4428
  /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
4429
  };
4430
  }
 
4450
  for (int i = 0; i < ggml_sycl_info().device_count; i++) {
4451
  ggml_backend_sycl_buffer_types[i] = {
4452
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
4453
+ /* .device = */ nullptr,
4454
  /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
4455
  };
4456
  }
 
4515
  std::vector<queue_ptr> streams;
4516
  };
4517
 
4518
+ static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
4519
  return GGML_SYCL_NAME "_Split";
4520
 
4521
  UNUSED(buffer);
 
4525
  return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
4526
  }
4527
 
4528
+ static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
4529
  ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
4530
  delete ctx;
4531
  }
4532
 
4533
+ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
4534
  // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
4535
  return (void *)0x1000;
4536
 
4537
  UNUSED(buffer);
4538
  }
4539
 
4540
+ static void
4541
  ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
4542
  ggml_tensor *tensor) try {
4543
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
 
4620
  std::exit(1);
4621
  }
4622
 
4623
+ static void
4624
  ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
4625
  ggml_tensor *tensor, const void *data,
4626
  size_t offset, size_t size) try {
 
4673
  std::exit(1);
4674
  }
4675
 
4676
+ static void
4677
  ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
4678
  const ggml_tensor *tensor, void *data,
4679
  size_t offset, size_t size) try {
 
4726
  std::exit(1);
4727
  }
4728
 
4729
+ static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
4730
  UNUSED(buffer);
4731
  UNUSED(value);
4732
  }
 
4744
  /* .reset = */ NULL,
4745
  };
4746
 
4747
+ static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
4748
  return GGML_SYCL_NAME "_Split";
4749
 
4750
  UNUSED(buft);
4751
  }
4752
 
4753
+ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
4754
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
4755
  // instead, we allocate them for each tensor separately in init_tensor
4756
  // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
 
4760
  return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size);
4761
  }
4762
 
4763
+ static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
4764
  return 128;
4765
  UNUSED(buft);
4766
  }
4767
 
4768
+ static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
4769
  ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context;
4770
 
4771
  size_t total_size = 0;
 
4792
  return total_size;
4793
  }
4794
 
4795
+ static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
4796
  return false;
4797
 
4798
  UNUSED(buft);
 
4807
  /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
4808
  };
4809
 
4810
+ ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
4811
  static std::mutex mutex;
4812
  std::lock_guard<std::mutex> lock(mutex);
4813
 
 
4839
 
4840
  struct ggml_backend_buffer_type buft {
4841
  /* .iface = */ ggml_backend_sycl_split_buffer_type_interface,
4842
+ /* .device = */ nullptr,
4843
  /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr},
4844
  };
4845
 
 
4849
 
4850
  // host buffer type
4851
 
4852
+ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
4853
  return GGML_SYCL_NAME "_Host";
4854
 
4855
  UNUSED(buft);
4856
  }
4857
 
4858
+ static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
4859
  return GGML_SYCL_NAME "_Host";
4860
 
4861
  UNUSED(buffer);
 
4893
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
4894
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
4895
  },
4896
+ /* .device = */ nullptr,
4897
  /* .context = */ nullptr,
4898
  };
4899
 
 
4902
 
4903
  // backend
4904
 
4905
+ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
4906
 
4907
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4908
 
4909
  return sycl_ctx->name.c_str();
4910
  }
4911
 
4912
+ static void ggml_backend_sycl_free(ggml_backend_t backend) {
4913
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4914
 
4915
  delete sycl_ctx;
 
4917
  }
4918
 
4919
 
4920
+ static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
4921
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4922
  return ggml_backend_sycl_buffer_type(sycl_ctx->device);
4923
  }
4924
 
4925
+ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
4926
  ggml_tensor *tensor,
4927
  const void *data, size_t offset,
4928
  size_t size) try {
 
4940
  std::exit(1);
4941
  }
4942
 
4943
+ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
4944
  const ggml_tensor *tensor,
4945
  void *data, size_t offset,
4946
  size_t size) try {
 
4958
  std::exit(1);
4959
  }
4960
 
4961
+ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
4962
+ const ggml_tensor *src,
4963
+ ggml_tensor *dst) try {
4964
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4965
  if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
4966
  /*
 
4995
  std::exit(1);
4996
  }
4997
 
4998
+ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
4999
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
5000
  ggml_sycl_set_main_device(sycl_ctx->device);
5001
 
 
5023
  return GGML_STATUS_SUCCESS;
5024
  }
5025
 
5026
+ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
5027
  switch (op->op) {
5028
  case GGML_OP_CONV_TRANSPOSE_1D:
5029
  {
 
5170
  UNUSED(backend);
5171
  }
5172
 
5173
+ static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
5174
  const int min_batch_size = 32;
5175
  return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
5176
  GGML_UNUSED(backend);
5177
  }
5178
 
5179
+ static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
5180
  if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
5181
  return false;
5182
  }
 
5201
  /* .supports_op = */ ggml_backend_sycl_supports_op,
5202
  /* .supports_buft = */ ggml_backend_sycl_supports_buft,
5203
  /* .offload_op = */ ggml_backend_sycl_offload_op,
 
 
5204
  /* .event_record = */ NULL,
5205
  /* .event_wait = */ NULL,
 
5206
  };
5207
 
5208
  static ggml_guid_t ggml_backend_sycl_guid() {
 
5210
  return &guid;
5211
  }
5212
 
5213
+ ggml_backend_t ggml_backend_sycl_init(int device) {
5214
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
5215
  ggml_check_sycl();
5216
 
 
5225
  ggml_backend_t sycl_backend = new ggml_backend {
5226
  /* .guid = */ ggml_backend_sycl_guid(),
5227
  /* .interface = */ ggml_backend_sycl_interface,
5228
+ /* .device = */ nullptr,
5229
  /* .context = */ ctx
5230
  };
5231
 
 
5236
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
5237
  }
5238
 
5239
+ int ggml_backend_sycl_get_device_count() {
5240
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
5241
  return ggml_sycl_info().device_count;
5242
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-vulkan.cpp CHANGED
@@ -119,11 +119,11 @@ struct ggml_backend_vk_buffer_type_context {
119
  vk_device device;
120
  };
121
 
122
- GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
123
- GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
124
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
125
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft);
126
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor);
127
  static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
128
  /* .get_name = */ ggml_backend_vk_buffer_type_name,
129
  /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
@@ -622,7 +622,7 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor);
622
 
623
  typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
624
 
625
- GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
626
 
627
  // variables to track number of compiles in progress
628
  static uint32_t compile_count = 0;
@@ -1953,6 +1953,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
1953
 
1954
  device->buffer_type = {
1955
  /* .iface = */ ggml_backend_vk_buffer_type_interface,
 
1956
  /* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
1957
  };
1958
 
@@ -6147,13 +6148,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
6147
  ctx->device->device.destroyFence(ctx->fence);
6148
  }
6149
 
6150
- GGML_CALL static int ggml_vk_get_device_count() {
6151
  ggml_vk_instance_init();
6152
 
6153
  return vk_instance.device_indices.size();
6154
  }
6155
 
6156
- GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
6157
  ggml_vk_instance_init();
6158
 
6159
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -6170,36 +6171,36 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
6170
 
6171
  // device backend
6172
 
6173
- GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
6174
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6175
  return ctx->name.c_str();
6176
  }
6177
 
6178
- GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
6179
  return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
6180
  }
6181
 
6182
- GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6183
  VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
6184
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6185
  ggml_vk_destroy_buffer(ctx->dev_buffer);
6186
  delete ctx;
6187
  }
6188
 
6189
- GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
6190
  return vk_ptr_base;
6191
 
6192
  UNUSED(buffer);
6193
  }
6194
 
6195
- GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
6196
  VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
6197
  if (tensor->view_src != nullptr) {
6198
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
6199
  }
6200
  }
6201
 
6202
- GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6203
  VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6204
  ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6205
  vk_buffer buf = buf_ctx->dev_buffer;
@@ -6207,7 +6208,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
6207
  ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6208
  }
6209
 
6210
- GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6211
  VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6212
  ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6213
 
@@ -6216,7 +6217,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
6216
  ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6217
  }
6218
 
6219
- GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
6220
  if (ggml_backend_buffer_is_vk(src->buffer)) {
6221
  ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
6222
  ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
@@ -6233,7 +6234,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
6233
  UNUSED(buffer);
6234
  }
6235
 
6236
- GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
6237
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6238
 
6239
  ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
@@ -6253,13 +6254,13 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
6253
  };
6254
 
6255
  // vk buffer type
6256
- GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) {
6257
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6258
 
6259
  return ctx->name.c_str();
6260
  }
6261
 
6262
- GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6263
  VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
6264
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6265
 
@@ -6275,23 +6276,23 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
6275
  return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size);
6276
  }
6277
 
6278
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
6279
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6280
  return ctx->device->properties.limits.minStorageBufferOffsetAlignment;
6281
  }
6282
 
6283
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
6284
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6285
  return ctx->device->max_memory_allocation_size;
6286
  }
6287
 
6288
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
6289
  return ggml_nbytes(tensor);
6290
 
6291
  UNUSED(buft);
6292
  }
6293
 
6294
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6295
  ggml_vk_instance_init();
6296
 
6297
  VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
@@ -6303,24 +6304,24 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num)
6303
 
6304
  // host buffer type
6305
 
6306
- GGML_CALL static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
6307
  return GGML_VK_NAME "_Host";
6308
 
6309
  UNUSED(buft);
6310
  }
6311
 
6312
- GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) {
6313
  return GGML_VK_NAME "_Host";
6314
 
6315
  UNUSED(buffer);
6316
  }
6317
 
6318
- GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6319
  VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
6320
  ggml_vk_host_free(vk_instance.devices[0], buffer->context);
6321
  }
6322
 
6323
- GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6324
  VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
6325
 
6326
  size += 32; // Behave like the CPU buffer type
@@ -6344,7 +6345,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
6344
  UNUSED(buft);
6345
  }
6346
 
6347
- GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
6348
  return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment;
6349
 
6350
  UNUSED(buft);
@@ -6352,7 +6353,7 @@ GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_back
6352
 
6353
  // Should be changed to return device-specific host buffer type
6354
  // but that probably requires changes in llama.cpp
6355
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6356
  static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
6357
  /* .iface = */ {
6358
  /* .get_name = */ ggml_backend_vk_host_buffer_type_name,
@@ -6362,6 +6363,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6362
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
6363
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
6364
  },
 
6365
  /* .context = */ nullptr,
6366
  };
6367
 
@@ -6375,13 +6377,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6375
 
6376
  // backend
6377
 
6378
- GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
6379
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6380
 
6381
  return ctx->name.c_str();
6382
  }
6383
 
6384
- GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
6385
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6386
  VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
6387
 
@@ -6391,13 +6393,13 @@ GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
6391
  delete backend;
6392
  }
6393
 
6394
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) {
6395
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6396
 
6397
  return &ctx->device->buffer_type;
6398
  }
6399
 
6400
- GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6401
  VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
6402
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6403
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
@@ -6420,7 +6422,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6420
  ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6421
  }
6422
 
6423
- GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6424
  VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
6425
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6426
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
@@ -6443,7 +6445,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6443
  ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6444
  }
6445
 
6446
- GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6447
  VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
6448
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6449
  if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
@@ -6471,7 +6473,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6471
  return false;
6472
  }
6473
 
6474
- GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6475
  VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
6476
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6477
  if(ctx->transfer_ctx.expired()) {
@@ -6501,7 +6503,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
6501
  return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
6502
  }
6503
 
6504
- GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6505
  VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
6506
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6507
 
@@ -6564,7 +6566,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6564
  UNUSED(backend);
6565
  }
6566
 
6567
- GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
6568
  // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
6569
 
6570
  switch (op->op) {
@@ -6687,7 +6689,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6687
  UNUSED(backend);
6688
  }
6689
 
6690
- GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
6691
  const int min_batch_size = 32;
6692
 
6693
  return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
@@ -6696,7 +6698,7 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
6696
  UNUSED(backend);
6697
  }
6698
 
6699
- GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
6700
  if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
6701
  return false;
6702
  }
@@ -6724,11 +6726,8 @@ static ggml_backend_i ggml_backend_vk_interface = {
6724
  /* .supports_op = */ ggml_backend_vk_supports_op,
6725
  /* .supports_buft = */ ggml_backend_vk_supports_buft,
6726
  /* .offload_op = */ ggml_backend_vk_offload_op,
6727
- /* .event_new = */ NULL,
6728
- /* .event_free = */ NULL,
6729
  /* .event_record = */ NULL,
6730
  /* .event_wait = */ NULL,
6731
- /* .event_synchronize = */ NULL,
6732
  };
6733
 
6734
  static ggml_guid_t ggml_backend_vk_guid() {
@@ -6736,7 +6735,7 @@ static ggml_guid_t ggml_backend_vk_guid() {
6736
  return &guid;
6737
  }
6738
 
6739
- GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6740
  VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
6741
 
6742
  ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
@@ -6745,25 +6744,26 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6745
  ggml_backend_t vk_backend = new ggml_backend {
6746
  /* .guid = */ ggml_backend_vk_guid(),
6747
  /* .interface = */ ggml_backend_vk_interface,
 
6748
  /* .context = */ ctx,
6749
  };
6750
 
6751
  return vk_backend;
6752
  }
6753
 
6754
- GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
6755
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
6756
  }
6757
 
6758
- GGML_CALL int ggml_backend_vk_get_device_count() {
6759
  return ggml_vk_get_device_count();
6760
  }
6761
 
6762
- GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
6763
  ggml_vk_get_device_description(device, description, description_size);
6764
  }
6765
 
6766
- GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
6767
  GGML_ASSERT(device < (int) vk_instance.device_indices.size());
6768
 
6769
  vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
@@ -6779,27 +6779,6 @@ GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size
6779
  }
6780
  }
6781
 
6782
- // backend registry
6783
- GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) {
6784
- ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data);
6785
- return vk_backend;
6786
-
6787
- UNUSED(params);
6788
- }
6789
-
6790
- extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
6791
-
6792
- GGML_CALL int ggml_backend_vk_reg_devices() {
6793
- ggml_vk_instance_init();
6794
-
6795
- for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
6796
- char name[128];
6797
- snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
6798
- ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
6799
- }
6800
- return vk_instance.device_indices.size();
6801
- }
6802
-
6803
  // Extension availability
6804
  static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
6805
  #ifdef GGML_VULKAN_VALIDATE
 
119
  vk_device device;
120
  };
121
 
122
+ static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
123
+ static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
124
+ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
125
+ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft);
126
+ static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor);
127
  static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
128
  /* .get_name = */ ggml_backend_vk_buffer_type_name,
129
  /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
 
622
 
623
  typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
624
 
625
+ static void ggml_backend_vk_free(ggml_backend_t backend);
626
 
627
  // variables to track number of compiles in progress
628
  static uint32_t compile_count = 0;
 
1953
 
1954
  device->buffer_type = {
1955
  /* .iface = */ ggml_backend_vk_buffer_type_interface,
1956
+ /* .device = */ nullptr,
1957
  /* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
1958
  };
1959
 
 
6148
  ctx->device->device.destroyFence(ctx->fence);
6149
  }
6150
 
6151
+ static int ggml_vk_get_device_count() {
6152
  ggml_vk_instance_init();
6153
 
6154
  return vk_instance.device_indices.size();
6155
  }
6156
 
6157
+ static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
6158
  ggml_vk_instance_init();
6159
 
6160
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
 
6171
 
6172
  // device backend
6173
 
6174
+ static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
6175
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6176
  return ctx->name.c_str();
6177
  }
6178
 
6179
+ static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
6180
  return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
6181
  }
6182
 
6183
+ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6184
  VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
6185
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6186
  ggml_vk_destroy_buffer(ctx->dev_buffer);
6187
  delete ctx;
6188
  }
6189
 
6190
+ static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
6191
  return vk_ptr_base;
6192
 
6193
  UNUSED(buffer);
6194
  }
6195
 
6196
+ static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
6197
  VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
6198
  if (tensor->view_src != nullptr) {
6199
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
6200
  }
6201
  }
6202
 
6203
+ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6204
  VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6205
  ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6206
  vk_buffer buf = buf_ctx->dev_buffer;
 
6208
  ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6209
  }
6210
 
6211
+ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6212
  VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6213
  ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6214
 
 
6217
  ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6218
  }
6219
 
6220
+ static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
6221
  if (ggml_backend_buffer_is_vk(src->buffer)) {
6222
  ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
6223
  ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
 
6234
  UNUSED(buffer);
6235
  }
6236
 
6237
+ static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
6238
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6239
 
6240
  ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
 
6254
  };
6255
 
6256
  // vk buffer type
6257
+ static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) {
6258
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6259
 
6260
  return ctx->name.c_str();
6261
  }
6262
 
6263
+ static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6264
  VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
6265
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6266
 
 
6276
  return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size);
6277
  }
6278
 
6279
+ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
6280
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6281
  return ctx->device->properties.limits.minStorageBufferOffsetAlignment;
6282
  }
6283
 
6284
+ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
6285
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6286
  return ctx->device->max_memory_allocation_size;
6287
  }
6288
 
6289
+ static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
6290
  return ggml_nbytes(tensor);
6291
 
6292
  UNUSED(buft);
6293
  }
6294
 
6295
+ ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6296
  ggml_vk_instance_init();
6297
 
6298
  VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
 
6304
 
6305
  // host buffer type
6306
 
6307
+ static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
6308
  return GGML_VK_NAME "_Host";
6309
 
6310
  UNUSED(buft);
6311
  }
6312
 
6313
+ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) {
6314
  return GGML_VK_NAME "_Host";
6315
 
6316
  UNUSED(buffer);
6317
  }
6318
 
6319
+ static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6320
  VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
6321
  ggml_vk_host_free(vk_instance.devices[0], buffer->context);
6322
  }
6323
 
6324
+ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6325
  VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
6326
 
6327
  size += 32; // Behave like the CPU buffer type
 
6345
  UNUSED(buft);
6346
  }
6347
 
6348
+ static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
6349
  return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment;
6350
 
6351
  UNUSED(buft);
 
6353
 
6354
  // Should be changed to return device-specific host buffer type
6355
  // but that probably requires changes in llama.cpp
6356
+ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6357
  static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
6358
  /* .iface = */ {
6359
  /* .get_name = */ ggml_backend_vk_host_buffer_type_name,
 
6363
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
6364
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
6365
  },
6366
+ /* .device = */ nullptr,
6367
  /* .context = */ nullptr,
6368
  };
6369
 
 
6377
 
6378
  // backend
6379
 
6380
+ static const char * ggml_backend_vk_name(ggml_backend_t backend) {
6381
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6382
 
6383
  return ctx->name.c_str();
6384
  }
6385
 
6386
+ static void ggml_backend_vk_free(ggml_backend_t backend) {
6387
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6388
  VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
6389
 
 
6393
  delete backend;
6394
  }
6395
 
6396
+ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) {
6397
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6398
 
6399
  return &ctx->device->buffer_type;
6400
  }
6401
 
6402
+ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6403
  VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
6404
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6405
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
 
6422
  ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6423
  }
6424
 
6425
+ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6426
  VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
6427
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6428
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
 
6445
  ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6446
  }
6447
 
6448
+ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6449
  VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
6450
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6451
  if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
 
6473
  return false;
6474
  }
6475
 
6476
+ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6477
  VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
6478
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6479
  if(ctx->transfer_ctx.expired()) {
 
6503
  return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
6504
  }
6505
 
6506
+ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6507
  VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
6508
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6509
 
 
6566
  UNUSED(backend);
6567
  }
6568
 
6569
+ static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
6570
  // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
6571
 
6572
  switch (op->op) {
 
6689
  UNUSED(backend);
6690
  }
6691
 
6692
+ static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
6693
  const int min_batch_size = 32;
6694
 
6695
  return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
 
6698
  UNUSED(backend);
6699
  }
6700
 
6701
+ static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
6702
  if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
6703
  return false;
6704
  }
 
6726
  /* .supports_op = */ ggml_backend_vk_supports_op,
6727
  /* .supports_buft = */ ggml_backend_vk_supports_buft,
6728
  /* .offload_op = */ ggml_backend_vk_offload_op,
 
 
6729
  /* .event_record = */ NULL,
6730
  /* .event_wait = */ NULL,
 
6731
  };
6732
 
6733
  static ggml_guid_t ggml_backend_vk_guid() {
 
6735
  return &guid;
6736
  }
6737
 
6738
+ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6739
  VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
6740
 
6741
  ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
 
6744
  ggml_backend_t vk_backend = new ggml_backend {
6745
  /* .guid = */ ggml_backend_vk_guid(),
6746
  /* .interface = */ ggml_backend_vk_interface,
6747
+ /* .device = */ nullptr,
6748
  /* .context = */ ctx,
6749
  };
6750
 
6751
  return vk_backend;
6752
  }
6753
 
6754
+ bool ggml_backend_is_vk(ggml_backend_t backend) {
6755
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
6756
  }
6757
 
6758
+ int ggml_backend_vk_get_device_count() {
6759
  return ggml_vk_get_device_count();
6760
  }
6761
 
6762
+ void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
6763
  ggml_vk_get_device_description(device, description, description_size);
6764
  }
6765
 
6766
+ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
6767
  GGML_ASSERT(device < (int) vk_instance.device_indices.size());
6768
 
6769
  vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
 
6779
  }
6780
  }
6781
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6782
  // Extension availability
6783
  static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
6784
  #ifdef GGML_VULKAN_VALIDATE
ggml/src/ggml.c CHANGED
@@ -461,7 +461,7 @@ struct ggml_arm_arch_features_type {
461
  } ggml_arm_arch_features = {-1, -1, -1, 0};
462
  #endif
463
 
464
- GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
465
  switch (status) {
466
  case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
467
  case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
@@ -3384,19 +3384,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
3384
  GGML_PRINT("%s: --- end ---\n", __func__);
3385
  }
3386
 
3387
- GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
3388
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3389
 
3390
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
3391
  }
3392
 
3393
- GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
3394
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3395
 
3396
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
3397
  }
3398
 
3399
- GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
3400
  size_t nbytes;
3401
  size_t blck_size = ggml_blck_size(tensor->type);
3402
  if (blck_size == 1) {
@@ -3419,15 +3419,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
3419
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
3420
  }
3421
 
3422
- GGML_CALL int64_t ggml_blck_size(enum ggml_type type) {
3423
  return type_traits[type].blck_size;
3424
  }
3425
 
3426
- GGML_CALL size_t ggml_type_size(enum ggml_type type) {
3427
  return type_traits[type].type_size;
3428
  }
3429
 
3430
- GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
3431
  assert(ne % ggml_blck_size(type) == 0);
3432
  return ggml_type_size(type)*ne/ggml_blck_size(type);
3433
  }
@@ -3436,15 +3436,15 @@ double ggml_type_sizef(enum ggml_type type) {
3436
  return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
3437
  }
3438
 
3439
- GGML_CALL const char * ggml_type_name(enum ggml_type type) {
3440
  return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
3441
  }
3442
 
3443
- GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
3444
  return type_traits[type].is_quantized;
3445
  }
3446
 
3447
- GGML_CALL const char * ggml_op_name(enum ggml_op op) {
3448
  return GGML_OP_NAME[op];
3449
  }
3450
 
@@ -3456,7 +3456,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
3456
  return GGML_UNARY_OP_NAME[op];
3457
  }
3458
 
3459
- GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
3460
  if (t->op == GGML_OP_UNARY) {
3461
  enum ggml_unary_op uop = ggml_get_unary_op(t);
3462
  return ggml_unary_op_name(uop);
@@ -3464,7 +3464,7 @@ GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
3464
  return ggml_op_name(t->op);
3465
  }
3466
 
3467
- GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
3468
  return ggml_type_size(tensor->type);
3469
  }
3470
 
@@ -3557,7 +3557,7 @@ size_t ggml_tensor_overhead(void) {
3557
  return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
3558
  }
3559
 
3560
- GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3561
  return tensor->nb[0] > tensor->nb[1];
3562
  }
3563
 
@@ -3583,23 +3583,23 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
3583
  return true;
3584
  }
3585
 
3586
- GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3587
  return ggml_is_contiguous_0(tensor);
3588
  }
3589
 
3590
- GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
3591
  return ggml_is_contiguous_n(tensor, 0);
3592
  }
3593
 
3594
- GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
3595
  return ggml_is_contiguous_n(tensor, 1);
3596
  }
3597
 
3598
- GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
3599
  return ggml_is_contiguous_n(tensor, 2);
3600
  }
3601
 
3602
- GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
3603
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3604
 
3605
  return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
@@ -3614,7 +3614,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
3614
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3615
  }
3616
 
3617
- GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
3618
  for (int i = 0; i < GGML_MAX_DIMS; ++i) {
3619
  if (tensor->ne[i] == 0) {
3620
  // empty if any dimension has no elements
@@ -4634,7 +4634,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
4634
  return (float *)(tensor->data);
4635
  }
4636
 
4637
- GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
4638
  GGML_ASSERT(tensor->op == GGML_OP_UNARY);
4639
  return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
4640
  }
@@ -12834,6 +12834,10 @@ static void ggml_compute_forward_out_prod_f32(
12834
 
12835
  GGML_TENSOR_BINARY_OP_LOCALS
12836
 
 
 
 
 
12837
  const int ith = params->ith;
12838
  const int nth = params->nth;
12839
 
@@ -14163,7 +14167,7 @@ static void ggml_rope_cache_init(
14163
  }
14164
  }
14165
 
14166
- GGML_CALL void ggml_rope_yarn_corr_dims(
14167
  int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
14168
  ) {
14169
  // start and end correction dims
 
461
  } ggml_arm_arch_features = {-1, -1, -1, 0};
462
  #endif
463
 
464
+ const char * ggml_status_to_string(enum ggml_status status) {
465
  switch (status) {
466
  case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
467
  case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
 
3384
  GGML_PRINT("%s: --- end ---\n", __func__);
3385
  }
3386
 
3387
+ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
3388
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3389
 
3390
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
3391
  }
3392
 
3393
+ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
3394
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3395
 
3396
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
3397
  }
3398
 
3399
+ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
3400
  size_t nbytes;
3401
  size_t blck_size = ggml_blck_size(tensor->type);
3402
  if (blck_size == 1) {
 
3419
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
3420
  }
3421
 
3422
+ int64_t ggml_blck_size(enum ggml_type type) {
3423
  return type_traits[type].blck_size;
3424
  }
3425
 
3426
+ size_t ggml_type_size(enum ggml_type type) {
3427
  return type_traits[type].type_size;
3428
  }
3429
 
3430
+ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
3431
  assert(ne % ggml_blck_size(type) == 0);
3432
  return ggml_type_size(type)*ne/ggml_blck_size(type);
3433
  }
 
3436
  return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
3437
  }
3438
 
3439
+ const char * ggml_type_name(enum ggml_type type) {
3440
  return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
3441
  }
3442
 
3443
+ bool ggml_is_quantized(enum ggml_type type) {
3444
  return type_traits[type].is_quantized;
3445
  }
3446
 
3447
+ const char * ggml_op_name(enum ggml_op op) {
3448
  return GGML_OP_NAME[op];
3449
  }
3450
 
 
3456
  return GGML_UNARY_OP_NAME[op];
3457
  }
3458
 
3459
+ const char * ggml_op_desc(const struct ggml_tensor * t) {
3460
  if (t->op == GGML_OP_UNARY) {
3461
  enum ggml_unary_op uop = ggml_get_unary_op(t);
3462
  return ggml_unary_op_name(uop);
 
3464
  return ggml_op_name(t->op);
3465
  }
3466
 
3467
+ size_t ggml_element_size(const struct ggml_tensor * tensor) {
3468
  return ggml_type_size(tensor->type);
3469
  }
3470
 
 
3557
  return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
3558
  }
3559
 
3560
+ bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3561
  return tensor->nb[0] > tensor->nb[1];
3562
  }
3563
 
 
3583
  return true;
3584
  }
3585
 
3586
+ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3587
  return ggml_is_contiguous_0(tensor);
3588
  }
3589
 
3590
+ bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
3591
  return ggml_is_contiguous_n(tensor, 0);
3592
  }
3593
 
3594
+ bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
3595
  return ggml_is_contiguous_n(tensor, 1);
3596
  }
3597
 
3598
+ bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
3599
  return ggml_is_contiguous_n(tensor, 2);
3600
  }
3601
 
3602
+ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
3603
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3604
 
3605
  return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
 
3614
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3615
  }
3616
 
3617
+ bool ggml_is_empty(const struct ggml_tensor * tensor) {
3618
  for (int i = 0; i < GGML_MAX_DIMS; ++i) {
3619
  if (tensor->ne[i] == 0) {
3620
  // empty if any dimension has no elements
 
4634
  return (float *)(tensor->data);
4635
  }
4636
 
4637
+ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
4638
  GGML_ASSERT(tensor->op == GGML_OP_UNARY);
4639
  return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
4640
  }
 
12834
 
12835
  GGML_TENSOR_BINARY_OP_LOCALS
12836
 
12837
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
12838
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
12839
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
12840
+
12841
  const int ith = params->ith;
12842
  const int nth = params->nth;
12843
 
 
14167
  }
14168
  }
14169
 
14170
+ void ggml_rope_yarn_corr_dims(
14171
  int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
14172
  ) {
14173
  // start and end correction dims