Diego Devesa commited on
Commit
9d74d85
·
1 Parent(s): 5239c28

ggml-backend : add device and backend reg interfaces (llama/9707)

Browse files

Also:

- metal : fix compute pass descriptor autorelease crash
- ggml-backend : add device description to CPU backend
- ggml: unify backend logging mechanism

ggml/include/ggml-backend.h CHANGED
@@ -164,7 +164,7 @@ extern "C" {
164
  GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
165
  GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
166
  GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
167
- GGML_API void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data);
168
 
169
  // Functions that may be obtained using ggml_backend_reg_get_proc_address
170
  typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
@@ -184,9 +184,6 @@ extern "C" {
184
  GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
185
  GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
186
 
187
- // Set the log callback for all registered backends
188
- GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data);
189
-
190
  // Direct backend (stream) initialization
191
  // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
192
  GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
 
164
  GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
165
  GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
166
  GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
167
+
168
 
169
  // Functions that may be obtained using ggml_backend_reg_get_proc_address
170
  typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
 
184
  GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
185
  GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
186
 
 
 
 
187
  // Direct backend (stream) initialization
188
  // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
189
  GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
ggml/include/ggml-cann.h CHANGED
@@ -116,17 +116,6 @@ GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
116
  size_t* free,
117
  size_t* total);
118
 
119
- /**
120
- * @brief Set the logging callback for GGML.
121
- *
122
- * This function sets the logging callback and user data for logging.
123
- *
124
- * @param log_callback The logging callback to set.
125
- * @param user_data User data to pass to the logging callback.
126
- */
127
- GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
128
- void* user_data);
129
-
130
  #ifdef __cplusplus
131
  }
132
  #endif
 
116
  size_t* free,
117
  size_t* total);
118
 
 
 
 
 
 
 
 
 
 
 
 
119
  #ifdef __cplusplus
120
  }
121
  #endif
ggml/include/ggml-cuda.h CHANGED
@@ -40,8 +40,6 @@ GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, siz
40
  GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
41
  GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
42
 
43
- GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
44
-
45
  GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
46
 
47
  #ifdef __cplusplus
 
40
  GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
41
  GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
42
 
 
 
43
  GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
44
 
45
  #ifdef __cplusplus
ggml/include/ggml-metal.h CHANGED
@@ -39,8 +39,6 @@ extern "C" {
39
  // user-code should use only these functions
40
  //
41
 
42
- GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
43
-
44
  GGML_API ggml_backend_t ggml_backend_metal_init(void);
45
 
46
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
39
  // user-code should use only these functions
40
  //
41
 
 
 
42
  GGML_API ggml_backend_t ggml_backend_metal_init(void);
43
 
44
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
ggml/include/ggml.h CHANGED
@@ -2174,6 +2174,10 @@ extern "C" {
2174
  typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
2175
  typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
2176
 
 
 
 
 
2177
  // optimization parameters
2178
  //
2179
  // see ggml.c (ggml_opt_default_params) for default values
 
2174
  typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
2175
  typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
2176
 
2177
+ // Set callback for all future logging events.
2178
+ // If this is not called, or NULL is supplied, everything is output on stderr.
2179
+ GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2180
+
2181
  // optimization parameters
2182
  //
2183
  // see ggml.c (ggml_opt_default_params) for default values
ggml/src/ggml-backend-impl.h CHANGED
@@ -215,9 +215,6 @@ extern "C" {
215
  // (optional) get a pointer to a function in the backend
216
  // backends can add custom functions that are not part of the standard ggml-backend interface
217
  void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
218
-
219
- // (optional) set the log callback for the backend
220
- void (*set_log_callback)(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data);
221
  };
222
 
223
  struct ggml_backend_reg {
 
215
  // (optional) get a pointer to a function in the backend
216
  // backends can add custom functions that are not part of the standard ggml-backend interface
217
  void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
 
 
 
218
  };
219
 
220
  struct ggml_backend_reg {
ggml/src/ggml-backend.cpp CHANGED
@@ -1,5 +1,13 @@
1
  // Note: porting this file to C++ is a work in progress
2
 
 
 
 
 
 
 
 
 
3
  #include "ggml-backend-impl.h"
4
  #include "ggml-alloc.h"
5
  #include "ggml-impl.h"
@@ -10,9 +18,15 @@
10
  #include <stdio.h>
11
  #include <stdlib.h>
12
  #include <string.h>
13
-
14
  #include <vector>
15
 
 
 
 
 
 
 
16
  // backend buffer type
17
 
18
  const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
@@ -505,12 +519,6 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
505
  return reg->iface.get_proc_address(reg, name);
506
  }
507
 
508
- void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
509
- if (reg->iface.set_log_callback) {
510
- reg->iface.set_log_callback(reg, log_callback, user_data);
511
- }
512
- }
513
-
514
  // Backend registry
515
 
516
  #ifdef GGML_USE_CUDA
@@ -614,13 +622,6 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
614
  return NULL;
615
  }
616
 
617
- void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data) {
618
- for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
619
- ggml_backend_reg_t reg = ggml_backend_reg_get(i);
620
- ggml_backend_reg_set_log_callback(reg, log_callback, user_data);
621
- }
622
- }
623
-
624
  // Convenience functions
625
  ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
626
  ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
@@ -1021,6 +1022,70 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
1021
 
1022
  ////////////////////////
1023
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1024
  static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
1025
  return "CPU";
1026
 
@@ -1028,10 +1093,9 @@ static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
1028
  }
1029
 
1030
  static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
1031
- // TODO
1032
- return "CPU";
1033
 
1034
- GGML_UNUSED(dev);
1035
  }
1036
 
1037
  static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@@ -1144,10 +1208,11 @@ static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
1144
  static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
1145
  GGML_ASSERT(index == 0);
1146
 
 
1147
  static ggml_backend_device ggml_backend_cpu_device = {
1148
  /* .iface = */ ggml_backend_cpu_device_i,
1149
  /* .reg = */ reg,
1150
- /* .context = */ NULL,
1151
  };
1152
 
1153
  return &ggml_backend_cpu_device;
@@ -1161,7 +1226,6 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
1161
  /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
1162
  /* .get_device = */ ggml_backend_cpu_reg_get_device,
1163
  /* .get_proc_address = */ NULL,
1164
- /* .set_log_callback = */ NULL,
1165
  };
1166
 
1167
  ggml_backend_reg_t ggml_backend_cpu_reg(void) {
 
1
  // Note: porting this file to C++ is a work in progress
2
 
3
+ #ifdef _WIN32
4
+ #define WIN32_LEAN_AND_MEAN
5
+ #ifndef NOMINMAX
6
+ # define NOMINMAX
7
+ #endif
8
+ #include <windows.h>
9
+ #endif
10
+
11
  #include "ggml-backend-impl.h"
12
  #include "ggml-alloc.h"
13
  #include "ggml-impl.h"
 
18
  #include <stdio.h>
19
  #include <stdlib.h>
20
  #include <string.h>
21
+ #include <string>
22
  #include <vector>
23
 
24
+ #ifdef __APPLE__
25
+ #include <sys/types.h>
26
+ #include <sys/sysctl.h>
27
+ #endif
28
+
29
+
30
  // backend buffer type
31
 
32
  const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
 
519
  return reg->iface.get_proc_address(reg, name);
520
  }
521
 
 
 
 
 
 
 
522
  // Backend registry
523
 
524
  #ifdef GGML_USE_CUDA
 
622
  return NULL;
623
  }
624
 
 
 
 
 
 
 
 
625
  // Convenience functions
626
  ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
627
  ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
 
1022
 
1023
  ////////////////////////
1024
 
1025
+ struct ggml_backend_cpu_device_context {
1026
+ std::string description = "CPU";
1027
+
1028
+ ggml_backend_cpu_device_context() {
1029
+ #ifdef __APPLE__
1030
+ size_t len = 0;
1031
+ if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
1032
+ description.resize(len);
1033
+ sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
1034
+ }
1035
+ #elif defined(__linux__)
1036
+ FILE * f = fopen("/proc/cpuinfo", "r");
1037
+ if (f) {
1038
+ char buf[1024];
1039
+ while (fgets(buf, sizeof(buf), f)) {
1040
+ if (strncmp(buf, "model name", 10) == 0) {
1041
+ char * p = strchr(buf, ':');
1042
+ if (p) {
1043
+ p++;
1044
+ while (std::isspace(*p)) {
1045
+ p++;
1046
+ }
1047
+ while (std::isspace(p[strlen(p) - 1])) {
1048
+ p[strlen(p) - 1] = '\0';
1049
+ }
1050
+ description = p;
1051
+ break;
1052
+ }
1053
+ }
1054
+ }
1055
+ fclose(f);
1056
+ }
1057
+ #elif defined(_WIN32)
1058
+ HKEY hKey;
1059
+ if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
1060
+ TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
1061
+ 0,
1062
+ KEY_READ,
1063
+ &hKey) == ERROR_SUCCESS) {
1064
+ DWORD cpu_brand_size = 0;
1065
+ if (RegQueryValueExA(hKey,
1066
+ TEXT("ProcessorNameString"),
1067
+ NULL,
1068
+ NULL,
1069
+ NULL,
1070
+ &cpu_brand_size) == ERROR_SUCCESS) {
1071
+ description.resize(cpu_brand_size);
1072
+ if (RegQueryValueExA(hKey,
1073
+ TEXT("ProcessorNameString"),
1074
+ NULL,
1075
+ NULL,
1076
+ (LPBYTE)&description[0], // NOLINT
1077
+ &cpu_brand_size) == ERROR_SUCCESS) {
1078
+ if (description.find('\0') != std::string::npos) {
1079
+ description.resize(description.find('\0'));
1080
+ }
1081
+ }
1082
+ }
1083
+ RegCloseKey(hKey);
1084
+ }
1085
+ #endif
1086
+ }
1087
+ };
1088
+
1089
  static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
1090
  return "CPU";
1091
 
 
1093
  }
1094
 
1095
  static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
1096
+ struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
 
1097
 
1098
+ return ctx->description.c_str();
1099
  }
1100
 
1101
  static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
 
1208
  static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
1209
  GGML_ASSERT(index == 0);
1210
 
1211
+ static ggml_backend_cpu_device_context ctx;
1212
  static ggml_backend_device ggml_backend_cpu_device = {
1213
  /* .iface = */ ggml_backend_cpu_device_i,
1214
  /* .reg = */ reg,
1215
+ /* .context = */ &ctx,
1216
  };
1217
 
1218
  return &ggml_backend_cpu_device;
 
1226
  /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
1227
  /* .get_device = */ ggml_backend_cpu_reg_get_device,
1228
  /* .get_proc_address = */ NULL,
 
1229
  };
1230
 
1231
  ggml_backend_reg_t ggml_backend_cpu_reg(void) {
ggml/src/ggml-cann.cpp CHANGED
@@ -39,69 +39,6 @@
39
 
40
  #include "ggml-common.h"
41
 
42
- /**
43
- * @brief Default logging callback for GGML.
44
- *
45
- * This function is the default logging callback that logs messages to stderr.
46
- *
47
- * @param level The log level.
48
- * @param msg The log message.
49
- * @param user_data User data passed to the callback.
50
- */
51
- static void ggml_cann_default_log_callback(enum ggml_log_level level,
52
- const char* msg, void* user_data) {
53
- GGML_UNUSED(level);
54
- GGML_UNUSED(user_data);
55
- fprintf(stderr, "%s", msg);
56
- }
57
-
58
- ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback;
59
- void* ggml_cann_log_user_data = NULL;
60
-
61
- GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
62
- void* user_data) {
63
- ggml_cann_log_callback = log_callback;
64
- ggml_cann_log_user_data = user_data;
65
- }
66
-
67
- #define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
- #define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
- #define GGML_CANN_LOG_ERROR(...) \
70
- ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
71
-
72
- GGML_ATTRIBUTE_FORMAT(2, 3)
73
-
74
- /**
75
- * @brief Log a message using the current logging callback.
76
- *
77
- * This function formats a log message and passes it to the current logging
78
- * callback.
79
- *
80
- * @param level The log level.
81
- * @param format The format string for the log message.
82
- * @param ... The arguments for the format string.
83
- */
84
- static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
85
- if (ggml_cann_log_callback != NULL) {
86
- va_list args;
87
- va_start(args, format);
88
- char buffer[128];
89
- int len = vsnprintf(buffer, 128, format, args);
90
- if (len < 128) {
91
- ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data);
92
- } else {
93
- // vsnprintf adds a null terminator
94
- std::vector<char> buffer2(len + 1);
95
- va_end(args);
96
- va_start(args, format);
97
- vsnprintf(&buffer2[0], buffer2.size(), format, args);
98
- ggml_cann_log_callback(level, buffer2.data(),
99
- ggml_cann_log_user_data);
100
- }
101
- va_end(args);
102
- }
103
- }
104
-
105
  /**
106
  * @brief Handles CANN errors by printing an error message and aborting.
107
  *
@@ -116,10 +53,10 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
116
  int32_t id = -1;
117
  aclrtGetDevice(&id);
118
 
119
- GGML_CANN_LOG_ERROR("CANN error: %s\n", msg);
120
- GGML_CANN_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
121
  file, line);
122
- GGML_CANN_LOG_ERROR(" %s\n", stmt);
123
  // abort with GGML_ASSERT to get a stack trace
124
  GGML_ABORT("CANN error");
125
  }
@@ -165,7 +102,7 @@ static ggml_cann_device_info ggml_cann_init() {
165
  aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
166
 
167
  if (err != ACL_SUCCESS) {
168
- GGML_CANN_LOG_ERROR("%s: failed to initialize CANN: %s\n",
169
  __func__, aclGetRecentErrMsg());
170
  return info;
171
  }
@@ -315,7 +252,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
315
  *actual_size = look_ahead_size;
316
  pool_size += look_ahead_size;
317
  #ifdef DEBUG_CANN_MALLOC
318
- GGML_CANN_LOG_INFO(
319
  "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
320
  "requested %u MB\n",
321
  __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
@@ -470,7 +407,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
470
  // add to the pool
471
  pool_size += reserve_size;
472
 
473
- // GGML_CANN_LOG_INFO("cann pool[%d]: size increased to %llu MB (
474
  // reserved %llu MB)\n",
475
  // device, (unsigned long long) (pool_size/1024/1024),
476
  // (unsigned long long) (reserve_size/1024/1024));
@@ -483,7 +420,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
483
  pool_used += size;
484
 
485
  #ifdef DEBUG_CANN_MALLOC
486
- GGML_CANN_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
487
  (unsigned long long)size, (unsigned long long)ptr);
488
  #endif
489
  return ptr;
@@ -497,7 +434,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
497
  */
498
  void free(void* ptr, size_t size) override {
499
  #ifdef DEBUG_CANN_MALLOC
500
- GGML_CANN_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
501
  (unsigned long long)size, (unsigned long long)ptr);
502
  #endif
503
 
@@ -1095,7 +1032,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1095
  void* dev_ptr;
1096
  aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1097
  if (err != ACL_SUCCESS) {
1098
- GGML_CANN_LOG_ERROR(
1099
  "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
1100
  __func__, size / 1024.0 / 1024.0, buft_ctx->device,
1101
  aclGetRecentErrMsg());
@@ -1280,7 +1217,7 @@ static void * ggml_cann_host_malloc(size_t size) {
1280
  aclError err = aclrtMallocHost((void **) &hostPtr, size);
1281
  if (err != ACL_SUCCESS) {
1282
 
1283
- GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1284
  size / 1024.0 / 1024.0, aclGetRecentErrMsg());
1285
  return nullptr;
1286
  }
@@ -1733,7 +1670,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
1733
  bool ok = ggml_cann_compute_forward(*cann_ctx, node);
1734
 
1735
  if (!ok) {
1736
- GGML_CANN_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
1737
  node->name, ggml_op_name(node->op));
1738
  }
1739
  GGML_ASSERT(ok);
@@ -2043,13 +1980,13 @@ static ggml_guid_t ggml_backend_cann_guid() {
2043
  ggml_backend_t ggml_backend_cann_init(int32_t device) {
2044
  aclInit(nullptr);
2045
  if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
2046
- GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
2047
  return nullptr;
2048
  }
2049
 
2050
  ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
2051
  if (ctx == nullptr) {
2052
- GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
2053
  return nullptr;
2054
  }
2055
  ggml_cann_set_device(ctx->device);
 
39
 
40
  #include "ggml-common.h"
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  /**
43
  * @brief Handles CANN errors by printing an error message and aborting.
44
  *
 
53
  int32_t id = -1;
54
  aclrtGetDevice(&id);
55
 
56
+ GGML_LOG_ERROR("CANN error: %s\n", msg);
57
+ GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
58
  file, line);
59
+ GGML_LOG_ERROR(" %s\n", stmt);
60
  // abort with GGML_ASSERT to get a stack trace
61
  GGML_ABORT("CANN error");
62
  }
 
102
  aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
103
 
104
  if (err != ACL_SUCCESS) {
105
+ GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
106
  __func__, aclGetRecentErrMsg());
107
  return info;
108
  }
 
252
  *actual_size = look_ahead_size;
253
  pool_size += look_ahead_size;
254
  #ifdef DEBUG_CANN_MALLOC
255
+ GGML_LOG_INFO(
256
  "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
257
  "requested %u MB\n",
258
  __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
 
407
  // add to the pool
408
  pool_size += reserve_size;
409
 
410
+ // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
411
  // reserved %llu MB)\n",
412
  // device, (unsigned long long) (pool_size/1024/1024),
413
  // (unsigned long long) (reserve_size/1024/1024));
 
420
  pool_used += size;
421
 
422
  #ifdef DEBUG_CANN_MALLOC
423
+ GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
424
  (unsigned long long)size, (unsigned long long)ptr);
425
  #endif
426
  return ptr;
 
434
  */
435
  void free(void* ptr, size_t size) override {
436
  #ifdef DEBUG_CANN_MALLOC
437
+ GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
438
  (unsigned long long)size, (unsigned long long)ptr);
439
  #endif
440
 
 
1032
  void* dev_ptr;
1033
  aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1034
  if (err != ACL_SUCCESS) {
1035
+ GGML_LOG_ERROR(
1036
  "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
1037
  __func__, size / 1024.0 / 1024.0, buft_ctx->device,
1038
  aclGetRecentErrMsg());
 
1217
  aclError err = aclrtMallocHost((void **) &hostPtr, size);
1218
  if (err != ACL_SUCCESS) {
1219
 
1220
+ GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1221
  size / 1024.0 / 1024.0, aclGetRecentErrMsg());
1222
  return nullptr;
1223
  }
 
1670
  bool ok = ggml_cann_compute_forward(*cann_ctx, node);
1671
 
1672
  if (!ok) {
1673
+ GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
1674
  node->name, ggml_op_name(node->op));
1675
  }
1676
  GGML_ASSERT(ok);
 
1980
  ggml_backend_t ggml_backend_cann_init(int32_t device) {
1981
  aclInit(nullptr);
1982
  if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
1983
+ GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
1984
  return nullptr;
1985
  }
1986
 
1987
  ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
1988
  if (ctx == nullptr) {
1989
+ GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
1990
  return nullptr;
1991
  }
1992
  ggml_cann_set_device(ctx->device);
ggml/src/ggml-cuda.cu CHANGED
@@ -58,52 +58,14 @@
58
 
59
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
60
 
61
- static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
62
- GGML_UNUSED(level);
63
- GGML_UNUSED(user_data);
64
- fprintf(stderr, "%s", msg);
65
- }
66
-
67
- ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
68
- void * ggml_cuda_log_user_data = NULL;
69
-
70
- GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
71
- ggml_cuda_log_callback = log_callback;
72
- ggml_cuda_log_user_data = user_data;
73
- }
74
-
75
- #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
76
- #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
77
- #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
78
-
79
- GGML_ATTRIBUTE_FORMAT(2, 3)
80
- static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
81
- if (ggml_cuda_log_callback != NULL) {
82
- va_list args;
83
- va_start(args, format);
84
- char buffer[128];
85
- int len = vsnprintf(buffer, 128, format, args);
86
- if (len < 128) {
87
- ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
88
- } else {
89
- std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
90
- va_end(args);
91
- va_start(args, format);
92
- vsnprintf(&buffer2[0], buffer2.size(), format, args);
93
- ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
94
- }
95
- va_end(args);
96
- }
97
- }
98
-
99
  [[noreturn]]
100
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
101
  int id = -1; // in case cudaGetDevice fails
102
  cudaGetDevice(&id);
103
 
104
- GGML_CUDA_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
105
- GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
106
- GGML_CUDA_LOG_ERROR(" %s\n", stmt);
107
  // abort with GGML_ABORT to get a stack trace
108
  GGML_ABORT(GGML_CUDA_NAME " error");
109
  }
@@ -168,7 +130,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
168
 
169
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
170
  if (err != cudaSuccess) {
171
- GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
172
  return info;
173
  }
174
 
@@ -176,16 +138,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
176
 
177
  int64_t total_vram = 0;
178
  #ifdef GGML_CUDA_FORCE_MMQ
179
- GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
180
  #else
181
- GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
182
  #endif // GGML_CUDA_FORCE_MMQ
183
  #ifdef GGML_CUDA_FORCE_CUBLAS
184
- GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
185
  #else
186
- GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
187
  #endif // GGML_CUDA_FORCE_CUBLAS
188
- GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
189
  for (int id = 0; id < info.device_count; ++id) {
190
  int device_vmm = 0;
191
 
@@ -206,7 +168,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
206
 
207
  cudaDeviceProp prop;
208
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
209
- GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
210
 
211
  info.default_tensor_split[id] = total_vram;
212
  total_vram += prop.totalGlobalMem;
@@ -314,7 +276,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
314
  *actual_size = look_ahead_size;
315
  pool_size += look_ahead_size;
316
  #ifdef DEBUG_CUDA_MALLOC
317
- GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
318
  (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
319
  #endif
320
  return ptr;
@@ -329,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
329
  return;
330
  }
331
  }
332
- GGML_CUDA_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
333
  ggml_cuda_set_device(device);
334
  CUDA_CHECK(cudaFree(ptr));
335
  pool_size -= size;
@@ -593,7 +555,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
593
  if (err != cudaSuccess) {
594
  // clear the error
595
  cudaGetLastError();
596
- GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
597
  return nullptr;
598
  }
599
 
@@ -1018,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
1018
  if (err != cudaSuccess) {
1019
  // clear the error
1020
  cudaGetLastError();
1021
- GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1022
  size / 1024.0 / 1024.0, cudaGetErrorString(err));
1023
  return nullptr;
1024
  }
@@ -2291,7 +2253,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2291
  break;
2292
  case GGML_OP_MUL_MAT:
2293
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2294
- GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2295
  return false;
2296
  } else {
2297
  ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@@ -2375,7 +2337,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2375
 
2376
  cudaError_t err = cudaGetLastError();
2377
  if (err != cudaSuccess) {
2378
- GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2379
  CUDA_CHECK(err);
2380
  }
2381
 
@@ -2444,7 +2406,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
2444
 
2445
  if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
2446
  #ifndef NDEBUG
2447
- GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
2448
  #endif
2449
  return false;
2450
  }
@@ -2560,7 +2522,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2560
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2561
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2562
  #ifndef NDEBUG
2563
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2564
  #endif
2565
  }
2566
  }
@@ -2611,14 +2573,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2611
  if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2612
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2613
  #ifndef NDEBUG
2614
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2615
  #endif
2616
  }
2617
 
2618
  if (node->op == GGML_OP_MUL_MAT_ID) {
2619
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2620
  #ifndef NDEBUG
2621
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2622
  #endif
2623
  }
2624
 
@@ -2627,7 +2589,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2627
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2628
  use_cuda_graph = false;
2629
  #ifndef NDEBUG
2630
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2631
  #endif
2632
  }
2633
 
@@ -2639,7 +2601,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2639
  if (!ptr) {
2640
  use_cuda_graph = false;
2641
  #ifndef NDEBUG
2642
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
2643
  #endif
2644
  } else {
2645
  if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
@@ -2663,7 +2625,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2663
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2664
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2665
  #ifndef NDEBUG
2666
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2667
  #endif
2668
  }
2669
  }
@@ -2702,7 +2664,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2702
 
2703
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2704
  if (!ok) {
2705
- GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2706
  }
2707
  GGML_ASSERT(ok);
2708
  }
@@ -2721,7 +2683,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2721
  use_cuda_graph = false;
2722
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2723
  #ifndef NDEBUG
2724
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2725
  #endif
2726
  } else {
2727
  graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2788,7 +2750,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2788
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2789
  if (stat == cudaErrorGraphExecUpdateFailure) {
2790
  #ifndef NDEBUG
2791
- GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2792
  #endif
2793
  // The pre-existing graph exec cannot be updated due to violated constraints
2794
  // so instead clear error and re-instantiate
@@ -2890,7 +2852,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
2890
  // clear the error
2891
  cudaGetLastError();
2892
 
2893
- GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
2894
  size / 1024.0 / 1024.0, cudaGetErrorString(err));
2895
  return false;
2896
  }
@@ -3322,17 +3284,11 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
3322
  return nullptr;
3323
  }
3324
 
3325
- static void ggml_backend_cuda_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
3326
- GGML_UNUSED(reg);
3327
- ggml_backend_cuda_log_set_callback(log_callback, user_data);
3328
- }
3329
-
3330
  static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
3331
  /* .get_name = */ ggml_backend_cuda_reg_get_name,
3332
  /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
3333
  /* .get_device_get = */ ggml_backend_cuda_reg_get_device,
3334
  /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
3335
- /* .set_log_callback = */ ggml_backend_cuda_reg_set_log_callback,
3336
  };
3337
 
3338
  // backend registry
@@ -3378,13 +3334,13 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
3378
 
3379
  ggml_backend_t ggml_backend_cuda_init(int device) {
3380
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
3381
- GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
3382
  return nullptr;
3383
  }
3384
 
3385
  ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
3386
  if (ctx == nullptr) {
3387
- GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
3388
  return nullptr;
3389
  }
3390
 
 
58
 
59
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  [[noreturn]]
62
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
63
  int id = -1; // in case cudaGetDevice fails
64
  cudaGetDevice(&id);
65
 
66
+ GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
67
+ GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
68
+ GGML_LOG_ERROR(" %s\n", stmt);
69
  // abort with GGML_ABORT to get a stack trace
70
  GGML_ABORT(GGML_CUDA_NAME " error");
71
  }
 
130
 
131
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
132
  if (err != cudaSuccess) {
133
+ GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
134
  return info;
135
  }
136
 
 
138
 
139
  int64_t total_vram = 0;
140
  #ifdef GGML_CUDA_FORCE_MMQ
141
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
142
  #else
143
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
144
  #endif // GGML_CUDA_FORCE_MMQ
145
  #ifdef GGML_CUDA_FORCE_CUBLAS
146
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
147
  #else
148
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
149
  #endif // GGML_CUDA_FORCE_CUBLAS
150
+ GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
151
  for (int id = 0; id < info.device_count; ++id) {
152
  int device_vmm = 0;
153
 
 
168
 
169
  cudaDeviceProp prop;
170
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
171
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
172
 
173
  info.default_tensor_split[id] = total_vram;
174
  total_vram += prop.totalGlobalMem;
 
276
  *actual_size = look_ahead_size;
277
  pool_size += look_ahead_size;
278
  #ifdef DEBUG_CUDA_MALLOC
279
+ GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
280
  (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
281
  #endif
282
  return ptr;
 
291
  return;
292
  }
293
  }
294
+ GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
295
  ggml_cuda_set_device(device);
296
  CUDA_CHECK(cudaFree(ptr));
297
  pool_size -= size;
 
555
  if (err != cudaSuccess) {
556
  // clear the error
557
  cudaGetLastError();
558
+ GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
559
  return nullptr;
560
  }
561
 
 
980
  if (err != cudaSuccess) {
981
  // clear the error
982
  cudaGetLastError();
983
+ GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
984
  size / 1024.0 / 1024.0, cudaGetErrorString(err));
985
  return nullptr;
986
  }
 
2253
  break;
2254
  case GGML_OP_MUL_MAT:
2255
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2256
+ GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2257
  return false;
2258
  } else {
2259
  ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
 
2337
 
2338
  cudaError_t err = cudaGetLastError();
2339
  if (err != cudaSuccess) {
2340
+ GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2341
  CUDA_CHECK(err);
2342
  }
2343
 
 
2406
 
2407
  if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
2408
  #ifndef NDEBUG
2409
+ GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
2410
  #endif
2411
  return false;
2412
  }
 
2522
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2523
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2524
  #ifndef NDEBUG
2525
+ GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2526
  #endif
2527
  }
2528
  }
 
2573
  if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2574
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2575
  #ifndef NDEBUG
2576
+ GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2577
  #endif
2578
  }
2579
 
2580
  if (node->op == GGML_OP_MUL_MAT_ID) {
2581
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2582
  #ifndef NDEBUG
2583
+ GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2584
  #endif
2585
  }
2586
 
 
2589
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2590
  use_cuda_graph = false;
2591
  #ifndef NDEBUG
2592
+ GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2593
  #endif
2594
  }
2595
 
 
2601
  if (!ptr) {
2602
  use_cuda_graph = false;
2603
  #ifndef NDEBUG
2604
+ GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
2605
  #endif
2606
  } else {
2607
  if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
 
2625
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2626
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2627
  #ifndef NDEBUG
2628
+ GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2629
  #endif
2630
  }
2631
  }
 
2664
 
2665
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2666
  if (!ok) {
2667
+ GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2668
  }
2669
  GGML_ASSERT(ok);
2670
  }
 
2683
  use_cuda_graph = false;
2684
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2685
  #ifndef NDEBUG
2686
+ GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2687
  #endif
2688
  } else {
2689
  graph_evaluated_or_captured = true; // CUDA graph has been captured
 
2750
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2751
  if (stat == cudaErrorGraphExecUpdateFailure) {
2752
  #ifndef NDEBUG
2753
+ GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2754
  #endif
2755
  // The pre-existing graph exec cannot be updated due to violated constraints
2756
  // so instead clear error and re-instantiate
 
2852
  // clear the error
2853
  cudaGetLastError();
2854
 
2855
+ GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
2856
  size / 1024.0 / 1024.0, cudaGetErrorString(err));
2857
  return false;
2858
  }
 
3284
  return nullptr;
3285
  }
3286
 
 
 
 
 
 
3287
  static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
3288
  /* .get_name = */ ggml_backend_cuda_reg_get_name,
3289
  /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
3290
  /* .get_device_get = */ ggml_backend_cuda_reg_get_device,
3291
  /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
 
3292
  };
3293
 
3294
  // backend registry
 
3334
 
3335
  ggml_backend_t ggml_backend_cuda_init(int device) {
3336
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
3337
+ GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
3338
  return nullptr;
3339
  }
3340
 
3341
  ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
3342
  if (ctx == nullptr) {
3343
+ GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
3344
  return nullptr;
3345
  }
3346
 
ggml/src/ggml-impl.h CHANGED
@@ -33,6 +33,21 @@ extern "C" {
33
  #endif
34
  #endif
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  // bitset
37
 
38
  typedef uint32_t ggml_bitset_t;
 
33
  #endif
34
  #endif
35
 
36
+ //
37
+ // logging
38
+ //
39
+
40
+ GGML_ATTRIBUTE_FORMAT(2, 3)
41
+ void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
42
+ void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
43
+
44
+ #define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
45
+ #define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
46
+ #define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
47
+ #define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
48
+ #define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
49
+ #define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
50
+
51
  // bitset
52
 
53
  typedef uint32_t ggml_bitset_t;
ggml/src/ggml-metal.m CHANGED
@@ -18,19 +18,6 @@
18
  // max number of MTLCommandBuffer used to submit a graph for processing
19
  #define GGML_METAL_MAX_COMMAND_BUFFERS 8
20
 
21
- #ifdef GGML_METAL_NDEBUG
22
- #define GGML_METAL_LOG(...)
23
- #define GGML_METAL_LOG_INFO(...)
24
- #define GGML_METAL_LOG_WARN(...)
25
- #define GGML_METAL_LOG_ERROR(...)
26
- #else
27
- #define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__)
28
- #define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
29
- #define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
30
- #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
31
- #define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
32
- #endif
33
-
34
  #define UNUSED(x) (void)(x)
35
 
36
  struct ggml_metal_kernel {
@@ -230,8 +217,6 @@ struct ggml_backend_metal_context {
230
  id<MTLDevice> device;
231
  id<MTLCommandQueue> queue;
232
 
233
- MTLComputePassDescriptor * edesc;
234
-
235
  dispatch_queue_t d_queue;
236
 
237
  struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
@@ -277,51 +262,19 @@ struct ggml_backend_metal_context {
277
  @implementation GGMLMetalClass
278
  @end
279
 
280
- static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
281
- fprintf(stderr, "%s", msg);
282
-
283
- UNUSED(level);
284
- UNUSED(user_data);
285
- }
286
-
287
- ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
288
- void * ggml_metal_log_user_data = NULL;
289
-
290
- GGML_ATTRIBUTE_FORMAT(2, 3)
291
- static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
292
- if (ggml_metal_log_callback != NULL) {
293
- va_list args;
294
- va_start(args, format);
295
- char buffer[128];
296
- int len = vsnprintf(buffer, 128, format, args);
297
- if (len < 128) {
298
- ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
299
- } else {
300
- char* buffer2 = malloc(len+1);
301
- va_end(args);
302
- va_start(args, format);
303
- vsnprintf(buffer2, len+1, format, args);
304
- buffer2[len] = 0;
305
- ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
306
- free(buffer2);
307
- }
308
- va_end(args);
309
- }
310
- }
311
-
312
  static void * ggml_metal_host_malloc(size_t n) {
313
  void * data = NULL;
314
 
315
  #if TARGET_OS_OSX
316
  kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
317
  if (err != KERN_SUCCESS) {
318
- GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
319
  return NULL;
320
  }
321
  #else
322
  const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
323
  if (result != 0) {
324
- GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
325
  return NULL;
326
  }
327
  #endif
@@ -330,27 +283,25 @@ static void * ggml_metal_host_malloc(size_t n) {
330
  }
331
 
332
  static struct ggml_backend_metal_context * ggml_metal_init(void) {
333
- GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
334
 
335
  #if TARGET_OS_OSX && !GGML_METAL_NDEBUG
336
  // Show all the Metal device instances in the system
337
  NSArray * devices = MTLCopyAllDevices();
338
  for (id<MTLDevice> device in devices) {
339
- GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
340
  }
341
  [devices release]; // since it was created by a *Copy* C method
342
  #endif
343
 
344
  // Pick and show default Metal device
345
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
346
- GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
347
 
348
  // Configure context
349
  struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
350
  ctx->device = device;
351
  ctx->queue = [ctx->device newCommandQueue];
352
- ctx->edesc = MTLComputePassDescriptor.computePassDescriptor;
353
- ctx->edesc.dispatchType = MTLDispatchTypeSerial;
354
  ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
355
 
356
  id<MTLLibrary> metal_library;
@@ -381,28 +332,28 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
381
  if (try_metallib && path_lib != nil) {
382
  // pre-compiled library found
383
  NSURL * libURL = [NSURL fileURLWithPath:path_lib];
384
- GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
385
 
386
  metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
387
  if (error) {
388
- GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
389
  return NULL;
390
  }
391
  } else {
392
  #if GGML_METAL_EMBED_LIBRARY
393
- GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
394
 
395
  extern const char ggml_metallib_start[];
396
  extern const char ggml_metallib_end[];
397
 
398
  NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
399
  #else
400
- GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
401
 
402
  NSString * path_source;
403
  NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
404
 
405
- GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
406
 
407
  if (path_resource) {
408
  path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
@@ -411,15 +362,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
411
  }
412
 
413
  if (path_source == nil) {
414
- GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
415
  path_source = @"ggml-metal.metal";
416
  }
417
 
418
- GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
419
 
420
  NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
421
  if (error) {
422
- GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
423
  return NULL;
424
  }
425
  #endif // GGML_METAL_EMBED_LIBRARY
@@ -435,7 +386,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
435
 
436
  metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
437
  if (error) {
438
- GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
439
  return NULL;
440
  }
441
  }
@@ -443,7 +394,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
443
  }
444
 
445
  // print MTL GPU family:
446
- GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
447
 
448
  const NSInteger MTLGPUFamilyMetal3 = 5001;
449
 
@@ -453,21 +404,21 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
453
  {
454
  for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
455
  if ([ctx->device supportsFamily:i]) {
456
- GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
457
  break;
458
  }
459
  }
460
 
461
  for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
462
  if ([ctx->device supportsFamily:i]) {
463
- GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
464
  break;
465
  }
466
  }
467
 
468
  for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
469
  if ([ctx->device supportsFamily:i]) {
470
- GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
471
  break;
472
  }
473
  }
@@ -478,9 +429,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
478
 
479
  ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7];
480
 
481
- GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false");
482
- GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
483
- GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
484
 
485
  ctx->capture_next_compute = false;
486
  ctx->capture_started = false;
@@ -494,13 +445,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
494
 
495
  #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
496
  if (@available(macOS 10.12, iOS 16.0, *)) {
497
- GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
498
  }
499
  #elif TARGET_OS_OSX
500
  if (ctx->device.maxTransferRate != 0) {
501
- GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
502
  } else {
503
- GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
504
  }
505
  #endif
506
 
@@ -513,7 +464,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
513
  }
514
 
515
  /*
516
- GGML_METAL_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
517
  (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
518
  (int) kernel->pipeline.threadExecutionWidth); \
519
  */
@@ -524,12 +475,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
524
  kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
525
  [metal_function release]; \
526
  if (error) { \
527
- GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
528
  [metal_library release]; \
529
  return NULL; \
530
  } \
531
  } else { \
532
- GGML_METAL_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
533
  }
534
 
535
  // simd_sum and simd_max requires MTLGPUFamilyApple7
@@ -726,7 +677,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
726
  }
727
 
728
  static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
729
- GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
730
 
731
  for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
732
  [ctx->kernels[i].pipeline release];
@@ -764,7 +715,7 @@ struct ggml_backend_metal_buffer_context {
764
  // Metal buffer based on the host memory pointer
765
  //
766
  static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
767
- //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
768
 
769
  const int64_t tsize = ggml_nbytes(t);
770
 
@@ -776,17 +727,17 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
776
  for (int i = 0; i < buf_ctx->n_buffers; ++i) {
777
  const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
778
 
779
- //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
780
  if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
781
  *offs = (size_t) ioffs;
782
 
783
- //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
784
 
785
  return buf_ctx->buffers[i].metal;
786
  }
787
  }
788
 
789
- GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
790
 
791
  return nil;
792
  }
@@ -918,7 +869,7 @@ static void ggml_metal_encode_node(
918
 
919
  struct ggml_tensor * node = ggml_graph_node(gf, idx);
920
 
921
- //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
922
 
923
  struct ggml_tensor * src0 = node->src[0];
924
  struct ggml_tensor * src1 = node->src[1];
@@ -944,7 +895,7 @@ static void ggml_metal_encode_node(
944
  }
945
 
946
  if (!ggml_metal_supports_op(ctx, dst)) {
947
- GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
948
  GGML_ABORT("unsupported op");
949
  }
950
 
@@ -1002,17 +953,17 @@ static void ggml_metal_encode_node(
1002
  id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
1003
  id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
1004
 
1005
- //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
1006
  //if (src0) {
1007
- // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
1008
  // ggml_is_contiguous(src0), src0->name);
1009
  //}
1010
  //if (src1) {
1011
- // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
1012
  // ggml_is_contiguous(src1), src1->name);
1013
  //}
1014
  //if (dst) {
1015
- // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
1016
  // dst->name);
1017
  //}
1018
 
@@ -1404,7 +1355,7 @@ static void ggml_metal_encode_node(
1404
  } break;
1405
  default:
1406
  {
1407
- GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
1408
  GGML_ABORT("fatal error");
1409
  }
1410
  } break;
@@ -1956,7 +1907,7 @@ static void ggml_metal_encode_node(
1956
  } break;
1957
  default:
1958
  {
1959
- GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
1960
  GGML_ABORT("not implemented");
1961
  }
1962
  };
@@ -2252,7 +2203,7 @@ static void ggml_metal_encode_node(
2252
  } break;
2253
  default:
2254
  {
2255
- GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
2256
  GGML_ABORT("not implemented");
2257
  }
2258
  };
@@ -2821,8 +2772,8 @@ static void ggml_metal_encode_node(
2821
  //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
2822
  default:
2823
  {
2824
- GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
2825
- GGML_METAL_LOG_ERROR("add template specialization for this size\n");
2826
  GGML_ABORT("add template specialization for this size");
2827
  }
2828
  }
@@ -2834,8 +2785,8 @@ static void ggml_metal_encode_node(
2834
  //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
2835
  default:
2836
  {
2837
- GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
2838
- GGML_METAL_LOG_ERROR("add template specialization for this size\n");
2839
  GGML_ABORT("add template specialization for this size");
2840
  }
2841
  }
@@ -2996,7 +2947,7 @@ static void ggml_metal_encode_node(
2996
  } break;
2997
  default:
2998
  {
2999
- GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
3000
  GGML_ABORT("fatal error");
3001
  }
3002
  }
@@ -3041,7 +2992,7 @@ static enum ggml_status ggml_metal_graph_compute(
3041
 
3042
  NSError * error = nil;
3043
  if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
3044
- GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
3045
  } else {
3046
  [ctx->capture_scope beginScope];
3047
  ctx->capture_started = true;
@@ -3060,7 +3011,7 @@ static enum ggml_status ggml_metal_graph_compute(
3060
  const int n_nodes_per_cb = ctx->n_nodes_per_cb;
3061
 
3062
  id<MTLCommandBuffer> command_buffer = ctx->command_buffers[cb_idx];
3063
- id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: ctx->edesc];
3064
 
3065
  int node_start = 0;
3066
  int node_end = n_nodes_0;
@@ -3122,9 +3073,9 @@ static enum ggml_status ggml_metal_graph_compute(
3122
 
3123
  MTLCommandBufferStatus status = [command_buffer status];
3124
  if (status != MTLCommandBufferStatusCompleted) {
3125
- GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
3126
  if (status == MTLCommandBufferStatusError) {
3127
- GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
3128
  }
3129
 
3130
  return GGML_STATUS_FAILED;
@@ -3137,9 +3088,9 @@ static enum ggml_status ggml_metal_graph_compute(
3137
 
3138
  MTLCommandBufferStatus status = [command_buffer status];
3139
  if (status != MTLCommandBufferStatusCompleted) {
3140
- GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
3141
  if (status == MTLCommandBufferStatusError) {
3142
- GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
3143
  }
3144
 
3145
  return GGML_STATUS_FAILED;
@@ -3156,7 +3107,7 @@ static enum ggml_status ggml_metal_graph_compute(
3156
  }
3157
 
3158
  if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
3159
- GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
3160
  return GGML_STATUS_ABORTED;
3161
  }
3162
 
@@ -3285,17 +3236,17 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
3285
  #ifndef GGML_METAL_NDEBUG
3286
  #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
3287
  if (@available(macOS 10.12, iOS 16.0, *)) {
3288
- GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
3289
  __func__,
3290
  size_aligned / 1024.0 / 1024.0,
3291
  device.currentAllocatedSize / 1024.0 / 1024.0,
3292
  device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
3293
 
3294
  if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
3295
- GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
3296
  }
3297
  } else {
3298
- GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
3299
  __func__,
3300
  size_aligned / 1024.0 / 1024.0,
3301
  device.currentAllocatedSize / 1024.0 / 1024.0);
@@ -3337,7 +3288,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
3337
  }
3338
 
3339
  if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
3340
- GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
3341
  free(ctx);
3342
  ggml_backend_metal_free_device();
3343
  return NULL;
@@ -3422,7 +3373,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
3422
  ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
3423
 
3424
  if (ctx->buffers[ctx->n_buffers].metal == nil) {
3425
- GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
3426
  return false;
3427
  }
3428
  }
@@ -3448,7 +3399,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
3448
  ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
3449
 
3450
  if (ctx->buffers[ctx->n_buffers].metal == nil) {
3451
- GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
3452
  return false;
3453
  }
3454
  }
@@ -3456,7 +3407,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
3456
  ggml_backend_metal_log_allocated_size(device, size_step_aligned);
3457
 
3458
  if (i + size_step < size) {
3459
- GGML_METAL_LOG_INFO("\n");
3460
  }
3461
 
3462
  ++ctx->n_buffers;
@@ -3513,7 +3464,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
3513
  ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
3514
 
3515
  if (ctx->n_cb > 2) {
3516
- GGML_METAL_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb);
3517
  }
3518
  }
3519
 
@@ -3543,11 +3494,6 @@ static struct ggml_backend_i ggml_backend_metal_i = {
3543
  /* .event_wait = */ NULL,
3544
  };
3545
 
3546
- void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
3547
- ggml_metal_log_callback = log_callback;
3548
- ggml_metal_log_user_data = user_data;
3549
- }
3550
-
3551
  static ggml_guid_t ggml_backend_metal_guid(void) {
3552
  static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 };
3553
  return &guid;
@@ -3556,7 +3502,7 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
3556
  ggml_backend_t ggml_backend_metal_init(void) {
3557
  struct ggml_backend_metal_context * ctx = ggml_metal_init();
3558
  if (ctx == NULL) {
3559
- GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
3560
  return NULL;
3561
  }
3562
 
 
18
  // max number of MTLCommandBuffer used to submit a graph for processing
19
  #define GGML_METAL_MAX_COMMAND_BUFFERS 8
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  #define UNUSED(x) (void)(x)
22
 
23
  struct ggml_metal_kernel {
 
217
  id<MTLDevice> device;
218
  id<MTLCommandQueue> queue;
219
 
 
 
220
  dispatch_queue_t d_queue;
221
 
222
  struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
 
262
  @implementation GGMLMetalClass
263
  @end
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  static void * ggml_metal_host_malloc(size_t n) {
266
  void * data = NULL;
267
 
268
  #if TARGET_OS_OSX
269
  kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
270
  if (err != KERN_SUCCESS) {
271
+ GGML_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
272
  return NULL;
273
  }
274
  #else
275
  const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
276
  if (result != 0) {
277
+ GGML_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
278
  return NULL;
279
  }
280
  #endif
 
283
  }
284
 
285
  static struct ggml_backend_metal_context * ggml_metal_init(void) {
286
+ GGML_LOG_INFO("%s: allocating\n", __func__);
287
 
288
  #if TARGET_OS_OSX && !GGML_METAL_NDEBUG
289
  // Show all the Metal device instances in the system
290
  NSArray * devices = MTLCopyAllDevices();
291
  for (id<MTLDevice> device in devices) {
292
+ GGML_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
293
  }
294
  [devices release]; // since it was created by a *Copy* C method
295
  #endif
296
 
297
  // Pick and show default Metal device
298
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
299
+ GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
300
 
301
  // Configure context
302
  struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
303
  ctx->device = device;
304
  ctx->queue = [ctx->device newCommandQueue];
 
 
305
  ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
306
 
307
  id<MTLLibrary> metal_library;
 
332
  if (try_metallib && path_lib != nil) {
333
  // pre-compiled library found
334
  NSURL * libURL = [NSURL fileURLWithPath:path_lib];
335
+ GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
336
 
337
  metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
338
  if (error) {
339
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
340
  return NULL;
341
  }
342
  } else {
343
  #if GGML_METAL_EMBED_LIBRARY
344
+ GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
345
 
346
  extern const char ggml_metallib_start[];
347
  extern const char ggml_metallib_end[];
348
 
349
  NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
350
  #else
351
+ GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
352
 
353
  NSString * path_source;
354
  NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
355
 
356
+ GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
357
 
358
  if (path_resource) {
359
  path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
 
362
  }
363
 
364
  if (path_source == nil) {
365
+ GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
366
  path_source = @"ggml-metal.metal";
367
  }
368
 
369
+ GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
370
 
371
  NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
372
  if (error) {
373
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
374
  return NULL;
375
  }
376
  #endif // GGML_METAL_EMBED_LIBRARY
 
386
 
387
  metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
388
  if (error) {
389
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
390
  return NULL;
391
  }
392
  }
 
394
  }
395
 
396
  // print MTL GPU family:
397
+ GGML_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
398
 
399
  const NSInteger MTLGPUFamilyMetal3 = 5001;
400
 
 
404
  {
405
  for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
406
  if ([ctx->device supportsFamily:i]) {
407
+ GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
408
  break;
409
  }
410
  }
411
 
412
  for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
413
  if ([ctx->device supportsFamily:i]) {
414
+ GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
415
  break;
416
  }
417
  }
418
 
419
  for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
420
  if ([ctx->device supportsFamily:i]) {
421
+ GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
422
  break;
423
  }
424
  }
 
429
 
430
  ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7];
431
 
432
+ GGML_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false");
433
+ GGML_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
434
+ GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
435
 
436
  ctx->capture_next_compute = false;
437
  ctx->capture_started = false;
 
445
 
446
  #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
447
  if (@available(macOS 10.12, iOS 16.0, *)) {
448
+ GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
449
  }
450
  #elif TARGET_OS_OSX
451
  if (ctx->device.maxTransferRate != 0) {
452
+ GGML_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
453
  } else {
454
+ GGML_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
455
  }
456
  #endif
457
 
 
464
  }
465
 
466
  /*
467
+ GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
468
  (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
469
  (int) kernel->pipeline.threadExecutionWidth); \
470
  */
 
475
  kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
476
  [metal_function release]; \
477
  if (error) { \
478
+ GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
479
  [metal_library release]; \
480
  return NULL; \
481
  } \
482
  } else { \
483
+ GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
484
  }
485
 
486
  // simd_sum and simd_max requires MTLGPUFamilyApple7
 
677
  }
678
 
679
  static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
680
+ GGML_LOG_INFO("%s: deallocating\n", __func__);
681
 
682
  for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
683
  [ctx->kernels[i].pipeline release];
 
715
  // Metal buffer based on the host memory pointer
716
  //
717
  static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
718
+ //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
719
 
720
  const int64_t tsize = ggml_nbytes(t);
721
 
 
727
  for (int i = 0; i < buf_ctx->n_buffers; ++i) {
728
  const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
729
 
730
+ //GGML_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
731
  if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
732
  *offs = (size_t) ioffs;
733
 
734
+ //GGML_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
735
 
736
  return buf_ctx->buffers[i].metal;
737
  }
738
  }
739
 
740
+ GGML_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
741
 
742
  return nil;
743
  }
 
869
 
870
  struct ggml_tensor * node = ggml_graph_node(gf, idx);
871
 
872
+ //GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
873
 
874
  struct ggml_tensor * src0 = node->src[0];
875
  struct ggml_tensor * src1 = node->src[1];
 
895
  }
896
 
897
  if (!ggml_metal_supports_op(ctx, dst)) {
898
+ GGML_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
899
  GGML_ABORT("unsupported op");
900
  }
901
 
 
953
  id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
954
  id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
955
 
956
+ //GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
957
  //if (src0) {
958
+ // GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
959
  // ggml_is_contiguous(src0), src0->name);
960
  //}
961
  //if (src1) {
962
+ // GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
963
  // ggml_is_contiguous(src1), src1->name);
964
  //}
965
  //if (dst) {
966
+ // GGML_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
967
  // dst->name);
968
  //}
969
 
 
1355
  } break;
1356
  default:
1357
  {
1358
+ GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
1359
  GGML_ABORT("fatal error");
1360
  }
1361
  } break;
 
1907
  } break;
1908
  default:
1909
  {
1910
+ GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
1911
  GGML_ABORT("not implemented");
1912
  }
1913
  };
 
2203
  } break;
2204
  default:
2205
  {
2206
+ GGML_LOG_ERROR("Asserting on type %d\n", (int)src2t);
2207
  GGML_ABORT("not implemented");
2208
  }
2209
  };
 
2772
  //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
2773
  default:
2774
  {
2775
+ GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
2776
+ GGML_LOG_ERROR("add template specialization for this size\n");
2777
  GGML_ABORT("add template specialization for this size");
2778
  }
2779
  }
 
2785
  //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
2786
  default:
2787
  {
2788
+ GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
2789
+ GGML_LOG_ERROR("add template specialization for this size\n");
2790
  GGML_ABORT("add template specialization for this size");
2791
  }
2792
  }
 
2947
  } break;
2948
  default:
2949
  {
2950
+ GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
2951
  GGML_ABORT("fatal error");
2952
  }
2953
  }
 
2992
 
2993
  NSError * error = nil;
2994
  if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
2995
+ GGML_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
2996
  } else {
2997
  [ctx->capture_scope beginScope];
2998
  ctx->capture_started = true;
 
3011
  const int n_nodes_per_cb = ctx->n_nodes_per_cb;
3012
 
3013
  id<MTLCommandBuffer> command_buffer = ctx->command_buffers[cb_idx];
3014
+ id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
3015
 
3016
  int node_start = 0;
3017
  int node_end = n_nodes_0;
 
3073
 
3074
  MTLCommandBufferStatus status = [command_buffer status];
3075
  if (status != MTLCommandBufferStatusCompleted) {
3076
+ GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
3077
  if (status == MTLCommandBufferStatusError) {
3078
+ GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
3079
  }
3080
 
3081
  return GGML_STATUS_FAILED;
 
3088
 
3089
  MTLCommandBufferStatus status = [command_buffer status];
3090
  if (status != MTLCommandBufferStatusCompleted) {
3091
+ GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
3092
  if (status == MTLCommandBufferStatusError) {
3093
+ GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
3094
  }
3095
 
3096
  return GGML_STATUS_FAILED;
 
3107
  }
3108
 
3109
  if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
3110
+ GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
3111
  return GGML_STATUS_ABORTED;
3112
  }
3113
 
 
3236
  #ifndef GGML_METAL_NDEBUG
3237
  #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
3238
  if (@available(macOS 10.12, iOS 16.0, *)) {
3239
+ GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
3240
  __func__,
3241
  size_aligned / 1024.0 / 1024.0,
3242
  device.currentAllocatedSize / 1024.0 / 1024.0,
3243
  device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
3244
 
3245
  if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
3246
+ GGML_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
3247
  }
3248
  } else {
3249
+ GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
3250
  __func__,
3251
  size_aligned / 1024.0 / 1024.0,
3252
  device.currentAllocatedSize / 1024.0 / 1024.0);
 
3288
  }
3289
 
3290
  if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
3291
+ GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
3292
  free(ctx);
3293
  ggml_backend_metal_free_device();
3294
  return NULL;
 
3373
  ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
3374
 
3375
  if (ctx->buffers[ctx->n_buffers].metal == nil) {
3376
+ GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
3377
  return false;
3378
  }
3379
  }
 
3399
  ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
3400
 
3401
  if (ctx->buffers[ctx->n_buffers].metal == nil) {
3402
+ GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
3403
  return false;
3404
  }
3405
  }
 
3407
  ggml_backend_metal_log_allocated_size(device, size_step_aligned);
3408
 
3409
  if (i + size_step < size) {
3410
+ GGML_LOG_INFO("\n");
3411
  }
3412
 
3413
  ++ctx->n_buffers;
 
3464
  ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
3465
 
3466
  if (ctx->n_cb > 2) {
3467
+ GGML_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb);
3468
  }
3469
  }
3470
 
 
3494
  /* .event_wait = */ NULL,
3495
  };
3496
 
 
 
 
 
 
3497
  static ggml_guid_t ggml_backend_metal_guid(void) {
3498
  static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 };
3499
  return &guid;
 
3502
  ggml_backend_t ggml_backend_metal_init(void) {
3503
  struct ggml_backend_metal_context * ctx = ggml_metal_init();
3504
  if (ctx == NULL) {
3505
+ GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
3506
  return NULL;
3507
  }
3508
 
ggml/src/ggml.c CHANGED
@@ -319,26 +319,63 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
319
  // logging
320
  //
321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  #if (GGML_DEBUG >= 1)
323
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
324
  #else
325
  #define GGML_PRINT_DEBUG(...)
326
  #endif
327
 
328
  #if (GGML_DEBUG >= 5)
329
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
330
  #else
331
  #define GGML_PRINT_DEBUG_5(...)
332
  #endif
333
 
334
  #if (GGML_DEBUG >= 10)
335
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
336
  #else
337
  #define GGML_PRINT_DEBUG_10(...)
338
  #endif
339
 
340
- #define GGML_PRINT(...) printf(__VA_ARGS__)
341
-
342
  //
343
  // end of logging block
344
  //
@@ -355,7 +392,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
355
  #else
356
  inline static void * ggml_aligned_malloc(size_t size) {
357
  if (size == 0) {
358
- GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
359
  return NULL;
360
  }
361
  void * aligned_memory = NULL;
@@ -377,7 +414,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
377
  error_desc = "insufficient memory";
378
  break;
379
  }
380
- GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
381
  GGML_ABORT("fatal error");
382
  return NULL;
383
  }
@@ -393,12 +430,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
393
 
394
  inline static void * ggml_malloc(size_t size) {
395
  if (size == 0) {
396
- GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
397
  return NULL;
398
  }
399
  void * result = malloc(size);
400
  if (result == NULL) {
401
- GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
402
  GGML_ABORT("fatal error");
403
  }
404
  return result;
@@ -407,12 +444,12 @@ inline static void * ggml_malloc(size_t size) {
407
  // calloc
408
  inline static void * ggml_calloc(size_t num, size_t size) {
409
  if (num == 0 || size == 0) {
410
- GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
411
  return NULL;
412
  }
413
  void * result = calloc(num, size);
414
  if (result == NULL) {
415
- GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
416
  GGML_ABORT("fatal error");
417
  }
418
  return result;
@@ -3349,7 +3386,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
3349
  if (fptr != NULL) {
3350
  char buf[42];
3351
  if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
3352
- GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
3353
  }
3354
  fclose(fptr);
3355
  }
@@ -3367,21 +3404,21 @@ bool ggml_is_numa(void) {
3367
  ////////////////////////////////////////////////////////////////////////////////
3368
 
3369
  void ggml_print_object(const struct ggml_object * obj) {
3370
- GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
3371
  obj->type, obj->offs, obj->size, (const void *) obj->next);
3372
  }
3373
 
3374
  void ggml_print_objects(const struct ggml_context * ctx) {
3375
  struct ggml_object * obj = ctx->objects_begin;
3376
 
3377
- GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);
3378
 
3379
  while (obj != NULL) {
3380
  ggml_print_object(obj);
3381
  obj = obj->next;
3382
  }
3383
 
3384
- GGML_PRINT("%s: --- end ---\n", __func__);
3385
  }
3386
 
3387
  int64_t ggml_nelements(const struct ggml_tensor * tensor) {
@@ -3964,7 +4001,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
3964
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
3965
 
3966
  if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
3967
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
3968
  __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
3969
  assert(false);
3970
  return NULL;
@@ -4028,7 +4065,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4028
  if (ctx->scratch.data != NULL) {
4029
  // allocate tensor data in the scratch buffer
4030
  if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4031
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4032
  __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4033
  assert(false);
4034
  return NULL;
@@ -20136,7 +20173,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
20136
  }
20137
  #else
20138
  if (n_threads > threadpool->n_threads_max) {
20139
- GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
20140
  n_threads = threadpool->n_threads_max;
20141
  }
20142
 
@@ -20675,30 +20712,30 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
20675
  }
20676
 
20677
  void ggml_graph_print(const struct ggml_cgraph * cgraph) {
20678
- GGML_PRINT("=== GRAPH ===\n");
20679
 
20680
- GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
20681
  for (int i = 0; i < cgraph->n_nodes; i++) {
20682
  struct ggml_tensor * node = cgraph->nodes[i];
20683
 
20684
- GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
20685
  i,
20686
  node->ne[0], node->ne[1], node->ne[2],
20687
  ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ");
20688
  }
20689
 
20690
- GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
20691
  for (int i = 0; i < cgraph->n_leafs; i++) {
20692
  struct ggml_tensor * node = cgraph->leafs[i];
20693
 
20694
- GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
20695
  i,
20696
  node->ne[0], node->ne[1],
20697
  ggml_op_name(node->op),
20698
  ggml_get_name(node));
20699
  }
20700
 
20701
- GGML_PRINT("========================================\n");
20702
  }
20703
 
20704
  // check if node is part of the graph
@@ -20869,7 +20906,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
20869
 
20870
  fclose(fp);
20871
 
20872
- GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
20873
  }
20874
 
20875
  ////////////////////////////////////////////////////////////////////////////////
@@ -23364,4 +23401,9 @@ int ggml_cpu_get_sve_cnt(void) {
23364
  return 0;
23365
  #endif
23366
  }
 
 
 
 
 
23367
  ////////////////////////////////////////////////////////////////////////////////
 
319
  // logging
320
  //
321
 
322
+ struct ggml_logger_state {
323
+ ggml_log_callback log_callback;
324
+ void * log_callback_user_data;
325
+ };
326
+ static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
327
+
328
+ static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
329
+ if (format == NULL)
330
+ return;
331
+ va_list args_copy;
332
+ va_copy(args_copy, args);
333
+ char buffer[128];
334
+ int len = vsnprintf(buffer, 128, format, args);
335
+ if (len < 128) {
336
+ g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
337
+ } else {
338
+ char * buffer2 = (char *) calloc(len + 1, sizeof(char));
339
+ vsnprintf(buffer2, len + 1, format, args_copy);
340
+ buffer2[len] = 0;
341
+ g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
342
+ free(buffer2);
343
+ }
344
+ va_end(args_copy);
345
+ }
346
+
347
+ void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
348
+ va_list args;
349
+ va_start(args, format);
350
+ ggml_log_internal_v(level, format, args);
351
+ va_end(args);
352
+ }
353
+
354
+ void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
355
+ (void) level;
356
+ (void) user_data;
357
+ fputs(text, stderr);
358
+ fflush(stderr);
359
+ }
360
+
361
  #if (GGML_DEBUG >= 1)
362
+ #define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
363
  #else
364
  #define GGML_PRINT_DEBUG(...)
365
  #endif
366
 
367
  #if (GGML_DEBUG >= 5)
368
+ #define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
369
  #else
370
  #define GGML_PRINT_DEBUG_5(...)
371
  #endif
372
 
373
  #if (GGML_DEBUG >= 10)
374
+ #define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
375
  #else
376
  #define GGML_PRINT_DEBUG_10(...)
377
  #endif
378
 
 
 
379
  //
380
  // end of logging block
381
  //
 
392
  #else
393
  inline static void * ggml_aligned_malloc(size_t size) {
394
  if (size == 0) {
395
+ GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
396
  return NULL;
397
  }
398
  void * aligned_memory = NULL;
 
414
  error_desc = "insufficient memory";
415
  break;
416
  }
417
+ GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
418
  GGML_ABORT("fatal error");
419
  return NULL;
420
  }
 
430
 
431
  inline static void * ggml_malloc(size_t size) {
432
  if (size == 0) {
433
+ GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
434
  return NULL;
435
  }
436
  void * result = malloc(size);
437
  if (result == NULL) {
438
+ GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
439
  GGML_ABORT("fatal error");
440
  }
441
  return result;
 
444
  // calloc
445
  inline static void * ggml_calloc(size_t num, size_t size) {
446
  if (num == 0 || size == 0) {
447
+ GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
448
  return NULL;
449
  }
450
  void * result = calloc(num, size);
451
  if (result == NULL) {
452
+ GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
453
  GGML_ABORT("fatal error");
454
  }
455
  return result;
 
3386
  if (fptr != NULL) {
3387
  char buf[42];
3388
  if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
3389
+ GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
3390
  }
3391
  fclose(fptr);
3392
  }
 
3404
  ////////////////////////////////////////////////////////////////////////////////
3405
 
3406
  void ggml_print_object(const struct ggml_object * obj) {
3407
+ GGML_LOG_INFO(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
3408
  obj->type, obj->offs, obj->size, (const void *) obj->next);
3409
  }
3410
 
3411
  void ggml_print_objects(const struct ggml_context * ctx) {
3412
  struct ggml_object * obj = ctx->objects_begin;
3413
 
3414
+ GGML_LOG_INFO("%s: objects in context %p:\n", __func__, (const void *) ctx);
3415
 
3416
  while (obj != NULL) {
3417
  ggml_print_object(obj);
3418
  obj = obj->next;
3419
  }
3420
 
3421
+ GGML_LOG_INFO("%s: --- end ---\n", __func__);
3422
  }
3423
 
3424
  int64_t ggml_nelements(const struct ggml_tensor * tensor) {
 
4001
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4002
 
4003
  if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4004
+ GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4005
  __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4006
  assert(false);
4007
  return NULL;
 
4065
  if (ctx->scratch.data != NULL) {
4066
  // allocate tensor data in the scratch buffer
4067
  if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4068
+ GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4069
  __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4070
  assert(false);
4071
  return NULL;
 
20173
  }
20174
  #else
20175
  if (n_threads > threadpool->n_threads_max) {
20176
+ GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
20177
  n_threads = threadpool->n_threads_max;
20178
  }
20179
 
 
20712
  }
20713
 
20714
  void ggml_graph_print(const struct ggml_cgraph * cgraph) {
20715
+ GGML_LOG_INFO("=== GRAPH ===\n");
20716
 
20717
+ GGML_LOG_INFO("n_nodes = %d\n", cgraph->n_nodes);
20718
  for (int i = 0; i < cgraph->n_nodes; i++) {
20719
  struct ggml_tensor * node = cgraph->nodes[i];
20720
 
20721
+ GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
20722
  i,
20723
  node->ne[0], node->ne[1], node->ne[2],
20724
  ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ");
20725
  }
20726
 
20727
+ GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs);
20728
  for (int i = 0; i < cgraph->n_leafs; i++) {
20729
  struct ggml_tensor * node = cgraph->leafs[i];
20730
 
20731
+ GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
20732
  i,
20733
  node->ne[0], node->ne[1],
20734
  ggml_op_name(node->op),
20735
  ggml_get_name(node));
20736
  }
20737
 
20738
+ GGML_LOG_INFO("========================================\n");
20739
  }
20740
 
20741
  // check if node is part of the graph
 
20906
 
20907
  fclose(fp);
20908
 
20909
+ GGML_LOG_INFO("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
20910
  }
20911
 
20912
  ////////////////////////////////////////////////////////////////////////////////
 
23401
  return 0;
23402
  #endif
23403
  }
23404
+
23405
+ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
23406
+ g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
23407
+ g_logger_state.log_callback_user_data = user_data;
23408
+ }
23409
  ////////////////////////////////////////////////////////////////////////////////