Diego Devesa commited on
Commit
98d1a6a
·
1 Parent(s): 4ac768e

ggml : move more prints to the ggml log system (llama/9839)

Browse files

* ggml : move more prints to the ggml log system

* show BLAS OpenMP warnings in all builds using debug print

ggml/src/ggml-alloc.c CHANGED
@@ -14,7 +14,7 @@
14
 
15
  //#define GGML_ALLOCATOR_DEBUG
16
 
17
- //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
18
  #define AT_PRINTF(...)
19
 
20
 
@@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
89
  size = GGML_PAD(size, talloc->alignment);
90
 
91
  if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
  __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
  GGML_ABORT("not enough space in the buffer");
95
  }
@@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
172
  best_fit_block = alloc->n_free_blocks - 1;
173
  } else {
174
  // this should never happen
175
- fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
176
  __func__, size, max_avail);
177
  GGML_ABORT("not enough space in the buffer");
178
  }
@@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
209
  }
210
  }
211
  }
212
- fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
213
  for (int i = 0; i < 1024; i++) {
214
  if (alloc->allocated_tensors[i].tensor) {
215
- fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
216
  alloc->allocated_tensors[i].offset,
217
  alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
218
  ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
219
  }
220
  }
221
- fprintf(stderr, "\n");
222
  }
223
  #endif
224
 
@@ -766,13 +766,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
766
  // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
767
  if (new_size > cur_size || galloc->buffers[i] == NULL) {
768
  #ifndef NDEBUG
769
- fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
770
  #endif
771
 
772
  ggml_backend_buffer_free(galloc->buffers[i]);
773
  galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
774
  if (galloc->buffers[i] == NULL) {
775
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
776
  return false;
777
  }
778
  ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -823,14 +823,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t
823
  static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
824
  if (galloc->n_nodes != graph->n_nodes) {
825
  #ifndef NDEBUG
826
- fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
827
  #endif
828
  return true;
829
  }
830
 
831
  if (galloc->n_leafs != graph->n_leafs) {
832
  #ifndef NDEBUG
833
- fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
834
  #endif
835
  return true;
836
  }
@@ -841,7 +841,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
841
 
842
  if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
843
  #ifndef NDEBUG
844
- fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
845
  #endif
846
  return true;
847
  }
@@ -853,7 +853,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
853
  }
854
  if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
855
  #ifndef NDEBUG
856
- fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
857
  #endif
858
  return true;
859
  }
@@ -867,14 +867,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
867
  if (ggml_gallocr_needs_realloc(galloc, graph)) {
868
  if (galloc->n_buffers == 1) {
869
  #ifndef NDEBUG
870
- fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
871
  #endif
872
  if (!ggml_gallocr_reserve(galloc, graph)) {
873
  return false;
874
  }
875
  } else {
876
  #ifndef NDEBUG
877
- fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
878
  #endif
879
  return false;
880
  }
@@ -938,7 +938,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
938
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
939
  if (buffer == NULL) {
940
  #ifndef NDEBUG
941
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
942
  #endif
943
  for (size_t i = 0; i < *n_buffers; i++) {
944
  ggml_backend_buffer_free((*buffers)[i]);
@@ -988,7 +988,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
988
  }
989
 
990
  if (this_size > max_size) {
991
- fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
992
  __func__, t->name,
993
  ggml_backend_buft_name(buft),
994
  this_size, max_size);
@@ -1020,7 +1020,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
1020
 
1021
  if (n_buffers == 0) {
1022
  #ifndef NDEBUG
1023
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
1024
  #endif
1025
  return NULL;
1026
  }
 
14
 
15
  //#define GGML_ALLOCATOR_DEBUG
16
 
17
+ //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
18
  #define AT_PRINTF(...)
19
 
20
 
 
89
  size = GGML_PAD(size, talloc->alignment);
90
 
91
  if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
  __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
  GGML_ABORT("not enough space in the buffer");
95
  }
 
172
  best_fit_block = alloc->n_free_blocks - 1;
173
  } else {
174
  // this should never happen
175
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
176
  __func__, size, max_avail);
177
  GGML_ABORT("not enough space in the buffer");
178
  }
 
209
  }
210
  }
211
  }
212
+ GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
213
  for (int i = 0; i < 1024; i++) {
214
  if (alloc->allocated_tensors[i].tensor) {
215
+ GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
216
  alloc->allocated_tensors[i].offset,
217
  alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
218
  ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
219
  }
220
  }
221
+ GGML_LOG_DEBUG("\n");
222
  }
223
  #endif
224
 
 
766
  // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
767
  if (new_size > cur_size || galloc->buffers[i] == NULL) {
768
  #ifndef NDEBUG
769
+ GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
770
  #endif
771
 
772
  ggml_backend_buffer_free(galloc->buffers[i]);
773
  galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
774
  if (galloc->buffers[i] == NULL) {
775
+ GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
776
  return false;
777
  }
778
  ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
 
823
  static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
824
  if (galloc->n_nodes != graph->n_nodes) {
825
  #ifndef NDEBUG
826
+ GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
827
  #endif
828
  return true;
829
  }
830
 
831
  if (galloc->n_leafs != graph->n_leafs) {
832
  #ifndef NDEBUG
833
+ GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
834
  #endif
835
  return true;
836
  }
 
841
 
842
  if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
843
  #ifndef NDEBUG
844
+ GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
845
  #endif
846
  return true;
847
  }
 
853
  }
854
  if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
855
  #ifndef NDEBUG
856
+ GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
857
  #endif
858
  return true;
859
  }
 
867
  if (ggml_gallocr_needs_realloc(galloc, graph)) {
868
  if (galloc->n_buffers == 1) {
869
  #ifndef NDEBUG
870
+ GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
871
  #endif
872
  if (!ggml_gallocr_reserve(galloc, graph)) {
873
  return false;
874
  }
875
  } else {
876
  #ifndef NDEBUG
877
+ GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
878
  #endif
879
  return false;
880
  }
 
938
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
939
  if (buffer == NULL) {
940
  #ifndef NDEBUG
941
+ GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
942
  #endif
943
  for (size_t i = 0; i < *n_buffers; i++) {
944
  ggml_backend_buffer_free((*buffers)[i]);
 
988
  }
989
 
990
  if (this_size > max_size) {
991
+ GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
992
  __func__, t->name,
993
  ggml_backend_buft_name(buft),
994
  this_size, max_size);
 
1020
 
1021
  if (n_buffers == 0) {
1022
  #ifndef NDEBUG
1023
+ GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
1024
  #endif
1025
  return NULL;
1026
  }
ggml/src/ggml-backend.cpp CHANGED
@@ -379,7 +379,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
379
  ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
380
  } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
381
  #ifndef NDEBUG
382
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
383
  #endif
384
  size_t nbytes = ggml_nbytes(src);
385
  void * data = malloc(nbytes);
@@ -571,7 +571,7 @@ struct ggml_backend_registry {
571
 
572
  void register_backend(ggml_backend_reg_t reg) {
573
  #ifndef NDEBUG
574
- fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
575
  __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
576
  #endif
577
  backends.push_back(reg);
@@ -582,7 +582,7 @@ struct ggml_backend_registry {
582
 
583
  void register_device(ggml_backend_dev_t device) {
584
  #ifndef NDEBUG
585
- fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
586
  #endif
587
  devices.push_back(device);
588
  }
@@ -773,7 +773,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
773
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
774
  void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
775
  if (data == NULL) {
776
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
777
  return NULL;
778
  }
779
 
@@ -836,7 +836,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
836
  void * ptr;
837
  int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
838
  if (result != 0) {
839
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
840
  return NULL;
841
  }
842
 
@@ -1459,7 +1459,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
1459
  }
1460
 
1461
  #ifndef NDEBUG
1462
- fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1463
  __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1464
  #endif
1465
 
@@ -1548,13 +1548,13 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1548
  for (int i = 0; i < graph->n_nodes; i++) {
1549
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1550
  ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1551
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1552
  sched->splits[cur_split].n_inputs);
1553
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1554
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1555
  fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
1556
  }
1557
- fprintf(stderr, "\n");
1558
  cur_split++;
1559
  }
1560
  struct ggml_tensor * node = graph->nodes[i];
@@ -1562,7 +1562,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1562
  continue;
1563
  }
1564
  ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1565
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1566
  fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1567
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1568
  struct ggml_tensor * src = node->src[j];
@@ -1570,10 +1570,10 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1570
  continue;
1571
  }
1572
  ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1573
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1574
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1575
  }
1576
- fprintf(stderr, "\n");
1577
  }
1578
  }
1579
 
@@ -2087,11 +2087,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
2087
  // the re-allocation may cause the split inputs to be moved to a different address
2088
  ggml_backend_sched_synchronize(sched);
2089
  #ifndef NDEBUG
2090
- fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
2091
  #endif
2092
  ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
2093
  if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2094
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
2095
  return false;
2096
  }
2097
  }
@@ -2485,7 +2485,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2485
  struct ggml_context * ctx_unallocated = ggml_init(params);
2486
 
2487
  if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2488
- fprintf(stderr, "failed to allocate context for graph copy\n");
2489
  ggml_hash_set_free(&hash_set);
2490
  free(node_copies);
2491
  free(node_init);
@@ -2508,7 +2508,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2508
  // allocate nodes
2509
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2510
  if (buffer == NULL) {
2511
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
2512
  ggml_hash_set_free(&hash_set);
2513
  free(node_copies);
2514
  free(node_init);
 
379
  ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
380
  } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
381
  #ifndef NDEBUG
382
+ GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
383
  #endif
384
  size_t nbytes = ggml_nbytes(src);
385
  void * data = malloc(nbytes);
 
571
 
572
  void register_backend(ggml_backend_reg_t reg) {
573
  #ifndef NDEBUG
574
+ GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
575
  __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
576
  #endif
577
  backends.push_back(reg);
 
582
 
583
  void register_device(ggml_backend_dev_t device) {
584
  #ifndef NDEBUG
585
+ GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
586
  #endif
587
  devices.push_back(device);
588
  }
 
773
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
774
  void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
775
  if (data == NULL) {
776
+ GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
777
  return NULL;
778
  }
779
 
 
836
  void * ptr;
837
  int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
838
  if (result != 0) {
839
+ GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
840
  return NULL;
841
  }
842
 
 
1459
  }
1460
 
1461
  #ifndef NDEBUG
1462
+ GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1463
  __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1464
  #endif
1465
 
 
1548
  for (int i = 0; i < graph->n_nodes; i++) {
1549
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1550
  ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1551
+ GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1552
  sched->splits[cur_split].n_inputs);
1553
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1554
+ GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1555
  fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
1556
  }
1557
+ GGML_LOG_DEBUG("\n");
1558
  cur_split++;
1559
  }
1560
  struct ggml_tensor * node = graph->nodes[i];
 
1562
  continue;
1563
  }
1564
  ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1565
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1566
  fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1567
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1568
  struct ggml_tensor * src = node->src[j];
 
1570
  continue;
1571
  }
1572
  ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1573
+ GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1574
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1575
  }
1576
+ GGML_LOG_DEBUG("\n");
1577
  }
1578
  }
1579
 
 
2087
  // the re-allocation may cause the split inputs to be moved to a different address
2088
  ggml_backend_sched_synchronize(sched);
2089
  #ifndef NDEBUG
2090
+ GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
2091
  #endif
2092
  ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
2093
  if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2094
+ GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
2095
  return false;
2096
  }
2097
  }
 
2485
  struct ggml_context * ctx_unallocated = ggml_init(params);
2486
 
2487
  if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2488
+ GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
2489
  ggml_hash_set_free(&hash_set);
2490
  free(node_copies);
2491
  free(node_init);
 
2508
  // allocate nodes
2509
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2510
  if (buffer == NULL) {
2511
+ GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
2512
  ggml_hash_set_free(&hash_set);
2513
  free(node_copies);
2514
  free(node_init);
ggml/src/ggml-blas.cpp CHANGED
@@ -297,14 +297,14 @@ ggml_backend_t ggml_backend_blas_init(void) {
297
  /* .context = */ ctx,
298
  };
299
 
300
- #if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
301
  if (openblas_get_parallel() != OPENBLAS_OPENMP) {
302
- fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
303
  }
304
  #endif
305
 
306
- #if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
307
- fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
308
  #endif
309
 
310
  return backend;
 
297
  /* .context = */ ctx,
298
  };
299
 
300
+ #if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
301
  if (openblas_get_parallel() != OPENBLAS_OPENMP) {
302
+ GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
303
  }
304
  #endif
305
 
306
+ #if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
307
+ GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
308
  #endif
309
 
310
  return backend;
ggml/src/ggml-cuda.cu CHANGED
@@ -291,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
291
  return;
292
  }
293
  }
294
- GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
295
  ggml_cuda_set_device(device);
296
  CUDA_CHECK(cudaFree(ptr));
297
  pool_size -= size;
@@ -980,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
980
  if (err != cudaSuccess) {
981
  // clear the error
982
  cudaGetLastError();
983
- GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
984
  size / 1024.0 / 1024.0, cudaGetErrorString(err));
985
  return nullptr;
986
  }
@@ -2406,7 +2406,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
2406
 
2407
  if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
2408
  #ifndef NDEBUG
2409
- GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
2410
  #endif
2411
  return false;
2412
  }
@@ -2524,7 +2524,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2524
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2525
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2526
  #ifndef NDEBUG
2527
- GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2528
  #endif
2529
  }
2530
  }
@@ -2575,14 +2575,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2575
  if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2576
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2577
  #ifndef NDEBUG
2578
- GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2579
  #endif
2580
  }
2581
 
2582
  if (node->op == GGML_OP_MUL_MAT_ID) {
2583
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2584
  #ifndef NDEBUG
2585
- GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2586
  #endif
2587
  }
2588
 
@@ -2591,7 +2591,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2591
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2592
  use_cuda_graph = false;
2593
  #ifndef NDEBUG
2594
- GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2595
  #endif
2596
  }
2597
 
@@ -2603,7 +2603,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2603
  if (!ptr) {
2604
  use_cuda_graph = false;
2605
  #ifndef NDEBUG
2606
- GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
2607
  #endif
2608
  } else {
2609
  if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
@@ -2627,7 +2627,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2627
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2628
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2629
  #ifndef NDEBUG
2630
- GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2631
  #endif
2632
  }
2633
  }
@@ -2685,7 +2685,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2685
  use_cuda_graph = false;
2686
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2687
  #ifndef NDEBUG
2688
- GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2689
  #endif
2690
  } else {
2691
  graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2854,7 +2854,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
2854
  // clear the error
2855
  cudaGetLastError();
2856
 
2857
- GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
2858
  size / 1024.0 / 1024.0, cudaGetErrorString(err));
2859
  return false;
2860
  }
 
291
  return;
292
  }
293
  }
294
+ GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
295
  ggml_cuda_set_device(device);
296
  CUDA_CHECK(cudaFree(ptr));
297
  pool_size -= size;
 
980
  if (err != cudaSuccess) {
981
  // clear the error
982
  cudaGetLastError();
983
+ GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
984
  size / 1024.0 / 1024.0, cudaGetErrorString(err));
985
  return nullptr;
986
  }
 
2406
 
2407
  if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
2408
  #ifndef NDEBUG
2409
+ GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
2410
  #endif
2411
  return false;
2412
  }
 
2524
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2525
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2526
  #ifndef NDEBUG
2527
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2528
  #endif
2529
  }
2530
  }
 
2575
  if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2576
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2577
  #ifndef NDEBUG
2578
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
2579
  #endif
2580
  }
2581
 
2582
  if (node->op == GGML_OP_MUL_MAT_ID) {
2583
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2584
  #ifndef NDEBUG
2585
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2586
  #endif
2587
  }
2588
 
 
2591
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2592
  use_cuda_graph = false;
2593
  #ifndef NDEBUG
2594
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2595
  #endif
2596
  }
2597
 
 
2603
  if (!ptr) {
2604
  use_cuda_graph = false;
2605
  #ifndef NDEBUG
2606
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
2607
  #endif
2608
  } else {
2609
  if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
 
2627
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2628
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2629
  #ifndef NDEBUG
2630
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2631
  #endif
2632
  }
2633
  }
 
2685
  use_cuda_graph = false;
2686
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2687
  #ifndef NDEBUG
2688
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2689
  #endif
2690
  } else {
2691
  graph_evaluated_or_captured = true; // CUDA graph has been captured
 
2854
  // clear the error
2855
  cudaGetLastError();
2856
 
2857
+ GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
2858
  size / 1024.0 / 1024.0, cudaGetErrorString(err));
2859
  return false;
2860
  }