Spaces:
Sleeping
Sleeping
Diego Devesa
commited on
Commit
·
98d1a6a
1
Parent(s):
4ac768e
ggml : move more prints to the ggml log system (llama/9839)
Browse files* ggml : move more prints to the ggml log system
* show BLAS OpenMP warnings in all builds using debug print
- ggml/src/ggml-alloc.c +17 -17
- ggml/src/ggml-backend.cpp +16 -16
- ggml/src/ggml-blas.cpp +4 -4
- ggml/src/ggml-cuda.cu +11 -11
ggml/src/ggml-alloc.c
CHANGED
|
@@ -14,7 +14,7 @@
|
|
| 14 |
|
| 15 |
//#define GGML_ALLOCATOR_DEBUG
|
| 16 |
|
| 17 |
-
//#define AT_PRINTF(...)
|
| 18 |
#define AT_PRINTF(...)
|
| 19 |
|
| 20 |
|
|
@@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
|
| 89 |
size = GGML_PAD(size, talloc->alignment);
|
| 90 |
|
| 91 |
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
| 92 |
-
|
| 93 |
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
| 94 |
GGML_ABORT("not enough space in the buffer");
|
| 95 |
}
|
|
@@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
|
| 172 |
best_fit_block = alloc->n_free_blocks - 1;
|
| 173 |
} else {
|
| 174 |
// this should never happen
|
| 175 |
-
|
| 176 |
__func__, size, max_avail);
|
| 177 |
GGML_ABORT("not enough space in the buffer");
|
| 178 |
}
|
|
@@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
|
| 209 |
}
|
| 210 |
}
|
| 211 |
}
|
| 212 |
-
|
| 213 |
for (int i = 0; i < 1024; i++) {
|
| 214 |
if (alloc->allocated_tensors[i].tensor) {
|
| 215 |
-
|
| 216 |
alloc->allocated_tensors[i].offset,
|
| 217 |
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
| 218 |
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
| 219 |
}
|
| 220 |
}
|
| 221 |
-
|
| 222 |
}
|
| 223 |
#endif
|
| 224 |
|
|
@@ -766,13 +766,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 766 |
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
| 767 |
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
| 768 |
#ifndef NDEBUG
|
| 769 |
-
|
| 770 |
#endif
|
| 771 |
|
| 772 |
ggml_backend_buffer_free(galloc->buffers[i]);
|
| 773 |
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
| 774 |
if (galloc->buffers[i] == NULL) {
|
| 775 |
-
|
| 776 |
return false;
|
| 777 |
}
|
| 778 |
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
@@ -823,14 +823,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t
|
|
| 823 |
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
| 824 |
if (galloc->n_nodes != graph->n_nodes) {
|
| 825 |
#ifndef NDEBUG
|
| 826 |
-
|
| 827 |
#endif
|
| 828 |
return true;
|
| 829 |
}
|
| 830 |
|
| 831 |
if (galloc->n_leafs != graph->n_leafs) {
|
| 832 |
#ifndef NDEBUG
|
| 833 |
-
|
| 834 |
#endif
|
| 835 |
return true;
|
| 836 |
}
|
|
@@ -841,7 +841,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
| 841 |
|
| 842 |
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
| 843 |
#ifndef NDEBUG
|
| 844 |
-
|
| 845 |
#endif
|
| 846 |
return true;
|
| 847 |
}
|
|
@@ -853,7 +853,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
| 853 |
}
|
| 854 |
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
| 855 |
#ifndef NDEBUG
|
| 856 |
-
|
| 857 |
#endif
|
| 858 |
return true;
|
| 859 |
}
|
|
@@ -867,14 +867,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
| 867 |
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
| 868 |
if (galloc->n_buffers == 1) {
|
| 869 |
#ifndef NDEBUG
|
| 870 |
-
|
| 871 |
#endif
|
| 872 |
if (!ggml_gallocr_reserve(galloc, graph)) {
|
| 873 |
return false;
|
| 874 |
}
|
| 875 |
} else {
|
| 876 |
#ifndef NDEBUG
|
| 877 |
-
|
| 878 |
#endif
|
| 879 |
return false;
|
| 880 |
}
|
|
@@ -938,7 +938,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
| 938 |
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
| 939 |
if (buffer == NULL) {
|
| 940 |
#ifndef NDEBUG
|
| 941 |
-
|
| 942 |
#endif
|
| 943 |
for (size_t i = 0; i < *n_buffers; i++) {
|
| 944 |
ggml_backend_buffer_free((*buffers)[i]);
|
|
@@ -988,7 +988,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
| 988 |
}
|
| 989 |
|
| 990 |
if (this_size > max_size) {
|
| 991 |
-
|
| 992 |
__func__, t->name,
|
| 993 |
ggml_backend_buft_name(buft),
|
| 994 |
this_size, max_size);
|
|
@@ -1020,7 +1020,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
| 1020 |
|
| 1021 |
if (n_buffers == 0) {
|
| 1022 |
#ifndef NDEBUG
|
| 1023 |
-
|
| 1024 |
#endif
|
| 1025 |
return NULL;
|
| 1026 |
}
|
|
|
|
| 14 |
|
| 15 |
//#define GGML_ALLOCATOR_DEBUG
|
| 16 |
|
| 17 |
+
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
| 18 |
#define AT_PRINTF(...)
|
| 19 |
|
| 20 |
|
|
|
|
| 89 |
size = GGML_PAD(size, talloc->alignment);
|
| 90 |
|
| 91 |
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
| 92 |
+
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
| 93 |
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
| 94 |
GGML_ABORT("not enough space in the buffer");
|
| 95 |
}
|
|
|
|
| 172 |
best_fit_block = alloc->n_free_blocks - 1;
|
| 173 |
} else {
|
| 174 |
// this should never happen
|
| 175 |
+
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
| 176 |
__func__, size, max_avail);
|
| 177 |
GGML_ABORT("not enough space in the buffer");
|
| 178 |
}
|
|
|
|
| 209 |
}
|
| 210 |
}
|
| 211 |
}
|
| 212 |
+
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
| 213 |
for (int i = 0; i < 1024; i++) {
|
| 214 |
if (alloc->allocated_tensors[i].tensor) {
|
| 215 |
+
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
| 216 |
alloc->allocated_tensors[i].offset,
|
| 217 |
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
| 218 |
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
| 219 |
}
|
| 220 |
}
|
| 221 |
+
GGML_LOG_DEBUG("\n");
|
| 222 |
}
|
| 223 |
#endif
|
| 224 |
|
|
|
|
| 766 |
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
| 767 |
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
| 768 |
#ifndef NDEBUG
|
| 769 |
+
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
| 770 |
#endif
|
| 771 |
|
| 772 |
ggml_backend_buffer_free(galloc->buffers[i]);
|
| 773 |
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
| 774 |
if (galloc->buffers[i] == NULL) {
|
| 775 |
+
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
| 776 |
return false;
|
| 777 |
}
|
| 778 |
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
|
|
| 823 |
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
| 824 |
if (galloc->n_nodes != graph->n_nodes) {
|
| 825 |
#ifndef NDEBUG
|
| 826 |
+
GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
|
| 827 |
#endif
|
| 828 |
return true;
|
| 829 |
}
|
| 830 |
|
| 831 |
if (galloc->n_leafs != graph->n_leafs) {
|
| 832 |
#ifndef NDEBUG
|
| 833 |
+
GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
|
| 834 |
#endif
|
| 835 |
return true;
|
| 836 |
}
|
|
|
|
| 841 |
|
| 842 |
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
| 843 |
#ifndef NDEBUG
|
| 844 |
+
GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
|
| 845 |
#endif
|
| 846 |
return true;
|
| 847 |
}
|
|
|
|
| 853 |
}
|
| 854 |
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
| 855 |
#ifndef NDEBUG
|
| 856 |
+
GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
| 857 |
#endif
|
| 858 |
return true;
|
| 859 |
}
|
|
|
|
| 867 |
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
| 868 |
if (galloc->n_buffers == 1) {
|
| 869 |
#ifndef NDEBUG
|
| 870 |
+
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
|
| 871 |
#endif
|
| 872 |
if (!ggml_gallocr_reserve(galloc, graph)) {
|
| 873 |
return false;
|
| 874 |
}
|
| 875 |
} else {
|
| 876 |
#ifndef NDEBUG
|
| 877 |
+
GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
| 878 |
#endif
|
| 879 |
return false;
|
| 880 |
}
|
|
|
|
| 938 |
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
| 939 |
if (buffer == NULL) {
|
| 940 |
#ifndef NDEBUG
|
| 941 |
+
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
| 942 |
#endif
|
| 943 |
for (size_t i = 0; i < *n_buffers; i++) {
|
| 944 |
ggml_backend_buffer_free((*buffers)[i]);
|
|
|
|
| 988 |
}
|
| 989 |
|
| 990 |
if (this_size > max_size) {
|
| 991 |
+
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
| 992 |
__func__, t->name,
|
| 993 |
ggml_backend_buft_name(buft),
|
| 994 |
this_size, max_size);
|
|
|
|
| 1020 |
|
| 1021 |
if (n_buffers == 0) {
|
| 1022 |
#ifndef NDEBUG
|
| 1023 |
+
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
| 1024 |
#endif
|
| 1025 |
return NULL;
|
| 1026 |
}
|
ggml/src/ggml-backend.cpp
CHANGED
|
@@ -379,7 +379,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
| 379 |
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
| 380 |
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
| 381 |
#ifndef NDEBUG
|
| 382 |
-
|
| 383 |
#endif
|
| 384 |
size_t nbytes = ggml_nbytes(src);
|
| 385 |
void * data = malloc(nbytes);
|
|
@@ -571,7 +571,7 @@ struct ggml_backend_registry {
|
|
| 571 |
|
| 572 |
void register_backend(ggml_backend_reg_t reg) {
|
| 573 |
#ifndef NDEBUG
|
| 574 |
-
|
| 575 |
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
| 576 |
#endif
|
| 577 |
backends.push_back(reg);
|
|
@@ -582,7 +582,7 @@ struct ggml_backend_registry {
|
|
| 582 |
|
| 583 |
void register_device(ggml_backend_dev_t device) {
|
| 584 |
#ifndef NDEBUG
|
| 585 |
-
|
| 586 |
#endif
|
| 587 |
devices.push_back(device);
|
| 588 |
}
|
|
@@ -773,7 +773,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
|
|
| 773 |
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
| 774 |
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
| 775 |
if (data == NULL) {
|
| 776 |
-
|
| 777 |
return NULL;
|
| 778 |
}
|
| 779 |
|
|
@@ -836,7 +836,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
|
| 836 |
void * ptr;
|
| 837 |
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
| 838 |
if (result != 0) {
|
| 839 |
-
|
| 840 |
return NULL;
|
| 841 |
}
|
| 842 |
|
|
@@ -1459,7 +1459,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
|
| 1459 |
}
|
| 1460 |
|
| 1461 |
#ifndef NDEBUG
|
| 1462 |
-
|
| 1463 |
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
| 1464 |
#endif
|
| 1465 |
|
|
@@ -1548,13 +1548,13 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
| 1548 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1549 |
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
| 1550 |
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
| 1551 |
-
|
| 1552 |
sched->splits[cur_split].n_inputs);
|
| 1553 |
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
| 1554 |
-
|
| 1555 |
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
| 1556 |
}
|
| 1557 |
-
|
| 1558 |
cur_split++;
|
| 1559 |
}
|
| 1560 |
struct ggml_tensor * node = graph->nodes[i];
|
|
@@ -1562,7 +1562,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
| 1562 |
continue;
|
| 1563 |
}
|
| 1564 |
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
| 1565 |
-
|
| 1566 |
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
| 1567 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1568 |
struct ggml_tensor * src = node->src[j];
|
|
@@ -1570,10 +1570,10 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
| 1570 |
continue;
|
| 1571 |
}
|
| 1572 |
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
| 1573 |
-
|
| 1574 |
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 1575 |
}
|
| 1576 |
-
|
| 1577 |
}
|
| 1578 |
}
|
| 1579 |
|
|
@@ -2087,11 +2087,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
| 2087 |
// the re-allocation may cause the split inputs to be moved to a different address
|
| 2088 |
ggml_backend_sched_synchronize(sched);
|
| 2089 |
#ifndef NDEBUG
|
| 2090 |
-
|
| 2091 |
#endif
|
| 2092 |
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
| 2093 |
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
| 2094 |
-
|
| 2095 |
return false;
|
| 2096 |
}
|
| 2097 |
}
|
|
@@ -2485,7 +2485,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
| 2485 |
struct ggml_context * ctx_unallocated = ggml_init(params);
|
| 2486 |
|
| 2487 |
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
| 2488 |
-
|
| 2489 |
ggml_hash_set_free(&hash_set);
|
| 2490 |
free(node_copies);
|
| 2491 |
free(node_init);
|
|
@@ -2508,7 +2508,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
| 2508 |
// allocate nodes
|
| 2509 |
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
| 2510 |
if (buffer == NULL) {
|
| 2511 |
-
|
| 2512 |
ggml_hash_set_free(&hash_set);
|
| 2513 |
free(node_copies);
|
| 2514 |
free(node_init);
|
|
|
|
| 379 |
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
| 380 |
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
| 381 |
#ifndef NDEBUG
|
| 382 |
+
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
| 383 |
#endif
|
| 384 |
size_t nbytes = ggml_nbytes(src);
|
| 385 |
void * data = malloc(nbytes);
|
|
|
|
| 571 |
|
| 572 |
void register_backend(ggml_backend_reg_t reg) {
|
| 573 |
#ifndef NDEBUG
|
| 574 |
+
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
| 575 |
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
| 576 |
#endif
|
| 577 |
backends.push_back(reg);
|
|
|
|
| 582 |
|
| 583 |
void register_device(ggml_backend_dev_t device) {
|
| 584 |
#ifndef NDEBUG
|
| 585 |
+
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
| 586 |
#endif
|
| 587 |
devices.push_back(device);
|
| 588 |
}
|
|
|
|
| 773 |
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
| 774 |
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
| 775 |
if (data == NULL) {
|
| 776 |
+
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
| 777 |
return NULL;
|
| 778 |
}
|
| 779 |
|
|
|
|
| 836 |
void * ptr;
|
| 837 |
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
| 838 |
if (result != 0) {
|
| 839 |
+
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
| 840 |
return NULL;
|
| 841 |
}
|
| 842 |
|
|
|
|
| 1459 |
}
|
| 1460 |
|
| 1461 |
#ifndef NDEBUG
|
| 1462 |
+
GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
| 1463 |
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
| 1464 |
#endif
|
| 1465 |
|
|
|
|
| 1548 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1549 |
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
| 1550 |
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
| 1551 |
+
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
| 1552 |
sched->splits[cur_split].n_inputs);
|
| 1553 |
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
| 1554 |
+
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
| 1555 |
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
| 1556 |
}
|
| 1557 |
+
GGML_LOG_DEBUG("\n");
|
| 1558 |
cur_split++;
|
| 1559 |
}
|
| 1560 |
struct ggml_tensor * node = graph->nodes[i];
|
|
|
|
| 1562 |
continue;
|
| 1563 |
}
|
| 1564 |
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
| 1565 |
+
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
| 1566 |
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
| 1567 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1568 |
struct ggml_tensor * src = node->src[j];
|
|
|
|
| 1570 |
continue;
|
| 1571 |
}
|
| 1572 |
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
| 1573 |
+
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
| 1574 |
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 1575 |
}
|
| 1576 |
+
GGML_LOG_DEBUG("\n");
|
| 1577 |
}
|
| 1578 |
}
|
| 1579 |
|
|
|
|
| 2087 |
// the re-allocation may cause the split inputs to be moved to a different address
|
| 2088 |
ggml_backend_sched_synchronize(sched);
|
| 2089 |
#ifndef NDEBUG
|
| 2090 |
+
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
| 2091 |
#endif
|
| 2092 |
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
| 2093 |
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
| 2094 |
+
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
| 2095 |
return false;
|
| 2096 |
}
|
| 2097 |
}
|
|
|
|
| 2485 |
struct ggml_context * ctx_unallocated = ggml_init(params);
|
| 2486 |
|
| 2487 |
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
| 2488 |
+
GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
|
| 2489 |
ggml_hash_set_free(&hash_set);
|
| 2490 |
free(node_copies);
|
| 2491 |
free(node_init);
|
|
|
|
| 2508 |
// allocate nodes
|
| 2509 |
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
| 2510 |
if (buffer == NULL) {
|
| 2511 |
+
GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
|
| 2512 |
ggml_hash_set_free(&hash_set);
|
| 2513 |
free(node_copies);
|
| 2514 |
free(node_init);
|
ggml/src/ggml-blas.cpp
CHANGED
|
@@ -297,14 +297,14 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|
| 297 |
/* .context = */ ctx,
|
| 298 |
};
|
| 299 |
|
| 300 |
-
#if
|
| 301 |
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
|
| 302 |
-
|
| 303 |
}
|
| 304 |
#endif
|
| 305 |
|
| 306 |
-
#if
|
| 307 |
-
|
| 308 |
#endif
|
| 309 |
|
| 310 |
return backend;
|
|
|
|
| 297 |
/* .context = */ ctx,
|
| 298 |
};
|
| 299 |
|
| 300 |
+
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
| 301 |
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
|
| 302 |
+
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
|
| 303 |
}
|
| 304 |
#endif
|
| 305 |
|
| 306 |
+
#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
|
| 307 |
+
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
|
| 308 |
#endif
|
| 309 |
|
| 310 |
return backend;
|
ggml/src/ggml-cuda.cu
CHANGED
|
@@ -291,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
| 291 |
return;
|
| 292 |
}
|
| 293 |
}
|
| 294 |
-
|
| 295 |
ggml_cuda_set_device(device);
|
| 296 |
CUDA_CHECK(cudaFree(ptr));
|
| 297 |
pool_size -= size;
|
|
@@ -980,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
| 980 |
if (err != cudaSuccess) {
|
| 981 |
// clear the error
|
| 982 |
cudaGetLastError();
|
| 983 |
-
|
| 984 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 985 |
return nullptr;
|
| 986 |
}
|
|
@@ -2406,7 +2406,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
|
| 2406 |
|
| 2407 |
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
| 2408 |
#ifndef NDEBUG
|
| 2409 |
-
|
| 2410 |
#endif
|
| 2411 |
return false;
|
| 2412 |
}
|
|
@@ -2524,7 +2524,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2524 |
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
| 2525 |
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
| 2526 |
#ifndef NDEBUG
|
| 2527 |
-
|
| 2528 |
#endif
|
| 2529 |
}
|
| 2530 |
}
|
|
@@ -2575,14 +2575,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2575 |
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
| 2576 |
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
| 2577 |
#ifndef NDEBUG
|
| 2578 |
-
|
| 2579 |
#endif
|
| 2580 |
}
|
| 2581 |
|
| 2582 |
if (node->op == GGML_OP_MUL_MAT_ID) {
|
| 2583 |
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
| 2584 |
#ifndef NDEBUG
|
| 2585 |
-
|
| 2586 |
#endif
|
| 2587 |
}
|
| 2588 |
|
|
@@ -2591,7 +2591,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2591 |
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
| 2592 |
use_cuda_graph = false;
|
| 2593 |
#ifndef NDEBUG
|
| 2594 |
-
|
| 2595 |
#endif
|
| 2596 |
}
|
| 2597 |
|
|
@@ -2603,7 +2603,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2603 |
if (!ptr) {
|
| 2604 |
use_cuda_graph = false;
|
| 2605 |
#ifndef NDEBUG
|
| 2606 |
-
|
| 2607 |
#endif
|
| 2608 |
} else {
|
| 2609 |
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
|
@@ -2627,7 +2627,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2627 |
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
| 2628 |
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
| 2629 |
#ifndef NDEBUG
|
| 2630 |
-
|
| 2631 |
#endif
|
| 2632 |
}
|
| 2633 |
}
|
|
@@ -2685,7 +2685,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2685 |
use_cuda_graph = false;
|
| 2686 |
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
| 2687 |
#ifndef NDEBUG
|
| 2688 |
-
|
| 2689 |
#endif
|
| 2690 |
} else {
|
| 2691 |
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
|
@@ -2854,7 +2854,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
|
| 2854 |
// clear the error
|
| 2855 |
cudaGetLastError();
|
| 2856 |
|
| 2857 |
-
|
| 2858 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 2859 |
return false;
|
| 2860 |
}
|
|
|
|
| 291 |
return;
|
| 292 |
}
|
| 293 |
}
|
| 294 |
+
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
| 295 |
ggml_cuda_set_device(device);
|
| 296 |
CUDA_CHECK(cudaFree(ptr));
|
| 297 |
pool_size -= size;
|
|
|
|
| 980 |
if (err != cudaSuccess) {
|
| 981 |
// clear the error
|
| 982 |
cudaGetLastError();
|
| 983 |
+
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
| 984 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 985 |
return nullptr;
|
| 986 |
}
|
|
|
|
| 2406 |
|
| 2407 |
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
| 2408 |
#ifndef NDEBUG
|
| 2409 |
+
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
| 2410 |
#endif
|
| 2411 |
return false;
|
| 2412 |
}
|
|
|
|
| 2524 |
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
| 2525 |
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
| 2526 |
#ifndef NDEBUG
|
| 2527 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
| 2528 |
#endif
|
| 2529 |
}
|
| 2530 |
}
|
|
|
|
| 2575 |
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
| 2576 |
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
| 2577 |
#ifndef NDEBUG
|
| 2578 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
| 2579 |
#endif
|
| 2580 |
}
|
| 2581 |
|
| 2582 |
if (node->op == GGML_OP_MUL_MAT_ID) {
|
| 2583 |
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
| 2584 |
#ifndef NDEBUG
|
| 2585 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
| 2586 |
#endif
|
| 2587 |
}
|
| 2588 |
|
|
|
|
| 2591 |
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
| 2592 |
use_cuda_graph = false;
|
| 2593 |
#ifndef NDEBUG
|
| 2594 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
| 2595 |
#endif
|
| 2596 |
}
|
| 2597 |
|
|
|
|
| 2603 |
if (!ptr) {
|
| 2604 |
use_cuda_graph = false;
|
| 2605 |
#ifndef NDEBUG
|
| 2606 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
| 2607 |
#endif
|
| 2608 |
} else {
|
| 2609 |
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
|
|
|
| 2627 |
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
| 2628 |
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
| 2629 |
#ifndef NDEBUG
|
| 2630 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
| 2631 |
#endif
|
| 2632 |
}
|
| 2633 |
}
|
|
|
|
| 2685 |
use_cuda_graph = false;
|
| 2686 |
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
| 2687 |
#ifndef NDEBUG
|
| 2688 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
| 2689 |
#endif
|
| 2690 |
} else {
|
| 2691 |
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
|
|
|
| 2854 |
// clear the error
|
| 2855 |
cudaGetLastError();
|
| 2856 |
|
| 2857 |
+
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
| 2858 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 2859 |
return false;
|
| 2860 |
}
|