Spaces:
Running
ggml : use dynamic thread scheduling for matrix multiplication (llama/6915)
Browse files* Just reordering some structs.
* Adding in the calls to mm_pause
* Passing around the state
* Renaming and moving a bunch of variables around.
* Extracting the logic to it's own function.
* Moving some variable definitions into the chunk function.
* Moving some variables around
* moving src1_cont inside
* Moving row_size
* adding the current_chunk
* Reorg the code.
* Formatting to match the orig patch
* starting to setup the chunking variables
* Starting the buildup of the loop
* The yield shouldn't be necessary.
* adding the looping structure based on the chunk configuration.
* Add in the re-chunking code.
* Making it much more likely to rechunk.
* disable resizing if numa is enabled.
* Updating comments with what we've learned.
* Fix formatting
* Couple more formatting fixes.
* More style fixes.
* Fix Warnings
* Going with unused because there's conditional logic that needs it.
* Update ggml.c
* Update ggml.c
---------
|
@@ -112,6 +112,8 @@ typedef void * thread_ret_t;
|
|
| 112 |
|
| 113 |
#endif
|
| 114 |
|
|
|
|
|
|
|
| 115 |
#ifdef GGML_USE_CPU_HBM
|
| 116 |
#include <hbwmalloc.h>
|
| 117 |
#endif
|
|
@@ -1539,6 +1541,59 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
|
| 1539 |
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
| 1540 |
#endif
|
| 1541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1542 |
//
|
| 1543 |
// fundamental operations
|
| 1544 |
//
|
|
@@ -2385,32 +2440,6 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
| 2385 |
}
|
| 2386 |
}
|
| 2387 |
|
| 2388 |
-
//
|
| 2389 |
-
// ggml context
|
| 2390 |
-
//
|
| 2391 |
-
|
| 2392 |
-
struct ggml_context {
|
| 2393 |
-
size_t mem_size;
|
| 2394 |
-
void * mem_buffer;
|
| 2395 |
-
bool mem_buffer_owned;
|
| 2396 |
-
bool no_alloc;
|
| 2397 |
-
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
| 2398 |
-
|
| 2399 |
-
int n_objects;
|
| 2400 |
-
|
| 2401 |
-
struct ggml_object * objects_begin;
|
| 2402 |
-
struct ggml_object * objects_end;
|
| 2403 |
-
|
| 2404 |
-
struct ggml_scratch scratch;
|
| 2405 |
-
struct ggml_scratch scratch_save;
|
| 2406 |
-
};
|
| 2407 |
-
|
| 2408 |
-
struct ggml_context_container {
|
| 2409 |
-
bool used;
|
| 2410 |
-
|
| 2411 |
-
struct ggml_context context;
|
| 2412 |
-
};
|
| 2413 |
-
|
| 2414 |
//
|
| 2415 |
// NUMA support
|
| 2416 |
//
|
|
@@ -11815,9 +11844,101 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
|
| 11815 |
}
|
| 11816 |
#endif
|
| 11817 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11818 |
static void ggml_compute_forward_mul_mat(
|
| 11819 |
const struct ggml_compute_params * params,
|
| 11820 |
-
struct ggml_tensor * dst
|
|
|
|
| 11821 |
|
| 11822 |
const struct ggml_tensor * src0 = dst->src[0];
|
| 11823 |
const struct ggml_tensor * src1 = dst->src[1];
|
|
@@ -11832,9 +11953,6 @@ static void ggml_compute_forward_mul_mat(
|
|
| 11832 |
|
| 11833 |
const enum ggml_type type = src0->type;
|
| 11834 |
|
| 11835 |
-
const bool src1_cont = ggml_is_contiguous(src1);
|
| 11836 |
-
|
| 11837 |
-
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
| 11838 |
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
| 11839 |
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
| 11840 |
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
|
@@ -11855,8 +11973,10 @@ static void ggml_compute_forward_mul_mat(
|
|
| 11855 |
GGML_ASSERT(nb2 <= nb3);
|
| 11856 |
|
| 11857 |
// broadcast factors
|
| 11858 |
-
const int64_t r2 = ne12/ne02;
|
| 11859 |
-
const int64_t r3 = ne13/ne03;
|
|
|
|
|
|
|
| 11860 |
|
| 11861 |
// nb01 >= nb00 - src0 is not transposed
|
| 11862 |
// compute by src0 rows
|
|
@@ -11938,6 +12058,8 @@ static void ggml_compute_forward_mul_mat(
|
|
| 11938 |
#endif
|
| 11939 |
|
| 11940 |
#if GGML_USE_LLAMAFILE
|
|
|
|
|
|
|
| 11941 |
if (src1_cont) {
|
| 11942 |
for (int64_t i13 = 0; i13 < ne13; i13++)
|
| 11943 |
for (int64_t i12 = 0; i12 < ne12; i12++)
|
|
@@ -11963,6 +12085,8 @@ UseGgmlGemm1:;
|
|
| 11963 |
if (ith != 0) {
|
| 11964 |
return;
|
| 11965 |
}
|
|
|
|
|
|
|
| 11966 |
if (src1->type != vec_dot_type) {
|
| 11967 |
char * wdata = params->wdata;
|
| 11968 |
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
|
@@ -11987,11 +12111,11 @@ UseGgmlGemm1:;
|
|
| 11987 |
return;
|
| 11988 |
}
|
| 11989 |
|
| 11990 |
-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
| 11991 |
-
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
| 11992 |
-
|
| 11993 |
#if GGML_USE_LLAMAFILE
|
| 11994 |
if (src1->type != vec_dot_type) {
|
|
|
|
|
|
|
|
|
|
| 11995 |
for (int64_t i13 = 0; i13 < ne13; i13++)
|
| 11996 |
for (int64_t i12 = 0; i12 < ne12; i12++)
|
| 11997 |
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
|
@@ -12012,98 +12136,87 @@ UseGgmlGemm1:;
|
|
| 12012 |
UseGgmlGemm2:;
|
| 12013 |
#endif
|
| 12014 |
|
| 12015 |
-
|
| 12016 |
-
|
| 12017 |
-
|
| 12018 |
-
|
| 12019 |
-
|
| 12020 |
-
// distribute the thread work across the inner or outer loop based on which one is larger
|
| 12021 |
-
|
| 12022 |
-
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
| 12023 |
-
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
| 12024 |
-
|
| 12025 |
-
const int64_t ith0 = ith % nth0;
|
| 12026 |
-
const int64_t ith1 = ith / nth0;
|
| 12027 |
-
|
| 12028 |
-
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
| 12029 |
-
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
| 12030 |
-
|
| 12031 |
-
const int64_t ir010 = dr0*ith0;
|
| 12032 |
-
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
| 12033 |
-
|
| 12034 |
-
const int64_t ir110 = dr1*ith1;
|
| 12035 |
-
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
| 12036 |
-
|
| 12037 |
-
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
| 12038 |
-
|
| 12039 |
-
// threads with no work simply yield (not sure if it helps)
|
| 12040 |
-
if (ir010 >= ir011 || ir110 >= ir111) {
|
| 12041 |
-
sched_yield();
|
| 12042 |
-
return;
|
| 12043 |
-
}
|
| 12044 |
|
| 12045 |
-
|
| 12046 |
-
|
| 12047 |
|
| 12048 |
-
//
|
| 12049 |
-
const int64_t
|
| 12050 |
-
const int64_t blck_1 = 16;
|
| 12051 |
|
| 12052 |
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
| 12053 |
-
int64_t
|
| 12054 |
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
| 12055 |
// this check can be removed once they are extended to support odd numbered rows/cols too
|
| 12056 |
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
| 12057 |
-
|
| 12058 |
}
|
| 12059 |
|
| 12060 |
-
|
|
|
|
| 12061 |
|
| 12062 |
-
//
|
| 12063 |
-
|
| 12064 |
-
|
|
|
|
| 12065 |
|
| 12066 |
-
|
| 12067 |
-
|
| 12068 |
-
|
| 12069 |
-
|
| 12070 |
-
|
| 12071 |
-
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
| 12072 |
|
| 12073 |
-
|
| 12074 |
-
|
| 12075 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12076 |
|
| 12077 |
-
|
| 12078 |
-
|
| 12079 |
-
|
| 12080 |
|
| 12081 |
-
|
|
|
|
| 12082 |
|
| 12083 |
-
|
| 12084 |
-
|
| 12085 |
-
// the original src1 data pointer, so we should index using the indices directly
|
| 12086 |
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
| 12087 |
-
const char * src1_col = (const char *) wdata +
|
| 12088 |
-
(src1_cont || src1->type != vec_dot_type
|
| 12089 |
-
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
| 12090 |
-
: (i11*nb11 + i12*nb12 + i13*nb13));
|
| 12091 |
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
| 12092 |
|
| 12093 |
-
|
| 12094 |
-
|
| 12095 |
-
|
| 12096 |
|
| 12097 |
-
|
| 12098 |
-
|
| 12099 |
-
}
|
| 12100 |
|
| 12101 |
-
|
| 12102 |
-
|
| 12103 |
-
|
| 12104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12105 |
}
|
|
|
|
|
|
|
| 12106 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12107 |
}
|
| 12108 |
|
| 12109 |
// ggml_compute_forward_mul_mat_id
|
|
@@ -17358,7 +17471,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|
| 17358 |
|
| 17359 |
/////////////////////////////////
|
| 17360 |
|
| 17361 |
-
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
| 17362 |
GGML_ASSERT(params);
|
| 17363 |
|
| 17364 |
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
|
@@ -17456,7 +17569,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
| 17456 |
} break;
|
| 17457 |
case GGML_OP_MUL_MAT:
|
| 17458 |
{
|
| 17459 |
-
ggml_compute_forward_mul_mat(params, tensor);
|
| 17460 |
} break;
|
| 17461 |
case GGML_OP_MUL_MAT_ID:
|
| 17462 |
{
|
|
@@ -19072,8 +19185,6 @@ typedef int ggml_lock_t;
|
|
| 19072 |
|
| 19073 |
#define GGML_LOCK_INITIALIZER 0
|
| 19074 |
|
| 19075 |
-
typedef pthread_t ggml_thread_t;
|
| 19076 |
-
|
| 19077 |
#define ggml_thread_create pthread_create
|
| 19078 |
#define ggml_thread_join pthread_join
|
| 19079 |
|
|
@@ -19099,8 +19210,6 @@ typedef int ggml_lock_t;
|
|
| 19099 |
|
| 19100 |
#define GGML_LOCK_INITIALIZER 0
|
| 19101 |
|
| 19102 |
-
typedef pthread_t ggml_thread_t;
|
| 19103 |
-
|
| 19104 |
#define ggml_thread_create pthread_create
|
| 19105 |
#define ggml_thread_join pthread_join
|
| 19106 |
|
|
@@ -19180,31 +19289,6 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
|
|
| 19180 |
static void clear_numa_thread_affinity(void) {}
|
| 19181 |
#endif
|
| 19182 |
|
| 19183 |
-
struct ggml_compute_state_shared {
|
| 19184 |
-
const struct ggml_cgraph * cgraph;
|
| 19185 |
-
const struct ggml_cplan * cplan;
|
| 19186 |
-
|
| 19187 |
-
int64_t perf_node_start_cycles;
|
| 19188 |
-
int64_t perf_node_start_time_us;
|
| 19189 |
-
|
| 19190 |
-
const int n_threads;
|
| 19191 |
-
|
| 19192 |
-
// synchronization primitives
|
| 19193 |
-
atomic_int n_active; // num active threads
|
| 19194 |
-
atomic_int node_n; // active graph node
|
| 19195 |
-
atomic_int node_task; // active graph node task phase
|
| 19196 |
-
|
| 19197 |
-
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
| 19198 |
-
void * abort_callback_data;
|
| 19199 |
-
};
|
| 19200 |
-
|
| 19201 |
-
struct ggml_compute_state {
|
| 19202 |
-
ggml_thread_t thrd;
|
| 19203 |
-
int ith;
|
| 19204 |
-
struct ggml_compute_state_shared * shared;
|
| 19205 |
-
enum ggml_status ec;
|
| 19206 |
-
};
|
| 19207 |
-
|
| 19208 |
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
| 19209 |
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
| 19210 |
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
|
@@ -19477,6 +19561,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
| 19477 |
|
| 19478 |
* node_n = atomic_load(&state->shared->node_n);
|
| 19479 |
if (* node_n != last_node_n) break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19480 |
}
|
| 19481 |
}
|
| 19482 |
|
|
@@ -19491,6 +19579,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co
|
|
| 19491 |
|
| 19492 |
* task_phase = atomic_load(&state->shared->node_task);
|
| 19493 |
if (* task_phase != last_task_phase) break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19494 |
}
|
| 19495 |
}
|
| 19496 |
|
|
@@ -19530,7 +19622,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 19530 |
struct ggml_tensor * node = cgraph->nodes[node_n];
|
| 19531 |
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
| 19532 |
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
| 19533 |
-
ggml_compute_forward(¶ms, node);
|
| 19534 |
}
|
| 19535 |
ggml_graph_compute_perf_stats_node(node, state->shared);
|
| 19536 |
}
|
|
@@ -19550,17 +19642,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 19550 |
/* INIT */
|
| 19551 |
if (GGML_OP_HAS_INIT[node->op]) {
|
| 19552 |
params.type = GGML_TASK_TYPE_INIT;
|
| 19553 |
-
ggml_compute_forward(¶ms, node);
|
| 19554 |
}
|
| 19555 |
|
| 19556 |
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
| 19557 |
// they do something more efficient than spinning (?)
|
| 19558 |
params.type = GGML_TASK_TYPE_COMPUTE;
|
| 19559 |
-
ggml_compute_forward(¶ms, node);
|
| 19560 |
|
| 19561 |
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
| 19562 |
params.type = GGML_TASK_TYPE_FINALIZE;
|
| 19563 |
-
ggml_compute_forward(¶ms, node);
|
| 19564 |
}
|
| 19565 |
|
| 19566 |
ggml_graph_compute_perf_stats_node(node, state->shared);
|
|
@@ -19599,7 +19691,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 19599 |
|
| 19600 |
if (state->ith < n_tasks) {
|
| 19601 |
if (GGML_OP_HAS_INIT[node->op]) {
|
| 19602 |
-
ggml_compute_forward(¶ms, node);
|
| 19603 |
}
|
| 19604 |
}
|
| 19605 |
|
|
@@ -19620,7 +19712,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 19620 |
|
| 19621 |
if (state->ith < n_tasks) {
|
| 19622 |
params.type = GGML_TASK_TYPE_COMPUTE;
|
| 19623 |
-
ggml_compute_forward(¶ms, node);
|
| 19624 |
}
|
| 19625 |
|
| 19626 |
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
|
@@ -19871,6 +19963,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
| 19871 |
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
| 19872 |
/*.abort_callback =*/ NULL,
|
| 19873 |
/*.abort_callback_data =*/ NULL,
|
|
|
|
| 19874 |
};
|
| 19875 |
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
| 19876 |
|
|
|
|
| 112 |
|
| 113 |
#endif
|
| 114 |
|
| 115 |
+
typedef pthread_t ggml_thread_t;
|
| 116 |
+
|
| 117 |
#ifdef GGML_USE_CPU_HBM
|
| 118 |
#include <hbwmalloc.h>
|
| 119 |
#endif
|
|
|
|
| 1541 |
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
| 1542 |
#endif
|
| 1543 |
|
| 1544 |
+
//
|
| 1545 |
+
// ggml context
|
| 1546 |
+
//
|
| 1547 |
+
|
| 1548 |
+
struct ggml_context {
|
| 1549 |
+
size_t mem_size;
|
| 1550 |
+
void* mem_buffer;
|
| 1551 |
+
bool mem_buffer_owned;
|
| 1552 |
+
bool no_alloc;
|
| 1553 |
+
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
| 1554 |
+
|
| 1555 |
+
int n_objects;
|
| 1556 |
+
|
| 1557 |
+
struct ggml_object* objects_begin;
|
| 1558 |
+
struct ggml_object* objects_end;
|
| 1559 |
+
|
| 1560 |
+
struct ggml_scratch scratch;
|
| 1561 |
+
struct ggml_scratch scratch_save;
|
| 1562 |
+
};
|
| 1563 |
+
|
| 1564 |
+
struct ggml_context_container {
|
| 1565 |
+
bool used;
|
| 1566 |
+
|
| 1567 |
+
struct ggml_context context;
|
| 1568 |
+
};
|
| 1569 |
+
|
| 1570 |
+
struct ggml_compute_state_shared {
|
| 1571 |
+
const struct ggml_cgraph* cgraph;
|
| 1572 |
+
const struct ggml_cplan* cplan;
|
| 1573 |
+
|
| 1574 |
+
int64_t perf_node_start_cycles;
|
| 1575 |
+
int64_t perf_node_start_time_us;
|
| 1576 |
+
|
| 1577 |
+
const int n_threads;
|
| 1578 |
+
|
| 1579 |
+
// synchronization primitives
|
| 1580 |
+
atomic_int n_active; // num active threads
|
| 1581 |
+
atomic_int node_n; // active graph node
|
| 1582 |
+
atomic_int node_task; // active graph node task phase
|
| 1583 |
+
|
| 1584 |
+
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
| 1585 |
+
void* abort_callback_data;
|
| 1586 |
+
|
| 1587 |
+
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
| 1588 |
+
};
|
| 1589 |
+
|
| 1590 |
+
struct ggml_compute_state {
|
| 1591 |
+
ggml_thread_t thrd;
|
| 1592 |
+
int ith;
|
| 1593 |
+
struct ggml_compute_state_shared* shared;
|
| 1594 |
+
enum ggml_status ec;
|
| 1595 |
+
};
|
| 1596 |
+
|
| 1597 |
//
|
| 1598 |
// fundamental operations
|
| 1599 |
//
|
|
|
|
| 2440 |
}
|
| 2441 |
}
|
| 2442 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2443 |
//
|
| 2444 |
// NUMA support
|
| 2445 |
//
|
|
|
|
| 11844 |
}
|
| 11845 |
#endif
|
| 11846 |
|
| 11847 |
+
static void ggml_compute_forward_mul_mat_one_chunk(
|
| 11848 |
+
const struct ggml_compute_params * params,
|
| 11849 |
+
struct ggml_tensor * dst,
|
| 11850 |
+
const int64_t num_rows_per_vec_dot,
|
| 11851 |
+
const int64_t ir0_start,
|
| 11852 |
+
const int64_t ir0_end,
|
| 11853 |
+
const int64_t ir1_start,
|
| 11854 |
+
const int64_t ir1_end) {
|
| 11855 |
+
|
| 11856 |
+
const struct ggml_tensor * src0 = dst->src[0];
|
| 11857 |
+
const struct ggml_tensor * src1 = dst->src[1];
|
| 11858 |
+
|
| 11859 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
| 11860 |
+
|
| 11861 |
+
const enum ggml_type type = src0->type;
|
| 11862 |
+
|
| 11863 |
+
const bool src1_cont = ggml_is_contiguous(src1);
|
| 11864 |
+
|
| 11865 |
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
| 11866 |
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
| 11867 |
+
|
| 11868 |
+
// broadcast factors
|
| 11869 |
+
const int64_t r2 = ne12 / ne02;
|
| 11870 |
+
const int64_t r3 = ne13 / ne03;
|
| 11871 |
+
|
| 11872 |
+
//printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
|
| 11873 |
+
|
| 11874 |
+
// threads with no work simply yield (not sure if it helps)
|
| 11875 |
+
if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
|
| 11876 |
+
return;
|
| 11877 |
+
}
|
| 11878 |
+
|
| 11879 |
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
| 11880 |
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
| 11881 |
+
|
| 11882 |
+
assert(ne12 % ne02 == 0);
|
| 11883 |
+
assert(ne13 % ne03 == 0);
|
| 11884 |
+
|
| 11885 |
+
// block-tiling attempt
|
| 11886 |
+
const int64_t blck_0 = 16;
|
| 11887 |
+
const int64_t blck_1 = 16;
|
| 11888 |
+
|
| 11889 |
+
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
| 11890 |
+
|
| 11891 |
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
| 11892 |
+
// 16 * 2, accounting for mmla kernels
|
| 11893 |
+
float tmp[32];
|
| 11894 |
+
|
| 11895 |
+
for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
| 11896 |
+
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
| 11897 |
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
|
| 11898 |
+
const int64_t i13 = (ir1 / (ne12 * ne1));
|
| 11899 |
+
const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
|
| 11900 |
+
const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
|
| 11901 |
+
|
| 11902 |
+
// broadcast src0 into src1
|
| 11903 |
+
const int64_t i03 = i13 / r3;
|
| 11904 |
+
const int64_t i02 = i12 / r2;
|
| 11905 |
+
|
| 11906 |
+
const int64_t i1 = i11;
|
| 11907 |
+
const int64_t i2 = i12;
|
| 11908 |
+
const int64_t i3 = i13;
|
| 11909 |
+
|
| 11910 |
+
const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
|
| 11911 |
+
|
| 11912 |
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
| 11913 |
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
| 11914 |
+
// the original src1 data pointer, so we should index using the indices directly
|
| 11915 |
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
| 11916 |
+
const char * src1_col = (const char*)wdata +
|
| 11917 |
+
(src1_cont || src1->type != vec_dot_type
|
| 11918 |
+
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
| 11919 |
+
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
| 11920 |
+
float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
| 11921 |
+
|
| 11922 |
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
|
| 11923 |
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
| 11924 |
+
//}
|
| 11925 |
+
|
| 11926 |
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
|
| 11927 |
+
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
|
| 11928 |
+
}
|
| 11929 |
+
|
| 11930 |
+
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
|
| 11931 |
+
memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
|
| 11932 |
+
}
|
| 11933 |
+
}
|
| 11934 |
+
}
|
| 11935 |
+
}
|
| 11936 |
+
}
|
| 11937 |
+
|
| 11938 |
static void ggml_compute_forward_mul_mat(
|
| 11939 |
const struct ggml_compute_params * params,
|
| 11940 |
+
struct ggml_tensor * dst,
|
| 11941 |
+
struct ggml_compute_state * state) {
|
| 11942 |
|
| 11943 |
const struct ggml_tensor * src0 = dst->src[0];
|
| 11944 |
const struct ggml_tensor * src1 = dst->src[1];
|
|
|
|
| 11953 |
|
| 11954 |
const enum ggml_type type = src0->type;
|
| 11955 |
|
|
|
|
|
|
|
|
|
|
| 11956 |
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
| 11957 |
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
| 11958 |
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
|
|
|
| 11973 |
GGML_ASSERT(nb2 <= nb3);
|
| 11974 |
|
| 11975 |
// broadcast factors
|
| 11976 |
+
const int64_t r2 = ne12 / ne02;
|
| 11977 |
+
const int64_t r3 = ne13 / ne03;
|
| 11978 |
+
UNUSED(r2);
|
| 11979 |
+
UNUSED(r3);
|
| 11980 |
|
| 11981 |
// nb01 >= nb00 - src0 is not transposed
|
| 11982 |
// compute by src0 rows
|
|
|
|
| 12058 |
#endif
|
| 12059 |
|
| 12060 |
#if GGML_USE_LLAMAFILE
|
| 12061 |
+
const bool src1_cont = ggml_is_contiguous(src1);
|
| 12062 |
+
|
| 12063 |
if (src1_cont) {
|
| 12064 |
for (int64_t i13 = 0; i13 < ne13; i13++)
|
| 12065 |
for (int64_t i12 = 0; i12 < ne12; i12++)
|
|
|
|
| 12085 |
if (ith != 0) {
|
| 12086 |
return;
|
| 12087 |
}
|
| 12088 |
+
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
| 12089 |
+
atomic_store(&state->shared->current_chunk, nth);
|
| 12090 |
if (src1->type != vec_dot_type) {
|
| 12091 |
char * wdata = params->wdata;
|
| 12092 |
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
|
|
|
| 12111 |
return;
|
| 12112 |
}
|
| 12113 |
|
|
|
|
|
|
|
|
|
|
| 12114 |
#if GGML_USE_LLAMAFILE
|
| 12115 |
if (src1->type != vec_dot_type) {
|
| 12116 |
+
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
| 12117 |
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
| 12118 |
+
|
| 12119 |
for (int64_t i13 = 0; i13 < ne13; i13++)
|
| 12120 |
for (int64_t i12 = 0; i12 < ne12; i12++)
|
| 12121 |
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
|
|
|
| 12136 |
UseGgmlGemm2:;
|
| 12137 |
#endif
|
| 12138 |
|
| 12139 |
+
#ifdef GGML_PERF
|
| 12140 |
+
int chunks_executed = 0;
|
| 12141 |
+
UNUSED(chunks_executed);
|
| 12142 |
+
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12143 |
|
| 12144 |
+
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
|
| 12145 |
+
const int64_t nr0 = ne0;
|
| 12146 |
|
| 12147 |
+
// This is the size of the rest of the dimensions of the result
|
| 12148 |
+
const int64_t nr1 = ne1 * ne2 * ne3;
|
|
|
|
| 12149 |
|
| 12150 |
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
| 12151 |
+
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
| 12152 |
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
| 12153 |
// this check can be removed once they are extended to support odd numbered rows/cols too
|
| 12154 |
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
| 12155 |
+
num_rows_per_vec_dot = 1;
|
| 12156 |
}
|
| 12157 |
|
| 12158 |
+
// Now select a reasonable chunk size.
|
| 12159 |
+
int chunk_size = 16;
|
| 12160 |
|
| 12161 |
+
// We need to step up the size if it's small
|
| 12162 |
+
if (nr0 == 1 || nr1 == 1) {
|
| 12163 |
+
chunk_size = 64;
|
| 12164 |
+
}
|
| 12165 |
|
| 12166 |
+
// distribute the work across the inner or outer loop based on which one is larger
|
| 12167 |
+
// The number of chunks in the 0/1 dim.
|
| 12168 |
+
// CEIL(nr0/chunk_size)
|
| 12169 |
+
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
| 12170 |
+
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
|
|
|
| 12171 |
|
| 12172 |
+
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
| 12173 |
+
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
| 12174 |
+
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
| 12175 |
+
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
| 12176 |
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
| 12177 |
+
nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
| 12178 |
+
nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
| 12179 |
+
}
|
| 12180 |
|
| 12181 |
+
// The number of elements in each chunk
|
| 12182 |
+
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
| 12183 |
+
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
| 12184 |
|
| 12185 |
+
//if (ith == 0)
|
| 12186 |
+
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
|
| 12187 |
|
| 12188 |
+
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
| 12189 |
+
int current_chunk = ith;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12190 |
|
| 12191 |
+
while (current_chunk < nchunk0 * nchunk1) {
|
| 12192 |
+
const int64_t ith0 = current_chunk % nchunk0;
|
| 12193 |
+
const int64_t ith1 = current_chunk / nchunk0;
|
| 12194 |
|
| 12195 |
+
const int64_t ir0_start = dr0 * ith0;
|
| 12196 |
+
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
|
|
|
|
| 12197 |
|
| 12198 |
+
const int64_t ir1_start = dr1 * ith1;
|
| 12199 |
+
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
| 12200 |
+
|
| 12201 |
+
ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
| 12202 |
+
|
| 12203 |
+
#ifdef GGML_PERF
|
| 12204 |
+
chunks_executed++;
|
| 12205 |
+
#endif
|
| 12206 |
+
|
| 12207 |
+
if (nth >= nchunk0 * nchunk1) {
|
| 12208 |
+
break;
|
| 12209 |
}
|
| 12210 |
+
|
| 12211 |
+
current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
|
| 12212 |
}
|
| 12213 |
+
|
| 12214 |
+
#ifdef GGML_PERF
|
| 12215 |
+
// These numbers are useful when trying to measure how well the threading scheduling works.
|
| 12216 |
+
//int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;
|
| 12217 |
+
//float time = (ggml_perf_time_us() - t0);
|
| 12218 |
+
//printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed);
|
| 12219 |
+
#endif
|
| 12220 |
}
|
| 12221 |
|
| 12222 |
// ggml_compute_forward_mul_mat_id
|
|
|
|
| 17471 |
|
| 17472 |
/////////////////////////////////
|
| 17473 |
|
| 17474 |
+
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
|
| 17475 |
GGML_ASSERT(params);
|
| 17476 |
|
| 17477 |
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
|
|
|
| 17569 |
} break;
|
| 17570 |
case GGML_OP_MUL_MAT:
|
| 17571 |
{
|
| 17572 |
+
ggml_compute_forward_mul_mat(params, tensor, state);
|
| 17573 |
} break;
|
| 17574 |
case GGML_OP_MUL_MAT_ID:
|
| 17575 |
{
|
|
|
|
| 19185 |
|
| 19186 |
#define GGML_LOCK_INITIALIZER 0
|
| 19187 |
|
|
|
|
|
|
|
| 19188 |
#define ggml_thread_create pthread_create
|
| 19189 |
#define ggml_thread_join pthread_join
|
| 19190 |
|
|
|
|
| 19210 |
|
| 19211 |
#define GGML_LOCK_INITIALIZER 0
|
| 19212 |
|
|
|
|
|
|
|
| 19213 |
#define ggml_thread_create pthread_create
|
| 19214 |
#define ggml_thread_join pthread_join
|
| 19215 |
|
|
|
|
| 19289 |
static void clear_numa_thread_affinity(void) {}
|
| 19290 |
#endif
|
| 19291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19292 |
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
| 19293 |
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
| 19294 |
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
|
|
|
| 19561 |
|
| 19562 |
* node_n = atomic_load(&state->shared->node_n);
|
| 19563 |
if (* node_n != last_node_n) break;
|
| 19564 |
+
#if defined(__SSE3__)
|
| 19565 |
+
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
| 19566 |
+
_mm_pause();
|
| 19567 |
+
#endif
|
| 19568 |
}
|
| 19569 |
}
|
| 19570 |
|
|
|
|
| 19579 |
|
| 19580 |
* task_phase = atomic_load(&state->shared->node_task);
|
| 19581 |
if (* task_phase != last_task_phase) break;
|
| 19582 |
+
#if defined(__SSE3__)
|
| 19583 |
+
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
| 19584 |
+
_mm_pause();
|
| 19585 |
+
#endif
|
| 19586 |
}
|
| 19587 |
}
|
| 19588 |
|
|
|
|
| 19622 |
struct ggml_tensor * node = cgraph->nodes[node_n];
|
| 19623 |
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
| 19624 |
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
| 19625 |
+
ggml_compute_forward(¶ms, node, state);
|
| 19626 |
}
|
| 19627 |
ggml_graph_compute_perf_stats_node(node, state->shared);
|
| 19628 |
}
|
|
|
|
| 19642 |
/* INIT */
|
| 19643 |
if (GGML_OP_HAS_INIT[node->op]) {
|
| 19644 |
params.type = GGML_TASK_TYPE_INIT;
|
| 19645 |
+
ggml_compute_forward(¶ms, node, state);
|
| 19646 |
}
|
| 19647 |
|
| 19648 |
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
| 19649 |
// they do something more efficient than spinning (?)
|
| 19650 |
params.type = GGML_TASK_TYPE_COMPUTE;
|
| 19651 |
+
ggml_compute_forward(¶ms, node, state);
|
| 19652 |
|
| 19653 |
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
| 19654 |
params.type = GGML_TASK_TYPE_FINALIZE;
|
| 19655 |
+
ggml_compute_forward(¶ms, node, state);
|
| 19656 |
}
|
| 19657 |
|
| 19658 |
ggml_graph_compute_perf_stats_node(node, state->shared);
|
|
|
|
| 19691 |
|
| 19692 |
if (state->ith < n_tasks) {
|
| 19693 |
if (GGML_OP_HAS_INIT[node->op]) {
|
| 19694 |
+
ggml_compute_forward(¶ms, node, state);
|
| 19695 |
}
|
| 19696 |
}
|
| 19697 |
|
|
|
|
| 19712 |
|
| 19713 |
if (state->ith < n_tasks) {
|
| 19714 |
params.type = GGML_TASK_TYPE_COMPUTE;
|
| 19715 |
+
ggml_compute_forward(¶ms, node, state);
|
| 19716 |
}
|
| 19717 |
|
| 19718 |
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
|
|
|
| 19963 |
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
| 19964 |
/*.abort_callback =*/ NULL,
|
| 19965 |
/*.abort_callback_data =*/ NULL,
|
| 19966 |
+
/*.current_chunk; =*/ 0,
|
| 19967 |
};
|
| 19968 |
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
| 19969 |
|