ggerganov commited on
Commit
7b501c1
·
unverified ·
1 Parent(s): 6fe850c

ggml : barrier refactor + static functions

Browse files
Files changed (1) hide show
  1. ggml.c +155 -134
ggml.c CHANGED
@@ -1217,6 +1217,24 @@ struct ggml_state {
1217
  static struct ggml_state g_state;
1218
  static atomic_int g_state_barrier = 0;
1219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1220
  ////////////////////////////////////////////////////////////////////////////////
1221
 
1222
  void ggml_print_object(const struct ggml_object * obj) {
@@ -1346,32 +1364,45 @@ int ggml_up64(int n) {
1346
 
1347
  struct ggml_context * ggml_init(struct ggml_init_params params) {
1348
  // make this function thread safe
1349
- {
1350
- int processing = atomic_fetch_add(&g_state_barrier, 1);
1351
- while (processing > 0) {
1352
- // wait for other threads to finish
1353
- atomic_fetch_sub(&g_state_barrier, 1);
1354
- sched_yield();
1355
- processing = atomic_fetch_add(&g_state_barrier, 1);
1356
- }
1357
- }
1358
 
1359
  static bool is_first_call = true;
 
1360
  if (is_first_call) {
1361
- const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
1362
-
1363
- ggml_fp16_t ii;
1364
- for (int i = 0; i < (1 << 16); ++i) {
1365
- uint16_t ui = i;
1366
- memcpy(&ii, &ui, sizeof(ii));
1367
- const float f = GGML_FP16_TO_FP32(ii);
1368
- table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
1369
- table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
 
 
 
 
 
 
 
1370
  }
1371
 
1372
- const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
 
 
 
 
 
 
 
 
 
 
 
 
1373
 
1374
- GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
 
1375
 
1376
  is_first_call = false;
1377
  }
@@ -1379,14 +1410,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
1379
  // find non-used context in g_state
1380
  struct ggml_context * ctx = NULL;
1381
 
1382
- static bool first_time = true;
1383
- if (first_time) {
1384
- for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
1385
- g_state.contexts[i].used = false;
1386
- }
1387
- first_time = false;
1388
- }
1389
-
1390
  for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
1391
  if (!g_state.contexts[i].used) {
1392
  g_state.contexts[i].used = true;
@@ -1400,7 +1423,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
1400
  if (ctx == NULL) {
1401
  GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
1402
 
1403
- atomic_fetch_sub(&g_state_barrier, 1);
1404
 
1405
  return NULL;
1406
  }
@@ -1418,22 +1441,16 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
1418
 
1419
  GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
1420
 
1421
- atomic_fetch_sub(&g_state_barrier, 1);
1422
 
1423
  return ctx;
1424
  }
1425
 
1426
  void ggml_free(struct ggml_context * ctx) {
1427
  // make this function thread safe
1428
- {
1429
- int processing = atomic_fetch_add(&g_state_barrier, 1);
1430
- while (processing > 0) {
1431
- // wait for other threads to finish
1432
- atomic_fetch_sub(&g_state_barrier, 1);
1433
- sched_yield();
1434
- processing = atomic_fetch_add(&g_state_barrier, 1);
1435
- }
1436
- }
1437
 
1438
  for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
1439
  if (&g_state.contexts[i].context == ctx) {
@@ -1446,15 +1463,16 @@ void ggml_free(struct ggml_context * ctx) {
1446
  free(ctx->mem_buffer);
1447
  }
1448
 
1449
- atomic_fetch_sub(&g_state_barrier, 1);
1450
-
1451
- return;
1452
  }
1453
  }
1454
 
1455
- GGML_PRINT_DEBUG("%s: context not found\n", __func__);
 
 
1456
 
1457
- atomic_fetch_sub(&g_state_barrier, 1);
1458
  }
1459
 
1460
  size_t ggml_used_mem(const struct ggml_context * ctx) {
@@ -3035,7 +3053,7 @@ void ggml_set_param(
3035
 
3036
  // ggml_compute_forward_dup
3037
 
3038
- void ggml_compute_forward_dup_f16(
3039
  const struct ggml_compute_params * params,
3040
  const struct ggml_tensor * src0,
3041
  struct ggml_tensor * dst) {
@@ -3139,7 +3157,7 @@ void ggml_compute_forward_dup_f16(
3139
  }
3140
  }
3141
 
3142
- void ggml_compute_forward_dup_f32(
3143
  const struct ggml_compute_params * params,
3144
  const struct ggml_tensor * src0,
3145
  struct ggml_tensor * dst) {
@@ -3243,7 +3261,7 @@ void ggml_compute_forward_dup_f32(
3243
  }
3244
  }
3245
 
3246
- void ggml_compute_forward_dup(
3247
  const struct ggml_compute_params * params,
3248
  const struct ggml_tensor * src0,
3249
  struct ggml_tensor * dst) {
@@ -3268,7 +3286,7 @@ void ggml_compute_forward_dup(
3268
 
3269
  // ggml_compute_forward_add
3270
 
3271
- void ggml_compute_forward_add_f32(
3272
  const struct ggml_compute_params * params,
3273
  const struct ggml_tensor * src0,
3274
  const struct ggml_tensor * src1,
@@ -3321,7 +3339,7 @@ void ggml_compute_forward_add_f32(
3321
  }
3322
  }
3323
 
3324
- void ggml_compute_forward_add(
3325
  const struct ggml_compute_params * params,
3326
  const struct ggml_tensor * src0,
3327
  const struct ggml_tensor * src1,
@@ -3344,7 +3362,7 @@ void ggml_compute_forward_add(
3344
 
3345
  // ggml_compute_forward_sub
3346
 
3347
- void ggml_compute_forward_sub_f32(
3348
  const struct ggml_compute_params * params,
3349
  const struct ggml_tensor * src0,
3350
  const struct ggml_tensor * src1,
@@ -3371,7 +3389,7 @@ void ggml_compute_forward_sub_f32(
3371
  }
3372
  }
3373
 
3374
- void ggml_compute_forward_sub(
3375
  const struct ggml_compute_params * params,
3376
  const struct ggml_tensor * src0,
3377
  const struct ggml_tensor * src1,
@@ -3394,7 +3412,7 @@ void ggml_compute_forward_sub(
3394
 
3395
  // ggml_compute_forward_mul
3396
 
3397
- void ggml_compute_forward_mul_f32(
3398
  const struct ggml_compute_params * params,
3399
  const struct ggml_tensor * src0,
3400
  const struct ggml_tensor * src1,
@@ -3421,7 +3439,7 @@ void ggml_compute_forward_mul_f32(
3421
  }
3422
  }
3423
 
3424
- void ggml_compute_forward_mul(
3425
  const struct ggml_compute_params * params,
3426
  const struct ggml_tensor * src0,
3427
  const struct ggml_tensor * src1,
@@ -3444,7 +3462,7 @@ void ggml_compute_forward_mul(
3444
 
3445
  // ggml_compute_forward_div
3446
 
3447
- void ggml_compute_forward_div_f32(
3448
  const struct ggml_compute_params * params,
3449
  const struct ggml_tensor * src0,
3450
  const struct ggml_tensor * src1,
@@ -3471,7 +3489,7 @@ void ggml_compute_forward_div_f32(
3471
  }
3472
  }
3473
 
3474
- void ggml_compute_forward_div(
3475
  const struct ggml_compute_params * params,
3476
  const struct ggml_tensor * src0,
3477
  const struct ggml_tensor * src1,
@@ -3494,7 +3512,7 @@ void ggml_compute_forward_div(
3494
 
3495
  // ggml_compute_forward_sqr
3496
 
3497
- void ggml_compute_forward_sqr_f32(
3498
  const struct ggml_compute_params * params,
3499
  const struct ggml_tensor * src0,
3500
  struct ggml_tensor * dst) {
@@ -3518,7 +3536,7 @@ void ggml_compute_forward_sqr_f32(
3518
  }
3519
  }
3520
 
3521
- void ggml_compute_forward_sqr(
3522
  const struct ggml_compute_params * params,
3523
  const struct ggml_tensor * src0,
3524
  struct ggml_tensor * dst) {
@@ -3540,7 +3558,7 @@ void ggml_compute_forward_sqr(
3540
 
3541
  // ggml_compute_forward_sqrt
3542
 
3543
- void ggml_compute_forward_sqrt_f32(
3544
  const struct ggml_compute_params * params,
3545
  const struct ggml_tensor * src0,
3546
  struct ggml_tensor * dst) {
@@ -3564,7 +3582,7 @@ void ggml_compute_forward_sqrt_f32(
3564
  }
3565
  }
3566
 
3567
- void ggml_compute_forward_sqrt(
3568
  const struct ggml_compute_params * params,
3569
  const struct ggml_tensor * src0,
3570
  struct ggml_tensor * dst) {
@@ -3586,7 +3604,7 @@ void ggml_compute_forward_sqrt(
3586
 
3587
  // ggml_compute_forward_sum
3588
 
3589
- void ggml_compute_forward_sum_f32(
3590
  const struct ggml_compute_params * params,
3591
  const struct ggml_tensor * src0,
3592
  struct ggml_tensor * dst) {
@@ -3622,7 +3640,7 @@ void ggml_compute_forward_sum_f32(
3622
  }
3623
  }
3624
 
3625
- void ggml_compute_forward_sum(
3626
  const struct ggml_compute_params * params,
3627
  const struct ggml_tensor * src0,
3628
  struct ggml_tensor * dst) {
@@ -3644,7 +3662,7 @@ void ggml_compute_forward_sum(
3644
 
3645
  // ggml_compute_forward_mean
3646
 
3647
- void ggml_compute_forward_mean_f32(
3648
  const struct ggml_compute_params * params,
3649
  const struct ggml_tensor * src0,
3650
  struct ggml_tensor * dst) {
@@ -3699,7 +3717,7 @@ void ggml_compute_forward_mean_f32(
3699
  }
3700
  }
3701
 
3702
- void ggml_compute_forward_mean(
3703
  const struct ggml_compute_params * params,
3704
  const struct ggml_tensor * src0,
3705
  struct ggml_tensor * dst) {
@@ -3721,7 +3739,7 @@ void ggml_compute_forward_mean(
3721
 
3722
  // ggml_compute_forward_repeat
3723
 
3724
- void ggml_compute_forward_repeat_f32(
3725
  const struct ggml_compute_params * params,
3726
  const struct ggml_tensor * src0,
3727
  struct ggml_tensor * dst) {
@@ -3761,7 +3779,7 @@ void ggml_compute_forward_repeat_f32(
3761
  }
3762
  }
3763
 
3764
- void ggml_compute_forward_repeat(
3765
  const struct ggml_compute_params * params,
3766
  const struct ggml_tensor * src0,
3767
  struct ggml_tensor * dst) {
@@ -3783,7 +3801,7 @@ void ggml_compute_forward_repeat(
3783
 
3784
  // ggml_compute_forward_abs
3785
 
3786
- void ggml_compute_forward_abs_f32(
3787
  const struct ggml_compute_params * params,
3788
  const struct ggml_tensor * src0,
3789
  struct ggml_tensor * dst) {
@@ -3807,7 +3825,7 @@ void ggml_compute_forward_abs_f32(
3807
  }
3808
  }
3809
 
3810
- void ggml_compute_forward_abs(
3811
  const struct ggml_compute_params * params,
3812
  const struct ggml_tensor * src0,
3813
  struct ggml_tensor * dst) {
@@ -3829,7 +3847,7 @@ void ggml_compute_forward_abs(
3829
 
3830
  // ggml_compute_forward_sgn
3831
 
3832
- void ggml_compute_forward_sgn_f32(
3833
  const struct ggml_compute_params * params,
3834
  const struct ggml_tensor * src0,
3835
  struct ggml_tensor * dst) {
@@ -3853,7 +3871,7 @@ void ggml_compute_forward_sgn_f32(
3853
  }
3854
  }
3855
 
3856
- void ggml_compute_forward_sgn(
3857
  const struct ggml_compute_params * params,
3858
  const struct ggml_tensor * src0,
3859
  struct ggml_tensor * dst) {
@@ -3875,7 +3893,7 @@ void ggml_compute_forward_sgn(
3875
 
3876
  // ggml_compute_forward_neg
3877
 
3878
- void ggml_compute_forward_neg_f32(
3879
  const struct ggml_compute_params * params,
3880
  const struct ggml_tensor * src0,
3881
  struct ggml_tensor * dst) {
@@ -3899,7 +3917,7 @@ void ggml_compute_forward_neg_f32(
3899
  }
3900
  }
3901
 
3902
- void ggml_compute_forward_neg(
3903
  const struct ggml_compute_params * params,
3904
  const struct ggml_tensor * src0,
3905
  struct ggml_tensor * dst) {
@@ -3921,7 +3939,7 @@ void ggml_compute_forward_neg(
3921
 
3922
  // ggml_compute_forward_step
3923
 
3924
- void ggml_compute_forward_step_f32(
3925
  const struct ggml_compute_params * params,
3926
  const struct ggml_tensor * src0,
3927
  struct ggml_tensor * dst) {
@@ -3945,7 +3963,7 @@ void ggml_compute_forward_step_f32(
3945
  }
3946
  }
3947
 
3948
- void ggml_compute_forward_step(
3949
  const struct ggml_compute_params * params,
3950
  const struct ggml_tensor * src0,
3951
  struct ggml_tensor * dst) {
@@ -3967,7 +3985,7 @@ void ggml_compute_forward_step(
3967
 
3968
  // ggml_compute_forward_relu
3969
 
3970
- void ggml_compute_forward_relu_f32(
3971
  const struct ggml_compute_params * params,
3972
  const struct ggml_tensor * src0,
3973
  struct ggml_tensor * dst) {
@@ -3991,7 +4009,7 @@ void ggml_compute_forward_relu_f32(
3991
  }
3992
  }
3993
 
3994
- void ggml_compute_forward_relu(
3995
  const struct ggml_compute_params * params,
3996
  const struct ggml_tensor * src0,
3997
  struct ggml_tensor * dst) {
@@ -4013,7 +4031,7 @@ void ggml_compute_forward_relu(
4013
 
4014
  // ggml_compute_forward_gelu
4015
 
4016
- void ggml_compute_forward_gelu_f32(
4017
  const struct ggml_compute_params * params,
4018
  const struct ggml_tensor * src0,
4019
  struct ggml_tensor * dst) {
@@ -4054,7 +4072,7 @@ void ggml_compute_forward_gelu_f32(
4054
  }
4055
  }
4056
 
4057
- void ggml_compute_forward_gelu(
4058
  const struct ggml_compute_params * params,
4059
  const struct ggml_tensor * src0,
4060
  struct ggml_tensor * dst) {
@@ -4076,7 +4094,7 @@ void ggml_compute_forward_gelu(
4076
 
4077
  // ggml_compute_forward_norm
4078
 
4079
- void ggml_compute_forward_norm_f32(
4080
  const struct ggml_compute_params * params,
4081
  const struct ggml_tensor * src0,
4082
  struct ggml_tensor * dst) {
@@ -4136,7 +4154,7 @@ void ggml_compute_forward_norm_f32(
4136
  }
4137
  }
4138
 
4139
- void ggml_compute_forward_norm(
4140
  const struct ggml_compute_params * params,
4141
  const struct ggml_tensor * src0,
4142
  struct ggml_tensor * dst) {
@@ -4158,9 +4176,10 @@ void ggml_compute_forward_norm(
4158
 
4159
  // ggml_compute_forward_mul_mat
4160
 
 
4161
  // helper function to determine if it is better to use BLAS or not
4162
  // for large matrices, BLAS is faster
4163
- bool ggml_compute_forward_mul_mat_use_blas(
4164
  const struct ggml_tensor * src0,
4165
  const struct ggml_tensor * src1,
4166
  struct ggml_tensor * dst) {
@@ -4179,8 +4198,9 @@ bool ggml_compute_forward_mul_mat_use_blas(
4179
 
4180
  return false;
4181
  }
 
4182
 
4183
- void ggml_compute_forward_mul_mat_f32(
4184
  const struct ggml_compute_params * params,
4185
  const struct ggml_tensor * src0,
4186
  const struct ggml_tensor * src1,
@@ -4423,7 +4443,7 @@ void ggml_compute_forward_mul_mat_f32(
4423
  //}
4424
  }
4425
 
4426
- void ggml_compute_forward_mul_mat_f16_f32(
4427
  const struct ggml_compute_params * params,
4428
  const struct ggml_tensor * src0,
4429
  const struct ggml_tensor * src1,
@@ -4727,7 +4747,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
4727
  //}
4728
  }
4729
 
4730
- void ggml_compute_forward_mul_mat(
4731
  const struct ggml_compute_params * params,
4732
  const struct ggml_tensor * src0,
4733
  const struct ggml_tensor * src1,
@@ -4753,7 +4773,7 @@ void ggml_compute_forward_mul_mat(
4753
 
4754
  // ggml_compute_forward_scale
4755
 
4756
- void ggml_compute_forward_scale_f32(
4757
  const struct ggml_compute_params * params,
4758
  const struct ggml_tensor * src0,
4759
  const struct ggml_tensor * src1,
@@ -4788,7 +4808,7 @@ void ggml_compute_forward_scale_f32(
4788
  }
4789
  }
4790
 
4791
- void ggml_compute_forward_scale(
4792
  const struct ggml_compute_params * params,
4793
  const struct ggml_tensor * src0,
4794
  const struct ggml_tensor * src1,
@@ -4811,7 +4831,7 @@ void ggml_compute_forward_scale(
4811
 
4812
  // ggml_compute_forward_cpy
4813
 
4814
- void ggml_compute_forward_cpy(
4815
  const struct ggml_compute_params * params,
4816
  const struct ggml_tensor * src0,
4817
  struct ggml_tensor * dst) {
@@ -4820,7 +4840,7 @@ void ggml_compute_forward_cpy(
4820
 
4821
  // ggml_compute_forward_reshape
4822
 
4823
- void ggml_compute_forward_reshape(
4824
  const struct ggml_compute_params * params,
4825
  const struct ggml_tensor * src0,
4826
  struct ggml_tensor * dst) {
@@ -4832,7 +4852,7 @@ void ggml_compute_forward_reshape(
4832
 
4833
  // ggml_compute_forward_view
4834
 
4835
- void ggml_compute_forward_view(
4836
  const struct ggml_compute_params * params,
4837
  const struct ggml_tensor * src0) {
4838
  // NOP
@@ -4842,7 +4862,7 @@ void ggml_compute_forward_view(
4842
 
4843
  // ggml_compute_forward_permute
4844
 
4845
- void ggml_compute_forward_permute(
4846
  const struct ggml_compute_params * params,
4847
  const struct ggml_tensor * src0) {
4848
  // NOP
@@ -4852,7 +4872,7 @@ void ggml_compute_forward_permute(
4852
 
4853
  // ggml_compute_forward_transpose
4854
 
4855
- void ggml_compute_forward_transpose(
4856
  const struct ggml_compute_params * params,
4857
  const struct ggml_tensor * src0) {
4858
  // NOP
@@ -4862,7 +4882,7 @@ void ggml_compute_forward_transpose(
4862
 
4863
  // ggml_compute_forward_get_rows
4864
 
4865
- void ggml_compute_forward_get_rows_f16(
4866
  const struct ggml_compute_params * params,
4867
  const struct ggml_tensor * src0,
4868
  const struct ggml_tensor * src1,
@@ -4890,7 +4910,7 @@ void ggml_compute_forward_get_rows_f16(
4890
  }
4891
  }
4892
 
4893
- void ggml_compute_forward_get_rows_f32(
4894
  const struct ggml_compute_params * params,
4895
  const struct ggml_tensor * src0,
4896
  const struct ggml_tensor * src1,
@@ -4917,7 +4937,7 @@ void ggml_compute_forward_get_rows_f32(
4917
  }
4918
  }
4919
 
4920
- void ggml_compute_forward_get_rows(
4921
  const struct ggml_compute_params * params,
4922
  const struct ggml_tensor * src0,
4923
  const struct ggml_tensor * src1,
@@ -4943,7 +4963,7 @@ void ggml_compute_forward_get_rows(
4943
 
4944
  // ggml_compute_forward_diag_mask_inf
4945
 
4946
- void ggml_compute_forward_diag_mask_inf_f32(
4947
  const struct ggml_compute_params * params,
4948
  const struct ggml_tensor * src0,
4949
  const struct ggml_tensor * src1,
@@ -4979,7 +4999,7 @@ void ggml_compute_forward_diag_mask_inf_f32(
4979
  }
4980
  }
4981
 
4982
- void ggml_compute_forward_diag_mask_inf(
4983
  const struct ggml_compute_params * params,
4984
  const struct ggml_tensor * src0,
4985
  const struct ggml_tensor * src1,
@@ -5002,7 +5022,7 @@ void ggml_compute_forward_diag_mask_inf(
5002
 
5003
  // ggml_compute_forward_soft_max
5004
 
5005
- void ggml_compute_forward_soft_max_f32(
5006
  const struct ggml_compute_params * params,
5007
  const struct ggml_tensor * src0,
5008
  struct ggml_tensor * dst) {
@@ -5073,7 +5093,7 @@ void ggml_compute_forward_soft_max_f32(
5073
  }
5074
  }
5075
 
5076
- void ggml_compute_forward_soft_max(
5077
  const struct ggml_compute_params * params,
5078
  const struct ggml_tensor * src0,
5079
  struct ggml_tensor * dst) {
@@ -5095,7 +5115,7 @@ void ggml_compute_forward_soft_max(
5095
 
5096
  // ggml_compute_forward_rope
5097
 
5098
- void ggml_compute_forward_rope_f32(
5099
  const struct ggml_compute_params * params,
5100
  const struct ggml_tensor * src0,
5101
  const struct ggml_tensor * src1,
@@ -5152,7 +5172,7 @@ void ggml_compute_forward_rope_f32(
5152
  }
5153
  }
5154
 
5155
- void ggml_compute_forward_rope(
5156
  const struct ggml_compute_params * params,
5157
  const struct ggml_tensor * src0,
5158
  const struct ggml_tensor * src1,
@@ -5175,7 +5195,7 @@ void ggml_compute_forward_rope(
5175
 
5176
  // ggml_compute_forward_conv_1d_1s
5177
 
5178
- void ggml_compute_forward_conv_1d_1s_f16_f32(
5179
  const struct ggml_compute_params * params,
5180
  const struct ggml_tensor * src0,
5181
  const struct ggml_tensor * src1,
@@ -5295,7 +5315,7 @@ void ggml_compute_forward_conv_1d_1s_f16_f32(
5295
  }
5296
  }
5297
 
5298
- void ggml_compute_forward_conv_1d_1s_f32(
5299
  const struct ggml_compute_params * params,
5300
  const struct ggml_tensor * src0,
5301
  const struct ggml_tensor * src1,
@@ -5415,7 +5435,7 @@ void ggml_compute_forward_conv_1d_1s_f32(
5415
  }
5416
  }
5417
 
5418
- void ggml_compute_forward_conv_1d_1s(
5419
  const struct ggml_compute_params * params,
5420
  const struct ggml_tensor * src0,
5421
  const struct ggml_tensor * src1,
@@ -5441,7 +5461,7 @@ void ggml_compute_forward_conv_1d_1s(
5441
 
5442
  // ggml_compute_forward_conv_1d_2s
5443
 
5444
- void ggml_compute_forward_conv_1d_2s_f16_f32(
5445
  const struct ggml_compute_params * params,
5446
  const struct ggml_tensor * src0,
5447
  const struct ggml_tensor * src1,
@@ -5561,7 +5581,7 @@ void ggml_compute_forward_conv_1d_2s_f16_f32(
5561
  }
5562
  }
5563
 
5564
- void ggml_compute_forward_conv_1d_2s_f32(
5565
  const struct ggml_compute_params * params,
5566
  const struct ggml_tensor * src0,
5567
  const struct ggml_tensor * src1,
@@ -5681,7 +5701,7 @@ void ggml_compute_forward_conv_1d_2s_f32(
5681
  }
5682
  }
5683
 
5684
- void ggml_compute_forward_conv_1d_2s(
5685
  const struct ggml_compute_params * params,
5686
  const struct ggml_tensor * src0,
5687
  const struct ggml_tensor * src1,
@@ -5707,7 +5727,7 @@ void ggml_compute_forward_conv_1d_2s(
5707
 
5708
  // ggml_compute_forward_flash_attn
5709
 
5710
- void ggml_compute_forward_flash_attn_f32(
5711
  const struct ggml_compute_params * params,
5712
  const struct ggml_tensor * q,
5713
  const struct ggml_tensor * k,
@@ -5888,7 +5908,7 @@ void ggml_compute_forward_flash_attn_f32(
5888
  }
5889
  }
5890
 
5891
- void ggml_compute_forward_flash_attn_f16(
5892
  const struct ggml_compute_params * params,
5893
  const struct ggml_tensor * q,
5894
  const struct ggml_tensor * k,
@@ -6075,7 +6095,7 @@ void ggml_compute_forward_flash_attn_f16(
6075
  }
6076
  }
6077
 
6078
- void ggml_compute_forward_flash_attn(
6079
  const struct ggml_compute_params * params,
6080
  const struct ggml_tensor * q,
6081
  const struct ggml_tensor * k,
@@ -6103,7 +6123,7 @@ void ggml_compute_forward_flash_attn(
6103
 
6104
  // ggml_compute_forward_flash_ff
6105
 
6106
- void ggml_compute_forward_flash_ff_f16(
6107
  const struct ggml_compute_params * params,
6108
  const struct ggml_tensor * a, // F16
6109
  const struct ggml_tensor * b0, // F16 fc_w
@@ -6283,7 +6303,7 @@ void ggml_compute_forward_flash_ff_f16(
6283
  }
6284
  }
6285
 
6286
- void ggml_compute_forward_flash_ff(
6287
  const struct ggml_compute_params * params,
6288
  const struct ggml_tensor * a,
6289
  const struct ggml_tensor * b0,
@@ -6312,7 +6332,7 @@ void ggml_compute_forward_flash_ff(
6312
 
6313
  /////////////////////////////////
6314
 
6315
- void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
6316
  assert(params);
6317
 
6318
  switch (tensor->op) {
@@ -6460,7 +6480,7 @@ void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tenso
6460
 
6461
  ////////////////////////////////////////////////////////////////////////////////
6462
 
6463
- void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
6464
  struct ggml_tensor * src0 = tensor->src0;
6465
  struct ggml_tensor * src1 = tensor->src1;
6466
 
@@ -6704,7 +6724,7 @@ void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tenso
6704
  }
6705
  }
6706
 
6707
- void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
6708
  if (node->grad == NULL) {
6709
  // this usually happens when we generate intermediate nodes from constants in the backward pass
6710
  // it can also happen during forward pass, if the user performs computations with constants
@@ -6755,7 +6775,7 @@ void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node)
6755
  }
6756
  }
6757
 
6758
- void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
6759
  if (!expand) {
6760
  cgraph->n_nodes = 0;
6761
  cgraph->n_leafs = 0;
@@ -6866,6 +6886,11 @@ typedef int ggml_lock_t;
6866
 
6867
  #define GGML_LOCK_INITIALIZER 0
6868
 
 
 
 
 
 
6869
  #else
6870
 
6871
  //typedef pthread_spinlock_t ggml_lock_t;
@@ -6884,6 +6909,11 @@ typedef int ggml_lock_t;
6884
 
6885
  #define GGML_LOCK_INITIALIZER 0
6886
 
 
 
 
 
 
6887
  #endif
6888
 
6889
  struct ggml_compute_state_shared {
@@ -6898,7 +6928,7 @@ struct ggml_compute_state_shared {
6898
  };
6899
 
6900
  struct ggml_compute_state {
6901
- pthread_t thrd;
6902
 
6903
  struct ggml_compute_params params;
6904
  struct ggml_tensor * node;
@@ -6906,16 +6936,7 @@ struct ggml_compute_state {
6906
  struct ggml_compute_state_shared * shared;
6907
  };
6908
 
6909
- // function used by each compute thread
6910
- void * ggml_graph_compute_one(void * data) {
6911
- struct ggml_compute_state * state = (struct ggml_compute_state *) data;
6912
-
6913
- ggml_compute_forward(&state->params, state->node);
6914
-
6915
- return NULL;
6916
- }
6917
-
6918
- thread_ret_t ggml_graph_compute_thread(void * data) {
6919
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
6920
 
6921
  const int n_threads = state->shared->n_threads;
@@ -6995,7 +7016,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
6995
  .node = NULL,
6996
  .shared = &state_shared,
6997
  };
6998
- int rc = pthread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
6999
  assert(rc == 0);
7000
  UNUSED(rc);
7001
  }
@@ -7339,7 +7360,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
7339
  atomic_store(&state_shared.has_work, true);
7340
 
7341
  for (int j = 0; j < n_threads - 1; j++) {
7342
- int rc = pthread_join(workers[j].thrd, NULL);
7343
  assert(rc == 0);
7344
  UNUSED(rc);
7345
  }
@@ -7417,7 +7438,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
7417
  }
7418
 
7419
  // check if node is part of the graph
7420
- bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
7421
  if (cgraph == NULL) {
7422
  return true;
7423
  }
@@ -7431,7 +7452,7 @@ bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor
7431
  return false;
7432
  }
7433
 
7434
- struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
7435
  for (int i = 0; i < cgraph->n_nodes; i++) {
7436
  struct ggml_tensor * parent = cgraph->nodes[i];
7437
 
@@ -7560,7 +7581,7 @@ label=\"<x>CONST %d [%d, %d]\"; ]\n",
7560
 
7561
  ////////////////////////////////////////////////////////////////////////////////
7562
 
7563
- void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
7564
  int i = 0;
7565
  for (int p = 0; p < np; ++p) {
7566
  const int ne = ggml_nelements(ps[p]) ;
@@ -7571,7 +7592,7 @@ void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float *
7571
  }
7572
  }
7573
 
7574
- void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
7575
  int i = 0;
7576
  for (int p = 0; p < np; ++p) {
7577
  const int ne = ggml_nelements(ps[p]) ;
@@ -7582,7 +7603,7 @@ void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
7582
  }
7583
  }
7584
 
7585
- void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
7586
  int i = 0;
7587
  for (int p = 0; p < np; ++p) {
7588
  const int ne = ggml_nelements(ps[p]) ;
@@ -7599,7 +7620,7 @@ void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
7599
  // ref: https://arxiv.org/pdf/1412.6980.pdf
7600
  //
7601
 
7602
- enum ggml_opt_result ggml_opt_adam(
7603
  struct ggml_context * ctx,
7604
  struct ggml_opt_params params,
7605
  struct ggml_tensor * f,
@@ -7892,7 +7913,7 @@ static enum ggml_opt_result linesearch_backtracking(
7892
  return GGML_LINESEARCH_FAIL;
7893
  }
7894
 
7895
- enum ggml_opt_result ggml_opt_lbfgs(
7896
  struct ggml_context * ctx,
7897
  struct ggml_opt_params params,
7898
  struct ggml_tensor * f,
 
1217
  static struct ggml_state g_state;
1218
  static atomic_int g_state_barrier = 0;
1219
 
1220
+ // barrier via spin lock
1221
+ inline static void ggml_critical_section_start() {
1222
+ int processing = atomic_fetch_add(&g_state_barrier, 1);
1223
+
1224
+ while (processing > 0) {
1225
+ // wait for other threads to finish
1226
+ atomic_fetch_sub(&g_state_barrier, 1);
1227
+ sched_yield(); // TODO: reconsider this
1228
+ processing = atomic_fetch_add(&g_state_barrier, 1);
1229
+ }
1230
+ }
1231
+
1232
+ // TODO: make this somehow automatically executed
1233
+ // some sort of "sentry" mechanism
1234
+ inline static void ggml_critical_section_end() {
1235
+ atomic_fetch_sub(&g_state_barrier, 1);
1236
+ }
1237
+
1238
  ////////////////////////////////////////////////////////////////////////////////
1239
 
1240
  void ggml_print_object(const struct ggml_object * obj) {
 
1364
 
1365
  struct ggml_context * ggml_init(struct ggml_init_params params) {
1366
  // make this function thread safe
1367
+ ggml_critical_section_start();
 
 
 
 
 
 
 
 
1368
 
1369
  static bool is_first_call = true;
1370
+
1371
  if (is_first_call) {
1372
+ // initialize GELU and EXP tables
1373
+ {
1374
+ const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
1375
+
1376
+ ggml_fp16_t ii;
1377
+ for (int i = 0; i < (1 << 16); ++i) {
1378
+ uint16_t ui = i;
1379
+ memcpy(&ii, &ui, sizeof(ii));
1380
+ const float f = GGML_FP16_TO_FP32(ii);
1381
+ table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
1382
+ table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
1383
+ }
1384
+
1385
+ const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
1386
+
1387
+ GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
1388
  }
1389
 
1390
+ // initialize g_state
1391
+ {
1392
+ const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
1393
+
1394
+ g_state = (struct ggml_state) {
1395
+ /*.contexts =*/ { 0 },
1396
+ };
1397
+
1398
+ for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
1399
+ g_state.contexts[i].used = false;
1400
+ }
1401
+
1402
+ const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
1403
 
1404
+ GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
1405
+ }
1406
 
1407
  is_first_call = false;
1408
  }
 
1410
  // find non-used context in g_state
1411
  struct ggml_context * ctx = NULL;
1412
 
 
 
 
 
 
 
 
 
1413
  for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
1414
  if (!g_state.contexts[i].used) {
1415
  g_state.contexts[i].used = true;
 
1423
  if (ctx == NULL) {
1424
  GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
1425
 
1426
+ ggml_critical_section_end();
1427
 
1428
  return NULL;
1429
  }
 
1441
 
1442
  GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
1443
 
1444
+ ggml_critical_section_end();
1445
 
1446
  return ctx;
1447
  }
1448
 
1449
  void ggml_free(struct ggml_context * ctx) {
1450
  // make this function thread safe
1451
+ ggml_critical_section_start();
1452
+
1453
+ bool found = false;
 
 
 
 
 
 
1454
 
1455
  for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
1456
  if (&g_state.contexts[i].context == ctx) {
 
1463
  free(ctx->mem_buffer);
1464
  }
1465
 
1466
+ found = true;
1467
+ break;
 
1468
  }
1469
  }
1470
 
1471
+ if (!found) {
1472
+ GGML_PRINT_DEBUG("%s: context not found\n", __func__);
1473
+ }
1474
 
1475
+ ggml_critical_section_end();
1476
  }
1477
 
1478
  size_t ggml_used_mem(const struct ggml_context * ctx) {
 
3053
 
3054
  // ggml_compute_forward_dup
3055
 
3056
+ static void ggml_compute_forward_dup_f16(
3057
  const struct ggml_compute_params * params,
3058
  const struct ggml_tensor * src0,
3059
  struct ggml_tensor * dst) {
 
3157
  }
3158
  }
3159
 
3160
+ static void ggml_compute_forward_dup_f32(
3161
  const struct ggml_compute_params * params,
3162
  const struct ggml_tensor * src0,
3163
  struct ggml_tensor * dst) {
 
3261
  }
3262
  }
3263
 
3264
+ static void ggml_compute_forward_dup(
3265
  const struct ggml_compute_params * params,
3266
  const struct ggml_tensor * src0,
3267
  struct ggml_tensor * dst) {
 
3286
 
3287
  // ggml_compute_forward_add
3288
 
3289
+ static void ggml_compute_forward_add_f32(
3290
  const struct ggml_compute_params * params,
3291
  const struct ggml_tensor * src0,
3292
  const struct ggml_tensor * src1,
 
3339
  }
3340
  }
3341
 
3342
+ static void ggml_compute_forward_add(
3343
  const struct ggml_compute_params * params,
3344
  const struct ggml_tensor * src0,
3345
  const struct ggml_tensor * src1,
 
3362
 
3363
  // ggml_compute_forward_sub
3364
 
3365
+ static void ggml_compute_forward_sub_f32(
3366
  const struct ggml_compute_params * params,
3367
  const struct ggml_tensor * src0,
3368
  const struct ggml_tensor * src1,
 
3389
  }
3390
  }
3391
 
3392
+ static void ggml_compute_forward_sub(
3393
  const struct ggml_compute_params * params,
3394
  const struct ggml_tensor * src0,
3395
  const struct ggml_tensor * src1,
 
3412
 
3413
  // ggml_compute_forward_mul
3414
 
3415
+ static void ggml_compute_forward_mul_f32(
3416
  const struct ggml_compute_params * params,
3417
  const struct ggml_tensor * src0,
3418
  const struct ggml_tensor * src1,
 
3439
  }
3440
  }
3441
 
3442
+ static void ggml_compute_forward_mul(
3443
  const struct ggml_compute_params * params,
3444
  const struct ggml_tensor * src0,
3445
  const struct ggml_tensor * src1,
 
3462
 
3463
  // ggml_compute_forward_div
3464
 
3465
+ static void ggml_compute_forward_div_f32(
3466
  const struct ggml_compute_params * params,
3467
  const struct ggml_tensor * src0,
3468
  const struct ggml_tensor * src1,
 
3489
  }
3490
  }
3491
 
3492
+ static void ggml_compute_forward_div(
3493
  const struct ggml_compute_params * params,
3494
  const struct ggml_tensor * src0,
3495
  const struct ggml_tensor * src1,
 
3512
 
3513
  // ggml_compute_forward_sqr
3514
 
3515
+ static void ggml_compute_forward_sqr_f32(
3516
  const struct ggml_compute_params * params,
3517
  const struct ggml_tensor * src0,
3518
  struct ggml_tensor * dst) {
 
3536
  }
3537
  }
3538
 
3539
+ static void ggml_compute_forward_sqr(
3540
  const struct ggml_compute_params * params,
3541
  const struct ggml_tensor * src0,
3542
  struct ggml_tensor * dst) {
 
3558
 
3559
  // ggml_compute_forward_sqrt
3560
 
3561
+ static void ggml_compute_forward_sqrt_f32(
3562
  const struct ggml_compute_params * params,
3563
  const struct ggml_tensor * src0,
3564
  struct ggml_tensor * dst) {
 
3582
  }
3583
  }
3584
 
3585
+ static void ggml_compute_forward_sqrt(
3586
  const struct ggml_compute_params * params,
3587
  const struct ggml_tensor * src0,
3588
  struct ggml_tensor * dst) {
 
3604
 
3605
  // ggml_compute_forward_sum
3606
 
3607
+ static void ggml_compute_forward_sum_f32(
3608
  const struct ggml_compute_params * params,
3609
  const struct ggml_tensor * src0,
3610
  struct ggml_tensor * dst) {
 
3640
  }
3641
  }
3642
 
3643
+ static void ggml_compute_forward_sum(
3644
  const struct ggml_compute_params * params,
3645
  const struct ggml_tensor * src0,
3646
  struct ggml_tensor * dst) {
 
3662
 
3663
  // ggml_compute_forward_mean
3664
 
3665
+ static void ggml_compute_forward_mean_f32(
3666
  const struct ggml_compute_params * params,
3667
  const struct ggml_tensor * src0,
3668
  struct ggml_tensor * dst) {
 
3717
  }
3718
  }
3719
 
3720
+ static void ggml_compute_forward_mean(
3721
  const struct ggml_compute_params * params,
3722
  const struct ggml_tensor * src0,
3723
  struct ggml_tensor * dst) {
 
3739
 
3740
  // ggml_compute_forward_repeat
3741
 
3742
+ static void ggml_compute_forward_repeat_f32(
3743
  const struct ggml_compute_params * params,
3744
  const struct ggml_tensor * src0,
3745
  struct ggml_tensor * dst) {
 
3779
  }
3780
  }
3781
 
3782
+ static void ggml_compute_forward_repeat(
3783
  const struct ggml_compute_params * params,
3784
  const struct ggml_tensor * src0,
3785
  struct ggml_tensor * dst) {
 
3801
 
3802
  // ggml_compute_forward_abs
3803
 
3804
+ static void ggml_compute_forward_abs_f32(
3805
  const struct ggml_compute_params * params,
3806
  const struct ggml_tensor * src0,
3807
  struct ggml_tensor * dst) {
 
3825
  }
3826
  }
3827
 
3828
+ static void ggml_compute_forward_abs(
3829
  const struct ggml_compute_params * params,
3830
  const struct ggml_tensor * src0,
3831
  struct ggml_tensor * dst) {
 
3847
 
3848
  // ggml_compute_forward_sgn
3849
 
3850
+ static void ggml_compute_forward_sgn_f32(
3851
  const struct ggml_compute_params * params,
3852
  const struct ggml_tensor * src0,
3853
  struct ggml_tensor * dst) {
 
3871
  }
3872
  }
3873
 
3874
+ static void ggml_compute_forward_sgn(
3875
  const struct ggml_compute_params * params,
3876
  const struct ggml_tensor * src0,
3877
  struct ggml_tensor * dst) {
 
3893
 
3894
  // ggml_compute_forward_neg
3895
 
3896
+ static void ggml_compute_forward_neg_f32(
3897
  const struct ggml_compute_params * params,
3898
  const struct ggml_tensor * src0,
3899
  struct ggml_tensor * dst) {
 
3917
  }
3918
  }
3919
 
3920
+ static void ggml_compute_forward_neg(
3921
  const struct ggml_compute_params * params,
3922
  const struct ggml_tensor * src0,
3923
  struct ggml_tensor * dst) {
 
3939
 
3940
  // ggml_compute_forward_step
3941
 
3942
+ static void ggml_compute_forward_step_f32(
3943
  const struct ggml_compute_params * params,
3944
  const struct ggml_tensor * src0,
3945
  struct ggml_tensor * dst) {
 
3963
  }
3964
  }
3965
 
3966
+ static void ggml_compute_forward_step(
3967
  const struct ggml_compute_params * params,
3968
  const struct ggml_tensor * src0,
3969
  struct ggml_tensor * dst) {
 
3985
 
3986
  // ggml_compute_forward_relu
3987
 
3988
+ static void ggml_compute_forward_relu_f32(
3989
  const struct ggml_compute_params * params,
3990
  const struct ggml_tensor * src0,
3991
  struct ggml_tensor * dst) {
 
4009
  }
4010
  }
4011
 
4012
+ static void ggml_compute_forward_relu(
4013
  const struct ggml_compute_params * params,
4014
  const struct ggml_tensor * src0,
4015
  struct ggml_tensor * dst) {
 
4031
 
4032
  // ggml_compute_forward_gelu
4033
 
4034
+ static void ggml_compute_forward_gelu_f32(
4035
  const struct ggml_compute_params * params,
4036
  const struct ggml_tensor * src0,
4037
  struct ggml_tensor * dst) {
 
4072
  }
4073
  }
4074
 
4075
+ static void ggml_compute_forward_gelu(
4076
  const struct ggml_compute_params * params,
4077
  const struct ggml_tensor * src0,
4078
  struct ggml_tensor * dst) {
 
4094
 
4095
  // ggml_compute_forward_norm
4096
 
4097
+ static void ggml_compute_forward_norm_f32(
4098
  const struct ggml_compute_params * params,
4099
  const struct ggml_tensor * src0,
4100
  struct ggml_tensor * dst) {
 
4154
  }
4155
  }
4156
 
4157
+ static void ggml_compute_forward_norm(
4158
  const struct ggml_compute_params * params,
4159
  const struct ggml_tensor * src0,
4160
  struct ggml_tensor * dst) {
 
4176
 
4177
  // ggml_compute_forward_mul_mat
4178
 
4179
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
4180
  // helper function to determine if it is better to use BLAS or not
4181
  // for large matrices, BLAS is faster
4182
+ static bool ggml_compute_forward_mul_mat_use_blas(
4183
  const struct ggml_tensor * src0,
4184
  const struct ggml_tensor * src1,
4185
  struct ggml_tensor * dst) {
 
4198
 
4199
  return false;
4200
  }
4201
+ #endif
4202
 
4203
+ static void ggml_compute_forward_mul_mat_f32(
4204
  const struct ggml_compute_params * params,
4205
  const struct ggml_tensor * src0,
4206
  const struct ggml_tensor * src1,
 
4443
  //}
4444
  }
4445
 
4446
+ static void ggml_compute_forward_mul_mat_f16_f32(
4447
  const struct ggml_compute_params * params,
4448
  const struct ggml_tensor * src0,
4449
  const struct ggml_tensor * src1,
 
4747
  //}
4748
  }
4749
 
4750
+ static void ggml_compute_forward_mul_mat(
4751
  const struct ggml_compute_params * params,
4752
  const struct ggml_tensor * src0,
4753
  const struct ggml_tensor * src1,
 
4773
 
4774
  // ggml_compute_forward_scale
4775
 
4776
+ static void ggml_compute_forward_scale_f32(
4777
  const struct ggml_compute_params * params,
4778
  const struct ggml_tensor * src0,
4779
  const struct ggml_tensor * src1,
 
4808
  }
4809
  }
4810
 
4811
+ static void ggml_compute_forward_scale(
4812
  const struct ggml_compute_params * params,
4813
  const struct ggml_tensor * src0,
4814
  const struct ggml_tensor * src1,
 
4831
 
4832
  // ggml_compute_forward_cpy
4833
 
4834
+ static void ggml_compute_forward_cpy(
4835
  const struct ggml_compute_params * params,
4836
  const struct ggml_tensor * src0,
4837
  struct ggml_tensor * dst) {
 
4840
 
4841
  // ggml_compute_forward_reshape
4842
 
4843
+ static void ggml_compute_forward_reshape(
4844
  const struct ggml_compute_params * params,
4845
  const struct ggml_tensor * src0,
4846
  struct ggml_tensor * dst) {
 
4852
 
4853
  // ggml_compute_forward_view
4854
 
4855
+ static void ggml_compute_forward_view(
4856
  const struct ggml_compute_params * params,
4857
  const struct ggml_tensor * src0) {
4858
  // NOP
 
4862
 
4863
  // ggml_compute_forward_permute
4864
 
4865
+ static void ggml_compute_forward_permute(
4866
  const struct ggml_compute_params * params,
4867
  const struct ggml_tensor * src0) {
4868
  // NOP
 
4872
 
4873
  // ggml_compute_forward_transpose
4874
 
4875
+ static void ggml_compute_forward_transpose(
4876
  const struct ggml_compute_params * params,
4877
  const struct ggml_tensor * src0) {
4878
  // NOP
 
4882
 
4883
  // ggml_compute_forward_get_rows
4884
 
4885
+ static void ggml_compute_forward_get_rows_f16(
4886
  const struct ggml_compute_params * params,
4887
  const struct ggml_tensor * src0,
4888
  const struct ggml_tensor * src1,
 
4910
  }
4911
  }
4912
 
4913
+ static void ggml_compute_forward_get_rows_f32(
4914
  const struct ggml_compute_params * params,
4915
  const struct ggml_tensor * src0,
4916
  const struct ggml_tensor * src1,
 
4937
  }
4938
  }
4939
 
4940
+ static void ggml_compute_forward_get_rows(
4941
  const struct ggml_compute_params * params,
4942
  const struct ggml_tensor * src0,
4943
  const struct ggml_tensor * src1,
 
4963
 
4964
  // ggml_compute_forward_diag_mask_inf
4965
 
4966
+ static void ggml_compute_forward_diag_mask_inf_f32(
4967
  const struct ggml_compute_params * params,
4968
  const struct ggml_tensor * src0,
4969
  const struct ggml_tensor * src1,
 
4999
  }
5000
  }
5001
 
5002
+ static void ggml_compute_forward_diag_mask_inf(
5003
  const struct ggml_compute_params * params,
5004
  const struct ggml_tensor * src0,
5005
  const struct ggml_tensor * src1,
 
5022
 
5023
  // ggml_compute_forward_soft_max
5024
 
5025
+ static void ggml_compute_forward_soft_max_f32(
5026
  const struct ggml_compute_params * params,
5027
  const struct ggml_tensor * src0,
5028
  struct ggml_tensor * dst) {
 
5093
  }
5094
  }
5095
 
5096
+ static void ggml_compute_forward_soft_max(
5097
  const struct ggml_compute_params * params,
5098
  const struct ggml_tensor * src0,
5099
  struct ggml_tensor * dst) {
 
5115
 
5116
  // ggml_compute_forward_rope
5117
 
5118
+ static void ggml_compute_forward_rope_f32(
5119
  const struct ggml_compute_params * params,
5120
  const struct ggml_tensor * src0,
5121
  const struct ggml_tensor * src1,
 
5172
  }
5173
  }
5174
 
5175
+ static void ggml_compute_forward_rope(
5176
  const struct ggml_compute_params * params,
5177
  const struct ggml_tensor * src0,
5178
  const struct ggml_tensor * src1,
 
5195
 
5196
  // ggml_compute_forward_conv_1d_1s
5197
 
5198
+ static void ggml_compute_forward_conv_1d_1s_f16_f32(
5199
  const struct ggml_compute_params * params,
5200
  const struct ggml_tensor * src0,
5201
  const struct ggml_tensor * src1,
 
5315
  }
5316
  }
5317
 
5318
+ static void ggml_compute_forward_conv_1d_1s_f32(
5319
  const struct ggml_compute_params * params,
5320
  const struct ggml_tensor * src0,
5321
  const struct ggml_tensor * src1,
 
5435
  }
5436
  }
5437
 
5438
+ static void ggml_compute_forward_conv_1d_1s(
5439
  const struct ggml_compute_params * params,
5440
  const struct ggml_tensor * src0,
5441
  const struct ggml_tensor * src1,
 
5461
 
5462
  // ggml_compute_forward_conv_1d_2s
5463
 
5464
+ static void ggml_compute_forward_conv_1d_2s_f16_f32(
5465
  const struct ggml_compute_params * params,
5466
  const struct ggml_tensor * src0,
5467
  const struct ggml_tensor * src1,
 
5581
  }
5582
  }
5583
 
5584
+ static void ggml_compute_forward_conv_1d_2s_f32(
5585
  const struct ggml_compute_params * params,
5586
  const struct ggml_tensor * src0,
5587
  const struct ggml_tensor * src1,
 
5701
  }
5702
  }
5703
 
5704
+ static void ggml_compute_forward_conv_1d_2s(
5705
  const struct ggml_compute_params * params,
5706
  const struct ggml_tensor * src0,
5707
  const struct ggml_tensor * src1,
 
5727
 
5728
  // ggml_compute_forward_flash_attn
5729
 
5730
+ static void ggml_compute_forward_flash_attn_f32(
5731
  const struct ggml_compute_params * params,
5732
  const struct ggml_tensor * q,
5733
  const struct ggml_tensor * k,
 
5908
  }
5909
  }
5910
 
5911
+ static void ggml_compute_forward_flash_attn_f16(
5912
  const struct ggml_compute_params * params,
5913
  const struct ggml_tensor * q,
5914
  const struct ggml_tensor * k,
 
6095
  }
6096
  }
6097
 
6098
+ static void ggml_compute_forward_flash_attn(
6099
  const struct ggml_compute_params * params,
6100
  const struct ggml_tensor * q,
6101
  const struct ggml_tensor * k,
 
6123
 
6124
  // ggml_compute_forward_flash_ff
6125
 
6126
+ static void ggml_compute_forward_flash_ff_f16(
6127
  const struct ggml_compute_params * params,
6128
  const struct ggml_tensor * a, // F16
6129
  const struct ggml_tensor * b0, // F16 fc_w
 
6303
  }
6304
  }
6305
 
6306
+ static void ggml_compute_forward_flash_ff(
6307
  const struct ggml_compute_params * params,
6308
  const struct ggml_tensor * a,
6309
  const struct ggml_tensor * b0,
 
6332
 
6333
  /////////////////////////////////
6334
 
6335
+ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
6336
  assert(params);
6337
 
6338
  switch (tensor->op) {
 
6480
 
6481
  ////////////////////////////////////////////////////////////////////////////////
6482
 
6483
+ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
6484
  struct ggml_tensor * src0 = tensor->src0;
6485
  struct ggml_tensor * src1 = tensor->src1;
6486
 
 
6724
  }
6725
  }
6726
 
6727
+ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
6728
  if (node->grad == NULL) {
6729
  // this usually happens when we generate intermediate nodes from constants in the backward pass
6730
  // it can also happen during forward pass, if the user performs computations with constants
 
6775
  }
6776
  }
6777
 
6778
+ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
6779
  if (!expand) {
6780
  cgraph->n_nodes = 0;
6781
  cgraph->n_leafs = 0;
 
6886
 
6887
  #define GGML_LOCK_INITIALIZER 0
6888
 
6889
+ typedef pthread_t ggml_thread_t;
6890
+
6891
+ #define ggml_thread_create pthread_create
6892
+ #define ggml_thread_join pthread_join
6893
+
6894
  #else
6895
 
6896
  //typedef pthread_spinlock_t ggml_lock_t;
 
6909
 
6910
  #define GGML_LOCK_INITIALIZER 0
6911
 
6912
+ typedef pthread_t ggml_thread_t;
6913
+
6914
+ #define ggml_thread_create pthread_create
6915
+ #define ggml_thread_join pthread_join
6916
+
6917
  #endif
6918
 
6919
  struct ggml_compute_state_shared {
 
6928
  };
6929
 
6930
  struct ggml_compute_state {
6931
+ ggml_thread_t thrd;
6932
 
6933
  struct ggml_compute_params params;
6934
  struct ggml_tensor * node;
 
6936
  struct ggml_compute_state_shared * shared;
6937
  };
6938
 
6939
+ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
 
 
 
 
 
 
 
 
6940
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
6941
 
6942
  const int n_threads = state->shared->n_threads;
 
7016
  .node = NULL,
7017
  .shared = &state_shared,
7018
  };
7019
+ int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
7020
  assert(rc == 0);
7021
  UNUSED(rc);
7022
  }
 
7360
  atomic_store(&state_shared.has_work, true);
7361
 
7362
  for (int j = 0; j < n_threads - 1; j++) {
7363
+ int rc = ggml_thread_join(workers[j].thrd, NULL);
7364
  assert(rc == 0);
7365
  UNUSED(rc);
7366
  }
 
7438
  }
7439
 
7440
  // check if node is part of the graph
7441
+ static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
7442
  if (cgraph == NULL) {
7443
  return true;
7444
  }
 
7452
  return false;
7453
  }
7454
 
7455
+ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
7456
  for (int i = 0; i < cgraph->n_nodes; i++) {
7457
  struct ggml_tensor * parent = cgraph->nodes[i];
7458
 
 
7581
 
7582
  ////////////////////////////////////////////////////////////////////////////////
7583
 
7584
+ static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
7585
  int i = 0;
7586
  for (int p = 0; p < np; ++p) {
7587
  const int ne = ggml_nelements(ps[p]) ;
 
7592
  }
7593
  }
7594
 
7595
+ static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
7596
  int i = 0;
7597
  for (int p = 0; p < np; ++p) {
7598
  const int ne = ggml_nelements(ps[p]) ;
 
7603
  }
7604
  }
7605
 
7606
+ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
7607
  int i = 0;
7608
  for (int p = 0; p < np; ++p) {
7609
  const int ne = ggml_nelements(ps[p]) ;
 
7620
  // ref: https://arxiv.org/pdf/1412.6980.pdf
7621
  //
7622
 
7623
+ static enum ggml_opt_result ggml_opt_adam(
7624
  struct ggml_context * ctx,
7625
  struct ggml_opt_params params,
7626
  struct ggml_tensor * f,
 
7913
  return GGML_LINESEARCH_FAIL;
7914
  }
7915
 
7916
+ static enum ggml_opt_result ggml_opt_lbfgs(
7917
  struct ggml_context * ctx,
7918
  struct ggml_opt_params params,
7919
  struct ggml_tensor * f,