rgerganov OccamRazor commited on
Commit
7d66a68
·
1 Parent(s): 52069b8

vulkan : do not use tensor->extra (llama/9407)

Browse files

* vulkan : do not use tensor->extra

This patch allows using the Vulkan backend with the RPC backend as
tensor->extra is no longer used.

Ref: #8536

* Adapt GGML_VULKAN_CHECK_RESULTS to extra removal (llama/2)

---------

Co-authored-by: 0cc4m <[email protected]>

Files changed (1) hide show
  1. ggml/src/ggml-vulkan.cpp +148 -194
ggml/src/ggml-vulkan.cpp CHANGED
@@ -433,16 +433,6 @@ struct vk_context_struct {
433
  typedef std::shared_ptr<vk_context_struct> vk_context;
434
  typedef std::weak_ptr<vk_context_struct> vk_context_ref;
435
 
436
- struct ggml_tensor_extra_gpu {
437
- vk_buffer_ref buffer_gpu;
438
- uint64_t offset;
439
-
440
- void reset() {
441
- buffer_gpu.reset();
442
- offset = 0;
443
- }
444
- };
445
-
446
  struct ggml_vk_garbage_collector {
447
  std::vector<vk_semaphore> tl_semaphores;
448
  std::vector<vk_semaphore> semaphores;
@@ -553,6 +543,31 @@ struct ggml_backend_vk_context {
553
  std::vector<vk_context_ref> tensor_ctxs;
554
  };
555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  #ifdef GGML_VULKAN_MEMORY_DEBUG
557
  void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
558
  std::lock_guard<std::mutex> guard(log_mutex);
@@ -3076,9 +3091,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
3076
  const uint64_t r2 = ne12 / ne02;
3077
  const uint64_t r3 = ne13 / ne03;
3078
 
3079
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3080
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3081
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3082
 
3083
  vk_buffer d_Qx;
3084
  size_t qx_buf_offset = 0;
@@ -3180,8 +3195,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
3180
  return;
3181
  }
3182
 
3183
- vk_buffer d_D = extra->buffer_gpu.lock();
3184
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3185
  GGML_ASSERT(d_D != nullptr);
3186
  GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
3187
  vk_buffer d_X;
@@ -3189,13 +3204,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
3189
  vk_buffer d_Y;
3190
  uint64_t y_buf_offset = 0;
3191
  if (!src0_uma) {
3192
- d_Qx = extra_src0->buffer_gpu.lock();
3193
- qx_buf_offset = extra_src0->offset + src0->view_offs;
3194
  GGML_ASSERT(d_Qx != nullptr);
3195
  }
3196
  if (!src1_uma) {
3197
- d_Qy = extra_src1->buffer_gpu.lock();
3198
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3199
  GGML_ASSERT(d_Qy != nullptr);
3200
  }
3201
  if (qx_needs_dequant) {
@@ -3276,9 +3291,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
3276
  const uint64_t r2 = ne12 / ne02;
3277
  const uint64_t r3 = ne13 / ne03;
3278
 
3279
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3280
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3281
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3282
 
3283
  vk_buffer d_Qx;
3284
  size_t qx_buf_offset = 0;
@@ -3357,21 +3372,21 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
3357
  return;
3358
  }
3359
 
3360
- vk_buffer d_D = extra->buffer_gpu.lock();
3361
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3362
  GGML_ASSERT(d_D != nullptr);
3363
  vk_buffer d_X;
3364
  uint64_t x_buf_offset = 0;
3365
  vk_buffer d_Y;
3366
  uint64_t y_buf_offset = 0;
3367
  if(!src0_uma) {
3368
- d_Qx = extra_src0->buffer_gpu.lock();
3369
- qx_buf_offset = extra_src0->offset + src0->view_offs;
3370
  GGML_ASSERT(d_Qx != nullptr);
3371
  }
3372
  if(!src1_uma) {
3373
- d_Qy = extra_src1->buffer_gpu.lock();
3374
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3375
  GGML_ASSERT(d_Qy != nullptr);
3376
  }
3377
  if (qx_needs_dequant) {
@@ -3454,9 +3469,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3454
 
3455
  GGML_ASSERT(ne11 == 1);
3456
 
3457
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3458
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3459
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3460
 
3461
  vk_buffer d_Qy;
3462
  size_t qy_buf_offset = 0;
@@ -3482,15 +3497,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3482
  return;
3483
  }
3484
 
3485
- vk_buffer d_D = extra->buffer_gpu.lock();
3486
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3487
  GGML_ASSERT(d_D != nullptr);
3488
- vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3489
- const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3490
  GGML_ASSERT(d_Qx != nullptr);
3491
  if (!src1_uma) {
3492
- d_Qy = extra_src1->buffer_gpu.lock();
3493
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3494
  GGML_ASSERT(d_Qx != nullptr);
3495
  }
3496
 
@@ -3532,9 +3547,9 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3532
 
3533
  GGML_ASSERT(ne11 == 1);
3534
 
3535
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3536
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3537
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3538
 
3539
  vk_buffer d_Qy = nullptr;
3540
  size_t qy_buf_offset = 0;
@@ -3561,15 +3576,15 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3561
  return;
3562
  }
3563
 
3564
- vk_buffer d_D = extra->buffer_gpu.lock();
3565
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3566
  GGML_ASSERT(d_D != nullptr);
3567
- vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3568
- const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3569
  GGML_ASSERT(d_Qx != nullptr);
3570
  if (!src1_uma) {
3571
- d_Qy = extra_src1->buffer_gpu.lock();
3572
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3573
  GGML_ASSERT(d_Qx != nullptr);
3574
  }
3575
 
@@ -3631,10 +3646,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
3631
 
3632
  const uint64_t n_as = ne02;
3633
 
3634
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3635
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3636
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3637
- ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra;
3638
 
3639
  vk_buffer d_Qx;
3640
  size_t qx_buf_offset = 0;
@@ -3731,26 +3746,26 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
3731
  return;
3732
  }
3733
 
3734
- vk_buffer d_D = extra->buffer_gpu.lock();
3735
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3736
  GGML_ASSERT(d_D != nullptr);
3737
  vk_buffer d_X;
3738
  uint64_t x_buf_offset = 0;
3739
  vk_buffer d_Y;
3740
  uint64_t y_buf_offset = 0;
3741
  if (!src0_uma) {
3742
- d_Qx = extra_src0->buffer_gpu.lock();
3743
- qx_buf_offset = extra_src0->offset + src0->view_offs;
3744
  GGML_ASSERT(d_Qx != nullptr);
3745
  }
3746
  if (!src1_uma) {
3747
- d_Qy = extra_src1->buffer_gpu.lock();
3748
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3749
  GGML_ASSERT(d_Qy != nullptr);
3750
  }
3751
  if (!ids_uma) {
3752
- d_ids = extra_ids->buffer_gpu.lock();
3753
- ids_buf_offset = extra_ids->offset + ids->view_offs;
3754
  GGML_ASSERT(d_ids != nullptr);
3755
  }
3756
  if (qx_needs_dequant) {
@@ -3836,10 +3851,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3836
  const uint64_t ne22 = dst->ne[2];
3837
  const uint64_t ne23 = dst->ne[3];
3838
 
3839
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3840
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3841
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3842
- ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra;
3843
 
3844
  vk_buffer d_Qx;
3845
  size_t qx_buf_offset = 0;
@@ -3924,26 +3939,26 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3924
  return;
3925
  }
3926
 
3927
- vk_buffer d_D = extra->buffer_gpu.lock();
3928
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3929
  GGML_ASSERT(d_D != nullptr);
3930
  vk_buffer d_X;
3931
  uint64_t x_buf_offset = 0;
3932
  vk_buffer d_Y;
3933
  uint64_t y_buf_offset = 0;
3934
  if(!src0_uma) {
3935
- d_Qx = extra_src0->buffer_gpu.lock();
3936
- qx_buf_offset = extra_src0->offset + src0->view_offs;
3937
  GGML_ASSERT(d_Qx != nullptr);
3938
  }
3939
  if(!src1_uma) {
3940
- d_Qy = extra_src1->buffer_gpu.lock();
3941
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3942
  GGML_ASSERT(d_Qy != nullptr);
3943
  }
3944
  if(!ids_uma) {
3945
- d_ids = extra_ids->buffer_gpu.lock();
3946
- ids_buf_offset = extra_ids->offset + ids->view_offs;
3947
  GGML_ASSERT(d_ids != nullptr);
3948
  }
3949
  if (qx_needs_dequant) {
@@ -4250,7 +4265,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
4250
  std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
4251
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
4252
  GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
4253
- GGML_ASSERT(dst->extra != nullptr);
4254
  const uint64_t ne00 = src0->ne[0];
4255
  const uint64_t ne01 = src0->ne[1];
4256
  const uint64_t ne02 = src0->ne[2];
@@ -4296,10 +4311,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
4296
 
4297
  const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
4298
 
4299
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4300
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
4301
- ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
4302
- ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
4303
 
4304
  vk_buffer d_X = nullptr;
4305
  size_t x_buf_offset = 0;
@@ -4330,7 +4345,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
4330
  uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0;
4331
  uint64_t d_sz = ggml_type_size(dst->type) * ned;
4332
 
4333
- vk_buffer d_D = extra->buffer_gpu.lock();
4334
 
4335
  // Workaround for tiny tensor inputs on ROPE
4336
  if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) {
@@ -4338,21 +4353,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
4338
  }
4339
 
4340
  GGML_ASSERT(d_D != nullptr);
4341
- uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4342
- GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
4343
  if(!src0_uma) {
4344
- d_X = extra_src0->buffer_gpu.lock();
4345
- x_buf_offset = extra_src0->offset + src0->view_offs;
4346
  GGML_ASSERT(d_X != nullptr);
4347
  }
4348
  if (use_src1 && !src1_uma) {
4349
- d_Y = extra_src1->buffer_gpu.lock();
4350
- y_buf_offset = extra_src1->offset + src1->view_offs;
4351
  GGML_ASSERT(d_Y != nullptr);
4352
  }
4353
  if (use_src2 && !src2_uma) {
4354
- d_Z = extra_src2->buffer_gpu.lock();
4355
- z_buf_offset = extra_src2->offset + src2->view_offs;
4356
  GGML_ASSERT(d_Z != nullptr);
4357
  }
4358
 
@@ -4531,11 +4546,10 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
4531
  }
4532
 
4533
  static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
4534
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4535
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4536
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4537
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4538
- const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4539
 
4540
  int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
4541
  int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
@@ -4724,10 +4738,9 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
4724
  }
4725
 
4726
  static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4727
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4728
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4729
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4730
- const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4731
 
4732
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4733
  (uint32_t)ggml_nelements(src0),
@@ -5535,14 +5548,6 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5535
  }
5536
  #endif
5537
 
5538
- static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
5539
- VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
5540
- ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
5541
- extra->reset();
5542
- tensor->extra = extra;
5543
- return extra;
5544
- }
5545
-
5546
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5547
  #if defined(GGML_VULKAN_RUN_TESTS)
5548
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
@@ -5711,9 +5716,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* t
5711
  // Returns true if node has enqueued work into the queue, false otherwise
5712
  // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
5713
  static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
5714
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5715
-
5716
- if (ggml_is_empty(node) || extra == nullptr) {
5717
  return false;
5718
  }
5719
 
@@ -5965,7 +5968,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5965
  }
5966
 
5967
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
5968
- ggml_tensor_extra_gpu * extra = nullptr;
5969
 
5970
  switch (tensor->op) {
5971
  case GGML_OP_ADD:
@@ -6001,7 +6004,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
6001
  case GGML_OP_TIMESTEP_EMBEDDING:
6002
  case GGML_OP_LEAKY_RELU:
6003
  case GGML_OP_REPEAT:
6004
- extra = (ggml_tensor_extra_gpu *) tensor->extra;
6005
 
6006
  break;
6007
  case GGML_OP_UNARY:
@@ -6011,7 +6014,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
6011
  case GGML_UNARY_OP_GELU_QUICK:
6012
  case GGML_UNARY_OP_RELU:
6013
  case GGML_UNARY_OP_TANH:
6014
- extra = (ggml_tensor_extra_gpu *) tensor->extra;
6015
  break;
6016
  default:
6017
  return false;
@@ -6019,14 +6022,14 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
6019
  break;
6020
  case GGML_OP_MUL_MAT:
6021
  case GGML_OP_MUL_MAT_ID:
6022
- extra = (ggml_tensor_extra_gpu *) tensor->extra;
6023
 
6024
  break;
6025
  default:
6026
  return false;
6027
  }
6028
 
6029
- if (extra == nullptr) {
6030
  return false;
6031
  }
6032
 
@@ -6167,42 +6170,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
6167
 
6168
  // device backend
6169
 
6170
- static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
6171
-
6172
- struct ggml_backend_vk_buffer_context {
6173
- vk_device_ref device;
6174
- vk_buffer dev_buffer;
6175
- ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
6176
- size_t temp_tensor_extra_index = 0;
6177
- std::string name;
6178
-
6179
- ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) :
6180
- device(device),
6181
- dev_buffer(dev_buffer),
6182
- name(name) {
6183
- }
6184
-
6185
- ~ggml_backend_vk_buffer_context() {
6186
- ggml_vk_destroy_buffer(dev_buffer);
6187
- if (temp_tensor_extras != nullptr) {
6188
- delete[] temp_tensor_extras;
6189
- }
6190
- }
6191
-
6192
- ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
6193
- if (temp_tensor_extras == nullptr) {
6194
- temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_VK_MAX_NODES];
6195
- }
6196
-
6197
- size_t alloc_index = temp_tensor_extra_index;
6198
- temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES;
6199
- ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
6200
- extra->reset();
6201
-
6202
- return extra;
6203
- }
6204
- };
6205
-
6206
  GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
6207
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6208
  return ctx->name.c_str();
@@ -6227,51 +6194,37 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu
6227
 
6228
  GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
6229
  VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
6230
- ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6231
-
6232
  if (tensor->view_src != nullptr) {
6233
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
6234
- GGML_ASSERT(tensor->view_src->extra != nullptr);
6235
- tensor->extra = tensor->view_src->extra;
6236
- } else {
6237
- ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
6238
- extra->buffer_gpu = ctx->dev_buffer;
6239
- extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
6240
- tensor->extra = extra;
6241
  }
6242
  }
6243
 
6244
  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6245
  VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6246
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
 
6247
 
6248
- vk_buffer buf = extra->buffer_gpu.lock();
6249
-
6250
- ggml_vk_buffer_write(buf, extra->offset + tensor->view_offs + offset, data, size);
6251
-
6252
- GGML_UNUSED(buffer);
6253
  }
6254
 
6255
  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6256
  VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6257
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6258
-
6259
- vk_buffer buf = extra->buffer_gpu.lock();
6260
 
6261
- ggml_vk_buffer_read(buf, extra->offset + tensor->view_offs + offset, data, size);
6262
 
6263
- GGML_UNUSED(buffer);
6264
  }
6265
 
6266
  GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
6267
  if (ggml_backend_buffer_is_vk(src->buffer)) {
6268
- ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
6269
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6270
 
6271
- vk_buffer src_buf = src_extra->buffer_gpu.lock();
6272
- vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6273
 
6274
- ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6275
 
6276
  return true;
6277
  }
@@ -6449,7 +6402,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6449
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6450
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6451
 
6452
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6453
 
6454
  vk_context transfer_ctx;
6455
 
@@ -6462,9 +6415,9 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6462
  transfer_ctx = ctx->transfer_ctx.lock();
6463
  }
6464
 
6465
- vk_buffer buf = extra->buffer_gpu.lock();
6466
 
6467
- ggml_vk_buffer_write_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6468
  }
6469
 
6470
  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6472,7 +6425,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6472
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6473
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6474
 
6475
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6476
 
6477
  vk_context transfer_ctx;
6478
 
@@ -6485,17 +6438,17 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6485
  transfer_ctx = ctx->transfer_ctx.lock();
6486
  }
6487
 
6488
- vk_buffer buf = extra->buffer_gpu.lock();
6489
 
6490
- ggml_vk_buffer_read_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6491
  }
6492
 
6493
  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6494
  VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
6495
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6496
  if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
6497
- ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
6498
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6499
 
6500
  vk_context transfer_ctx;
6501
 
@@ -6508,10 +6461,10 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6508
  transfer_ctx = ctx->transfer_ctx.lock();
6509
  }
6510
 
6511
- vk_buffer src_buf = src_extra->buffer_gpu.lock();
6512
- vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6513
 
6514
- ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6515
  return true;
6516
  }
6517
 
@@ -6949,10 +6902,10 @@ static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name)
6949
  const size_t tensor_size = ggml_nbytes(tensor);
6950
  tensor_data = malloc(tensor_size);
6951
 
6952
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6953
 
6954
- vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6955
- ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
6956
  }
6957
 
6958
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@@ -7026,9 +6979,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
7026
  memcpy(src0_clone->data, src0->data, src0_size);
7027
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
7028
  } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
7029
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
7030
- vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7031
- uint64_t offset = extra->offset + src0->view_offs;
7032
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
7033
  for (int i3 = 0; i3 < src0->ne[3]; i3++) {
7034
  for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@@ -7068,9 +7021,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
7068
  memcpy(src1_clone->data, src1->data, src1_size);
7069
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
7070
  } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
7071
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
7072
- vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7073
- uint64_t offset = extra->offset + src1->view_offs;
7074
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
7075
  for (int i3 = 0; i3 < src1->ne[3]; i3++) {
7076
  for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@@ -7110,9 +7063,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
7110
  memcpy(src2_clone->data, src2->data, src2_size);
7111
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
7112
  } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
7113
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
7114
- vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7115
- uint64_t offset = extra->offset + src2->view_offs;
7116
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
7117
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
7118
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@@ -7167,7 +7120,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
7167
  } else if (tensor->op == GGML_OP_PAD) {
7168
  tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]);
7169
  } else if (tensor->op == GGML_OP_REPEAT) {
7170
- tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone);
7171
  } else if (tensor->op == GGML_OP_ADD) {
7172
  tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
7173
  } else if (tensor->op == GGML_OP_ACC) {
@@ -7312,14 +7265,15 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
7312
  size_t tensor_size = ggml_nbytes(tensor);
7313
  tensor_data = malloc(tensor_size);
7314
 
7315
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7316
 
7317
- vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7318
- if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
7319
- tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
 
7320
  }
7321
 
7322
- ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
7323
  }
7324
 
7325
  float first_error_result = -1.0f;
 
433
  typedef std::shared_ptr<vk_context_struct> vk_context;
434
  typedef std::weak_ptr<vk_context_struct> vk_context_ref;
435
 
 
 
 
 
 
 
 
 
 
 
436
  struct ggml_vk_garbage_collector {
437
  std::vector<vk_semaphore> tl_semaphores;
438
  std::vector<vk_semaphore> semaphores;
 
543
  std::vector<vk_context_ref> tensor_ctxs;
544
  };
545
 
546
+ static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
547
+
548
+ static uint64_t vk_tensor_offset(const ggml_tensor * tensor) {
549
+ if (tensor->view_src) {
550
+ return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base;
551
+ }
552
+ return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
553
+ }
554
+
555
+ struct ggml_backend_vk_buffer_context {
556
+ vk_device_ref device;
557
+ vk_buffer dev_buffer;
558
+ std::string name;
559
+
560
+ ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) :
561
+ device(device),
562
+ dev_buffer(dev_buffer),
563
+ name(name) {
564
+ }
565
+
566
+ ~ggml_backend_vk_buffer_context() {
567
+ ggml_vk_destroy_buffer(dev_buffer);
568
+ }
569
+ };
570
+
571
  #ifdef GGML_VULKAN_MEMORY_DEBUG
572
  void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
573
  std::lock_guard<std::mutex> guard(log_mutex);
 
3091
  const uint64_t r2 = ne12 / ne02;
3092
  const uint64_t r3 = ne13 / ne03;
3093
 
3094
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3095
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3096
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3097
 
3098
  vk_buffer d_Qx;
3099
  size_t qx_buf_offset = 0;
 
3195
  return;
3196
  }
3197
 
3198
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3199
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3200
  GGML_ASSERT(d_D != nullptr);
3201
  GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
3202
  vk_buffer d_X;
 
3204
  vk_buffer d_Y;
3205
  uint64_t y_buf_offset = 0;
3206
  if (!src0_uma) {
3207
+ d_Qx = src0_buf_ctx->dev_buffer;
3208
+ qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3209
  GGML_ASSERT(d_Qx != nullptr);
3210
  }
3211
  if (!src1_uma) {
3212
+ d_Qy = src1_buf_ctx->dev_buffer;
3213
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3214
  GGML_ASSERT(d_Qy != nullptr);
3215
  }
3216
  if (qx_needs_dequant) {
 
3291
  const uint64_t r2 = ne12 / ne02;
3292
  const uint64_t r3 = ne13 / ne03;
3293
 
3294
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3295
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3296
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3297
 
3298
  vk_buffer d_Qx;
3299
  size_t qx_buf_offset = 0;
 
3372
  return;
3373
  }
3374
 
3375
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3376
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3377
  GGML_ASSERT(d_D != nullptr);
3378
  vk_buffer d_X;
3379
  uint64_t x_buf_offset = 0;
3380
  vk_buffer d_Y;
3381
  uint64_t y_buf_offset = 0;
3382
  if(!src0_uma) {
3383
+ d_Qx = src0_buf_ctx->dev_buffer;
3384
+ qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3385
  GGML_ASSERT(d_Qx != nullptr);
3386
  }
3387
  if(!src1_uma) {
3388
+ d_Qy = src1_buf_ctx->dev_buffer;
3389
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3390
  GGML_ASSERT(d_Qy != nullptr);
3391
  }
3392
  if (qx_needs_dequant) {
 
3469
 
3470
  GGML_ASSERT(ne11 == 1);
3471
 
3472
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3473
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3474
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3475
 
3476
  vk_buffer d_Qy;
3477
  size_t qy_buf_offset = 0;
 
3497
  return;
3498
  }
3499
 
3500
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3501
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3502
  GGML_ASSERT(d_D != nullptr);
3503
+ vk_buffer d_Qx = src0_buf_ctx->dev_buffer;
3504
+ const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3505
  GGML_ASSERT(d_Qx != nullptr);
3506
  if (!src1_uma) {
3507
+ d_Qy = src1_buf_ctx->dev_buffer;
3508
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3509
  GGML_ASSERT(d_Qx != nullptr);
3510
  }
3511
 
 
3547
 
3548
  GGML_ASSERT(ne11 == 1);
3549
 
3550
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3551
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3552
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3553
 
3554
  vk_buffer d_Qy = nullptr;
3555
  size_t qy_buf_offset = 0;
 
3576
  return;
3577
  }
3578
 
3579
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3580
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3581
  GGML_ASSERT(d_D != nullptr);
3582
+ vk_buffer d_Qx = src0_buf_ctx->dev_buffer;
3583
+ const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3584
  GGML_ASSERT(d_Qx != nullptr);
3585
  if (!src1_uma) {
3586
+ d_Qy = src1_buf_ctx->dev_buffer;
3587
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3588
  GGML_ASSERT(d_Qx != nullptr);
3589
  }
3590
 
 
3646
 
3647
  const uint64_t n_as = ne02;
3648
 
3649
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3650
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3651
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3652
+ ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
3653
 
3654
  vk_buffer d_Qx;
3655
  size_t qx_buf_offset = 0;
 
3746
  return;
3747
  }
3748
 
3749
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3750
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3751
  GGML_ASSERT(d_D != nullptr);
3752
  vk_buffer d_X;
3753
  uint64_t x_buf_offset = 0;
3754
  vk_buffer d_Y;
3755
  uint64_t y_buf_offset = 0;
3756
  if (!src0_uma) {
3757
+ d_Qx = src0_buf_ctx->dev_buffer;
3758
+ qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3759
  GGML_ASSERT(d_Qx != nullptr);
3760
  }
3761
  if (!src1_uma) {
3762
+ d_Qy = src1_buf_ctx->dev_buffer;
3763
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3764
  GGML_ASSERT(d_Qy != nullptr);
3765
  }
3766
  if (!ids_uma) {
3767
+ d_ids = ids_buf_ctx->dev_buffer;
3768
+ ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
3769
  GGML_ASSERT(d_ids != nullptr);
3770
  }
3771
  if (qx_needs_dequant) {
 
3851
  const uint64_t ne22 = dst->ne[2];
3852
  const uint64_t ne23 = dst->ne[3];
3853
 
3854
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3855
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3856
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3857
+ ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
3858
 
3859
  vk_buffer d_Qx;
3860
  size_t qx_buf_offset = 0;
 
3939
  return;
3940
  }
3941
 
3942
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3943
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3944
  GGML_ASSERT(d_D != nullptr);
3945
  vk_buffer d_X;
3946
  uint64_t x_buf_offset = 0;
3947
  vk_buffer d_Y;
3948
  uint64_t y_buf_offset = 0;
3949
  if(!src0_uma) {
3950
+ d_Qx = src0_buf_ctx->dev_buffer;
3951
+ qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3952
  GGML_ASSERT(d_Qx != nullptr);
3953
  }
3954
  if(!src1_uma) {
3955
+ d_Qy = src1_buf_ctx->dev_buffer;
3956
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3957
  GGML_ASSERT(d_Qy != nullptr);
3958
  }
3959
  if(!ids_uma) {
3960
+ d_ids = ids_buf_ctx->dev_buffer;
3961
+ ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
3962
  GGML_ASSERT(d_ids != nullptr);
3963
  }
3964
  if (qx_needs_dequant) {
 
4265
  std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
4266
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
4267
  GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
4268
+ GGML_ASSERT(dst->buffer != nullptr);
4269
  const uint64_t ne00 = src0->ne[0];
4270
  const uint64_t ne01 = src0->ne[1];
4271
  const uint64_t ne02 = src0->ne[2];
 
4311
 
4312
  const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
4313
 
4314
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
4315
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
4316
+ ggml_backend_vk_buffer_context * src1_buf_ctx = use_src1 ? (ggml_backend_vk_buffer_context *)src1->buffer->context : nullptr;
4317
+ ggml_backend_vk_buffer_context * src2_buf_ctx = use_src2 ? (ggml_backend_vk_buffer_context *)src2->buffer->context : nullptr;
4318
 
4319
  vk_buffer d_X = nullptr;
4320
  size_t x_buf_offset = 0;
 
4345
  uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0;
4346
  uint64_t d_sz = ggml_type_size(dst->type) * ned;
4347
 
4348
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
4349
 
4350
  // Workaround for tiny tensor inputs on ROPE
4351
  if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) {
 
4353
  }
4354
 
4355
  GGML_ASSERT(d_D != nullptr);
4356
+ uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4357
+ GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT
4358
  if(!src0_uma) {
4359
+ d_X = src0_buf_ctx->dev_buffer;
4360
+ x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
4361
  GGML_ASSERT(d_X != nullptr);
4362
  }
4363
  if (use_src1 && !src1_uma) {
4364
+ d_Y = src1_buf_ctx->dev_buffer;
4365
+ y_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
4366
  GGML_ASSERT(d_Y != nullptr);
4367
  }
4368
  if (use_src2 && !src2_uma) {
4369
+ d_Z = src2_buf_ctx->dev_buffer;
4370
+ z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
4371
  GGML_ASSERT(d_Z != nullptr);
4372
  }
4373
 
 
4546
  }
4547
 
4548
  static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
 
4549
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4550
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4551
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4552
+ const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4553
 
4554
  int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
4555
  int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
 
4738
  }
4739
 
4740
  static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
 
4741
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4742
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4743
+ const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4744
 
4745
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4746
  (uint32_t)ggml_nelements(src0),
 
5548
  }
5549
  #endif
5550
 
 
 
 
 
 
 
 
 
5551
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5552
  #if defined(GGML_VULKAN_RUN_TESTS)
5553
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
 
5716
  // Returns true if node has enqueued work into the queue, false otherwise
5717
  // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
5718
  static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
5719
+ if (ggml_is_empty(node) || !node->buffer) {
 
 
5720
  return false;
5721
  }
5722
 
 
5968
  }
5969
 
5970
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
5971
+ ggml_backend_buffer * buf = nullptr;
5972
 
5973
  switch (tensor->op) {
5974
  case GGML_OP_ADD:
 
6004
  case GGML_OP_TIMESTEP_EMBEDDING:
6005
  case GGML_OP_LEAKY_RELU:
6006
  case GGML_OP_REPEAT:
6007
+ buf = tensor->buffer;
6008
 
6009
  break;
6010
  case GGML_OP_UNARY:
 
6014
  case GGML_UNARY_OP_GELU_QUICK:
6015
  case GGML_UNARY_OP_RELU:
6016
  case GGML_UNARY_OP_TANH:
6017
+ buf = tensor->buffer;
6018
  break;
6019
  default:
6020
  return false;
 
6022
  break;
6023
  case GGML_OP_MUL_MAT:
6024
  case GGML_OP_MUL_MAT_ID:
6025
+ buf = tensor->buffer;
6026
 
6027
  break;
6028
  default:
6029
  return false;
6030
  }
6031
 
6032
+ if (buf == nullptr) {
6033
  return false;
6034
  }
6035
 
 
6170
 
6171
  // device backend
6172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6173
  GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
6174
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6175
  return ctx->name.c_str();
 
6194
 
6195
  GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
6196
  VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
 
 
6197
  if (tensor->view_src != nullptr) {
6198
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
 
 
 
 
 
 
 
6199
  }
6200
  }
6201
 
6202
  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6203
  VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6204
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6205
+ vk_buffer buf = buf_ctx->dev_buffer;
6206
 
6207
+ ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
 
 
 
 
6208
  }
6209
 
6210
  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6211
  VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6212
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
 
 
6213
 
6214
+ vk_buffer buf = buf_ctx->dev_buffer;
6215
 
6216
+ ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6217
  }
6218
 
6219
  GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
6220
  if (ggml_backend_buffer_is_vk(src->buffer)) {
6221
+ ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
6222
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
6223
 
6224
+ vk_buffer src_buf = src_buf_ctx->dev_buffer;
6225
+ vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
6226
 
6227
+ ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
6228
 
6229
  return true;
6230
  }
 
6402
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6403
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6404
 
6405
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
6406
 
6407
  vk_context transfer_ctx;
6408
 
 
6415
  transfer_ctx = ctx->transfer_ctx.lock();
6416
  }
6417
 
6418
+ vk_buffer buf = buf_ctx->dev_buffer;
6419
 
6420
+ ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6421
  }
6422
 
6423
  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 
6425
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6426
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6427
 
6428
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
6429
 
6430
  vk_context transfer_ctx;
6431
 
 
6438
  transfer_ctx = ctx->transfer_ctx.lock();
6439
  }
6440
 
6441
+ vk_buffer buf = buf_ctx->dev_buffer;
6442
 
6443
+ ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6444
  }
6445
 
6446
  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6447
  VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
6448
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6449
  if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
6450
+ ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
6451
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
6452
 
6453
  vk_context transfer_ctx;
6454
 
 
6461
  transfer_ctx = ctx->transfer_ctx.lock();
6462
  }
6463
 
6464
+ vk_buffer src_buf = src_buf_ctx->dev_buffer;
6465
+ vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
6466
 
6467
+ ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
6468
  return true;
6469
  }
6470
 
 
6902
  const size_t tensor_size = ggml_nbytes(tensor);
6903
  tensor_data = malloc(tensor_size);
6904
 
6905
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
6906
 
6907
+ vk_buffer buffer_gpu = buf_ctx->dev_buffer;
6908
+ ggml_vk_buffer_read(buffer_gpu, vk_tensor_offset(tensor) + tensor->view_offs, tensor_data, tensor_size);
6909
  }
6910
 
6911
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
 
6979
  memcpy(src0_clone->data, src0->data, src0_size);
6980
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6981
  } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6982
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
6983
+ vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
6984
+ uint64_t offset = vk_tensor_offset(src0) + src0->view_offs;
6985
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
6986
  for (int i3 = 0; i3 < src0->ne[3]; i3++) {
6987
  for (int i2 = 0; i2 < src0->ne[2]; i2++) {
 
7021
  memcpy(src1_clone->data, src1->data, src1_size);
7022
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
7023
  } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
7024
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
7025
+ vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
7026
+ uint64_t offset = vk_tensor_offset(src1) + src1->view_offs;
7027
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
7028
  for (int i3 = 0; i3 < src1->ne[3]; i3++) {
7029
  for (int i2 = 0; i2 < src1->ne[2]; i2++) {
 
7063
  memcpy(src2_clone->data, src2->data, src2_size);
7064
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
7065
  } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
7066
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src2->buffer->context;
7067
+ vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
7068
+ uint64_t offset = vk_tensor_offset(src2) + src2->view_offs;
7069
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
7070
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
7071
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
 
7120
  } else if (tensor->op == GGML_OP_PAD) {
7121
  tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]);
7122
  } else if (tensor->op == GGML_OP_REPEAT) {
7123
+ tensor_clone = ggml_repeat(ggml_ctx, src0_clone, tensor);
7124
  } else if (tensor->op == GGML_OP_ADD) {
7125
  tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
7126
  } else if (tensor->op == GGML_OP_ACC) {
 
7265
  size_t tensor_size = ggml_nbytes(tensor);
7266
  tensor_data = malloc(tensor_size);
7267
 
7268
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
7269
 
7270
+ vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
7271
+ uint64_t offset = vk_tensor_offset(tensor) + tensor->view_offs;
7272
+ if (offset + tensor_size >= buffer_gpu->size) {
7273
+ tensor_size = buffer_gpu->size - offset;
7274
  }
7275
 
7276
+ ggml_vk_buffer_read(buffer_gpu, offset, tensor_data, tensor_size);
7277
  }
7278
 
7279
  float first_error_result = -1.0f;