jeffbolznv commited on
Commit
04e729a
·
1 Parent(s): 3bf5be1

vulkan: Use push constant offset to handle misaligned descriptors (llama/10987)

Browse files
ggml/src/ggml-vulkan/ggml-vulkan.cpp CHANGED
@@ -411,7 +411,7 @@ struct vk_op_unary_push_constants {
411
  uint32_t ne;
412
  uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
413
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
414
- uint32_t d_offset;
415
  float param1; float param2;
416
  uint32_t ne0_012mp; uint32_t ne0_012L;
417
  uint32_t ne0_01mp; uint32_t ne0_01L;
@@ -459,7 +459,7 @@ struct vk_op_binary_push_constants {
459
  uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
460
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
461
  uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
462
- uint32_t d_offset;
463
  float param1; float param2; int32_t param3;
464
  };
465
 
@@ -546,7 +546,7 @@ struct vk_staging_memcpy {
546
  };
547
 
548
  struct vk_op_upscale_push_constants {
549
- uint32_t ne; uint32_t d_offset;
550
  uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
551
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
552
  float sf0; float sf1; float sf2; float sf3;
@@ -5076,6 +5076,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
5076
  }
5077
  }
5078
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5079
  template<typename PC>
5080
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
5081
  VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
@@ -5179,8 +5230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
5179
  }
5180
 
5181
  GGML_ASSERT(d_D != nullptr);
5182
- uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
5183
- GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT
5184
  if(!src0_uma) {
5185
  d_X = src0_buf_ctx->dev_buffer;
5186
  x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
@@ -5196,6 +5246,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
5196
  z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
5197
  GGML_ASSERT(d_Z != nullptr);
5198
  }
 
 
 
 
 
 
5199
 
5200
  if (op_supports_incontiguous) {
5201
  x_sz = ggml_nbytes(src0);
@@ -5383,7 +5439,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
5383
  const uint32_t src0_type_size = ggml_type_size(src0->type);
5384
  const uint32_t src1_type_size = ggml_type_size(src1->type);
5385
  const uint32_t dst_type_size = ggml_type_size(dst->type);
5386
- const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
5387
 
5388
  int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
5389
  int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
@@ -5395,7 +5450,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
5395
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
5396
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
5397
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
5398
- d_offset,
5399
  0.0f, 0.0f, offset,
5400
  }, dryrun);
5401
  }
@@ -5599,7 +5654,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
5599
  const float sf3 = (float)dst->ne[3] / src0->ne[3];
5600
 
5601
  ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
5602
- (uint32_t)ggml_nelements(dst), 0,
5603
  (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
5604
  (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
5605
  sf0, sf1, sf2, sf3,
@@ -5709,13 +5764,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
5709
  static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
5710
  const uint32_t src0_type_size = ggml_type_size(src0->type);
5711
  const uint32_t dst_type_size = ggml_type_size(dst->type);
5712
- const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
5713
 
5714
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
5715
  (uint32_t)ggml_nelements(src0),
5716
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
5717
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
5718
- d_offset,
5719
  0.0f, 0.0f,
5720
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5721
  }, dryrun);
 
411
  uint32_t ne;
412
  uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
413
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
414
+ uint32_t misalign_offsets;
415
  float param1; float param2;
416
  uint32_t ne0_012mp; uint32_t ne0_012L;
417
  uint32_t ne0_01mp; uint32_t ne0_01L;
 
459
  uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
460
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
461
  uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
462
+ uint32_t misalign_offsets;
463
  float param1; float param2; int32_t param3;
464
  };
465
 
 
546
  };
547
 
548
  struct vk_op_upscale_push_constants {
549
+ uint32_t ne; uint32_t a_offset; uint32_t d_offset;
550
  uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
551
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
552
  float sf0; float sf1; float sf2; float sf3;
 
5076
  }
5077
  }
5078
 
5079
+ static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_tensor * t)
5080
+ {
5081
+ return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
5082
+ }
5083
+
5084
+ template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5085
+ GGML_UNUSED(p);
5086
+ GGML_UNUSED(src0);
5087
+ GGML_UNUSED(src1);
5088
+ GGML_UNUSED(src2);
5089
+ GGML_UNUSED(dst);
5090
+ static_assert(!std::is_const<T>::value, "unexpected type");
5091
+ GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
5092
+ GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
5093
+ GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
5094
+ GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0);
5095
+ }
5096
+
5097
+ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5098
+ const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
5099
+ const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
5100
+
5101
+ p.misalign_offsets = (a_offset << 16) | d_offset;
5102
+
5103
+ GGML_UNUSED(src1);
5104
+ GGML_UNUSED(src2);
5105
+ }
5106
+
5107
+ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5108
+ const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
5109
+ const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
5110
+ const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
5111
+
5112
+ GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));
5113
+
5114
+ p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
5115
+
5116
+ GGML_UNUSED(src2);
5117
+ }
5118
+
5119
+ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5120
+ const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
5121
+ const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
5122
+
5123
+ p.a_offset = a_offset;
5124
+ p.d_offset = d_offset;
5125
+
5126
+ GGML_UNUSED(src1);
5127
+ GGML_UNUSED(src2);
5128
+ }
5129
+
5130
  template<typename PC>
5131
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
5132
  VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
 
5230
  }
5231
 
5232
  GGML_ASSERT(d_D != nullptr);
5233
+ uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
 
5234
  if(!src0_uma) {
5235
  d_X = src0_buf_ctx->dev_buffer;
5236
  x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
 
5246
  z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
5247
  GGML_ASSERT(d_Z != nullptr);
5248
  }
5249
+ // Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
5250
+ init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, dst);
5251
+ x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
5252
+ y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
5253
+ z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
5254
+ d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
5255
 
5256
  if (op_supports_incontiguous) {
5257
  x_sz = ggml_nbytes(src0);
 
5439
  const uint32_t src0_type_size = ggml_type_size(src0->type);
5440
  const uint32_t src1_type_size = ggml_type_size(src1->type);
5441
  const uint32_t dst_type_size = ggml_type_size(dst->type);
 
5442
 
5443
  int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
5444
  int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
 
5450
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
5451
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
5452
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
5453
+ 0,
5454
  0.0f, 0.0f, offset,
5455
  }, dryrun);
5456
  }
 
5654
  const float sf3 = (float)dst->ne[3] / src0->ne[3];
5655
 
5656
  ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
5657
+ (uint32_t)ggml_nelements(dst), 0, 0,
5658
  (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
5659
  (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
5660
  sf0, sf1, sf2, sf3,
 
5764
  static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
5765
  const uint32_t src0_type_size = ggml_type_size(src0->type);
5766
  const uint32_t dst_type_size = ggml_type_size(dst->type);
 
5767
 
5768
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
5769
  (uint32_t)ggml_nelements(src0),
5770
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
5771
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
5772
+ 0,
5773
  0.0f, 0.0f,
5774
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5775
  }, dryrun);
ggml/src/ggml-vulkan/vulkan-shaders/acc.comp CHANGED
@@ -21,9 +21,9 @@ void main() {
21
  get_indices(idx, i00, i01, i02, i03);
22
 
23
  if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
24
- data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
25
  } else {
26
- data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]));
27
  }
28
  }
29
 
 
21
  get_indices(idx, i00, i01, i02, i03);
22
 
23
  if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
24
+ data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
25
  } else {
26
+ data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
27
  }
28
  }
29
 
ggml/src/ggml-vulkan/vulkan-shaders/add.comp CHANGED
@@ -22,7 +22,7 @@ void main() {
22
  uint i00, i01, i02, i03;
23
  get_indices(idx, i00, i01, i02, i03);
24
 
25
- data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
26
 
27
  idx += num_threads;
28
  }
 
22
  uint i00, i01, i02, i03;
23
  get_indices(idx, i00, i01, i02, i03);
24
 
25
+ data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
26
 
27
  idx += num_threads;
28
  }
ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp CHANGED
@@ -12,6 +12,6 @@ void main() {
12
  return;
13
  }
14
 
15
- const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
16
- data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
17
  }
 
12
  return;
13
  }
14
 
15
+ const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
16
+ data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
17
  }
ggml/src/ggml-vulkan/vulkan-shaders/concat.comp CHANGED
@@ -30,12 +30,12 @@ void main() {
30
  const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
31
 
32
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
33
- data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
34
  #else
35
  if (is_src0) {
36
- data_d[p.d_offset + dst_idx] = data_a[src0_idx];
37
  } else {
38
- data_d[p.d_offset + dst_idx] = data_b[src1_idx];
39
  }
40
  #endif
41
  }
 
30
  const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
31
 
32
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
33
+ data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : data_b[get_boffset() + src1_idx]);
34
  #else
35
  if (is_src0) {
36
+ data_d[get_doffset() + dst_idx] = data_a[get_aoffset() + src0_idx];
37
  } else {
38
+ data_d[get_doffset() + dst_idx] = data_b[get_boffset() + src1_idx];
39
  }
40
  #endif
41
  }
ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp CHANGED
@@ -19,9 +19,9 @@ void main() {
19
  if (idx + (num_iter-1)*num_threads < p.ne) {
20
  [[unroll]] for (uint i = 0; i < num_iter; ++i) {
21
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
22
- data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
23
  #else
24
- data_d[p.d_offset + idx] = data_a[idx];
25
  #endif
26
  idx += num_threads;
27
  }
@@ -32,9 +32,9 @@ void main() {
32
  }
33
 
34
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
35
- data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
36
  #else
37
- data_d[p.d_offset + idx] = data_a[idx];
38
  #endif
39
  idx += num_threads;
40
  }
 
19
  if (idx + (num_iter-1)*num_threads < p.ne) {
20
  [[unroll]] for (uint i = 0; i < num_iter; ++i) {
21
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
22
+ data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
23
  #else
24
+ data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
25
  #endif
26
  idx += num_threads;
27
  }
 
32
  }
33
 
34
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
35
+ data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
36
  #else
37
+ data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
38
  #endif
39
  idx += num_threads;
40
  }
ggml/src/ggml-vulkan/vulkan-shaders/copy.comp CHANGED
@@ -13,8 +13,8 @@ void main() {
13
  }
14
 
15
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
16
- data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
17
  #else
18
- data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
19
  #endif
20
  }
 
13
  }
14
 
15
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
16
+ data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
17
  #else
18
+ data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)];
19
  #endif
20
  }
ggml/src/ggml-vulkan/vulkan-shaders/cos.comp CHANGED
@@ -12,6 +12,6 @@ void main() {
12
  return;
13
  }
14
 
15
- const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
16
- data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
17
  }
 
12
  return;
13
  }
14
 
15
+ const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
16
+ data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
17
  }
ggml/src/ggml-vulkan/vulkan-shaders/div.comp CHANGED
@@ -20,7 +20,7 @@ void main() {
20
  uint i00, i01, i02, i03;
21
  get_indices(idx, i00, i01, i02, i03);
22
 
23
- data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
24
 
25
  idx += num_threads;
26
  }
 
20
  uint i00, i01, i02, i03;
21
  get_indices(idx, i00, i01, i02, i03);
22
 
23
+ data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
24
 
25
  idx += num_threads;
26
  }
ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp CHANGED
@@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
7
  uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
8
  uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
9
  uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
10
- uint d_offset;
11
  float param1; float param2; int param3;
12
  } p;
13
 
@@ -22,6 +22,10 @@ uint get_idx() {
22
  return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
23
  }
24
 
 
 
 
 
25
  // mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
26
  uint fastmod(uint a, uint b) {
27
  if ((b & (b-1)) == 0) {
 
7
  uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
8
  uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
9
  uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
10
+ uint misalign_offsets;
11
  float param1; float param2; int param3;
12
  } p;
13
 
 
22
  return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
23
  }
24
 
25
+ uint get_aoffset() { return p.misalign_offsets >> 16; }
26
+ uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
27
+ uint get_doffset() { return p.misalign_offsets & 0xFF; }
28
+
29
  // mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
30
  uint fastmod(uint a, uint b) {
31
  if ((b & (b-1)) == 0) {
ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp CHANGED
@@ -6,7 +6,7 @@ layout (push_constant) uniform parameter
6
  uint ne;
7
  uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
8
  uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
9
- uint d_offset;
10
  float param1; float param2;
11
 
12
  uint ne0_012mp; uint ne0_012L;
@@ -24,6 +24,9 @@ uint get_idx() {
24
  return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
25
  }
26
 
 
 
 
27
  // see init_fastdiv_values in ggml-vulkan.cpp
28
  uint fastdiv(uint n, uint mp, uint L) {
29
  uint msbs, lsbs;
 
6
  uint ne;
7
  uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
8
  uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
9
+ uint misalign_offsets;
10
  float param1; float param2;
11
 
12
  uint ne0_012mp; uint ne0_012L;
 
24
  return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
25
  }
26
 
27
+ uint get_aoffset() { return p.misalign_offsets >> 16; }
28
+ uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
29
+
30
  // see init_fastdiv_values in ggml-vulkan.cpp
31
  uint fastdiv(uint n, uint mp, uint L) {
32
  uint msbs, lsbs;
ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp CHANGED
@@ -15,10 +15,10 @@ void main() {
15
  return;
16
  }
17
 
18
- const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
19
 
20
- const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
21
- const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
22
 
23
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
24
  data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
 
15
  return;
16
  }
17
 
18
+ const uint i01 = data_b[get_boffset() + i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
19
 
20
+ const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
21
+ const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
22
 
23
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
24
  data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
ggml/src/ggml-vulkan/vulkan-shaders/mul.comp CHANGED
@@ -20,7 +20,7 @@ void main() {
20
  uint i00, i01, i02, i03;
21
  get_indices(idx, i00, i01, i02, i03);
22
 
23
- data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
24
 
25
  idx += num_threads;
26
  }
 
20
  uint i00, i01, i02, i03;
21
  get_indices(idx, i00, i01, i02, i03);
22
 
23
+ data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
24
 
25
  idx += num_threads;
26
  }
ggml/src/ggml-vulkan/vulkan-shaders/pad.comp CHANGED
@@ -24,5 +24,5 @@ void main() {
24
 
25
  const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
26
 
27
- data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
28
  }
 
24
 
25
  const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
26
 
27
+ data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
28
  }
ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp CHANGED
@@ -22,5 +22,5 @@ void main() {
22
  return;
23
  }
24
 
25
- data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
26
  }
 
22
  return;
23
  }
24
 
25
+ data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx_mod(idx)]);
26
  }
ggml/src/ggml-vulkan/vulkan-shaders/scale.comp CHANGED
@@ -18,7 +18,7 @@ void main() {
18
  continue;
19
  }
20
 
21
- data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
22
  idx += num_threads;
23
  }
24
  }
 
18
  continue;
19
  }
20
 
21
+ data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
22
  idx += num_threads;
23
  }
24
  }
ggml/src/ggml-vulkan/vulkan-shaders/sin.comp CHANGED
@@ -12,6 +12,6 @@ void main() {
12
  return;
13
  }
14
 
15
- const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
16
- data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
17
  }
 
12
  return;
13
  }
14
 
15
+ const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
16
+ data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
17
  }
ggml/src/ggml-vulkan/vulkan-shaders/square.comp CHANGED
@@ -12,6 +12,6 @@ void main() {
12
  return;
13
  }
14
 
15
- const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
16
- data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
17
  }
 
12
  return;
13
  }
14
 
15
+ const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
16
+ data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
17
  }
ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp CHANGED
@@ -2,7 +2,7 @@
2
 
3
  layout (push_constant) uniform parameter
4
  {
5
- uint ne; uint d_offset;
6
  uint nb00; uint nb01; uint nb02; uint nb03;
7
  uint ne10; uint ne11; uint ne12; uint ne13;
8
  float sf0; float sf1; float sf2; float sf3;
@@ -32,5 +32,5 @@ void main() {
32
  const uint i02 = uint(i12 / p.sf2);
33
  const uint i03 = uint(i13 / p.sf3);
34
 
35
- data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
36
  }
 
2
 
3
  layout (push_constant) uniform parameter
4
  {
5
+ uint ne; uint a_offset; uint d_offset;
6
  uint nb00; uint nb01; uint nb02; uint nb03;
7
  uint ne10; uint ne11; uint ne12; uint ne13;
8
  float sf0; float sf1; float sf2; float sf3;
 
32
  const uint i02 = uint(i12 / p.sf2);
33
  const uint i03 = uint(i13 / p.sf3);
34
 
35
+ data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
36
  }