Spaces:

natasa365
/

whisper.cpp

Running

App Files Files Community

jeffbolznv commited on Dec 29, 2024

Commit

04e729a

1 Parent(s): 3bf5be1

vulkan: Use push constant offset to handle misaligned descriptors (llama/10987)

Browse files

Files changed (19) hide show

ggml/src/ggml-vulkan/ggml-vulkan.cpp +64 -10
ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
ggml/src/ggml-vulkan/vulkan-shaders/add.comp +1 -1
ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +3 -3
ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +4 -4
ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
ggml/src/ggml-vulkan/vulkan-shaders/div.comp +1 -1
ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +5 -1
ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +4 -1
ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +3 -3
ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +1 -1
ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +1 -1
ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +1 -1
ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +2 -2

ggml/src/ggml-vulkan/ggml-vulkan.cpp CHANGED Viewed

@@ -411,7 +411,7 @@ struct vk_op_unary_push_constants {
     uint32_t ne;
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
-    uint32_t d_offset;
     float param1; float param2;
     uint32_t ne0_012mp; uint32_t ne0_012L;
     uint32_t ne0_01mp;  uint32_t ne0_01L;
@@ -459,7 +459,7 @@ struct vk_op_binary_push_constants {
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
     uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
-    uint32_t d_offset;
     float param1; float param2; int32_t param3;
 };
@@ -546,7 +546,7 @@ struct vk_staging_memcpy {
 };
 struct vk_op_upscale_push_constants {
-    uint32_t ne; uint32_t d_offset;
     uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
     float sf0; float sf1; float sf2; float sf3;
@@ -5076,6 +5076,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
     }
 }
 template<typename PC>
 static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
     VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
@@ -5179,8 +5230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
     }
     GGML_ASSERT(d_D != nullptr);
-    uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
-    GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY);  // NOLINT
     if(!src0_uma) {
         d_X = src0_buf_ctx->dev_buffer;
         x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
@@ -5196,6 +5246,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
         z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
         GGML_ASSERT(d_Z != nullptr);
     }
     if (op_supports_incontiguous) {
         x_sz = ggml_nbytes(src0);
@@ -5383,7 +5439,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
     const uint32_t src0_type_size = ggml_type_size(src0->type);
     const uint32_t src1_type_size = ggml_type_size(src1->type);
     const uint32_t dst_type_size = ggml_type_size(dst->type);
-    const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
     int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
     int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
@@ -5395,7 +5450,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
         (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
         (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
         (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] /  dst_type_size,
-        d_offset,
         0.0f, 0.0f, offset,
     }, dryrun);
 }
@@ -5599,7 +5654,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
     const float sf3 = (float)dst->ne[3] / src0->ne[3];
     ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
-        (uint32_t)ggml_nelements(dst), 0,
         (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
         (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
         sf0, sf1, sf2, sf3,
@@ -5709,13 +5764,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
 static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     const uint32_t src0_type_size = ggml_type_size(src0->type);
     const uint32_t dst_type_size = ggml_type_size(dst->type);
-    const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
     ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
         (uint32_t)ggml_nelements(src0),
         (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
         (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
-        d_offset,
         0.0f, 0.0f,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     }, dryrun);

     uint32_t ne;
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
+    uint32_t misalign_offsets;
     float param1; float param2;
     uint32_t ne0_012mp; uint32_t ne0_012L;
     uint32_t ne0_01mp;  uint32_t ne0_01L;
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
     uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
+    uint32_t misalign_offsets;
     float param1; float param2; int32_t param3;
 };
 };
 struct vk_op_upscale_push_constants {
+    uint32_t ne; uint32_t a_offset; uint32_t d_offset;
     uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
     float sf0; float sf1; float sf2; float sf3;
     }
 }
+static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_tensor * t)
+{
+    return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
+}
+template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+    GGML_UNUSED(p);
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src2);
+    GGML_UNUSED(dst);
+    static_assert(!std::is_const<T>::value, "unexpected type");
+    GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
+    GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
+    GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
+    GGML_ASSERT(!dst  || get_misalign_bytes(ctx, dst) == 0);
+}
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
+    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
+    p.misalign_offsets = (a_offset << 16) | d_offset;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src2);
+}
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
+    const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
+    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
+    GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));
+    p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
+    GGML_UNUSED(src2);
+}
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
+    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
+    p.a_offset = a_offset;
+    p.d_offset = d_offset;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src2);
+}
 template<typename PC>
 static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
     VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
     }
     GGML_ASSERT(d_D != nullptr);
+    uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
     if(!src0_uma) {
         d_X = src0_buf_ctx->dev_buffer;
         x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
         z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
         GGML_ASSERT(d_Z != nullptr);
     }
+    // Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
+    init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, dst);
+    x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
+    y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
+    z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
+    d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
     if (op_supports_incontiguous) {
         x_sz = ggml_nbytes(src0);
     const uint32_t src0_type_size = ggml_type_size(src0->type);
     const uint32_t src1_type_size = ggml_type_size(src1->type);
     const uint32_t dst_type_size = ggml_type_size(dst->type);
     int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
     int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
         (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
         (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
         (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
         0.0f, 0.0f, offset,
     }, dryrun);
 }
     const float sf3 = (float)dst->ne[3] / src0->ne[3];
     ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
+        (uint32_t)ggml_nelements(dst), 0, 0,
         (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
         (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
         sf0, sf1, sf2, sf3,
 static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     const uint32_t src0_type_size = ggml_type_size(src0->type);
     const uint32_t dst_type_size = ggml_type_size(dst->type);
     ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
         (uint32_t)ggml_nelements(src0),
         (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
         (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
         0.0f, 0.0f,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     }, dryrun);

ggml/src/ggml-vulkan/vulkan-shaders/acc.comp CHANGED Viewed

@@ -21,9 +21,9 @@ void main() {
     get_indices(idx, i00, i01, i02, i03);
     if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
-        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
     } else {
-        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]));
     }
 }

     get_indices(idx, i00, i01, i02, i03);
     if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
     } else {
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
     }
 }

ggml/src/ggml-vulkan/vulkan-shaders/add.comp CHANGED Viewed

@@ -22,7 +22,7 @@ void main() {
         uint i00, i01, i02, i03;
         get_indices(idx, i00, i01, i02, i03);
-        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
         idx += num_threads;
     }

         uint i00, i01, i02, i03;
         get_indices(idx, i00, i01, i02, i03);
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
         idx += num_threads;
     }

ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp CHANGED Viewed

@@ -12,6 +12,6 @@ void main() {
         return;
     }
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
 }

         return;
     }
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
 }

ggml/src/ggml-vulkan/vulkan-shaders/concat.comp CHANGED Viewed

@@ -30,12 +30,12 @@ void main() {
     const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
-    data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
 #else
     if (is_src0) {
-        data_d[p.d_offset + dst_idx] = data_a[src0_idx];
     } else {
-        data_d[p.d_offset + dst_idx] = data_b[src1_idx];
     }
 #endif
 }

     const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
+    data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : data_b[get_boffset() + src1_idx]);
 #else
     if (is_src0) {
+        data_d[get_doffset() + dst_idx] = data_a[get_aoffset() + src0_idx];
     } else {
+        data_d[get_doffset() + dst_idx] = data_b[get_boffset() + src1_idx];
     }
 #endif
 }

ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp CHANGED Viewed

@@ -19,9 +19,9 @@ void main() {
     if (idx + (num_iter-1)*num_threads < p.ne) {
         [[unroll]] for (uint i = 0; i < num_iter; ++i) {
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
-            data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
 #else
-            data_d[p.d_offset + idx] = data_a[idx];
 #endif
             idx += num_threads;
         }
@@ -32,9 +32,9 @@ void main() {
             }
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
-            data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
 #else
-            data_d[p.d_offset + idx] = data_a[idx];
 #endif
             idx += num_threads;
         }

     if (idx + (num_iter-1)*num_threads < p.ne) {
         [[unroll]] for (uint i = 0; i < num_iter; ++i) {
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
+            data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
 #else
+            data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
 #endif
             idx += num_threads;
         }
             }
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
+            data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
 #else
+            data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
 #endif
             idx += num_threads;
         }

ggml/src/ggml-vulkan/vulkan-shaders/copy.comp CHANGED Viewed

@@ -13,8 +13,8 @@ void main() {
     }
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
 #else
-    data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
 #endif
 }

     }
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
 #else
+    data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)];
 #endif
 }

ggml/src/ggml-vulkan/vulkan-shaders/cos.comp CHANGED Viewed

@@ -12,6 +12,6 @@ void main() {
         return;
     }
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
 }

         return;
     }
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
 }

ggml/src/ggml-vulkan/vulkan-shaders/div.comp CHANGED Viewed

@@ -20,7 +20,7 @@ void main() {
         uint i00, i01, i02, i03;
         get_indices(idx, i00, i01, i02, i03);
-        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
         idx += num_threads;
     }

         uint i00, i01, i02, i03;
         get_indices(idx, i00, i01, i02, i03);
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
         idx += num_threads;
     }

ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp CHANGED Viewed

@@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
     uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
     uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
-    uint d_offset;
     float param1; float param2; int param3;
 } p;
@@ -22,6 +22,10 @@ uint get_idx() {
     return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 }
 // mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
 uint fastmod(uint a, uint b) {
     if ((b & (b-1)) == 0) {

     uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
     uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
+    uint misalign_offsets;
     float param1; float param2; int param3;
 } p;
     return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 }
+uint get_aoffset() { return p.misalign_offsets >> 16; }
+uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
+uint get_doffset() { return p.misalign_offsets & 0xFF; }
 // mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
 uint fastmod(uint a, uint b) {
     if ((b & (b-1)) == 0) {

ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp CHANGED Viewed

@@ -6,7 +6,7 @@ layout (push_constant) uniform parameter
     uint ne;
     uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
-    uint d_offset;
     float param1; float param2;
     uint ne0_012mp; uint ne0_012L;
@@ -24,6 +24,9 @@ uint get_idx() {
     return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 }
 // see init_fastdiv_values in ggml-vulkan.cpp
 uint fastdiv(uint n, uint mp, uint L) {
     uint msbs, lsbs;

     uint ne;
     uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
+    uint misalign_offsets;
     float param1; float param2;
     uint ne0_012mp; uint ne0_012L;
     return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 }
+uint get_aoffset() { return p.misalign_offsets >> 16; }
+uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
 // see init_fastdiv_values in ggml-vulkan.cpp
 uint fastdiv(uint n, uint mp, uint L) {
     uint msbs, lsbs;

ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp CHANGED Viewed

@@ -15,10 +15,10 @@ void main() {
         return;
     }
-    const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
-    const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
-    const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
     data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);

         return;
     }
+    const uint i01 = data_b[get_boffset() + i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
+    const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
+    const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
     data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);

ggml/src/ggml-vulkan/vulkan-shaders/mul.comp CHANGED Viewed

@@ -20,7 +20,7 @@ void main() {
         uint i00, i01, i02, i03;
         get_indices(idx, i00, i01, i02, i03);
-        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
         idx += num_threads;
     }

         uint i00, i01, i02, i03;
         get_indices(idx, i00, i01, i02, i03);
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
         idx += num_threads;
     }

ggml/src/ggml-vulkan/vulkan-shaders/pad.comp CHANGED Viewed

@@ -24,5 +24,5 @@ void main() {
     const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
-    data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
 }

     const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
+    data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
 }

ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp CHANGED Viewed

@@ -22,5 +22,5 @@ void main() {
         return;
     }
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
 }

         return;
     }
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx_mod(idx)]);
 }

ggml/src/ggml-vulkan/vulkan-shaders/scale.comp CHANGED Viewed

@@ -18,7 +18,7 @@ void main() {
             continue;
         }
-        data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
         idx += num_threads;
     }
 }

             continue;
         }
+        data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
         idx += num_threads;
     }
 }

ggml/src/ggml-vulkan/vulkan-shaders/sin.comp CHANGED Viewed

@@ -12,6 +12,6 @@ void main() {
         return;
     }
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
 }

         return;
     }
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
 }

ggml/src/ggml-vulkan/vulkan-shaders/square.comp CHANGED Viewed

@@ -12,6 +12,6 @@ void main() {
         return;
     }
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
 }

         return;
     }
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
 }

ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp CHANGED Viewed

@@ -2,7 +2,7 @@
 layout (push_constant) uniform parameter
 {
-    uint ne; uint d_offset;
     uint nb00; uint nb01; uint nb02; uint nb03;
     uint ne10; uint ne11; uint ne12; uint ne13;
     float sf0; float sf1; float sf2; float sf3;
@@ -32,5 +32,5 @@ void main() {
     const uint i02 = uint(i12 / p.sf2);
     const uint i03 = uint(i13 / p.sf3);
-    data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
 }

 layout (push_constant) uniform parameter
 {
+    uint ne; uint a_offset; uint d_offset;
     uint nb00; uint nb01; uint nb02; uint nb03;
     uint ne10; uint ne11; uint ne12; uint ne13;
     float sf0; float sf1; float sf2; float sf3;
     const uint i02 = uint(i12 / p.sf2);
     const uint i03 = uint(i13 / p.sf3);
+    data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
 }