Spaces:
Running
Running
Commit
·
04e729a
1
Parent(s):
3bf5be1
vulkan: Use push constant offset to handle misaligned descriptors (llama/10987)
Browse files- ggml/src/ggml-vulkan/ggml-vulkan.cpp +64 -10
- ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/add.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +3 -3
- ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +4 -4
- ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/div.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +5 -1
- ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +4 -1
- ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +3 -3
- ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +2 -2
ggml/src/ggml-vulkan/ggml-vulkan.cpp
CHANGED
|
@@ -411,7 +411,7 @@ struct vk_op_unary_push_constants {
|
|
| 411 |
uint32_t ne;
|
| 412 |
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
| 413 |
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
| 414 |
-
uint32_t
|
| 415 |
float param1; float param2;
|
| 416 |
uint32_t ne0_012mp; uint32_t ne0_012L;
|
| 417 |
uint32_t ne0_01mp; uint32_t ne0_01L;
|
|
@@ -459,7 +459,7 @@ struct vk_op_binary_push_constants {
|
|
| 459 |
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
| 460 |
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
| 461 |
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
|
| 462 |
-
uint32_t
|
| 463 |
float param1; float param2; int32_t param3;
|
| 464 |
};
|
| 465 |
|
|
@@ -546,7 +546,7 @@ struct vk_staging_memcpy {
|
|
| 546 |
};
|
| 547 |
|
| 548 |
struct vk_op_upscale_push_constants {
|
| 549 |
-
uint32_t ne; uint32_t d_offset;
|
| 550 |
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
| 551 |
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
| 552 |
float sf0; float sf1; float sf2; float sf3;
|
|
@@ -5076,6 +5076,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
| 5076 |
}
|
| 5077 |
}
|
| 5078 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5079 |
template<typename PC>
|
| 5080 |
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
|
| 5081 |
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
@@ -5179,8 +5230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
| 5179 |
}
|
| 5180 |
|
| 5181 |
GGML_ASSERT(d_D != nullptr);
|
| 5182 |
-
uint64_t d_buf_offset =
|
| 5183 |
-
GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT
|
| 5184 |
if(!src0_uma) {
|
| 5185 |
d_X = src0_buf_ctx->dev_buffer;
|
| 5186 |
x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
|
@@ -5196,6 +5246,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
| 5196 |
z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
|
| 5197 |
GGML_ASSERT(d_Z != nullptr);
|
| 5198 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5199 |
|
| 5200 |
if (op_supports_incontiguous) {
|
| 5201 |
x_sz = ggml_nbytes(src0);
|
|
@@ -5383,7 +5439,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
| 5383 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 5384 |
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
| 5385 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 5386 |
-
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
| 5387 |
|
| 5388 |
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
| 5389 |
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
|
@@ -5395,7 +5450,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
| 5395 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
|
| 5396 |
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
| 5397 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
|
| 5398 |
-
|
| 5399 |
0.0f, 0.0f, offset,
|
| 5400 |
}, dryrun);
|
| 5401 |
}
|
|
@@ -5599,7 +5654,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
|
| 5599 |
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
| 5600 |
|
| 5601 |
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
| 5602 |
-
(uint32_t)ggml_nelements(dst), 0,
|
| 5603 |
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 5604 |
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
|
| 5605 |
sf0, sf1, sf2, sf3,
|
|
@@ -5709,13 +5764,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
| 5709 |
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 5710 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 5711 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 5712 |
-
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
| 5713 |
|
| 5714 |
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
| 5715 |
(uint32_t)ggml_nelements(src0),
|
| 5716 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 5717 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 5718 |
-
|
| 5719 |
0.0f, 0.0f,
|
| 5720 |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 5721 |
}, dryrun);
|
|
|
|
| 411 |
uint32_t ne;
|
| 412 |
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
| 413 |
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
| 414 |
+
uint32_t misalign_offsets;
|
| 415 |
float param1; float param2;
|
| 416 |
uint32_t ne0_012mp; uint32_t ne0_012L;
|
| 417 |
uint32_t ne0_01mp; uint32_t ne0_01L;
|
|
|
|
| 459 |
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
| 460 |
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
| 461 |
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
|
| 462 |
+
uint32_t misalign_offsets;
|
| 463 |
float param1; float param2; int32_t param3;
|
| 464 |
};
|
| 465 |
|
|
|
|
| 546 |
};
|
| 547 |
|
| 548 |
struct vk_op_upscale_push_constants {
|
| 549 |
+
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
|
| 550 |
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
| 551 |
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
| 552 |
float sf0; float sf1; float sf2; float sf3;
|
|
|
|
| 5076 |
}
|
| 5077 |
}
|
| 5078 |
|
| 5079 |
+
static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_tensor * t)
|
| 5080 |
+
{
|
| 5081 |
+
return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
|
| 5082 |
+
}
|
| 5083 |
+
|
| 5084 |
+
template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
| 5085 |
+
GGML_UNUSED(p);
|
| 5086 |
+
GGML_UNUSED(src0);
|
| 5087 |
+
GGML_UNUSED(src1);
|
| 5088 |
+
GGML_UNUSED(src2);
|
| 5089 |
+
GGML_UNUSED(dst);
|
| 5090 |
+
static_assert(!std::is_const<T>::value, "unexpected type");
|
| 5091 |
+
GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
|
| 5092 |
+
GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
|
| 5093 |
+
GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
|
| 5094 |
+
GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0);
|
| 5095 |
+
}
|
| 5096 |
+
|
| 5097 |
+
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
| 5098 |
+
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
| 5099 |
+
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
| 5100 |
+
|
| 5101 |
+
p.misalign_offsets = (a_offset << 16) | d_offset;
|
| 5102 |
+
|
| 5103 |
+
GGML_UNUSED(src1);
|
| 5104 |
+
GGML_UNUSED(src2);
|
| 5105 |
+
}
|
| 5106 |
+
|
| 5107 |
+
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
| 5108 |
+
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
| 5109 |
+
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
| 5110 |
+
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
| 5111 |
+
|
| 5112 |
+
GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));
|
| 5113 |
+
|
| 5114 |
+
p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
|
| 5115 |
+
|
| 5116 |
+
GGML_UNUSED(src2);
|
| 5117 |
+
}
|
| 5118 |
+
|
| 5119 |
+
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
| 5120 |
+
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
| 5121 |
+
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
| 5122 |
+
|
| 5123 |
+
p.a_offset = a_offset;
|
| 5124 |
+
p.d_offset = d_offset;
|
| 5125 |
+
|
| 5126 |
+
GGML_UNUSED(src1);
|
| 5127 |
+
GGML_UNUSED(src2);
|
| 5128 |
+
}
|
| 5129 |
+
|
| 5130 |
template<typename PC>
|
| 5131 |
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
|
| 5132 |
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
|
|
| 5230 |
}
|
| 5231 |
|
| 5232 |
GGML_ASSERT(d_D != nullptr);
|
| 5233 |
+
uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
|
|
|
| 5234 |
if(!src0_uma) {
|
| 5235 |
d_X = src0_buf_ctx->dev_buffer;
|
| 5236 |
x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
|
|
|
| 5246 |
z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
|
| 5247 |
GGML_ASSERT(d_Z != nullptr);
|
| 5248 |
}
|
| 5249 |
+
// Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
|
| 5250 |
+
init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, dst);
|
| 5251 |
+
x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
| 5252 |
+
y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
| 5253 |
+
z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
| 5254 |
+
d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
| 5255 |
|
| 5256 |
if (op_supports_incontiguous) {
|
| 5257 |
x_sz = ggml_nbytes(src0);
|
|
|
|
| 5439 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 5440 |
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
| 5441 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
| 5442 |
|
| 5443 |
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
| 5444 |
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
|
|
|
| 5450 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
|
| 5451 |
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
| 5452 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
|
| 5453 |
+
0,
|
| 5454 |
0.0f, 0.0f, offset,
|
| 5455 |
}, dryrun);
|
| 5456 |
}
|
|
|
|
| 5654 |
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
| 5655 |
|
| 5656 |
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
| 5657 |
+
(uint32_t)ggml_nelements(dst), 0, 0,
|
| 5658 |
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 5659 |
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
|
| 5660 |
sf0, sf1, sf2, sf3,
|
|
|
|
| 5764 |
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 5765 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 5766 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
| 5767 |
|
| 5768 |
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
| 5769 |
(uint32_t)ggml_nelements(src0),
|
| 5770 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 5771 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 5772 |
+
0,
|
| 5773 |
0.0f, 0.0f,
|
| 5774 |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 5775 |
}, dryrun);
|
ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
CHANGED
|
@@ -21,9 +21,9 @@ void main() {
|
|
| 21 |
get_indices(idx, i00, i01, i02, i03);
|
| 22 |
|
| 23 |
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
|
| 24 |
-
data_d[
|
| 25 |
} else {
|
| 26 |
-
data_d[
|
| 27 |
}
|
| 28 |
}
|
| 29 |
|
|
|
|
| 21 |
get_indices(idx, i00, i01, i02, i03);
|
| 22 |
|
| 23 |
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
|
| 24 |
+
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
|
| 25 |
} else {
|
| 26 |
+
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
|
| 27 |
}
|
| 28 |
}
|
| 29 |
|
ggml/src/ggml-vulkan/vulkan-shaders/add.comp
CHANGED
|
@@ -22,7 +22,7 @@ void main() {
|
|
| 22 |
uint i00, i01, i02, i03;
|
| 23 |
get_indices(idx, i00, i01, i02, i03);
|
| 24 |
|
| 25 |
-
data_d[
|
| 26 |
|
| 27 |
idx += num_threads;
|
| 28 |
}
|
|
|
|
| 22 |
uint i00, i01, i02, i03;
|
| 23 |
get_indices(idx, i00, i01, i02, i03);
|
| 24 |
|
| 25 |
+
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
| 26 |
|
| 27 |
idx += num_threads;
|
| 28 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp
CHANGED
|
@@ -12,6 +12,6 @@ void main() {
|
|
| 12 |
return;
|
| 13 |
}
|
| 14 |
|
| 15 |
-
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
| 16 |
-
data_d[
|
| 17 |
}
|
|
|
|
| 12 |
return;
|
| 13 |
}
|
| 14 |
|
| 15 |
+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
| 16 |
+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
| 17 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/concat.comp
CHANGED
|
@@ -30,12 +30,12 @@ void main() {
|
|
| 30 |
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
| 31 |
|
| 32 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 33 |
-
data_d[
|
| 34 |
#else
|
| 35 |
if (is_src0) {
|
| 36 |
-
data_d[
|
| 37 |
} else {
|
| 38 |
-
data_d[
|
| 39 |
}
|
| 40 |
#endif
|
| 41 |
}
|
|
|
|
| 30 |
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
| 31 |
|
| 32 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 33 |
+
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : data_b[get_boffset() + src1_idx]);
|
| 34 |
#else
|
| 35 |
if (is_src0) {
|
| 36 |
+
data_d[get_doffset() + dst_idx] = data_a[get_aoffset() + src0_idx];
|
| 37 |
} else {
|
| 38 |
+
data_d[get_doffset() + dst_idx] = data_b[get_boffset() + src1_idx];
|
| 39 |
}
|
| 40 |
#endif
|
| 41 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
CHANGED
|
@@ -19,9 +19,9 @@ void main() {
|
|
| 19 |
if (idx + (num_iter-1)*num_threads < p.ne) {
|
| 20 |
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
| 21 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 22 |
-
data_d[
|
| 23 |
#else
|
| 24 |
-
data_d[
|
| 25 |
#endif
|
| 26 |
idx += num_threads;
|
| 27 |
}
|
|
@@ -32,9 +32,9 @@ void main() {
|
|
| 32 |
}
|
| 33 |
|
| 34 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 35 |
-
data_d[
|
| 36 |
#else
|
| 37 |
-
data_d[
|
| 38 |
#endif
|
| 39 |
idx += num_threads;
|
| 40 |
}
|
|
|
|
| 19 |
if (idx + (num_iter-1)*num_threads < p.ne) {
|
| 20 |
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
| 21 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 22 |
+
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
|
| 23 |
#else
|
| 24 |
+
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
|
| 25 |
#endif
|
| 26 |
idx += num_threads;
|
| 27 |
}
|
|
|
|
| 32 |
}
|
| 33 |
|
| 34 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 35 |
+
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
|
| 36 |
#else
|
| 37 |
+
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
|
| 38 |
#endif
|
| 39 |
idx += num_threads;
|
| 40 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
CHANGED
|
@@ -13,8 +13,8 @@ void main() {
|
|
| 13 |
}
|
| 14 |
|
| 15 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 16 |
-
data_d[
|
| 17 |
#else
|
| 18 |
-
data_d[
|
| 19 |
#endif
|
| 20 |
}
|
|
|
|
| 13 |
}
|
| 14 |
|
| 15 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 16 |
+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
| 17 |
#else
|
| 18 |
+
data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)];
|
| 19 |
#endif
|
| 20 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/cos.comp
CHANGED
|
@@ -12,6 +12,6 @@ void main() {
|
|
| 12 |
return;
|
| 13 |
}
|
| 14 |
|
| 15 |
-
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
| 16 |
-
data_d[
|
| 17 |
}
|
|
|
|
| 12 |
return;
|
| 13 |
}
|
| 14 |
|
| 15 |
+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
| 16 |
+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
|
| 17 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/div.comp
CHANGED
|
@@ -20,7 +20,7 @@ void main() {
|
|
| 20 |
uint i00, i01, i02, i03;
|
| 21 |
get_indices(idx, i00, i01, i02, i03);
|
| 22 |
|
| 23 |
-
data_d[
|
| 24 |
|
| 25 |
idx += num_threads;
|
| 26 |
}
|
|
|
|
| 20 |
uint i00, i01, i02, i03;
|
| 21 |
get_indices(idx, i00, i01, i02, i03);
|
| 22 |
|
| 23 |
+
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
| 24 |
|
| 25 |
idx += num_threads;
|
| 26 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp
CHANGED
|
@@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
|
|
| 7 |
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
| 8 |
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
| 9 |
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
| 10 |
-
uint
|
| 11 |
float param1; float param2; int param3;
|
| 12 |
} p;
|
| 13 |
|
|
@@ -22,6 +22,10 @@ uint get_idx() {
|
|
| 22 |
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 23 |
}
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
|
| 26 |
uint fastmod(uint a, uint b) {
|
| 27 |
if ((b & (b-1)) == 0) {
|
|
|
|
| 7 |
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
| 8 |
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
| 9 |
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
| 10 |
+
uint misalign_offsets;
|
| 11 |
float param1; float param2; int param3;
|
| 12 |
} p;
|
| 13 |
|
|
|
|
| 22 |
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 23 |
}
|
| 24 |
|
| 25 |
+
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
| 26 |
+
uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
|
| 27 |
+
uint get_doffset() { return p.misalign_offsets & 0xFF; }
|
| 28 |
+
|
| 29 |
// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
|
| 30 |
uint fastmod(uint a, uint b) {
|
| 31 |
if ((b & (b-1)) == 0) {
|
ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
CHANGED
|
@@ -6,7 +6,7 @@ layout (push_constant) uniform parameter
|
|
| 6 |
uint ne;
|
| 7 |
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
| 8 |
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
| 9 |
-
uint
|
| 10 |
float param1; float param2;
|
| 11 |
|
| 12 |
uint ne0_012mp; uint ne0_012L;
|
|
@@ -24,6 +24,9 @@ uint get_idx() {
|
|
| 24 |
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 25 |
}
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
// see init_fastdiv_values in ggml-vulkan.cpp
|
| 28 |
uint fastdiv(uint n, uint mp, uint L) {
|
| 29 |
uint msbs, lsbs;
|
|
|
|
| 6 |
uint ne;
|
| 7 |
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
| 8 |
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
| 9 |
+
uint misalign_offsets;
|
| 10 |
float param1; float param2;
|
| 11 |
|
| 12 |
uint ne0_012mp; uint ne0_012L;
|
|
|
|
| 24 |
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 25 |
}
|
| 26 |
|
| 27 |
+
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
| 28 |
+
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
|
| 29 |
+
|
| 30 |
// see init_fastdiv_values in ggml-vulkan.cpp
|
| 31 |
uint fastdiv(uint n, uint mp, uint L) {
|
| 32 |
uint msbs, lsbs;
|
ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
CHANGED
|
@@ -15,10 +15,10 @@ void main() {
|
|
| 15 |
return;
|
| 16 |
}
|
| 17 |
|
| 18 |
-
const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
|
| 19 |
|
| 20 |
-
const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
|
| 21 |
-
const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
| 22 |
|
| 23 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 24 |
data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
|
|
|
|
| 15 |
return;
|
| 16 |
}
|
| 17 |
|
| 18 |
+
const uint i01 = data_b[get_boffset() + i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
|
| 19 |
|
| 20 |
+
const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
|
| 21 |
+
const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
| 22 |
|
| 23 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 24 |
data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
|
ggml/src/ggml-vulkan/vulkan-shaders/mul.comp
CHANGED
|
@@ -20,7 +20,7 @@ void main() {
|
|
| 20 |
uint i00, i01, i02, i03;
|
| 21 |
get_indices(idx, i00, i01, i02, i03);
|
| 22 |
|
| 23 |
-
data_d[
|
| 24 |
|
| 25 |
idx += num_threads;
|
| 26 |
}
|
|
|
|
| 20 |
uint i00, i01, i02, i03;
|
| 21 |
get_indices(idx, i00, i01, i02, i03);
|
| 22 |
|
| 23 |
+
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
| 24 |
|
| 25 |
idx += num_threads;
|
| 26 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
CHANGED
|
@@ -24,5 +24,5 @@ void main() {
|
|
| 24 |
|
| 25 |
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
| 26 |
|
| 27 |
-
data_d[
|
| 28 |
}
|
|
|
|
| 24 |
|
| 25 |
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
| 26 |
|
| 27 |
+
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
|
| 28 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp
CHANGED
|
@@ -22,5 +22,5 @@ void main() {
|
|
| 22 |
return;
|
| 23 |
}
|
| 24 |
|
| 25 |
-
data_d[
|
| 26 |
}
|
|
|
|
| 22 |
return;
|
| 23 |
}
|
| 24 |
|
| 25 |
+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx_mod(idx)]);
|
| 26 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/scale.comp
CHANGED
|
@@ -18,7 +18,7 @@ void main() {
|
|
| 18 |
continue;
|
| 19 |
}
|
| 20 |
|
| 21 |
-
data_d[
|
| 22 |
idx += num_threads;
|
| 23 |
}
|
| 24 |
}
|
|
|
|
| 18 |
continue;
|
| 19 |
}
|
| 20 |
|
| 21 |
+
data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
|
| 22 |
idx += num_threads;
|
| 23 |
}
|
| 24 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/sin.comp
CHANGED
|
@@ -12,6 +12,6 @@ void main() {
|
|
| 12 |
return;
|
| 13 |
}
|
| 14 |
|
| 15 |
-
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
| 16 |
-
data_d[
|
| 17 |
}
|
|
|
|
| 12 |
return;
|
| 13 |
}
|
| 14 |
|
| 15 |
+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
| 16 |
+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
|
| 17 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/square.comp
CHANGED
|
@@ -12,6 +12,6 @@ void main() {
|
|
| 12 |
return;
|
| 13 |
}
|
| 14 |
|
| 15 |
-
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
| 16 |
-
data_d[
|
| 17 |
}
|
|
|
|
| 12 |
return;
|
| 13 |
}
|
| 14 |
|
| 15 |
+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
| 16 |
+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
|
| 17 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
layout (push_constant) uniform parameter
|
| 4 |
{
|
| 5 |
-
uint ne; uint d_offset;
|
| 6 |
uint nb00; uint nb01; uint nb02; uint nb03;
|
| 7 |
uint ne10; uint ne11; uint ne12; uint ne13;
|
| 8 |
float sf0; float sf1; float sf2; float sf3;
|
|
@@ -32,5 +32,5 @@ void main() {
|
|
| 32 |
const uint i02 = uint(i12 / p.sf2);
|
| 33 |
const uint i03 = uint(i13 / p.sf3);
|
| 34 |
|
| 35 |
-
data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
|
| 36 |
}
|
|
|
|
| 2 |
|
| 3 |
layout (push_constant) uniform parameter
|
| 4 |
{
|
| 5 |
+
uint ne; uint a_offset; uint d_offset;
|
| 6 |
uint nb00; uint nb01; uint nb02; uint nb03;
|
| 7 |
uint ne10; uint ne11; uint ne12; uint ne13;
|
| 8 |
float sf0; float sf1; float sf2; float sf3;
|
|
|
|
| 32 |
const uint i02 = uint(i12 / p.sf2);
|
| 33 |
const uint i03 = uint(i13 / p.sf3);
|
| 34 |
|
| 35 |
+
data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
|
| 36 |
}
|