Rémy O commited on
Commit
d7d82b9
·
1 Parent(s): 0b52fcc

vulkan: add specific MMV kernels for IQ2 and IQ3 quants + optimizations (llama/11595)

Browse files

* vulkan: implement specialized MMV kernels for IQ2 quantizations

* vulkan: add MMV kernels for IQ3 quants

* vulkan: Increase MMV batch size and unroll IQ LUT setup

* vulkan: fix init_iq_shmem for WG sizes larger than tables

* vulkan: common batch size for all I-quants

ggml/src/ggml-vulkan/ggml-vulkan.cpp CHANGED
@@ -1987,6 +1987,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1987
  }
1988
  } else if (device->vendor_id == VK_VENDOR_ID_INTEL)
1989
  rm_stdq = 2;
 
1990
 
1991
  for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
1992
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32_"+std::to_string(i+1), mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
@@ -2001,15 +2002,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
2001
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2002
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2003
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2004
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq1_s_f32_f32_len, mul_mat_vec_iq1_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2005
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq1_m_f32_f32_len, mul_mat_vec_iq1_m_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2006
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f32_f32_len, mul_mat_vec_iq2_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2007
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f32_f32_len, mul_mat_vec_iq2_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2008
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f32_f32_len, mul_mat_vec_iq2_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2009
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2010
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f32_f32_len, mul_mat_vec_iq3_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2011
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f32_f32_len, mul_mat_vec_iq4_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2012
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
2013
 
2014
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
2015
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1), mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
@@ -2023,15 +2024,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
2023
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2024
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2025
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2026
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq1_s_f16_f32_len, mul_mat_vec_iq1_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2027
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq1_m_f16_f32_len, mul_mat_vec_iq1_m_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2028
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f16_f32_len, mul_mat_vec_iq2_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2029
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f16_f32_len, mul_mat_vec_iq2_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2030
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f16_f32_len, mul_mat_vec_iq2_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2031
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2032
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f16_f32_len, mul_mat_vec_iq3_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2033
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f16_f32_len, mul_mat_vec_iq4_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2034
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
2035
  }
2036
 
2037
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@@ -2046,15 +2047,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
2046
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2047
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2048
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2049
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_f32", mul_mat_vec_id_iq1_s_f32_len, mul_mat_vec_id_iq1_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2050
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_f32", mul_mat_vec_id_iq1_m_f32_len, mul_mat_vec_id_iq1_m_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2051
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", mul_mat_vec_id_iq2_xxs_f32_len, mul_mat_vec_id_iq2_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2052
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XS], "mul_mat_vec_id_iq2_xs_f32", mul_mat_vec_id_iq2_xs_f32_len, mul_mat_vec_id_iq2_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2053
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2054
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2055
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2056
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", mul_mat_vec_id_iq4_xs_f32_len, mul_mat_vec_id_iq4_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2057
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
2058
 
2059
  // dequant shaders
2060
  ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
 
1987
  }
1988
  } else if (device->vendor_id == VK_VENDOR_ID_INTEL)
1989
  rm_stdq = 2;
1990
+ uint32_t rm_iq = 2 * rm_kq;
1991
 
1992
  for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
1993
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32_"+std::to_string(i+1), mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
 
2002
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2003
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2004
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2005
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq1_s_f32_f32_len, mul_mat_vec_iq1_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2006
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq1_m_f32_f32_len, mul_mat_vec_iq1_m_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2007
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f32_f32_len, mul_mat_vec_iq2_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2008
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f32_f32_len, mul_mat_vec_iq2_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2009
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f32_f32_len, mul_mat_vec_iq2_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2010
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2011
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f32_f32_len, mul_mat_vec_iq3_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2012
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f32_f32_len, mul_mat_vec_iq4_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2013
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2014
 
2015
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
2016
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1), mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
 
2024
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2025
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2026
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
2027
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq1_s_f16_f32_len, mul_mat_vec_iq1_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2028
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq1_m_f16_f32_len, mul_mat_vec_iq1_m_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2029
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f16_f32_len, mul_mat_vec_iq2_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2030
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f16_f32_len, mul_mat_vec_iq2_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2031
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f16_f32_len, mul_mat_vec_iq2_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2032
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2033
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f16_f32_len, mul_mat_vec_iq3_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2034
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f16_f32_len, mul_mat_vec_iq4_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2035
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
2036
  }
2037
 
2038
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
 
2047
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2048
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2049
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
2050
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_f32", mul_mat_vec_id_iq1_s_f32_len, mul_mat_vec_id_iq1_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
2051
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_f32", mul_mat_vec_id_iq1_m_f32_len, mul_mat_vec_id_iq1_m_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
2052
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", mul_mat_vec_id_iq2_xxs_f32_len, mul_mat_vec_id_iq2_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
2053
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XS], "mul_mat_vec_id_iq2_xs_f32", mul_mat_vec_id_iq2_xs_f32_len, mul_mat_vec_id_iq2_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
2054
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
2055
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
2056
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
2057
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", mul_mat_vec_id_iq4_xs_f32_len, mul_mat_vec_id_iq4_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
2058
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
2059
 
2060
  // dequant shaders
2061
  ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp CHANGED
@@ -1,5 +1,7 @@
1
  #version 450
2
 
 
 
3
  #include "types.comp"
4
  #include "generic_binary_head.comp"
5
  #include "dequant_funcs.comp"
 
1
  #version 450
2
 
3
+ #extension GL_EXT_control_flow_attributes : enable
4
+
5
  #include "types.comp"
6
  #include "generic_binary_head.comp"
7
  #include "dequant_funcs.comp"
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+ #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
3
+
4
+ #include "mul_mat_vec_base.comp"
5
+
6
+ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
7
+
8
+ FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
9
+
10
+ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
11
+ const uint y_idx = i * QUANT_K + 16 * itid;
12
+ const uint nibble_shift = 4 * (itid & 1);
13
+ const uint ib32 = itid / 2; // 0..7
14
+
15
+ uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
16
+ [[unroll]] for (uint n = 0; n < num_rows; ++n) {
17
+ const float d = float(data_a[ibi].d);
18
+ const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
19
+ const float db = d * (0.5 + scale) * 0.25;
20
+
21
+ const uint qh = data_a[ibi].qh[ib32];
22
+ const u8vec2 qs16 = unpack8(data_a_packed16[ibi].qs[itid]);
23
+ const u8vec2 sign16 = unpack8(data_a_packed16[ibi].qs[QUANT_K / 16 + itid]);
24
+ [[unroll]] for (uint l = 0; l < 2; ++l) {
25
+ const uint8_t sign = sign16[l];
26
+ const uint qs = qs16[l] | ((qh << (8 - nibble_shift - 2 * l)) & 0x300);
27
+ const uvec2 grid = iq2s_grid[qs];
28
+ const vec4 grid0 = vec4(unpack8(grid.x));
29
+ const vec4 grid1 = vec4(unpack8(grid.y));
30
+
31
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
32
+ vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
33
+ vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
34
+
35
+ FLOAT_TYPE sum =
36
+ fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
37
+ fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
38
+ fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
39
+ fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
40
+ fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
41
+ fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
42
+ fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
43
+ fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w),
44
+ FLOAT_TYPE(0.0)))))))));
45
+ temp[j][n] = fma(db, sum, temp[j][n]);
46
+ }
47
+ }
48
+ ibi += num_blocks_per_row;
49
+ }
50
+ }
51
+
52
+ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
53
+ uint a_offset, b_offset, d_offset;
54
+ get_offsets(a_offset, b_offset, d_offset);
55
+
56
+ const uint num_blocks_per_row = p.ncols / QUANT_K;
57
+
58
+ // 16 threads are used to process each block
59
+ const uint blocks_per_wg = gl_WorkGroupSize.x/16;
60
+ const uint tid = gl_LocalInvocationID.x;
61
+ const uint itid = tid % 16; // 0...15
62
+ const uint ix = tid / 16;
63
+
64
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
65
+ [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
66
+ temp[j][i] = FLOAT_TYPE(0);
67
+ }
68
+ }
69
+
70
+ [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
71
+ calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
72
+
73
+ reduce_result(temp, d_offset, first_row, num_rows, tid);
74
+ }
75
+
76
+ void main() {
77
+ const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
78
+
79
+ init_iq_shmem(gl_WorkGroupSize);
80
+
81
+ // do NUM_ROWS at a time, unless there aren't enough remaining rows
82
+ if (first_row + NUM_ROWS <= p.stride_d) {
83
+ compute_outputs(first_row, NUM_ROWS);
84
+ } else {
85
+ if (first_row >= p.stride_d) {
86
+ return;
87
+ }
88
+ compute_outputs(first_row, p.stride_d - first_row);
89
+ }
90
+ }
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+ #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
3
+
4
+ #include "mul_mat_vec_base.comp"
5
+
6
+ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
7
+
8
+ FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
9
+
10
+ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
11
+ const uint y_idx = i * QUANT_K + 16 * itid;
12
+ const uint nibble_shift = 4 * (itid & 1);
13
+ const uint ib32 = itid / 2; // 0..7
14
+
15
+ uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
16
+ [[unroll]] for (uint n = 0; n < num_rows; ++n) {
17
+ const float d = float(data_a[ibi].d);
18
+ const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
19
+ const float db = d * (0.5 + scale) * 0.25;
20
+
21
+ [[unroll]] for (uint l = 0; l < 2; ++l) {
22
+ const uint qs = data_a[ibi].qs[2 * itid + l];
23
+ const uint sign = qs >> 9;
24
+ const uint sign7 = bitCount(sign);
25
+ const vec4 grid0 = vec4(unpack8(iq2xs_grid[qs & 511].x));
26
+ const vec4 grid1 = vec4(unpack8(iq2xs_grid[qs & 511].y));
27
+
28
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
29
+ vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
30
+ vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
31
+
32
+ FLOAT_TYPE sum =
33
+ fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
34
+ fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
35
+ fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
36
+ fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
37
+ fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
38
+ fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
39
+ fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
40
+ fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 & 1) != 0 ? -grid1.w : grid1.w),
41
+ FLOAT_TYPE(0.0)))))))));
42
+ temp[j][n] = fma(db, sum, temp[j][n]);
43
+ }
44
+ }
45
+ ibi += num_blocks_per_row;
46
+ }
47
+ }
48
+
49
+ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
50
+ uint a_offset, b_offset, d_offset;
51
+ get_offsets(a_offset, b_offset, d_offset);
52
+
53
+ const uint num_blocks_per_row = p.ncols / QUANT_K;
54
+
55
+ // 16 threads are used to process each block
56
+ const uint blocks_per_wg = gl_WorkGroupSize.x/16;
57
+ const uint tid = gl_LocalInvocationID.x;
58
+ const uint itid = tid % 16; // 0...15
59
+ const uint ix = tid / 16;
60
+
61
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
62
+ [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
63
+ temp[j][i] = FLOAT_TYPE(0);
64
+ }
65
+ }
66
+
67
+ [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
68
+ calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
69
+
70
+ reduce_result(temp, d_offset, first_row, num_rows, tid);
71
+ }
72
+
73
+ void main() {
74
+ const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
75
+
76
+ init_iq_shmem(gl_WorkGroupSize);
77
+
78
+ // do NUM_ROWS at a time, unless there aren't enough remaining rows
79
+ if (first_row + NUM_ROWS <= p.stride_d) {
80
+ compute_outputs(first_row, NUM_ROWS);
81
+ } else {
82
+ if (first_row >= p.stride_d) {
83
+ return;
84
+ }
85
+ compute_outputs(first_row, p.stride_d - first_row);
86
+ }
87
+ }
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+ #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
3
+
4
+ #include "mul_mat_vec_base.comp"
5
+
6
+ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
7
+
8
+ FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
9
+
10
+ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
11
+ const uint y_idx = i * QUANT_K + 16 * itid;
12
+ const uint ib32 = itid / 2; // 0..7
13
+
14
+ uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
15
+ [[unroll]] for (uint n = 0; n < num_rows; ++n) {
16
+ const float d = float(data_a[ibi].d);
17
+ const uint signscale = pack32(u16vec2(
18
+ data_a_packed16[ibi].qs[4 * ib32 + 2],
19
+ data_a_packed16[ibi].qs[4 * ib32 + 3]));
20
+ const float db = d * 0.25 * (0.5 + (signscale >> 28));
21
+ [[unroll]] for (uint l = 0; l < 2; ++l) {
22
+ const uint qs = data_a[ibi].qs[8 * ib32 + 2 * (itid & 1) + l];
23
+ const uint sign = bitfieldExtract(signscale, 7 * int(2 * (itid & 1) + l), 7);
24
+ const uint sign7 = bitCount(sign);
25
+ const vec4 grid0 = vec4(unpack8(iq2xxs_grid[qs].x));
26
+ const vec4 grid1 = vec4(unpack8(iq2xxs_grid[qs].y));
27
+
28
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
29
+ const vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
30
+ const vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
31
+
32
+ FLOAT_TYPE sum =
33
+ fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
34
+ fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
35
+ fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
36
+ fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
37
+ fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
38
+ fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
39
+ fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
40
+ fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 & 1) != 0 ? -grid1.w : grid1.w),
41
+ FLOAT_TYPE(0.0)))))))));
42
+ temp[j][n] = fma(db, sum, temp[j][n]);
43
+ }
44
+ }
45
+ ibi += num_blocks_per_row;
46
+ }
47
+ }
48
+
49
+ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
50
+ uint a_offset, b_offset, d_offset;
51
+ get_offsets(a_offset, b_offset, d_offset);
52
+
53
+ const uint num_blocks_per_row = p.ncols / QUANT_K;
54
+
55
+ // 16 threads are used to process each block
56
+ const uint blocks_per_wg = gl_WorkGroupSize.x/16;
57
+ const uint tid = gl_LocalInvocationID.x;
58
+ const uint itid = tid % 16; // 0...15
59
+ const uint ix = tid / 16;
60
+
61
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
62
+ [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
63
+ temp[j][i] = FLOAT_TYPE(0);
64
+ }
65
+ }
66
+
67
+ [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
68
+ calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
69
+
70
+ reduce_result(temp, d_offset, first_row, num_rows, tid);
71
+ }
72
+
73
+ void main() {
74
+ const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
75
+
76
+ init_iq_shmem(gl_WorkGroupSize);
77
+
78
+ // do NUM_ROWS at a time, unless there aren't enough remaining rows
79
+ if (first_row + NUM_ROWS <= p.stride_d) {
80
+ compute_outputs(first_row, NUM_ROWS);
81
+ } else {
82
+ if (first_row >= p.stride_d) {
83
+ return;
84
+ }
85
+ compute_outputs(first_row, p.stride_d - first_row);
86
+ }
87
+ }
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+ #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
3
+
4
+ #include "mul_mat_vec_base.comp"
5
+
6
+ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
7
+
8
+ FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
9
+
10
+ void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
11
+ const uint y_idx = i * QUANT_K + 32 * ib32;
12
+
13
+ uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
14
+ [[unroll]] for (uint n = 0; n < num_rows; ++n) {
15
+ const float d = float(data_a[ibi].d);
16
+ const uint scale = (data_a[ibi].scales[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
17
+ const float dscale = d * (1 + 2 * scale);
18
+ const uint qh = data_a[ibi].qh[ib32];
19
+ FLOAT_TYPE sum[NUM_COLS];
20
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
21
+ sum[j] = 0.0;
22
+ }
23
+ [[unroll]] for (uint l = 0; l < 4; ++l) {
24
+ const u8vec2 qs = unpack8(data_a_packed16[ibi].qs[4 * ib32 + l]);
25
+ const uint sign = data_a[ibi].signs[4 * ib32 + l];
26
+ const vec4 grid0 = vec4(unpack8(iq3s_grid[qs.x | ((qh << (8 - 2*l)) & 0x100)]));
27
+ const vec4 grid1 = vec4(unpack8(iq3s_grid[qs.y | ((qh << (7 - 2*l)) & 0x100)]));
28
+
29
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
30
+ const vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
31
+ const vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
32
+
33
+ sum[j] =
34
+ fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
35
+ fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
36
+ fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
37
+ fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
38
+ fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
39
+ fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
40
+ fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
41
+ fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w),
42
+ sum[j]))))))));
43
+ }
44
+ }
45
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
46
+ temp[j][n] = fma(dscale, sum[j], temp[j][n]);
47
+ }
48
+ ibi += num_blocks_per_row;
49
+ }
50
+ }
51
+
52
+ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
53
+ uint a_offset, b_offset, d_offset;
54
+ get_offsets(a_offset, b_offset, d_offset);
55
+
56
+ const uint num_blocks_per_row = p.ncols / QUANT_K;
57
+
58
+ // 8 threads are used to process each block
59
+ const uint blocks_per_wg = gl_WorkGroupSize.x/8;
60
+ const uint tid = gl_LocalInvocationID.x;
61
+ const uint itid = tid % 8; // 0...7
62
+ const uint ix = tid / 8;
63
+
64
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
65
+ [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
66
+ temp[j][i] = FLOAT_TYPE(0);
67
+ }
68
+ }
69
+
70
+ [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
71
+ calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
72
+
73
+ reduce_result(temp, d_offset, first_row, num_rows, tid);
74
+ }
75
+
76
+ void main() {
77
+ const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
78
+
79
+ init_iq_shmem(gl_WorkGroupSize);
80
+
81
+ // do NUM_ROWS at a time, unless there aren't enough remaining rows
82
+ if (first_row + NUM_ROWS <= p.stride_d) {
83
+ compute_outputs(first_row, NUM_ROWS);
84
+ } else {
85
+ if (first_row >= p.stride_d) {
86
+ return;
87
+ }
88
+ compute_outputs(first_row, p.stride_d - first_row);
89
+ }
90
+ }
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+ #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
3
+
4
+ #include "mul_mat_vec_base.comp"
5
+
6
+ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
7
+
8
+ FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
9
+
10
+ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
11
+ const uint y_idx = i * QUANT_K + 16 * itid;
12
+ const uint ib32 = itid / 2; // 0..7
13
+
14
+ uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
15
+ [[unroll]] for (uint n = 0; n < num_rows; ++n) {
16
+ const float d = float(data_a[ibi].d);
17
+ const uint signscale = pack32(u16vec2(
18
+ data_a_packed16[ibi].qs[QUANT_K / 8 + 2 * ib32],
19
+ data_a_packed16[ibi].qs[QUANT_K / 8 + 2 * ib32 + 1]));
20
+ const float db = d * 0.5 * (0.5 + (signscale >> 28));
21
+ [[unroll]] for (uint l = 0; l < 2; ++l) {
22
+ const uint qs0 = data_a[ibi].qs[8 * ib32 + 4 * (itid & 1) + 2 * l];
23
+ const uint qs1 = data_a[ibi].qs[8 * ib32 + 4 * (itid & 1) + 2 * l + 1];
24
+ const uint sign = bitfieldExtract(signscale, 7 * int(2 * (itid & 1) + l), 7);
25
+ const uint sign7 = bitCount(sign);
26
+ const vec4 grid0 = vec4(unpack8(iq3xxs_grid[qs0]));
27
+ const vec4 grid1 = vec4(unpack8(iq3xxs_grid[qs1]));
28
+
29
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
30
+ const vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
31
+ const vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
32
+
33
+ FLOAT_TYPE sum =
34
+ fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
35
+ fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
36
+ fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
37
+ fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
38
+ fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
39
+ fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
40
+ fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
41
+ fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 & 1) != 0 ? -grid1.w : grid1.w),
42
+ FLOAT_TYPE(0.0)))))))));
43
+ temp[j][n] = fma(db, sum, temp[j][n]);
44
+ }
45
+ }
46
+ ibi += num_blocks_per_row;
47
+ }
48
+ }
49
+
50
+ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
51
+ uint a_offset, b_offset, d_offset;
52
+ get_offsets(a_offset, b_offset, d_offset);
53
+
54
+ const uint num_blocks_per_row = p.ncols / QUANT_K;
55
+
56
+ // 16 threads are used to process each block
57
+ const uint blocks_per_wg = gl_WorkGroupSize.x/16;
58
+ const uint tid = gl_LocalInvocationID.x;
59
+ const uint itid = tid % 16; // 0...15
60
+ const uint ix = tid / 16;
61
+
62
+ [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
63
+ [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
64
+ temp[j][i] = FLOAT_TYPE(0);
65
+ }
66
+ }
67
+
68
+ [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
69
+ calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
70
+
71
+ reduce_result(temp, d_offset, first_row, num_rows, tid);
72
+ }
73
+
74
+ void main() {
75
+ const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
76
+
77
+ init_iq_shmem(gl_WorkGroupSize);
78
+
79
+ // do NUM_ROWS at a time, unless there aren't enough remaining rows
80
+ if (first_row + NUM_ROWS <= p.stride_d) {
81
+ compute_outputs(first_row, NUM_ROWS);
82
+ } else {
83
+ if (first_row >= p.stride_d) {
84
+ return;
85
+ }
86
+ compute_outputs(first_row, p.stride_d - first_row);
87
+ }
88
+ }
ggml/src/ggml-vulkan/vulkan-shaders/types.comp CHANGED
@@ -466,10 +466,13 @@ shared uint16_t iq1s_grid[2048];
466
  void init_iq_shmem(uvec3 wgsize)
467
  {
468
  // copy the table into shared memory and sync
469
- for (uint i = gl_LocalInvocationIndex.x; i < iq1s_grid_const.length(); i += wgsize.x) {
470
- u16vec2 g = unpack16(iq1s_grid_const[i]);
471
- iq1s_grid[2*i+0] = g.x;
472
- iq1s_grid[2*i+1] = g.y;
 
 
 
473
  }
474
  barrier();
475
  }
@@ -565,8 +568,10 @@ shared uvec2 iq2xxs_grid[256];
565
  void init_iq_shmem(uvec3 wgsize)
566
  {
567
  // copy the table into shared memory and sync
568
- for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += wgsize.x) {
569
- iq2xxs_grid[i] = iq2xxs_grid_const[i];
 
 
570
  }
571
  barrier();
572
  }
@@ -733,8 +738,10 @@ shared uvec2 iq2xs_grid[512];
733
  void init_iq_shmem(uvec3 wgsize)
734
  {
735
  // copy the table into shared memory and sync
736
- for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += wgsize.x) {
737
- iq2xs_grid[i] = iq2xs_grid_const[i];
 
 
738
  }
739
  barrier();
740
  }
@@ -756,6 +763,14 @@ struct block_iq2_s
756
  uint8_t scales[QUANT_K_IQ2_S/32];
757
  };
758
 
 
 
 
 
 
 
 
 
759
  #if defined(DATA_A_IQ2_S)
760
 
761
  const uvec2 iq2s_grid_const[1024] = {
@@ -1023,8 +1038,10 @@ shared uvec2 iq2s_grid[1024];
1023
  void init_iq_shmem(uvec3 wgsize)
1024
  {
1025
  // copy the table into shared memory and sync
1026
- for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += wgsize.x) {
1027
- iq2s_grid[i] = iq2s_grid_const[i];
 
 
1028
  }
1029
  barrier();
1030
  }
@@ -1032,6 +1049,7 @@ void init_iq_shmem(uvec3 wgsize)
1032
  #define QUANT_K QUANT_K_IQ2_S
1033
  #define QUANT_R QUANT_R_IQ2_S
1034
  #define A_TYPE block_iq2_s
 
1035
  #endif
1036
 
1037
  #define QUANT_K_IQ3_XXS 256
@@ -1092,8 +1110,10 @@ shared uint32_t iq3xxs_grid[256];
1092
  void init_iq_shmem(uvec3 wgsize)
1093
  {
1094
  // copy the table into shared memory and sync
1095
- for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += wgsize.x) {
1096
- iq3xxs_grid[i] = iq3xxs_grid_const[i];
 
 
1097
  }
1098
  barrier();
1099
  }
@@ -1200,8 +1220,10 @@ shared uint32_t iq3s_grid[512];
1200
  void init_iq_shmem(uvec3 wgsize)
1201
  {
1202
  // copy the table into shared memory and sync
1203
- for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += wgsize.x) {
1204
- iq3s_grid[i] = iq3s_grid_const[i];
 
 
1205
  }
1206
  barrier();
1207
  }
 
466
  void init_iq_shmem(uvec3 wgsize)
467
  {
468
  // copy the table into shared memory and sync
469
+ [[unroll]] for (uint i = 0; i < iq1s_grid_const.length(); i += wgsize.x) {
470
+ uint idx = i + gl_LocalInvocationIndex.x;
471
+ if (iq1s_grid_const.length() % wgsize.x == 0 || idx < iq1s_grid_const.length()) {
472
+ u16vec2 g = unpack16(iq1s_grid_const[idx]);
473
+ iq1s_grid[2*idx+0] = g.x;
474
+ iq1s_grid[2*idx+1] = g.y;
475
+ }
476
  }
477
  barrier();
478
  }
 
568
  void init_iq_shmem(uvec3 wgsize)
569
  {
570
  // copy the table into shared memory and sync
571
+ [[unroll]] for (uint i = 0; i < iq2xxs_grid.length(); i += wgsize.x) {
572
+ if (iq2xxs_grid_const.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq2xxs_grid_const.length()) {
573
+ iq2xxs_grid[i + gl_LocalInvocationIndex.x] = iq2xxs_grid_const[i + gl_LocalInvocationIndex.x];
574
+ }
575
  }
576
  barrier();
577
  }
 
738
  void init_iq_shmem(uvec3 wgsize)
739
  {
740
  // copy the table into shared memory and sync
741
+ [[unroll]] for (uint i = 0; i < iq2xs_grid.length(); i += wgsize.x) {
742
+ if (iq2xs_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq2xs_grid_const.length()) {
743
+ iq2xs_grid[i + gl_LocalInvocationIndex.x] = iq2xs_grid_const[i + gl_LocalInvocationIndex.x];
744
+ }
745
  }
746
  barrier();
747
  }
 
763
  uint8_t scales[QUANT_K_IQ2_S/32];
764
  };
765
 
766
+ struct block_iq2_s_packed16
767
+ {
768
+ float16_t d;
769
+ uint16_t qs[QUANT_K_IQ2_S/8];
770
+ uint16_t qh[QUANT_K_IQ2_S/64];
771
+ uint16_t scales[QUANT_K_IQ2_S/64];
772
+ };
773
+
774
  #if defined(DATA_A_IQ2_S)
775
 
776
  const uvec2 iq2s_grid_const[1024] = {
 
1038
  void init_iq_shmem(uvec3 wgsize)
1039
  {
1040
  // copy the table into shared memory and sync
1041
+ [[unroll]] for (uint i = 0; i < iq2s_grid.length(); i += wgsize.x) {
1042
+ if (iq2s_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq2s_grid_const.length()) {
1043
+ iq2s_grid[i + gl_LocalInvocationIndex.x] = iq2s_grid_const[i + gl_LocalInvocationIndex.x];
1044
+ }
1045
  }
1046
  barrier();
1047
  }
 
1049
  #define QUANT_K QUANT_K_IQ2_S
1050
  #define QUANT_R QUANT_R_IQ2_S
1051
  #define A_TYPE block_iq2_s
1052
+ #define A_TYPE_PACKED16 block_iq2_s_packed16
1053
  #endif
1054
 
1055
  #define QUANT_K_IQ3_XXS 256
 
1110
  void init_iq_shmem(uvec3 wgsize)
1111
  {
1112
  // copy the table into shared memory and sync
1113
+ [[unroll]] for (uint i = 0; i < iq3xxs_grid.length(); i += wgsize.x) {
1114
+ if (iq3xxs_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq3xxs_grid.length()) {
1115
+ iq3xxs_grid[i + gl_LocalInvocationIndex.x] = iq3xxs_grid_const[i + gl_LocalInvocationIndex.x];
1116
+ }
1117
  }
1118
  barrier();
1119
  }
 
1220
  void init_iq_shmem(uvec3 wgsize)
1221
  {
1222
  // copy the table into shared memory and sync
1223
+ [[unroll]] for (uint i = 0; i < iq3s_grid.length(); i += wgsize.x) {
1224
+ if (iq3s_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq3s_grid.length()) {
1225
+ iq3s_grid[i + gl_LocalInvocationIndex.x] = iq3s_grid_const[i + gl_LocalInvocationIndex.x];
1226
+ }
1227
  }
1228
  barrier();
1229
  }
ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp CHANGED
@@ -402,7 +402,7 @@ void process_shaders() {
402
  for (const auto& tname : type_names) {
403
  // mul mat vec
404
  std::string data_a_key = "DATA_A_" + to_uppercase(tname);
405
- std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
406
 
407
  string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
408
  string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
 
402
  for (const auto& tname : type_names) {
403
  // mul mat vec
404
  std::string data_a_key = "DATA_A_" + to_uppercase(tname);
405
+ std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
406
 
407
  string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
408
  string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));