Spaces:
Running
Running
bench : improve benchmarks
Browse files- extra/bench-all.sh +33 -15
- whisper.cpp +57 -33
extra/bench-all.sh
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
# Helper script to run the bench tool on all models and print the results in share-able format
|
| 4 |
|
| 5 |
-
printf "Usage: ./bench.sh [n_threads]\n"
|
| 6 |
|
| 7 |
if [ -z "$1" ]; then
|
| 8 |
n_threads=4
|
|
@@ -10,24 +10,39 @@ else
|
|
| 10 |
n_threads=$1
|
| 11 |
fi
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
|
| 23 |
-
printf "\n"
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
printf "
|
| 30 |
-
printf "\n"
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
printf "| CPU | OS | Config | Model | Th | Load | Enc. | Commit |\n"
|
| 33 |
printf "| --- | -- | ------ | ----- | -- | ---- | ---- | ------ |\n"
|
|
@@ -39,6 +54,7 @@ for model in "${models[@]}"; do
|
|
| 39 |
# actual run
|
| 40 |
# store stderr output in a variable in order to parse it later
|
| 41 |
output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
|
|
|
|
| 42 |
|
| 43 |
# parse the output:
|
| 44 |
load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
|
|
@@ -70,5 +86,7 @@ for model in "${models[@]}"; do
|
|
| 70 |
|
| 71 |
commit=$(git rev-parse --short HEAD)
|
| 72 |
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
done
|
|
|
|
| 2 |
|
| 3 |
# Helper script to run the bench tool on all models and print the results in share-able format
|
| 4 |
|
| 5 |
+
printf "Usage: ./bench.sh [n_threads] [encoder-only]\n"
|
| 6 |
|
| 7 |
if [ -z "$1" ]; then
|
| 8 |
n_threads=4
|
|
|
|
| 10 |
n_threads=$1
|
| 11 |
fi
|
| 12 |
|
| 13 |
+
encoder_only=0
|
| 14 |
+
if [ -z "$2" ]; then
|
| 15 |
+
encoder_only=0
|
| 16 |
+
else
|
| 17 |
+
encoder_only=$2
|
| 18 |
+
fi
|
| 19 |
|
| 20 |
+
models=( \
|
| 21 |
+
"tiny" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \
|
| 22 |
+
"base" "base-q5_0" "base-q5_1" "base-q8_0" \
|
| 23 |
+
"small" "small-q5_0" "small-q5_1" "small-q8_0" \
|
| 24 |
+
"medium" "medium-q5_0" "medium-q5_1" "medium-q8_0" \
|
| 25 |
+
"large" "large-q5_0" "large-q5_1" "large-q8_0" \
|
| 26 |
+
)
|
| 27 |
|
| 28 |
+
if [ "$encoder_only" -eq 0 ]; then
|
| 29 |
+
printf "\n"
|
| 30 |
+
printf "Running memcpy benchmark\n"
|
| 31 |
+
printf "\n"
|
| 32 |
|
| 33 |
+
./bench -w 1 -t $n_threads 2>&1
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
printf "\n"
|
| 36 |
+
printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
|
| 37 |
+
printf "\n"
|
| 38 |
|
| 39 |
+
./bench -w 2 -t $n_threads 2>&1
|
| 40 |
+
|
| 41 |
+
printf "\n"
|
| 42 |
+
printf "Running benchmark for all models\n"
|
| 43 |
+
printf "This can take a while!\n"
|
| 44 |
+
printf "\n"
|
| 45 |
+
fi
|
| 46 |
|
| 47 |
printf "| CPU | OS | Config | Model | Th | Load | Enc. | Commit |\n"
|
| 48 |
printf "| --- | -- | ------ | ----- | -- | ---- | ---- | ------ |\n"
|
|
|
|
| 54 |
# actual run
|
| 55 |
# store stderr output in a variable in order to parse it later
|
| 56 |
output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
|
| 57 |
+
ret=$?
|
| 58 |
|
| 59 |
# parse the output:
|
| 60 |
load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
|
|
|
|
| 86 |
|
| 87 |
commit=$(git rev-parse --short HEAD)
|
| 88 |
|
| 89 |
+
if [ $ret -eq 0 ]; then
|
| 90 |
+
printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
|
| 91 |
+
fi
|
| 92 |
done
|
whisper.cpp
CHANGED
|
@@ -4827,49 +4827,51 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
| 4827 |
|
| 4828 |
ggml_time_init();
|
| 4829 |
|
| 4830 |
-
size_t n =
|
| 4831 |
-
size_t arr = n_threads > 0 ?
|
| 4832 |
|
| 4833 |
-
//
|
| 4834 |
const size_t size = arr*1024llu*1024llu;
|
| 4835 |
|
| 4836 |
-
|
| 4837 |
-
|
|
|
|
|
|
|
| 4838 |
|
| 4839 |
-
|
| 4840 |
|
| 4841 |
-
|
| 4842 |
|
| 4843 |
-
|
|
|
|
| 4844 |
|
| 4845 |
-
|
| 4846 |
-
|
| 4847 |
|
| 4848 |
-
|
| 4849 |
|
| 4850 |
-
|
| 4851 |
|
| 4852 |
-
|
| 4853 |
|
| 4854 |
-
|
| 4855 |
-
|
| 4856 |
|
| 4857 |
-
|
| 4858 |
-
|
| 4859 |
|
| 4860 |
-
|
| 4861 |
-
|
| 4862 |
-
|
| 4863 |
|
| 4864 |
-
|
|
|
|
|
|
|
| 4865 |
|
| 4866 |
-
|
| 4867 |
-
|
| 4868 |
}
|
| 4869 |
|
| 4870 |
-
free(src);
|
| 4871 |
-
free(dst);
|
| 4872 |
-
|
| 4873 |
return s.c_str();
|
| 4874 |
}
|
| 4875 |
|
|
@@ -4905,26 +4907,37 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 4905 |
for (int j = 0; j < (int) sizes.size(); j++) {
|
| 4906 |
int n_q4_0 = 0;
|
| 4907 |
int n_q4_1 = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4908 |
int n_fp16 = 0;
|
| 4909 |
int n_fp32 = 0;
|
| 4910 |
|
| 4911 |
// GFLOPS/s
|
| 4912 |
double s_q4_0 = 0.0;
|
| 4913 |
double s_q4_1 = 0.0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4914 |
double s_fp16 = 0.0;
|
| 4915 |
double s_fp32 = 0.0;
|
| 4916 |
|
| 4917 |
const size_t N = sizes[j];
|
| 4918 |
|
| 4919 |
-
for (int k = 0; k <
|
| 4920 |
const ggml_type wtype =
|
| 4921 |
k == 0 ? GGML_TYPE_Q4_0 :
|
| 4922 |
k == 1 ? GGML_TYPE_Q4_1 :
|
| 4923 |
-
k == 2 ?
|
| 4924 |
-
|
|
|
|
|
|
|
|
|
|
| 4925 |
|
| 4926 |
-
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32;
|
| 4927 |
-
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32;
|
| 4928 |
|
| 4929 |
struct ggml_init_params gparams = {
|
| 4930 |
/*.mem_size =*/ buf.size(),
|
|
@@ -4968,8 +4981,19 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 4968 |
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
| 4969 |
}
|
| 4970 |
|
| 4971 |
-
|
| 4972 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4973 |
s += strbuf;
|
| 4974 |
}
|
| 4975 |
|
|
|
|
| 4827 |
|
| 4828 |
ggml_time_init();
|
| 4829 |
|
| 4830 |
+
size_t n = 20;
|
| 4831 |
+
size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
|
| 4832 |
|
| 4833 |
+
// 1GB MB array
|
| 4834 |
const size_t size = arr*1024llu*1024llu;
|
| 4835 |
|
| 4836 |
+
// single-thread
|
| 4837 |
+
{
|
| 4838 |
+
char * src = (char *) malloc(size);
|
| 4839 |
+
char * dst = (char *) malloc(size);
|
| 4840 |
|
| 4841 |
+
for (size_t i = 0; i < size; i++) src[i] = i;
|
| 4842 |
|
| 4843 |
+
memcpy(dst, src, size); // heat-up
|
| 4844 |
|
| 4845 |
+
double tsum = 0.0;
|
| 4846 |
+
double sum = 0.0;
|
| 4847 |
|
| 4848 |
+
for (size_t i = 0; i < n; i++) {
|
| 4849 |
+
const int64_t t0 = ggml_time_us();
|
| 4850 |
|
| 4851 |
+
memcpy(dst, src, size);
|
| 4852 |
|
| 4853 |
+
const int64_t t1 = ggml_time_us();
|
| 4854 |
|
| 4855 |
+
tsum += (t1 - t0)*1e-6;
|
| 4856 |
|
| 4857 |
+
src[rand() % size] = rand() % 256;
|
| 4858 |
+
}
|
| 4859 |
|
| 4860 |
+
snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
|
| 4861 |
+
s += strbuf;
|
| 4862 |
|
| 4863 |
+
// needed to prevent the compiler from optimizing the memcpy away
|
| 4864 |
+
{
|
| 4865 |
+
for (size_t i = 0; i < size; i++) sum += dst[i];
|
| 4866 |
|
| 4867 |
+
snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
|
| 4868 |
+
s += strbuf;
|
| 4869 |
+
}
|
| 4870 |
|
| 4871 |
+
free(src);
|
| 4872 |
+
free(dst);
|
| 4873 |
}
|
| 4874 |
|
|
|
|
|
|
|
|
|
|
| 4875 |
return s.c_str();
|
| 4876 |
}
|
| 4877 |
|
|
|
|
| 4907 |
for (int j = 0; j < (int) sizes.size(); j++) {
|
| 4908 |
int n_q4_0 = 0;
|
| 4909 |
int n_q4_1 = 0;
|
| 4910 |
+
int n_q4_2 = 0;
|
| 4911 |
+
int n_q5_0 = 0;
|
| 4912 |
+
int n_q5_1 = 0;
|
| 4913 |
+
int n_q8_0 = 0;
|
| 4914 |
int n_fp16 = 0;
|
| 4915 |
int n_fp32 = 0;
|
| 4916 |
|
| 4917 |
// GFLOPS/s
|
| 4918 |
double s_q4_0 = 0.0;
|
| 4919 |
double s_q4_1 = 0.0;
|
| 4920 |
+
double s_q4_2 = 0.0;
|
| 4921 |
+
double s_q5_0 = 0.0;
|
| 4922 |
+
double s_q5_1 = 0.0;
|
| 4923 |
+
double s_q8_0 = 0.0;
|
| 4924 |
double s_fp16 = 0.0;
|
| 4925 |
double s_fp32 = 0.0;
|
| 4926 |
|
| 4927 |
const size_t N = sizes[j];
|
| 4928 |
|
| 4929 |
+
for (int k = 0; k < 8; ++k) {
|
| 4930 |
const ggml_type wtype =
|
| 4931 |
k == 0 ? GGML_TYPE_Q4_0 :
|
| 4932 |
k == 1 ? GGML_TYPE_Q4_1 :
|
| 4933 |
+
k == 2 ? GGML_TYPE_Q4_2 :
|
| 4934 |
+
k == 3 ? GGML_TYPE_Q5_0 :
|
| 4935 |
+
k == 4 ? GGML_TYPE_Q5_1 :
|
| 4936 |
+
k == 5 ? GGML_TYPE_Q8_0 :
|
| 4937 |
+
k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
| 4938 |
|
| 4939 |
+
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
|
| 4940 |
+
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
|
| 4941 |
|
| 4942 |
struct ggml_init_params gparams = {
|
| 4943 |
/*.mem_size =*/ buf.size(),
|
|
|
|
| 4981 |
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
| 4982 |
}
|
| 4983 |
|
| 4984 |
+
// Q4_0 | Q4_1 | Q4_2
|
| 4985 |
+
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
|
| 4986 |
+
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
|
| 4987 |
+
s += strbuf;
|
| 4988 |
+
|
| 4989 |
+
// Q5_0 | Q5_1 | Q8_0
|
| 4990 |
+
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n",
|
| 4991 |
+
N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0);
|
| 4992 |
+
s += strbuf;
|
| 4993 |
+
|
| 4994 |
+
// F16 | F32
|
| 4995 |
+
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: F16 %7.1f GFLOPS (%3d runs) | F32 %7.1f GFLOPS (%3d runs)\n",
|
| 4996 |
+
N, N, s_fp16, n_fp16, s_fp32, n_fp32);
|
| 4997 |
s += strbuf;
|
| 4998 |
}
|
| 4999 |
|