ggerganov commited on
Commit
d75ae65
·
unverified ·
1 Parent(s): 648dff8

bench : improve benchmarks

Browse files
Files changed (2) hide show
  1. extra/bench-all.sh +33 -15
  2. whisper.cpp +57 -33
extra/bench-all.sh CHANGED
@@ -2,7 +2,7 @@
2
 
3
  # Helper script to run the bench tool on all models and print the results in share-able format
4
 
5
- printf "Usage: ./bench.sh [n_threads]\n"
6
 
7
  if [ -z "$1" ]; then
8
  n_threads=4
@@ -10,24 +10,39 @@ else
10
  n_threads=$1
11
  fi
12
 
13
- models=( "tiny" "base" "small" "medium" "large" )
 
 
 
 
 
14
 
15
- printf "\n"
16
- printf "Running memcpy benchmark with 1 thread\n"
17
- printf "\n"
 
 
 
 
18
 
19
- ./bench -w 1 -t 1 2>&1
 
 
 
20
 
21
- printf "\n"
22
- printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
23
- printf "\n"
24
 
25
- ./bench -w 2 -t $n_threads 2>&1
 
 
26
 
27
- printf "\n"
28
- printf "Running benchmark for all models\n"
29
- printf "This can take a while!\n"
30
- printf "\n"
 
 
 
31
 
32
  printf "| CPU | OS | Config | Model | Th | Load | Enc. | Commit |\n"
33
  printf "| --- | -- | ------ | ----- | -- | ---- | ---- | ------ |\n"
@@ -39,6 +54,7 @@ for model in "${models[@]}"; do
39
  # actual run
40
  # store stderr output in a variable in order to parse it later
41
  output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
 
42
 
43
  # parse the output:
44
  load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
@@ -70,5 +86,7 @@ for model in "${models[@]}"; do
70
 
71
  commit=$(git rev-parse --short HEAD)
72
 
73
- printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
 
 
74
  done
 
2
 
3
  # Helper script to run the bench tool on all models and print the results in share-able format
4
 
5
+ printf "Usage: ./bench.sh [n_threads] [encoder-only]\n"
6
 
7
  if [ -z "$1" ]; then
8
  n_threads=4
 
10
  n_threads=$1
11
  fi
12
 
13
+ encoder_only=0
14
+ if [ -z "$2" ]; then
15
+ encoder_only=0
16
+ else
17
+ encoder_only=$2
18
+ fi
19
 
20
+ models=( \
21
+ "tiny" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \
22
+ "base" "base-q5_0" "base-q5_1" "base-q8_0" \
23
+ "small" "small-q5_0" "small-q5_1" "small-q8_0" \
24
+ "medium" "medium-q5_0" "medium-q5_1" "medium-q8_0" \
25
+ "large" "large-q5_0" "large-q5_1" "large-q8_0" \
26
+ )
27
 
28
+ if [ "$encoder_only" -eq 0 ]; then
29
+ printf "\n"
30
+ printf "Running memcpy benchmark\n"
31
+ printf "\n"
32
 
33
+ ./bench -w 1 -t $n_threads 2>&1
 
 
34
 
35
+ printf "\n"
36
+ printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
37
+ printf "\n"
38
 
39
+ ./bench -w 2 -t $n_threads 2>&1
40
+
41
+ printf "\n"
42
+ printf "Running benchmark for all models\n"
43
+ printf "This can take a while!\n"
44
+ printf "\n"
45
+ fi
46
 
47
  printf "| CPU | OS | Config | Model | Th | Load | Enc. | Commit |\n"
48
  printf "| --- | -- | ------ | ----- | -- | ---- | ---- | ------ |\n"
 
54
  # actual run
55
  # store stderr output in a variable in order to parse it later
56
  output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
57
+ ret=$?
58
 
59
  # parse the output:
60
  load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
 
86
 
87
  commit=$(git rev-parse --short HEAD)
88
 
89
+ if [ $ret -eq 0 ]; then
90
+ printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
91
+ fi
92
  done
whisper.cpp CHANGED
@@ -4827,49 +4827,51 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
4827
 
4828
  ggml_time_init();
4829
 
4830
- size_t n = 50;
4831
- size_t arr = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations
4832
 
4833
- // 1 GB array
4834
  const size_t size = arr*1024llu*1024llu;
4835
 
4836
- char * src = (char *) malloc(size);
4837
- char * dst = (char *) malloc(size);
 
 
4838
 
4839
- for (size_t i = 0; i < size; i++) src[i] = i;
4840
 
4841
- memcpy(dst, src, size); // heat-up
4842
 
4843
- double tsum = 0.0;
 
4844
 
4845
- for (size_t i = 0; i < n; i++) {
4846
- const int64_t t0 = ggml_time_us();
4847
 
4848
- memcpy(dst, src, size);
4849
 
4850
- const int64_t t1 = ggml_time_us();
4851
 
4852
- tsum += (t1 - t0)*1e-6;
4853
 
4854
- src[0] = rand();
4855
- }
4856
 
4857
- snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
4858
- s += strbuf;
4859
 
4860
- // needed to prevent the compile from optimizing the memcpy away
4861
- {
4862
- double sum = 0.0;
4863
 
4864
- for (size_t i = 0; i < size; i++) sum += dst[i];
 
 
4865
 
4866
- snprintf(strbuf, sizeof(strbuf), "sum: %s %f\n", sum == -536870910.00 ? "ok" : "error", sum);
4867
- s += strbuf;
4868
  }
4869
 
4870
- free(src);
4871
- free(dst);
4872
-
4873
  return s.c_str();
4874
  }
4875
 
@@ -4905,26 +4907,37 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4905
  for (int j = 0; j < (int) sizes.size(); j++) {
4906
  int n_q4_0 = 0;
4907
  int n_q4_1 = 0;
 
 
 
 
4908
  int n_fp16 = 0;
4909
  int n_fp32 = 0;
4910
 
4911
  // GFLOPS/s
4912
  double s_q4_0 = 0.0;
4913
  double s_q4_1 = 0.0;
 
 
 
 
4914
  double s_fp16 = 0.0;
4915
  double s_fp32 = 0.0;
4916
 
4917
  const size_t N = sizes[j];
4918
 
4919
- for (int k = 0; k < 4; ++k) {
4920
  const ggml_type wtype =
4921
  k == 0 ? GGML_TYPE_Q4_0 :
4922
  k == 1 ? GGML_TYPE_Q4_1 :
4923
- k == 2 ? GGML_TYPE_F16 :
4924
- GGML_TYPE_F32;
 
 
 
4925
 
4926
- double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32;
4927
- int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32;
4928
 
4929
  struct ggml_init_params gparams = {
4930
  /*.mem_size =*/ buf.size(),
@@ -4968,8 +4981,19 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4968
  s = ((2.0*N*N*N*n)/tsum)*1e-9;
4969
  }
4970
 
4971
- snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n",
4972
- N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32);
 
 
 
 
 
 
 
 
 
 
 
4973
  s += strbuf;
4974
  }
4975
 
 
4827
 
4828
  ggml_time_init();
4829
 
4830
+ size_t n = 20;
4831
+ size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
4832
 
4833
+ // 1GB MB array
4834
  const size_t size = arr*1024llu*1024llu;
4835
 
4836
+ // single-thread
4837
+ {
4838
+ char * src = (char *) malloc(size);
4839
+ char * dst = (char *) malloc(size);
4840
 
4841
+ for (size_t i = 0; i < size; i++) src[i] = i;
4842
 
4843
+ memcpy(dst, src, size); // heat-up
4844
 
4845
+ double tsum = 0.0;
4846
+ double sum = 0.0;
4847
 
4848
+ for (size_t i = 0; i < n; i++) {
4849
+ const int64_t t0 = ggml_time_us();
4850
 
4851
+ memcpy(dst, src, size);
4852
 
4853
+ const int64_t t1 = ggml_time_us();
4854
 
4855
+ tsum += (t1 - t0)*1e-6;
4856
 
4857
+ src[rand() % size] = rand() % 256;
4858
+ }
4859
 
4860
+ snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
4861
+ s += strbuf;
4862
 
4863
+ // needed to prevent the compiler from optimizing the memcpy away
4864
+ {
4865
+ for (size_t i = 0; i < size; i++) sum += dst[i];
4866
 
4867
+ snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
4868
+ s += strbuf;
4869
+ }
4870
 
4871
+ free(src);
4872
+ free(dst);
4873
  }
4874
 
 
 
 
4875
  return s.c_str();
4876
  }
4877
 
 
4907
  for (int j = 0; j < (int) sizes.size(); j++) {
4908
  int n_q4_0 = 0;
4909
  int n_q4_1 = 0;
4910
+ int n_q4_2 = 0;
4911
+ int n_q5_0 = 0;
4912
+ int n_q5_1 = 0;
4913
+ int n_q8_0 = 0;
4914
  int n_fp16 = 0;
4915
  int n_fp32 = 0;
4916
 
4917
  // GFLOPS/s
4918
  double s_q4_0 = 0.0;
4919
  double s_q4_1 = 0.0;
4920
+ double s_q4_2 = 0.0;
4921
+ double s_q5_0 = 0.0;
4922
+ double s_q5_1 = 0.0;
4923
+ double s_q8_0 = 0.0;
4924
  double s_fp16 = 0.0;
4925
  double s_fp32 = 0.0;
4926
 
4927
  const size_t N = sizes[j];
4928
 
4929
+ for (int k = 0; k < 8; ++k) {
4930
  const ggml_type wtype =
4931
  k == 0 ? GGML_TYPE_Q4_0 :
4932
  k == 1 ? GGML_TYPE_Q4_1 :
4933
+ k == 2 ? GGML_TYPE_Q4_2 :
4934
+ k == 3 ? GGML_TYPE_Q5_0 :
4935
+ k == 4 ? GGML_TYPE_Q5_1 :
4936
+ k == 5 ? GGML_TYPE_Q8_0 :
4937
+ k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
4938
 
4939
+ double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
4940
+ int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
4941
 
4942
  struct ggml_init_params gparams = {
4943
  /*.mem_size =*/ buf.size(),
 
4981
  s = ((2.0*N*N*N*n)/tsum)*1e-9;
4982
  }
4983
 
4984
+ // Q4_0 | Q4_1 | Q4_2
4985
+ snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
4986
+ N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
4987
+ s += strbuf;
4988
+
4989
+ // Q5_0 | Q5_1 | Q8_0
4990
+ snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n",
4991
+ N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0);
4992
+ s += strbuf;
4993
+
4994
+ // F16 | F32
4995
+ snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: F16 %7.1f GFLOPS (%3d runs) | F32 %7.1f GFLOPS (%3d runs)\n",
4996
+ N, N, s_fp16, n_fp16, s_fp32, n_fp32);
4997
  s += strbuf;
4998
  }
4999