ggerganov commited on
Commit
9908abb
·
unverified ·
1 Parent(s): 9f860c0

whisper : add support for large v3 (#1444)

Browse files

* whisper : add support for large v3

* bench : fix build + fix go bindings

* bench : fix n_mels

* models : update readme

Makefile CHANGED
@@ -417,9 +417,10 @@ samples:
417
  .PHONY: medium.en
418
  .PHONY: medium
419
  .PHONY: large-v1
 
420
  .PHONY: large
421
 
422
- tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
423
  bash ./models/download-ggml-model.sh $@
424
  @echo ""
425
  @echo "==============================================="
 
417
  .PHONY: medium.en
418
  .PHONY: medium
419
  .PHONY: large-v1
420
+ .PHONY: large-v2
421
  .PHONY: large
422
 
423
+ tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main
424
  bash ./models/download-ggml-model.sh $@
425
  @echo ""
426
  @echo "==============================================="
README.md CHANGED
@@ -234,6 +234,7 @@ make small
234
  make medium.en
235
  make medium
236
  make large-v1
 
237
  make large
238
  ```
239
 
@@ -245,7 +246,7 @@ make large
245
  | base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
246
  | small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
247
  | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
248
- | large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
249
 
250
  ## Quantization
251
 
 
234
  make medium.en
235
  make medium
236
  make large-v1
237
+ make large-v2
238
  make large
239
  ```
240
 
 
246
  | base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
247
  | small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
248
  | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
249
+ | large | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
250
 
251
  ## Quantization
252
 
bindings/go/examples/go-model-download/main.go CHANGED
@@ -24,7 +24,7 @@ const (
24
 
25
  var (
26
  // The models which will be downloaded, if no model is specified as an argument
27
- modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large"}
28
  )
29
 
30
  var (
 
24
 
25
  var (
26
  // The models which will be downloaded, if no model is specified as an argument
27
+ modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"}
28
  )
29
 
30
  var (
bindings/go/whisper.go CHANGED
@@ -83,7 +83,6 @@ const (
83
  SampleRate = C.WHISPER_SAMPLE_RATE // Expected sample rate, samples per second
84
  SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
85
  NumFFT = C.WHISPER_N_FFT
86
- NumMEL = C.WHISPER_N_MEL
87
  HopLength = C.WHISPER_HOP_LENGTH
88
  ChunkSize = C.WHISPER_CHUNK_SIZE
89
  )
 
83
  SampleRate = C.WHISPER_SAMPLE_RATE // Expected sample rate, samples per second
84
  SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
85
  NumFFT = C.WHISPER_N_FFT
 
86
  HopLength = C.WHISPER_HOP_LENGTH
87
  ChunkSize = C.WHISPER_CHUNK_SIZE
88
  )
examples/bench.wasm/emscripten.cpp CHANGED
@@ -23,7 +23,9 @@ void bench_main(size_t index) {
23
 
24
  fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);
25
 
26
- if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
 
 
27
  fprintf(stderr, "error: failed to set mel: %d\n", ret);
28
  return;
29
  }
 
23
 
24
  fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);
25
 
26
+ const int n_mels = whisper_model_n_mels(ctx);
27
+
28
+ if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
29
  fprintf(stderr, "error: failed to set mel: %d\n", ret);
30
  return;
31
  }
examples/bench/bench.cpp CHANGED
@@ -73,7 +73,9 @@ int whisper_bench_full(const whisper_params & params) {
73
  return 2;
74
  }
75
 
76
- if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
 
 
77
  fprintf(stderr, "error: failed to set mel: %d\n", ret);
78
  return 3;
79
  }
 
73
  return 2;
74
  }
75
 
76
+ const int n_mels = whisper_model_n_mels(ctx);
77
+
78
+ if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
79
  fprintf(stderr, "error: failed to set mel: %d\n", ret);
80
  return 3;
81
  }
examples/livestream.sh CHANGED
@@ -48,7 +48,7 @@ if [ -n "$3" ]; then
48
  fi
49
 
50
  # Whisper models
51
- models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
52
 
53
  # list available models
54
  function list_models {
 
48
  fi
49
 
50
  # Whisper models
51
+ models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
52
 
53
  # list available models
54
  function list_models {
examples/twitch.sh CHANGED
@@ -21,7 +21,7 @@ help()
21
  echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
22
  echo "options:"
23
  echo "-s Step in seconds (default is $step)."
24
- echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large' (default is '$model')."
25
  echo "-t Number of threads to use."
26
  echo "-h Print this help page."
27
  echo
 
21
  echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
22
  echo "options:"
23
  echo "-s Step in seconds (default is $step)."
24
+ echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')."
25
  echo "-t Number of threads to use."
26
  echo "-h Print this help page."
27
  echo
extra/convert-all.sh CHANGED
@@ -1,6 +1,6 @@
1
  #!/bin/bash
2
 
3
- models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
4
 
5
  for model in "${models[@]}"; do
6
  python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
 
1
  #!/bin/bash
2
 
3
+ models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
4
 
5
  for model in "${models[@]}"; do
6
  python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
models/README.md CHANGED
@@ -50,7 +50,8 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main
50
  | medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
51
  | medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
52
  | large-v1 | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
53
- | large | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 
54
 
55
  ## Model files for testing purposes
56
 
 
50
  | medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
51
  | medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
52
  | large-v1 | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
53
+ | large-v2 | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
54
+ | large | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
55
 
56
  ## Model files for testing purposes
57
 
models/convert-h5-to-coreml.py CHANGED
@@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
78
  # Ported from models/convert-whisper-to-coreml.py
79
  if __name__ == "__main__":
80
  parser = argparse.ArgumentParser()
81
- parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
82
  parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
83
  parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
84
  parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
85
  parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
86
  args = parser.parse_args()
87
 
88
- if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
89
  raise ValueError("Invalid model name")
90
 
91
  pt_target_path = f"models/hf-{args.model_name}.pt"
 
78
  # Ported from models/convert-whisper-to-coreml.py
79
  if __name__ == "__main__":
80
  parser = argparse.ArgumentParser()
81
+ parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
82
  parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
83
  parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
84
  parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
85
  parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
86
  args = parser.parse_args()
87
 
88
+ if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
89
  raise ValueError("Invalid model name")
90
 
91
  pt_target_path = f"models/hf-{args.model_name}.pt"
models/convert-pt-to-ggml.py CHANGED
@@ -228,7 +228,7 @@ with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
228
  # for backwards compatibility, also check for older hf_transformers format tokenizer files
229
  # old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
230
  # new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
231
- multilingual = hparams["n_vocab"] == 51865
232
  tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
233
  tokenizer_type = "tiktoken"
234
  if not tokenizer.is_file():
 
228
  # for backwards compatibility, also check for older hf_transformers format tokenizer files
229
  # old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
230
  # new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
231
+ multilingual = hparams["n_vocab"] >= 51865
232
  tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
233
  tokenizer_type = "tiktoken"
234
  if not tokenizer.is_file():
models/convert-whisper-to-coreml.py CHANGED
@@ -194,7 +194,7 @@ class TextDecoderANE(TextDecoder):
194
  x = x.permute(0,2,3,1).squeeze(0)
195
 
196
  # ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
197
- if self.token_embedding.weight.shape[0] == 51865:
198
  # split in 11 chunks - 4715 each
199
  splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
200
  logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
@@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False):
296
 
297
  if __name__ == "__main__":
298
  parser = argparse.ArgumentParser()
299
- parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
300
  parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
301
  parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
302
  parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
303
  args = parser.parse_args()
304
 
305
- if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
306
  raise ValueError("Invalid model name")
307
 
308
  whisper = load_model(args.model).cpu()
 
194
  x = x.permute(0,2,3,1).squeeze(0)
195
 
196
  # ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
197
+ if self.token_embedding.weight.shape[0] >= 51865:
198
  # split in 11 chunks - 4715 each
199
  splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
200
  logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
 
296
 
297
  if __name__ == "__main__":
298
  parser = argparse.ArgumentParser()
299
+ parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
300
  parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
301
  parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
302
  parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
303
  args = parser.parse_args()
304
 
305
+ if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
306
  raise ValueError("Invalid model name")
307
 
308
  whisper = load_model(args.model).cpu()
models/convert-whisper-to-openvino.py CHANGED
@@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname):
38
 
39
  if __name__ == "__main__":
40
  parser = argparse.ArgumentParser()
41
- parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
42
  args = parser.parse_args()
43
 
44
- if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
45
  raise ValueError("Invalid model name")
46
 
47
  whisper = load_model(args.model).cpu()
 
38
 
39
  if __name__ == "__main__":
40
  parser = argparse.ArgumentParser()
41
+ parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
42
  args = parser.parse_args()
43
 
44
+ if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
45
  raise ValueError("Invalid model name")
46
 
47
  whisper = load_model(args.model).cpu()
models/download-coreml-model.sh CHANGED
@@ -19,7 +19,7 @@ function get_script_path() {
19
  models_path="$(get_script_path)"
20
 
21
  # Whisper models
22
- models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
23
 
24
  # list available models
25
  function list_models {
 
19
  models_path="$(get_script_path)"
20
 
21
  # Whisper models
22
+ models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
23
 
24
  # list available models
25
  function list_models {
models/download-ggml-model.cmd CHANGED
@@ -8,7 +8,7 @@ popd
8
  set argc=0
9
  for %%x in (%*) do set /A argc+=1
10
 
11
- set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large
12
 
13
  if %argc% neq 1 (
14
  echo.
@@ -57,8 +57,8 @@ goto :eof
57
  :list_models
58
  echo.
59
  echo Available models:
60
- (for %%a in (%models%) do (
61
- echo %%a
62
  ))
63
  echo.
64
  exit /b
 
8
  set argc=0
9
  for %%x in (%*) do set /A argc+=1
10
 
11
+ set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large
12
 
13
  if %argc% neq 1 (
14
  echo.
 
57
  :list_models
58
  echo.
59
  echo Available models:
60
+ (for %%a in (%models%) do (
61
+ echo %%a
62
  ))
63
  echo.
64
  exit /b
models/download-ggml-model.sh CHANGED
@@ -41,6 +41,7 @@ models=(
41
  "medium-q5_0"
42
  "medium.en-q5_0"
43
  "large-v1"
 
44
  "large"
45
  "large-q5_0"
46
  )
 
41
  "medium-q5_0"
42
  "medium.en-q5_0"
43
  "large-v1"
44
+ "large-v2"
45
  "large"
46
  "large-q5_0"
47
  )
tests/run-tests.sh CHANGED
@@ -19,7 +19,7 @@
19
  cd `dirname $0`
20
 
21
  # Whisper models
22
- models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
23
 
24
  # list available models
25
  function list_models {
 
19
  cd `dirname $0`
20
 
21
  # Whisper models
22
+ models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
23
 
24
  # list available models
25
  function list_models {
whisper.cpp CHANGED
@@ -193,6 +193,15 @@ enum e_model {
193
  MODEL_LARGE,
194
  };
195
 
 
 
 
 
 
 
 
 
 
196
  static const std::map<std::string, std::pair<int, std::string>> g_lang = {
197
  { "en", { 0, "english", } },
198
  { "zh", { 1, "chinese", } },
@@ -293,6 +302,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
293
  { "ba", { 96, "bashkir", } },
294
  { "jw", { 97, "javanese", } },
295
  { "su", { 98, "sundanese", } },
 
296
  };
297
 
298
  static const size_t MB = 1ull*1024*1024;
@@ -402,7 +412,11 @@ struct whisper_vocab {
402
  id token_beg = 50363; // begin timestamps
403
 
404
  bool is_multilingual() const {
405
- return n_vocab == 51865;
 
 
 
 
406
  }
407
  };
408
 
@@ -922,6 +936,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
922
 
923
  assert(hparams.n_text_state == hparams.n_audio_state);
924
 
 
 
925
  if (hparams.n_audio_layer == 4) {
926
  model.type = e_model::MODEL_TINY;
927
  }
@@ -940,6 +956,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
940
 
941
  if (hparams.n_audio_layer == 32) {
942
  model.type = e_model::MODEL_LARGE;
 
 
 
 
943
  }
944
 
945
  const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
@@ -968,7 +988,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
968
  log("%s: n_mels = %d\n", __func__, hparams.n_mels);
969
  log("%s: ftype = %d\n", __func__, model.hparams.ftype);
970
  log("%s: qntvr = %d\n", __func__, qntvr);
971
- log("%s: type = %d\n", __func__, model.type);
972
 
973
  // print memory requirements
974
  {
@@ -1039,13 +1059,17 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1039
  if (vocab.is_multilingual()) {
1040
  vocab.token_eot++;
1041
  vocab.token_sot++;
1042
- vocab.token_translate++;
1043
- vocab.token_transcribe++;
1044
- vocab.token_solm++;
1045
- vocab.token_prev++;
1046
- vocab.token_nosp++;
1047
- vocab.token_not++;
1048
- vocab.token_beg++;
 
 
 
 
1049
  }
1050
 
1051
  if (n_vocab < model.hparams.n_vocab) {
@@ -1074,6 +1098,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1074
  vocab.id_to_token[i] = word;
1075
  }
1076
  }
 
 
1077
  }
1078
 
1079
  size_t ctx_size = 0;
@@ -3281,7 +3307,7 @@ void whisper_free_params(struct whisper_full_params * params) {
3281
  }
3282
 
3283
  int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
3284
- if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
3285
  log("%s: failed to compute mel spectrogram\n", __func__);
3286
  return -1;
3287
  }
@@ -3295,7 +3321,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
3295
 
3296
  // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
3297
  int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
3298
- if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
3299
  log("%s: failed to compute mel spectrogram\n", __func__);
3300
  return -1;
3301
  }
@@ -3318,13 +3344,13 @@ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float *
3318
  // TODO
3319
 
3320
  int whisper_set_mel_with_state(
3321
- struct whisper_context * /*ctx*/,
3322
  struct whisper_state * state,
3323
  const float * data,
3324
  int n_len,
3325
  int n_mel) {
3326
- if (n_mel != WHISPER_N_MEL) {
3327
- log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
3328
  return -1;
3329
  }
3330
 
 
193
  MODEL_LARGE,
194
  };
195
 
196
+ static const std::map<e_model, std::string> g_model_name = {
197
+ { MODEL_UNKNOWN, "unknown" },
198
+ { MODEL_TINY, "tiny" },
199
+ { MODEL_BASE, "base" },
200
+ { MODEL_SMALL, "small" },
201
+ { MODEL_MEDIUM, "medium" },
202
+ { MODEL_LARGE, "large" },
203
+ };
204
+
205
  static const std::map<std::string, std::pair<int, std::string>> g_lang = {
206
  { "en", { 0, "english", } },
207
  { "zh", { 1, "chinese", } },
 
302
  { "ba", { 96, "bashkir", } },
303
  { "jw", { 97, "javanese", } },
304
  { "su", { 98, "sundanese", } },
305
+ { "yue", { 99, "cantonese", } },
306
  };
307
 
308
  static const size_t MB = 1ull*1024*1024;
 
412
  id token_beg = 50363; // begin timestamps
413
 
414
  bool is_multilingual() const {
415
+ return n_vocab >= 51865;
416
+ }
417
+
418
+ int num_languages() const {
419
+ return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
420
  }
421
  };
422
 
 
936
 
937
  assert(hparams.n_text_state == hparams.n_audio_state);
938
 
939
+ std::string mver = "";
940
+
941
  if (hparams.n_audio_layer == 4) {
942
  model.type = e_model::MODEL_TINY;
943
  }
 
956
 
957
  if (hparams.n_audio_layer == 32) {
958
  model.type = e_model::MODEL_LARGE;
959
+
960
+ if (hparams.n_vocab == 51866) {
961
+ mver = " v3";
962
+ }
963
  }
964
 
965
  const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
 
988
  log("%s: n_mels = %d\n", __func__, hparams.n_mels);
989
  log("%s: ftype = %d\n", __func__, model.hparams.ftype);
990
  log("%s: qntvr = %d\n", __func__, qntvr);
991
+ log("%s: type = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str());
992
 
993
  // print memory requirements
994
  {
 
1059
  if (vocab.is_multilingual()) {
1060
  vocab.token_eot++;
1061
  vocab.token_sot++;
1062
+
1063
+ // account for variable number of language tokens
1064
+ const int dt = vocab.num_languages() - 98;
1065
+
1066
+ vocab.token_translate += dt;
1067
+ vocab.token_transcribe += dt;
1068
+ vocab.token_solm += dt;
1069
+ vocab.token_prev += dt;
1070
+ vocab.token_nosp += dt;
1071
+ vocab.token_not += dt;
1072
+ vocab.token_beg += dt;
1073
  }
1074
 
1075
  if (n_vocab < model.hparams.n_vocab) {
 
1098
  vocab.id_to_token[i] = word;
1099
  }
1100
  }
1101
+
1102
+ log("%s: n_langs = %d\n", __func__, vocab.num_languages());
1103
  }
1104
 
1105
  size_t ctx_size = 0;
 
3307
  }
3308
 
3309
  int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
3310
+ if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
3311
  log("%s: failed to compute mel spectrogram\n", __func__);
3312
  return -1;
3313
  }
 
3321
 
3322
  // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
3323
  int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
3324
+ if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
3325
  log("%s: failed to compute mel spectrogram\n", __func__);
3326
  return -1;
3327
  }
 
3344
  // TODO
3345
 
3346
  int whisper_set_mel_with_state(
3347
+ struct whisper_context * ctx,
3348
  struct whisper_state * state,
3349
  const float * data,
3350
  int n_len,
3351
  int n_mel) {
3352
+ if (n_mel != ctx->model.filters.n_mel) {
3353
+ log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, ctx->model.filters.n_mel);
3354
  return -1;
3355
  }
3356
 
whisper.h CHANGED
@@ -29,7 +29,6 @@
29
 
30
  #define WHISPER_SAMPLE_RATE 16000
31
  #define WHISPER_N_FFT 400
32
- #define WHISPER_N_MEL 80
33
  #define WHISPER_HOP_LENGTH 160
34
  #define WHISPER_CHUNK_SIZE 30
35
 
 
29
 
30
  #define WHISPER_SAMPLE_RATE 16000
31
  #define WHISPER_N_FFT 400
 
32
  #define WHISPER_HOP_LENGTH 160
33
  #define WHISPER_CHUNK_SIZE 30
34