Spaces:
Sleeping
Sleeping
whisper : add support for large v3 (#1444)
Browse files* whisper : add support for large v3
* bench : fix build + fix go bindings
* bench : fix n_mels
* models : update readme
- Makefile +2 -1
- README.md +2 -1
- bindings/go/examples/go-model-download/main.go +1 -1
- bindings/go/whisper.go +0 -1
- examples/bench.wasm/emscripten.cpp +3 -1
- examples/bench/bench.cpp +3 -1
- examples/livestream.sh +1 -1
- examples/twitch.sh +1 -1
- extra/convert-all.sh +1 -1
- models/README.md +2 -1
- models/convert-h5-to-coreml.py +2 -2
- models/convert-pt-to-ggml.py +1 -1
- models/convert-whisper-to-coreml.py +3 -3
- models/convert-whisper-to-openvino.py +2 -2
- models/download-coreml-model.sh +1 -1
- models/download-ggml-model.cmd +3 -3
- models/download-ggml-model.sh +1 -0
- tests/run-tests.sh +1 -1
- whisper.cpp +40 -14
- whisper.h +0 -1
Makefile
CHANGED
|
@@ -417,9 +417,10 @@ samples:
|
|
| 417 |
.PHONY: medium.en
|
| 418 |
.PHONY: medium
|
| 419 |
.PHONY: large-v1
|
|
|
|
| 420 |
.PHONY: large
|
| 421 |
|
| 422 |
-
tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
|
| 423 |
bash ./models/download-ggml-model.sh $@
|
| 424 |
@echo ""
|
| 425 |
@echo "==============================================="
|
|
|
|
| 417 |
.PHONY: medium.en
|
| 418 |
.PHONY: medium
|
| 419 |
.PHONY: large-v1
|
| 420 |
+
.PHONY: large-v2
|
| 421 |
.PHONY: large
|
| 422 |
|
| 423 |
+
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main
|
| 424 |
bash ./models/download-ggml-model.sh $@
|
| 425 |
@echo ""
|
| 426 |
@echo "==============================================="
|
README.md
CHANGED
|
@@ -234,6 +234,7 @@ make small
|
|
| 234 |
make medium.en
|
| 235 |
make medium
|
| 236 |
make large-v1
|
|
|
|
| 237 |
make large
|
| 238 |
```
|
| 239 |
|
|
@@ -245,7 +246,7 @@ make large
|
|
| 245 |
| base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
|
| 246 |
| small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
|
| 247 |
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
| 248 |
-
| large | 2.9 GB | ~3.3 GB | `
|
| 249 |
|
| 250 |
## Quantization
|
| 251 |
|
|
|
|
| 234 |
make medium.en
|
| 235 |
make medium
|
| 236 |
make large-v1
|
| 237 |
+
make large-v2
|
| 238 |
make large
|
| 239 |
```
|
| 240 |
|
|
|
|
| 246 |
| base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
|
| 247 |
| small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
|
| 248 |
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
| 249 |
+
| large | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
|
| 250 |
|
| 251 |
## Quantization
|
| 252 |
|
bindings/go/examples/go-model-download/main.go
CHANGED
|
@@ -24,7 +24,7 @@ const (
|
|
| 24 |
|
| 25 |
var (
|
| 26 |
// The models which will be downloaded, if no model is specified as an argument
|
| 27 |
-
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large"}
|
| 28 |
)
|
| 29 |
|
| 30 |
var (
|
|
|
|
| 24 |
|
| 25 |
var (
|
| 26 |
// The models which will be downloaded, if no model is specified as an argument
|
| 27 |
+
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"}
|
| 28 |
)
|
| 29 |
|
| 30 |
var (
|
bindings/go/whisper.go
CHANGED
|
@@ -83,7 +83,6 @@ const (
|
|
| 83 |
SampleRate = C.WHISPER_SAMPLE_RATE // Expected sample rate, samples per second
|
| 84 |
SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
|
| 85 |
NumFFT = C.WHISPER_N_FFT
|
| 86 |
-
NumMEL = C.WHISPER_N_MEL
|
| 87 |
HopLength = C.WHISPER_HOP_LENGTH
|
| 88 |
ChunkSize = C.WHISPER_CHUNK_SIZE
|
| 89 |
)
|
|
|
|
| 83 |
SampleRate = C.WHISPER_SAMPLE_RATE // Expected sample rate, samples per second
|
| 84 |
SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
|
| 85 |
NumFFT = C.WHISPER_N_FFT
|
|
|
|
| 86 |
HopLength = C.WHISPER_HOP_LENGTH
|
| 87 |
ChunkSize = C.WHISPER_CHUNK_SIZE
|
| 88 |
)
|
examples/bench.wasm/emscripten.cpp
CHANGED
|
@@ -23,7 +23,9 @@ void bench_main(size_t index) {
|
|
| 23 |
|
| 24 |
fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);
|
| 25 |
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
fprintf(stderr, "error: failed to set mel: %d\n", ret);
|
| 28 |
return;
|
| 29 |
}
|
|
|
|
| 23 |
|
| 24 |
fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);
|
| 25 |
|
| 26 |
+
const int n_mels = whisper_model_n_mels(ctx);
|
| 27 |
+
|
| 28 |
+
if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
|
| 29 |
fprintf(stderr, "error: failed to set mel: %d\n", ret);
|
| 30 |
return;
|
| 31 |
}
|
examples/bench/bench.cpp
CHANGED
|
@@ -73,7 +73,9 @@ int whisper_bench_full(const whisper_params & params) {
|
|
| 73 |
return 2;
|
| 74 |
}
|
| 75 |
|
| 76 |
-
|
|
|
|
|
|
|
| 77 |
fprintf(stderr, "error: failed to set mel: %d\n", ret);
|
| 78 |
return 3;
|
| 79 |
}
|
|
|
|
| 73 |
return 2;
|
| 74 |
}
|
| 75 |
|
| 76 |
+
const int n_mels = whisper_model_n_mels(ctx);
|
| 77 |
+
|
| 78 |
+
if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
|
| 79 |
fprintf(stderr, "error: failed to set mel: %d\n", ret);
|
| 80 |
return 3;
|
| 81 |
}
|
examples/livestream.sh
CHANGED
|
@@ -48,7 +48,7 @@ if [ -n "$3" ]; then
|
|
| 48 |
fi
|
| 49 |
|
| 50 |
# Whisper models
|
| 51 |
-
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
|
| 52 |
|
| 53 |
# list available models
|
| 54 |
function list_models {
|
|
|
|
| 48 |
fi
|
| 49 |
|
| 50 |
# Whisper models
|
| 51 |
+
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
| 52 |
|
| 53 |
# list available models
|
| 54 |
function list_models {
|
examples/twitch.sh
CHANGED
|
@@ -21,7 +21,7 @@ help()
|
|
| 21 |
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
|
| 22 |
echo "options:"
|
| 23 |
echo "-s Step in seconds (default is $step)."
|
| 24 |
-
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large' (default is '$model')."
|
| 25 |
echo "-t Number of threads to use."
|
| 26 |
echo "-h Print this help page."
|
| 27 |
echo
|
|
|
|
| 21 |
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
|
| 22 |
echo "options:"
|
| 23 |
echo "-s Step in seconds (default is $step)."
|
| 24 |
+
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')."
|
| 25 |
echo "-t Number of threads to use."
|
| 26 |
echo "-h Print this help page."
|
| 27 |
echo
|
extra/convert-all.sh
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
-
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
|
| 4 |
|
| 5 |
for model in "${models[@]}"; do
|
| 6 |
python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
+
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
| 4 |
|
| 5 |
for model in "${models[@]}"; do
|
| 6 |
python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
|
models/README.md
CHANGED
|
@@ -50,7 +50,8 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main
|
|
| 50 |
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
| 51 |
| medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
|
| 52 |
| large-v1 | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
|
| 53 |
-
| large
|
|
|
|
| 54 |
|
| 55 |
## Model files for testing purposes
|
| 56 |
|
|
|
|
| 50 |
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
| 51 |
| medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
|
| 52 |
| large-v1 | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
|
| 53 |
+
| large-v2 | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
| 54 |
+
| large | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
|
| 55 |
|
| 56 |
## Model files for testing purposes
|
| 57 |
|
models/convert-h5-to-coreml.py
CHANGED
|
@@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
|
|
| 78 |
# Ported from models/convert-whisper-to-coreml.py
|
| 79 |
if __name__ == "__main__":
|
| 80 |
parser = argparse.ArgumentParser()
|
| 81 |
-
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
|
| 82 |
parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
|
| 83 |
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
| 84 |
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
| 85 |
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
| 86 |
args = parser.parse_args()
|
| 87 |
|
| 88 |
-
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
|
| 89 |
raise ValueError("Invalid model name")
|
| 90 |
|
| 91 |
pt_target_path = f"models/hf-{args.model_name}.pt"
|
|
|
|
| 78 |
# Ported from models/convert-whisper-to-coreml.py
|
| 79 |
if __name__ == "__main__":
|
| 80 |
parser = argparse.ArgumentParser()
|
| 81 |
+
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
|
| 82 |
parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
|
| 83 |
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
| 84 |
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
| 85 |
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
| 86 |
args = parser.parse_args()
|
| 87 |
|
| 88 |
+
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
|
| 89 |
raise ValueError("Invalid model name")
|
| 90 |
|
| 91 |
pt_target_path = f"models/hf-{args.model_name}.pt"
|
models/convert-pt-to-ggml.py
CHANGED
|
@@ -228,7 +228,7 @@ with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
|
|
| 228 |
# for backwards compatibility, also check for older hf_transformers format tokenizer files
|
| 229 |
# old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
|
| 230 |
# new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
|
| 231 |
-
multilingual = hparams["n_vocab"]
|
| 232 |
tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
|
| 233 |
tokenizer_type = "tiktoken"
|
| 234 |
if not tokenizer.is_file():
|
|
|
|
| 228 |
# for backwards compatibility, also check for older hf_transformers format tokenizer files
|
| 229 |
# old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
|
| 230 |
# new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
|
| 231 |
+
multilingual = hparams["n_vocab"] >= 51865
|
| 232 |
tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
|
| 233 |
tokenizer_type = "tiktoken"
|
| 234 |
if not tokenizer.is_file():
|
models/convert-whisper-to-coreml.py
CHANGED
|
@@ -194,7 +194,7 @@ class TextDecoderANE(TextDecoder):
|
|
| 194 |
x = x.permute(0,2,3,1).squeeze(0)
|
| 195 |
|
| 196 |
# ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
|
| 197 |
-
if self.token_embedding.weight.shape[0]
|
| 198 |
# split in 11 chunks - 4715 each
|
| 199 |
splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
|
| 200 |
logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
|
|
@@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False):
|
|
| 296 |
|
| 297 |
if __name__ == "__main__":
|
| 298 |
parser = argparse.ArgumentParser()
|
| 299 |
-
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
|
| 300 |
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
| 301 |
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
| 302 |
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
| 303 |
args = parser.parse_args()
|
| 304 |
|
| 305 |
-
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
|
| 306 |
raise ValueError("Invalid model name")
|
| 307 |
|
| 308 |
whisper = load_model(args.model).cpu()
|
|
|
|
| 194 |
x = x.permute(0,2,3,1).squeeze(0)
|
| 195 |
|
| 196 |
# ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
|
| 197 |
+
if self.token_embedding.weight.shape[0] >= 51865:
|
| 198 |
# split in 11 chunks - 4715 each
|
| 199 |
splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
|
| 200 |
logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
|
|
|
|
| 296 |
|
| 297 |
if __name__ == "__main__":
|
| 298 |
parser = argparse.ArgumentParser()
|
| 299 |
+
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
|
| 300 |
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
| 301 |
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
| 302 |
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
| 303 |
args = parser.parse_args()
|
| 304 |
|
| 305 |
+
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
|
| 306 |
raise ValueError("Invalid model name")
|
| 307 |
|
| 308 |
whisper = load_model(args.model).cpu()
|
models/convert-whisper-to-openvino.py
CHANGED
|
@@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname):
|
|
| 38 |
|
| 39 |
if __name__ == "__main__":
|
| 40 |
parser = argparse.ArgumentParser()
|
| 41 |
-
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
|
| 42 |
args = parser.parse_args()
|
| 43 |
|
| 44 |
-
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
|
| 45 |
raise ValueError("Invalid model name")
|
| 46 |
|
| 47 |
whisper = load_model(args.model).cpu()
|
|
|
|
| 38 |
|
| 39 |
if __name__ == "__main__":
|
| 40 |
parser = argparse.ArgumentParser()
|
| 41 |
+
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
|
| 42 |
args = parser.parse_args()
|
| 43 |
|
| 44 |
+
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
|
| 45 |
raise ValueError("Invalid model name")
|
| 46 |
|
| 47 |
whisper = load_model(args.model).cpu()
|
models/download-coreml-model.sh
CHANGED
|
@@ -19,7 +19,7 @@ function get_script_path() {
|
|
| 19 |
models_path="$(get_script_path)"
|
| 20 |
|
| 21 |
# Whisper models
|
| 22 |
-
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
|
| 23 |
|
| 24 |
# list available models
|
| 25 |
function list_models {
|
|
|
|
| 19 |
models_path="$(get_script_path)"
|
| 20 |
|
| 21 |
# Whisper models
|
| 22 |
+
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
| 23 |
|
| 24 |
# list available models
|
| 25 |
function list_models {
|
models/download-ggml-model.cmd
CHANGED
|
@@ -8,7 +8,7 @@ popd
|
|
| 8 |
set argc=0
|
| 9 |
for %%x in (%*) do set /A argc+=1
|
| 10 |
|
| 11 |
-
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large
|
| 12 |
|
| 13 |
if %argc% neq 1 (
|
| 14 |
echo.
|
|
@@ -57,8 +57,8 @@ goto :eof
|
|
| 57 |
:list_models
|
| 58 |
echo.
|
| 59 |
echo Available models:
|
| 60 |
-
(for %%a in (%models%) do (
|
| 61 |
-
echo %%a
|
| 62 |
))
|
| 63 |
echo.
|
| 64 |
exit /b
|
|
|
|
| 8 |
set argc=0
|
| 9 |
for %%x in (%*) do set /A argc+=1
|
| 10 |
|
| 11 |
+
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large
|
| 12 |
|
| 13 |
if %argc% neq 1 (
|
| 14 |
echo.
|
|
|
|
| 57 |
:list_models
|
| 58 |
echo.
|
| 59 |
echo Available models:
|
| 60 |
+
(for %%a in (%models%) do (
|
| 61 |
+
echo %%a
|
| 62 |
))
|
| 63 |
echo.
|
| 64 |
exit /b
|
models/download-ggml-model.sh
CHANGED
|
@@ -41,6 +41,7 @@ models=(
|
|
| 41 |
"medium-q5_0"
|
| 42 |
"medium.en-q5_0"
|
| 43 |
"large-v1"
|
|
|
|
| 44 |
"large"
|
| 45 |
"large-q5_0"
|
| 46 |
)
|
|
|
|
| 41 |
"medium-q5_0"
|
| 42 |
"medium.en-q5_0"
|
| 43 |
"large-v1"
|
| 44 |
+
"large-v2"
|
| 45 |
"large"
|
| 46 |
"large-q5_0"
|
| 47 |
)
|
tests/run-tests.sh
CHANGED
|
@@ -19,7 +19,7 @@
|
|
| 19 |
cd `dirname $0`
|
| 20 |
|
| 21 |
# Whisper models
|
| 22 |
-
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
|
| 23 |
|
| 24 |
# list available models
|
| 25 |
function list_models {
|
|
|
|
| 19 |
cd `dirname $0`
|
| 20 |
|
| 21 |
# Whisper models
|
| 22 |
+
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
| 23 |
|
| 24 |
# list available models
|
| 25 |
function list_models {
|
whisper.cpp
CHANGED
|
@@ -193,6 +193,15 @@ enum e_model {
|
|
| 193 |
MODEL_LARGE,
|
| 194 |
};
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
static const std::map<std::string, std::pair<int, std::string>> g_lang = {
|
| 197 |
{ "en", { 0, "english", } },
|
| 198 |
{ "zh", { 1, "chinese", } },
|
|
@@ -293,6 +302,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
|
|
| 293 |
{ "ba", { 96, "bashkir", } },
|
| 294 |
{ "jw", { 97, "javanese", } },
|
| 295 |
{ "su", { 98, "sundanese", } },
|
|
|
|
| 296 |
};
|
| 297 |
|
| 298 |
static const size_t MB = 1ull*1024*1024;
|
|
@@ -402,7 +412,11 @@ struct whisper_vocab {
|
|
| 402 |
id token_beg = 50363; // begin timestamps
|
| 403 |
|
| 404 |
bool is_multilingual() const {
|
| 405 |
-
return n_vocab
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
}
|
| 407 |
};
|
| 408 |
|
|
@@ -922,6 +936,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 922 |
|
| 923 |
assert(hparams.n_text_state == hparams.n_audio_state);
|
| 924 |
|
|
|
|
|
|
|
| 925 |
if (hparams.n_audio_layer == 4) {
|
| 926 |
model.type = e_model::MODEL_TINY;
|
| 927 |
}
|
|
@@ -940,6 +956,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 940 |
|
| 941 |
if (hparams.n_audio_layer == 32) {
|
| 942 |
model.type = e_model::MODEL_LARGE;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 943 |
}
|
| 944 |
|
| 945 |
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
|
@@ -968,7 +988,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 968 |
log("%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 969 |
log("%s: ftype = %d\n", __func__, model.hparams.ftype);
|
| 970 |
log("%s: qntvr = %d\n", __func__, qntvr);
|
| 971 |
-
log("%s: type = %d\n", __func__, model.type);
|
| 972 |
|
| 973 |
// print memory requirements
|
| 974 |
{
|
|
@@ -1039,13 +1059,17 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1039 |
if (vocab.is_multilingual()) {
|
| 1040 |
vocab.token_eot++;
|
| 1041 |
vocab.token_sot++;
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
vocab.
|
| 1045 |
-
|
| 1046 |
-
vocab.
|
| 1047 |
-
vocab.
|
| 1048 |
-
vocab.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
}
|
| 1050 |
|
| 1051 |
if (n_vocab < model.hparams.n_vocab) {
|
|
@@ -1074,6 +1098,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1074 |
vocab.id_to_token[i] = word;
|
| 1075 |
}
|
| 1076 |
}
|
|
|
|
|
|
|
| 1077 |
}
|
| 1078 |
|
| 1079 |
size_t ctx_size = 0;
|
|
@@ -3281,7 +3307,7 @@ void whisper_free_params(struct whisper_full_params * params) {
|
|
| 3281 |
}
|
| 3282 |
|
| 3283 |
int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
| 3284 |
-
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH,
|
| 3285 |
log("%s: failed to compute mel spectrogram\n", __func__);
|
| 3286 |
return -1;
|
| 3287 |
}
|
|
@@ -3295,7 +3321,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
|
|
| 3295 |
|
| 3296 |
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
|
| 3297 |
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
| 3298 |
-
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH,
|
| 3299 |
log("%s: failed to compute mel spectrogram\n", __func__);
|
| 3300 |
return -1;
|
| 3301 |
}
|
|
@@ -3318,13 +3344,13 @@ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float *
|
|
| 3318 |
// TODO
|
| 3319 |
|
| 3320 |
int whisper_set_mel_with_state(
|
| 3321 |
-
struct whisper_context *
|
| 3322 |
struct whisper_state * state,
|
| 3323 |
const float * data,
|
| 3324 |
int n_len,
|
| 3325 |
int n_mel) {
|
| 3326 |
-
if (n_mel !=
|
| 3327 |
-
log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel,
|
| 3328 |
return -1;
|
| 3329 |
}
|
| 3330 |
|
|
|
|
| 193 |
MODEL_LARGE,
|
| 194 |
};
|
| 195 |
|
| 196 |
+
static const std::map<e_model, std::string> g_model_name = {
|
| 197 |
+
{ MODEL_UNKNOWN, "unknown" },
|
| 198 |
+
{ MODEL_TINY, "tiny" },
|
| 199 |
+
{ MODEL_BASE, "base" },
|
| 200 |
+
{ MODEL_SMALL, "small" },
|
| 201 |
+
{ MODEL_MEDIUM, "medium" },
|
| 202 |
+
{ MODEL_LARGE, "large" },
|
| 203 |
+
};
|
| 204 |
+
|
| 205 |
static const std::map<std::string, std::pair<int, std::string>> g_lang = {
|
| 206 |
{ "en", { 0, "english", } },
|
| 207 |
{ "zh", { 1, "chinese", } },
|
|
|
|
| 302 |
{ "ba", { 96, "bashkir", } },
|
| 303 |
{ "jw", { 97, "javanese", } },
|
| 304 |
{ "su", { 98, "sundanese", } },
|
| 305 |
+
{ "yue", { 99, "cantonese", } },
|
| 306 |
};
|
| 307 |
|
| 308 |
static const size_t MB = 1ull*1024*1024;
|
|
|
|
| 412 |
id token_beg = 50363; // begin timestamps
|
| 413 |
|
| 414 |
bool is_multilingual() const {
|
| 415 |
+
return n_vocab >= 51865;
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
int num_languages() const {
|
| 419 |
+
return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
|
| 420 |
}
|
| 421 |
};
|
| 422 |
|
|
|
|
| 936 |
|
| 937 |
assert(hparams.n_text_state == hparams.n_audio_state);
|
| 938 |
|
| 939 |
+
std::string mver = "";
|
| 940 |
+
|
| 941 |
if (hparams.n_audio_layer == 4) {
|
| 942 |
model.type = e_model::MODEL_TINY;
|
| 943 |
}
|
|
|
|
| 956 |
|
| 957 |
if (hparams.n_audio_layer == 32) {
|
| 958 |
model.type = e_model::MODEL_LARGE;
|
| 959 |
+
|
| 960 |
+
if (hparams.n_vocab == 51866) {
|
| 961 |
+
mver = " v3";
|
| 962 |
+
}
|
| 963 |
}
|
| 964 |
|
| 965 |
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
|
|
|
| 988 |
log("%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 989 |
log("%s: ftype = %d\n", __func__, model.hparams.ftype);
|
| 990 |
log("%s: qntvr = %d\n", __func__, qntvr);
|
| 991 |
+
log("%s: type = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str());
|
| 992 |
|
| 993 |
// print memory requirements
|
| 994 |
{
|
|
|
|
| 1059 |
if (vocab.is_multilingual()) {
|
| 1060 |
vocab.token_eot++;
|
| 1061 |
vocab.token_sot++;
|
| 1062 |
+
|
| 1063 |
+
// account for variable number of language tokens
|
| 1064 |
+
const int dt = vocab.num_languages() - 98;
|
| 1065 |
+
|
| 1066 |
+
vocab.token_translate += dt;
|
| 1067 |
+
vocab.token_transcribe += dt;
|
| 1068 |
+
vocab.token_solm += dt;
|
| 1069 |
+
vocab.token_prev += dt;
|
| 1070 |
+
vocab.token_nosp += dt;
|
| 1071 |
+
vocab.token_not += dt;
|
| 1072 |
+
vocab.token_beg += dt;
|
| 1073 |
}
|
| 1074 |
|
| 1075 |
if (n_vocab < model.hparams.n_vocab) {
|
|
|
|
| 1098 |
vocab.id_to_token[i] = word;
|
| 1099 |
}
|
| 1100 |
}
|
| 1101 |
+
|
| 1102 |
+
log("%s: n_langs = %d\n", __func__, vocab.num_languages());
|
| 1103 |
}
|
| 1104 |
|
| 1105 |
size_t ctx_size = 0;
|
|
|
|
| 3307 |
}
|
| 3308 |
|
| 3309 |
int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
| 3310 |
+
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
|
| 3311 |
log("%s: failed to compute mel spectrogram\n", __func__);
|
| 3312 |
return -1;
|
| 3313 |
}
|
|
|
|
| 3321 |
|
| 3322 |
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
|
| 3323 |
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
| 3324 |
+
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
|
| 3325 |
log("%s: failed to compute mel spectrogram\n", __func__);
|
| 3326 |
return -1;
|
| 3327 |
}
|
|
|
|
| 3344 |
// TODO
|
| 3345 |
|
| 3346 |
int whisper_set_mel_with_state(
|
| 3347 |
+
struct whisper_context * ctx,
|
| 3348 |
struct whisper_state * state,
|
| 3349 |
const float * data,
|
| 3350 |
int n_len,
|
| 3351 |
int n_mel) {
|
| 3352 |
+
if (n_mel != ctx->model.filters.n_mel) {
|
| 3353 |
+
log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, ctx->model.filters.n_mel);
|
| 3354 |
return -1;
|
| 3355 |
}
|
| 3356 |
|
whisper.h
CHANGED
|
@@ -29,7 +29,6 @@
|
|
| 29 |
|
| 30 |
#define WHISPER_SAMPLE_RATE 16000
|
| 31 |
#define WHISPER_N_FFT 400
|
| 32 |
-
#define WHISPER_N_MEL 80
|
| 33 |
#define WHISPER_HOP_LENGTH 160
|
| 34 |
#define WHISPER_CHUNK_SIZE 30
|
| 35 |
|
|
|
|
| 29 |
|
| 30 |
#define WHISPER_SAMPLE_RATE 16000
|
| 31 |
#define WHISPER_N_FFT 400
|
|
|
|
| 32 |
#define WHISPER_HOP_LENGTH 160
|
| 33 |
#define WHISPER_CHUNK_SIZE 30
|
| 34 |
|