ms180 commited on
Commit
cd5c512
·
verified ·
1 Parent(s): 00de4b3

Update app.py

Browse files

Updated description from v3.1 to v4

Files changed (1) hide show
  1. app.py +25 -3
app.py CHANGED
@@ -7,7 +7,7 @@ from espnet2.bin.s2t_inference import Speech2Text as ARSpeech2Text
7
  from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch as CTCSpeech2Text
8
 
9
 
10
- TITLE="Open Whisper-style Speech Model from CMU WAVLab"
11
 
12
  DESCRIPTION='''
13
  OWSM (pronounced as "awesome") is a series of Open Whisper-style Speech Models from [CMU WAVLab](https://www.wavlab.org/).
@@ -16,14 +16,18 @@ For more details, please check our [website](https://www.wavlab.org/activities/2
16
  '''
17
 
18
  ARTICLE = '''
19
- The latest demo uses OWSM v3.1 based on [E-Branchformer](https://arxiv.org/abs/2210.00077).
20
- OWSM v3.1 has 1.02B parameters and is trained on 180k hours of labelled data. It supports various speech-to-text tasks:
 
 
21
  - Speech recognition in 151 languages
22
  - Any-to-any language speech translation
23
  - Utterance-level timestamp prediction
24
  - Long-form transcription
25
  - Language identification
26
 
 
 
27
  As a demo, the input speech should not exceed 2 minutes. We also limit the maximum number of tokens to be generated.
28
  Please try our [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) if you want to explore more features.
29
 
@@ -32,6 +36,12 @@ Please try our [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd
32
  Please consider citing the following papers if you find our work helpful.
33
 
34
  ```
 
 
 
 
 
 
35
  @inproceedings{peng2024owsm31,
36
  title={OWSM v3.1: Better and Faster Open Whisper-Style Speech Models based on E-Branchformer},
37
  author={Yifan Peng and Jinchuan Tian and William Chen and Siddhant Arora and Brian Yan and Yui Sudo and Muhammad Shakeel and Kwanghee Choi and Jiatong Shi and Xuankai Chang and Jee-weon Jung and Shinji Watanabe},
@@ -44,6 +54,18 @@ Please consider citing the following papers if you find our work helpful.
44
  booktitle={Proc. ASRU},
45
  year={2023}
46
  }
 
 
 
 
 
 
 
 
 
 
 
 
47
  ```
48
  '''
49
 
 
7
  from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch as CTCSpeech2Text
8
 
9
 
10
+ TITLE="Open Whisper-style Speech Model V4 from CMU WAVLab"
11
 
12
  DESCRIPTION='''
13
  OWSM (pronounced as "awesome") is a series of Open Whisper-style Speech Models from [CMU WAVLab](https://www.wavlab.org/).
 
16
  '''
17
 
18
  ARTICLE = '''
19
+ The latest demo uses OWSM v4 based on [E-Branchformer](https://arxiv.org/abs/2210.00077).
20
+ OWSM v4 medium model has 1.02B parameters and is trained on 320k hours of labelled data (290k for ASR, 30k for ST).
21
+ OWSM-V4 CTC model has 1.01B parameters and is trained on the same dataset as the medium model.
22
+ They supports various speech-to-text tasks:
23
  - Speech recognition in 151 languages
24
  - Any-to-any language speech translation
25
  - Utterance-level timestamp prediction
26
  - Long-form transcription
27
  - Language identification
28
 
29
+ Additionally, OWSM v4 applies 8 times subsampling (instead of 4 times in OWSM v3.1) to the log Mel features, leading to a final resolution of 80 ms in the encoder. When running inference, we recommend setting maxlenratio=1.0 (default) instead of smaller values.
30
+
31
  As a demo, the input speech should not exceed 2 minutes. We also limit the maximum number of tokens to be generated.
32
  Please try our [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) if you want to explore more features.
33
 
 
36
  Please consider citing the following papers if you find our work helpful.
37
 
38
  ```
39
+ @inproceedings{owsm-v4,
40
+ title={{OWSM} v4: Improving Open Whisper-Style Speech Models via Data Scaling and Cleaning},
41
+ author={Yifan Peng and Shakeel Muhammad and Yui Sudo and William Chen and Jinchuan Tian and Chyi-Jiunn Lin and Shinji Watanabe},
42
+ booktitle={Proceedings of the Annual Conference of the International Speech Communication Association (INTERSPEECH)},
43
+ year={2025},
44
+ }
45
  @inproceedings{peng2024owsm31,
46
  title={OWSM v3.1: Better and Faster Open Whisper-Style Speech Models based on E-Branchformer},
47
  author={Yifan Peng and Jinchuan Tian and William Chen and Siddhant Arora and Brian Yan and Yui Sudo and Muhammad Shakeel and Kwanghee Choi and Jiatong Shi and Xuankai Chang and Jee-weon Jung and Shinji Watanabe},
 
54
  booktitle={Proc. ASRU},
55
  year={2023}
56
  }
57
+ @inproceedings{owsm-ctc,
58
+ title = "{OWSM}-{CTC}: An Open Encoder-Only Speech Foundation Model for Speech Recognition, Translation, and Language Identification",
59
+ author = "Peng, Yifan and
60
+ Sudo, Yui and
61
+ Shakeel, Muhammad and
62
+ Watanabe, Shinji",
63
+ booktitle = "Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL)",
64
+ year = "2024",
65
+ month= {8},
66
+ url = "https://aclanthology.org/2024.acl-long.549",
67
+ }
68
+
69
  ```
70
  '''
71