jaeikkim commited on
Commit
7bfbdc3
·
0 Parent(s):

Reinit Space without binary assets

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. MMaDA/.cursor/rules/python-env.mdc +4 -0
  3. MMaDA/.gitignore +2 -0
  4. MMaDA/AIDAS-Omni-Modal-Diffusion/app.py +16 -0
  5. MMaDA/LICENSE +21 -0
  6. MMaDA/README.md +209 -0
  7. MMaDA/accelerate_configs/1_gpu.yaml +15 -0
  8. MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero2.yaml +21 -0
  9. MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero3.yaml +24 -0
  10. MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero4.yaml +24 -0
  11. MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero2_aidas.yaml +25 -0
  12. MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero2_aidas2.yaml +25 -0
  13. MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero4.yaml +26 -0
  14. MMaDA/accelerate_configs/3_node_8_gpus_deepspeed_zero1.yaml +25 -0
  15. MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2.yaml +21 -0
  16. MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml +25 -0
  17. MMaDA/accelerate_configs/8_node_8_gpus_deepspeed_zero2.yaml +21 -0
  18. MMaDA/app.py +894 -0
  19. MMaDA/check_lr.py +27 -0
  20. MMaDA/check_tokens.py +191 -0
  21. MMaDA/configs/mmada_demo.yaml +95 -0
  22. MMaDA/configs/mmada_demo_s2t.yaml +131 -0
  23. MMaDA/configs/mmada_demo_speech.yaml +101 -0
  24. MMaDA/configs/mmada_demo_video.yaml +95 -0
  25. MMaDA/configs/mmada_demo_video_temp.yaml +95 -0
  26. MMaDA/configs/mmada_pretraining_i2i.yaml +86 -0
  27. MMaDA/configs/mmada_pretraining_s2t.yaml +96 -0
  28. MMaDA/configs/mmada_pretraining_stage1_llada_instruct.yaml +100 -0
  29. MMaDA/configs/mmada_pretraining_stage2_llada_instruct.yaml +109 -0
  30. MMaDA/configs/mmada_pretraining_stage3_llada_instruct.yaml +112 -0
  31. MMaDA/configs/mmada_pretraining_stage3_llada_instruct_512_cot.yaml +123 -0
  32. MMaDA/configs/mmada_pretraining_stage4_llada_instruct.yaml +134 -0
  33. MMaDA/configs/mmada_pretraining_t2s.yaml +96 -0
  34. MMaDA/configs/mmada_pretraining_v2s.yaml +133 -0
  35. MMaDA/configs/mmada_pretraining_v2t.yaml +88 -0
  36. MMaDA/configs/omada_instruction_tuning.yaml +200 -0
  37. MMaDA/configs/omada_pretraining_stage1-2.yaml +131 -0
  38. MMaDA/configs/omada_pretraining_stage1-3.yaml +132 -0
  39. MMaDA/configs/omada_pretraining_stage1-4.yaml +132 -0
  40. MMaDA/configs/omada_pretraining_stage1.yaml +131 -0
  41. MMaDA/configs/omada_pretraining_v2t_inst.yaml +132 -0
  42. MMaDA/debug_speech_dataloader.py +222 -0
  43. MMaDA/eval_ASR_TTS/test.py +266 -0
  44. MMaDA/eval_ASR_TTS/whisper_asr/normalizers/__init__.py +2 -0
  45. MMaDA/eval_ASR_TTS/whisper_asr/normalizers/basic.py +76 -0
  46. MMaDA/eval_ASR_TTS/whisper_asr/normalizers/english.json +1741 -0
  47. MMaDA/eval_ASR_TTS/whisper_asr/normalizers/english.py +550 -0
  48. MMaDA/eval_ASR_TTS/whisper_asr/whisper_asr.py +0 -0
  49. MMaDA/eval_emova.py +249 -0
  50. MMaDA/generate.py +146 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
MMaDA/.cursor/rules/python-env.mdc ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ---
2
+ alwaysApply: true
3
+ ---
4
+ When running python script, use conda env `mmada`.
MMaDA/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ exp
2
+ wandb
MMaDA/AIDAS-Omni-Modal-Diffusion/app.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import torch
4
+
5
+ zero = torch.Tensor([0]).cuda()
6
+ print(zero.device) # should print 'cpu' until GPU context is enabled
7
+
8
+
9
+ @spaces.GPU
10
+ def greet(n):
11
+ print(zero.device) # now this should print 'cuda:0'
12
+ return f"Hello {zero + n} Tensor"
13
+
14
+
15
+ demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
16
+ demo.launch()
MMaDA/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Ling Yang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
MMaDA/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <br>
3
+ <img src="assets/title.png" width="166">
4
+ <h3>Multimodal Large Diffusion Language Models</h3></div>
5
+
6
+ <p align="center">
7
+ <a href="https://arxiv.org/abs/2505.15809">
8
+ <img
9
+ src="https://img.shields.io/badge/MMaDA-Paper-red?logo=arxiv&logoColor=red"
10
+ alt="MMaDA Paper on arXiv"
11
+ />
12
+ </a>
13
+ <a href="https://huggingface.co/spaces/Gen-Verse/MMaDA">
14
+ <img
15
+ src="https://img.shields.io/badge/MMaDA%20Demo-Hugging%20Face%20Space-blue?logo=huggingface&logoColor=blue"
16
+ alt="MMaDA on Hugging Face"
17
+ />
18
+ </a>
19
+ <a href="https://huggingface.co/Gen-Verse/MMaDA-8B-Base">
20
+ <img
21
+ src="https://img.shields.io/badge/MMaDA--8B--Base-Hugging%20Face%20Model-orange?logo=huggingface&logoColor=yellow"
22
+ alt="MMaDA on Hugging Face"
23
+ />
24
+ </a>
25
+ <a href="https://huggingface.co/Gen-Verse/MMaDA-8B-MixCoT">
26
+ <img
27
+ src="https://img.shields.io/badge/MMaDA--8B--MixCoT-Hugging%20Face%20Model-orange?logo=huggingface&logoColor=yellow"
28
+ alt="MMaDA on Hugging Face"
29
+ />
30
+ </a>
31
+ <a href="https://github.com/Gen-Verse/MMaDA/blob/main/assets/wx-mmada-0613.jpeg">
32
+ <img
33
+ src="https://img.shields.io/badge/Wechat-Join-green?logo=wechat&amp"
34
+ alt="Wechat Group Link"
35
+ />
36
+ </a>
37
+
38
+ </p>
39
+
40
+
41
+ ## 🌌 Introduction
42
+ MMaDA is a new family of **multimodal diffusion foundation models** designed to achieve superior performance across diverse domains such as textual reasoning, multimodal understanding, and text-to-image generation. MMaDA is distinguished by three key innovations:
43
+ 1. MMaDA adopts a **unified diffusion architecture** with a shared probabilistic formulation and a modality-agnostic design, eliminating the need for modality-specific components.
44
+ 2. MMaDA introduces a **mixed long chain-of-thought (CoT) fine-tuning** strategy that curates a unified CoT format across modalities.
45
+ 3. MMaDA adopts a unified policy-gradient-based RL algorithm, which we call **UniGRPO**, tailored for diffusion foundation models. Utilizing diversified reward modeling, **UniGRPO** unifies post-training across both reasoning and generation tasks, ensuring consistent performance improvements.
46
+
47
+ <div align="center" style="width: 600px; margin: auto;">
48
+ <img src="assets/showcase0.8.gif" alt="MMaDA decoding demo" width="550" />
49
+ <p style="font-style: italic; font-size: 14px; color: #555; margin-top: 6px;">
50
+ MMaDA's decoding demo. This video showcases how a diffusion foundation model generates text and image.<br>
51
+ The "Text Generation" part uses a semi-autoregressive sampling method, while the "Multimodal Generation" part adopts non-autoregressive diffusion denoising.
52
+ </p>
53
+ </div>
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+ <!--
62
+
63
+
64
+
65
+ ## Decoding Demo
66
+ We demonstrate the decoding process of MMaDA with a teaser video to show how a diffusion model generates text and image. The "Text Generation" part adopts a "semi-autoregressive" sampling method and the "MultiModal Generation" part adopts a non-autoregressive sampling method which is purely diffusion denoising.
67
+
68
+ <!-- <div style="display: flex; justify-content: center; flex-wrap: wrap;">
69
+ <img src="assets/showcase0.8.gif" style="width: 90%" />
70
+ </div> -->
71
+
72
+ ## 📰 Latest Updates
73
+ * **[2025-06-02]** We open source our **MMaDA-8B-MixCoT** at [Huggingface](https://huggingface.co/Gen-Verse/MMaDA-8B-MixCoT).
74
+ * **[2025-05-24]** We add support for MPS inference, tested on M4.
75
+ * **[2025-05-22]** We release the inference and training code of MMaDA for text generation, multimodal generation and image generation.
76
+ * **[2025-05-22]** We open source our **MMaDA-8B-Base** at [Huggingface](https://huggingface.co/Gen-Verse/MMaDA-8B-Base). **MMaDA-8B-MixCoT** and **MMaDA-8B-Max** will be released in the near future.
77
+ * **[2025-05-22]** We release our [research paper](https://arxiv.org/abs/2505.15809) and [demo](https://huggingface.co/spaces/Gen-Verse/MMaDA) for the first unified multimodal diffusion model: MMaDA.
78
+
79
+
80
+ ## 🧬 MMaDA Series Overview
81
+
82
+ MMaDA includes a series of checkpoints reflecting different training stages:
83
+ 1. **[MMaDA-8B-Base](https://huggingface.co/Gen-Verse/MMaDA-8B-Base)**: After pretraining and instruction tuning. Capable of basic text generation, image generation, image captioning and **thinking ablities**.
84
+ 2. **[MMaDA-8B-MixCoT](https://huggingface.co/Gen-Verse/MMaDA-8B-MixCoT)**: After mixed long chain-of-thought (CoT) fine-tuning. Capable of **complex** textual, multimodal and image generation reasoning.
85
+ 3. **MMaDA-8B-Max (coming soon)**: After UniGRPO reinforment learning. Excels at complex reasoning and awesome visual generation. Will be released in the future.
86
+ <div align="center">
87
+ <img src="assets/example_compare.png" width="800">
88
+ <p><i>Overview of MMaDA's capablities.</i></p>
89
+ </div>
90
+
91
+
92
+
93
+
94
+ ## ✅ TODO
95
+ - [x] Release [MMaDA-8B-MixCoT](https://huggingface.co/Gen-Verse/MMaDA-8B-MixCoT)
96
+ - [ ] Release MMaDA-8B-Max and OpenRLHF-based UniGRPO training code.
97
+
98
+ ## ⚙️ Quick Start
99
+ First, set up the enviroment:
100
+ ```
101
+ pip install -r requirements.txt
102
+ ```
103
+ Launch local Gradio demo:
104
+ ```
105
+ python app.py
106
+ ```
107
+ Or try it online via our [Huggingface Demo](https://huggingface.co/spaces/Gen-Verse/MMaDA).
108
+
109
+ ## 🚀 Inference
110
+ For batch-level inference, we provide our inference scripts here.
111
+ ### 1. Text Generation
112
+ For text generation, we follow LLaDA's configuration and generation script. Simple run:
113
+ ```bash
114
+ python generate.py
115
+ ```
116
+
117
+ ### 2. MultiModal Generation
118
+ For multimodal generation and text-to-image generation, first login your wandb account:
119
+ ```
120
+ wandb login
121
+ ```
122
+ Inference demo for MultiModal Generation and you can view the results on wandb:
123
+ ```
124
+ python3 inference_mmu.py config=configs/mmada_demo.yaml mmu_image_root=./mmu_validation question='Please describe this image in detail.'
125
+ ```
126
+
127
+ ### 3. Text-to-Image Genertion
128
+ For multimodal generation and text-to-image generation, first login your wandb account:
129
+ ```
130
+ wandb login
131
+ ```
132
+ Inference demo for Text-to-Image Genertion and you can view the results on wandb:
133
+ ```
134
+ python3 inference_t2i.py config=configs/mmada_demo.yaml batch_size=1 validation_prompts_file=validation_prompts/text2image_prompts.txt guidance_scale=3.5 generation_timesteps=15
135
+ mode='t2i'
136
+ ```
137
+
138
+ ## 🔧 Training
139
+ **Update your training data path in `configs/xx.yaml`.**
140
+
141
+ ### Stage 0. Prepare your accelerate configs
142
+ Please first prepare your accelerate configs. You can simple run
143
+ ```
144
+ accelerate config
145
+ ```
146
+
147
+ Or use our provided configs in `accelerate_configs`:
148
+ ```
149
+ ├── accelerate_configs/
150
+ | ├── 1_gpu.yaml
151
+ | └── 8_node_8_gpus_deepspeed_zero2.yaml (for 8 * 8 gpus)
152
+ ```
153
+
154
+ ### Stage 1.1: Pre-training on ImageNet
155
+ First we use LLaDA-8B-Instruct to initialize our model, and train on ImageNet for basic visual capbalities.
156
+ ```
157
+ accelerate launch --config_file path/to/your/accelerate_config --main_process_port=8888 training/train_mmada.py config=configs/mmada_pretraining_stage1_llada_instruct.yaml
158
+ ```
159
+
160
+ ### Stage 1.2 Pre-training on Image-Text Dataset
161
+ Then we replace the ImageNet dataset in Stage 1.1 with Image-Text Dataset. Please change the pretrained model path in `mmada_pretraining_stage2_llada_instruct.yaml` with your checkpoint in Stage 1.1
162
+ ```
163
+ accelerate launch --config_file path/to/your/accelerate_config --main_process_port=8888 training/train_mmada_stage2.py config=configs/mmada_pretraining_stage2_llada_instruct.yaml
164
+ ```
165
+
166
+ ### Stage 1.3 Pre-training on Text Instruction following
167
+ In this stage, we begin training on text instruction following and include corresponding validations. Please change the pretrained model path in `mmada_pretraining_stage3_llada_instruct.yaml` with your checkpoint in Stage 1.2
168
+ ```
169
+ accelerate launch --config_file path/to/your/accelerate_config --main_process_port=8888 training/train_mmada_stage3.py config=configs/mmada_pretraining_stage3_llada_instruct.yaml
170
+ ```
171
+
172
+ ### Stage 2.1 Mix-CoT Training (Text Only)
173
+ In this stage, we begin our Mix-CoT finetuning with text reasoning first, along with improved image quality. Please change the pretrained model path in `mmada_pretraining_stage3_llada_instruct.yaml` with your checkpoint in Stage 1.3 and prepare your CoT data.
174
+ ```
175
+ accelerate launch --config_file path/to/your/accelerate_config --main_process_port=8888 training/train_mmada_stage_cot_sft.py config=configs/mmada_pretraining_stage3_llada_instruct_512_cot.yaml
176
+ ```
177
+
178
+ ### Stage 2.2 Mix-CoT Training (with MultiModal Reasoning)
179
+ In this stage, we include multimodal reasoning, along with improved image quality. Please change the pretrained model path in `mmada_pretraining_stage3_llada_instruct.yaml` with your checkpoint in Stage 2.1 and prepare your CoT data.
180
+ ```
181
+ accelerate launch --config_file path/to/your/accelerate_config --main_process_port=8888 training/train_mmada_stage4.py config=configs/mmada_pretraining_stage4_llada_instruct.yaml
182
+ ```
183
+
184
+ ### Stage 3 UniGRPO RL
185
+ [Will be released once we finished our code transition to OpenRLHF]
186
+
187
+
188
+ ## 📖 Citation
189
+ ```
190
+ @article{yang2025mmada,
191
+ title={MMaDA: Multimodal Large Diffusion Language Models},
192
+ author={Yang, Ling and Tian, Ye and Li, Bowen and Zhang, Xinchen and Shen, Ke and Tong, Yunhai and Wang, Mengdi},
193
+ journal={arXiv preprint arXiv:2505.15809},
194
+ year={2025}
195
+ }
196
+ ```
197
+
198
+ ## 🤝 Acknowledgments
199
+ This work is heavily based on [Show-o](https://github.com/showlab/Show-o), [LLaDA](https://github.com/ML-GSAI/LLaDA), [maskgit](https://github.com/google-research/maskgit), [transformers](https://github.com/huggingface/transformers), [accelerate](https://github.com/huggingface/accelerate) and [webdataset](https://github.com/webdataset/webdataset). Thanks to all the authors for their great work.
200
+
201
+ ## 💬 Discussion and Collaboration
202
+
203
+ Welcome to discuss and collaborate with us for continuously improving MMaDA. If you have any bad cases, please kindly share them in the [Issue](https://github.com/Gen-Verse/MMaDA/issues/4#issue-3083196081).
204
+
205
+ Also, you can reach us with this WeChat QR code!
206
+ <p align="center">
207
+ <img src="assets/wx-mmada-0613.jpeg" width="256">
208
+ </p>
209
+
MMaDA/accelerate_configs/1_gpu.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ distributed_type: 'NO'
3
+ downcast_bf16: 'no'
4
+ gpu_ids: '0'
5
+ machine_rank: 0
6
+ main_training_function: main
7
+ mixed_precision: bf16
8
+ num_machines: 1
9
+ num_processes: 1
10
+ rdzv_backend: static
11
+ same_network: true
12
+ tpu_env: []
13
+ tpu_use_cluster: false
14
+ tpu_use_sudo: false
15
+ use_cpu: false
MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero2.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ deepspeed_multinode_launcher: standard
4
+ gradient_accumulation_steps: 1
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: cpu
7
+ offload_param_device: cpu
8
+ zero3_init_flag: true
9
+ zero_stage: 2
10
+ distributed_type: DEEPSPEED
11
+ downcast_bf16: 'no'
12
+ main_training_function: main
13
+ mixed_precision: bf16
14
+ num_machines: 1
15
+ num_processes: 8
16
+ rdzv_backend: static
17
+ same_network: true
18
+ tpu_env: []
19
+ tpu_use_cluster: false
20
+ tpu_use_sudo: false
21
+ use_cpu: false
MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero3.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ deepspeed_multinode_launcher: standard
4
+ gradient_accumulation_steps: 2
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: cpu
7
+ offload_param_device: cpu
8
+ zero3_init_flag: true
9
+ zero3_save_16bit_model: true
10
+ zero_stage: 3
11
+ zero_optimization:
12
+ overlap_comm: false
13
+ distributed_type: DEEPSPEED
14
+ downcast_bf16: 'no'
15
+ main_training_function: main
16
+ mixed_precision: bf16
17
+ num_machines: 1
18
+ num_processes: 8
19
+ rdzv_backend: static
20
+ same_network: true
21
+ tpu_env: []
22
+ tpu_use_cluster: false
23
+ tpu_use_sudo: false
24
+ use_cpu: false
MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero4.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ deepspeed_multinode_launcher: standard
4
+ gradient_accumulation_steps: 1
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: cpu
7
+ offload_param_device: cpu
8
+ zero3_init_flag: true
9
+ zero3_save_16bit_model: true
10
+ zero_stage: 2
11
+ zero_optimization:
12
+ overlap_comm: false
13
+ distributed_type: DEEPSPEED
14
+ downcast_bf16: 'no'
15
+ main_training_function: main
16
+ mixed_precision: bf16
17
+ num_machines: 1
18
+ num_processes: 8
19
+ rdzv_backend: static
20
+ same_network: true
21
+ tpu_env: []
22
+ tpu_use_cluster: false
23
+ tpu_use_sudo: false
24
+ use_cpu: false
MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero2_aidas.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ deepspeed_multinode_launcher: standard
4
+ gradient_accumulation_steps: 1
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: cpu
7
+ offload_param_device: cpu
8
+ zero3_init_flag: true
9
+ zero3_save_16bit_model: true
10
+ zero_stage: 2
11
+ zero_optimization:
12
+ overlap_comm: false
13
+ distributed_type: DEEPSPEED
14
+ downcast_bf16: 'no'
15
+ enable_cpu_affinity: false
16
+ main_process_ip: 172.51.80.134
17
+ main_training_function: main
18
+ num_machines: 2
19
+ num_processes: 16
20
+ rdzv_backend: static
21
+ same_network: true
22
+ tpu_env: []
23
+ tpu_use_cluster: false
24
+ tpu_use_sudo: false
25
+ use_cpu: false
MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero2_aidas2.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ deepspeed_multinode_launcher: standard
4
+ gradient_accumulation_steps: 1
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: cpu
7
+ offload_param_device: cpu
8
+ zero3_init_flag: true
9
+ zero3_save_16bit_model: true
10
+ zero_stage: 2
11
+ zero_optimization:
12
+ overlap_comm: false
13
+ distributed_type: DEEPSPEED
14
+ downcast_bf16: 'no'
15
+ enable_cpu_affinity: false
16
+ main_process_ip: 172.51.80.136
17
+ main_training_function: main
18
+ num_machines: 4
19
+ num_processes: 32
20
+ rdzv_backend: static
21
+ same_network: true
22
+ tpu_env: []
23
+ tpu_use_cluster: false
24
+ tpu_use_sudo: false
25
+ use_cpu: false
MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero4.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ deepspeed_multinode_launcher: standard
4
+ gradient_accumulation_steps: 4
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: cpu
7
+ offload_param_device: cpu
8
+ zero3_init_flag: true
9
+ zero3_save_16bit_model: true
10
+ zero_stage: 2
11
+ zero_optimization:
12
+ overlap_comm: false
13
+ distributed_type: DEEPSPEED
14
+ downcast_bf16: 'no'
15
+ enable_cpu_affinity: false
16
+ main_process_ip: 172.51.64.134
17
+ main_training_function: main
18
+ num_machines: 2
19
+ num_processes: 16
20
+ machine_rank: 1
21
+ rdzv_backend: static
22
+ same_network: true
23
+ tpu_env: []
24
+ tpu_use_cluster: false
25
+ tpu_use_sudo: false
26
+ use_cpu: false
MMaDA/accelerate_configs/3_node_8_gpus_deepspeed_zero1.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ deepspeed_multinode_launcher: standard
4
+ gradient_accumulation_steps: 4
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: cpu
7
+ offload_param_device: cpu
8
+ zero3_init_flag: true
9
+ zero3_save_16bit_model: true
10
+ zero_stage: 2
11
+ zero_optimization:
12
+ overlap_comm: false
13
+ distributed_type: DEEPSPEED
14
+ downcast_bf16: 'no'
15
+ enable_cpu_affinity: false
16
+ main_process_ip: 172.51.64.130
17
+ main_training_function: main
18
+ num_machines: 3
19
+ num_processes: 24
20
+ rdzv_backend: static
21
+ same_network: true
22
+ tpu_env: []
23
+ tpu_use_cluster: false
24
+ tpu_use_sudo: false
25
+ use_cpu: false
MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ deepspeed_multinode_launcher: standard
4
+ gradient_accumulation_steps: 4
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: cpu
7
+ offload_param_device: cpu
8
+ zero3_init_flag: true
9
+ zero_stage: 2
10
+ distributed_type: DEEPSPEED
11
+ downcast_bf16: 'no'
12
+ main_training_function: main
13
+ mixed_precision: bf16
14
+ num_machines: 4
15
+ num_processes: 32
16
+ rdzv_backend: static
17
+ same_network: true
18
+ tpu_env: []
19
+ tpu_use_cluster: false
20
+ tpu_use_sudo: false
21
+ use_cpu: false
MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ deepspeed_multinode_launcher: standard
4
+ gradient_accumulation_steps: 1
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: none #cpu
7
+ offload_param_device: none #cpu
8
+ zero3_init_flag: true
9
+ zero3_save_16bit_model: true
10
+ zero_stage: 2
11
+ zero_optimization:
12
+ overlap_comm: false
13
+ distributed_type: DEEPSPEED
14
+ downcast_bf16: 'no'
15
+ enable_cpu_affinity: true
16
+ main_process_ip: 172.51.133.6
17
+ main_training_function: main
18
+ num_machines: 4
19
+ num_processes: 32
20
+ rdzv_backend: static
21
+ same_network: true
22
+ tpu_env: []
23
+ tpu_use_cluster: false
24
+ tpu_use_sudo: false
25
+ use_cpu: false
MMaDA/accelerate_configs/8_node_8_gpus_deepspeed_zero2.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ deepspeed_multinode_launcher: standard
4
+ gradient_accumulation_steps: 1
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: cpu
7
+ offload_param_device: cpu
8
+ zero3_init_flag: true
9
+ zero_stage: 2
10
+ distributed_type: DEEPSPEED
11
+ downcast_bf16: 'no'
12
+ main_training_function: main
13
+ mixed_precision: bf16
14
+ num_machines: 8
15
+ num_processes: 64
16
+ rdzv_backend: static
17
+ same_network: true
18
+ tpu_env: []
19
+ tpu_use_cluster: false
20
+ tpu_use_sudo: false
21
+ use_cpu: false
MMaDA/app.py ADDED
@@ -0,0 +1,894 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import torch.nn.functional as F
5
+ from transformers import AutoTokenizer
6
+ from torchvision import transforms
7
+ from models import MAGVITv2, get_mask_schedule, MMadaModelLM
8
+ from training.prompting_utils import UniversalPrompting
9
+ from PIL import Image
10
+
11
+ def image_transform(image, resolution=256, normalize=True):
12
+ image = transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BICUBIC)(image)
13
+ image = transforms.CenterCrop((resolution, resolution))(image)
14
+ image = transforms.ToTensor()(image)
15
+ if normalize:
16
+ image = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)(image)
17
+ return image
18
+
19
+ def add_gumbel_noise(logits, temperature):
20
+ """
21
+ Adds Gumbel noise to logits for stochastic sampling.
22
+ Equivalent to argmax(logits + temperature * G) where G ~ Gumbel(0,1).
23
+ This version is more numerically stable than a version involving exp() and division.
24
+ """
25
+ if abs(temperature) < 1e-9: # Effectively zero temperature
26
+ return logits
27
+ # Ensure logits are float64 for precision with noise, as suggested by user context
28
+ if DEVICE == "mps":
29
+ logits = logits.to(torch.float32)
30
+ else:
31
+ logits = logits.to(torch.float64)
32
+ # Standard Gumbel noise: -log(-log(U)), U ~ Uniform(0,1)
33
+ # Add small epsilon for numerical stability inside logs
34
+ if DEVICE == "mps":
35
+ noise = torch.rand_like(logits, dtype=torch.float32)
36
+ else:
37
+ noise = torch.rand_like(logits, dtype=torch.float64)
38
+ standard_gumbel_noise = -torch.log(-torch.log(noise + 1e-20) + 1e-20)
39
+ return logits + temperature * standard_gumbel_noise
40
+
41
+ def get_num_transfer_tokens(mask_index, steps):
42
+ mask_num = mask_index.sum(dim=1, keepdim=True)
43
+ # Ensure steps is at least 1 to avoid division by zero if mask_num is also 0 (though sum should be >=0)
44
+ steps = max(1, int(steps)) # Ensure steps is a positive integer
45
+ base = mask_num // steps
46
+ remainder = mask_num % steps
47
+ num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.long) + base
48
+ for i in range(mask_num.size(0)): # Iterate over batch
49
+ if remainder[i] > 0 : # Ensure remainder is positive before indexing
50
+ num_transfer_tokens[i, :remainder[i].item()] += 1 # .item() for single value tensor to int
51
+ return num_transfer_tokens
52
+
53
+ MODEL = None
54
+ TOKENIZER = None
55
+ DEVICE = (
56
+ "cuda"
57
+ if torch.cuda.is_available()
58
+ else "mps" if torch.backends.mps.is_available() else "cpu"
59
+ )
60
+ MASK_ID = None
61
+ uni_prompting = None
62
+ VQ_MODEL = MAGVITv2().from_pretrained("showlab/magvitv2").to(DEVICE)
63
+
64
+ DEFAULT_MODEL_PATH = "Gen-Verse/MMaDA-8B-Base" # Default
65
+ CURRENT_MODEL_PATH = None
66
+
67
+ MODEL_CHOICES = [
68
+ "MMaDA-8B-Base",
69
+ "MMaDA-8B-MixCoT (coming soon)",
70
+ "MMaDA-8B-Max (coming soon)"
71
+ ]
72
+ MODEL_ACTUAL_PATHS = {
73
+ "MMaDA-8B-Base": DEFAULT_MODEL_PATH,
74
+ }
75
+
76
+ def clear_outputs_action():
77
+ return None, None
78
+
79
+ def _load_model_and_tokenizer_core(model_path_to_load, model_display_name_for_status):
80
+ global MODEL, TOKENIZER, MASK_ID, CURRENT_MODEL_PATH, DEVICE, uni_prompting
81
+
82
+ if MODEL is not None and CURRENT_MODEL_PATH == model_path_to_load:
83
+ return f"Model '{model_display_name_for_status}' from '{model_path_to_load}' is already loaded. MASK_ID: {MASK_ID}"
84
+
85
+ CURRENT_MODEL_PATH = model_path_to_load
86
+
87
+ status_msg_parts = [f"Loading '{model_display_name_for_status}'..."]
88
+ try:
89
+ TOKENIZER = AutoTokenizer.from_pretrained(model_path_to_load, trust_remote_code=True)
90
+ status_msg_parts.append(f"Tokenizer for '{model_display_name_for_status}' loaded.")
91
+
92
+ MODEL = MMadaModelLM.from_pretrained(model_path_to_load, trust_remote_code=True, torch_dtype=torch.bfloat16).to(DEVICE).eval()
93
+ status_msg_parts.append(f"Model '{model_display_name_for_status}' loaded to {DEVICE}.")
94
+
95
+ uni_prompting = UniversalPrompting(TOKENIZER, max_text_len=512, special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),ignore_id=-100, cond_dropout_prob=0.1, use_reserved_token=True)
96
+
97
+ if hasattr(TOKENIZER, 'mask_token_id') and TOKENIZER.mask_token_id is not None:
98
+ MASK_ID = TOKENIZER.mask_token_id
99
+ status_msg_parts.append(f"Using MASK_ID from tokenizer: {MASK_ID}.")
100
+ else:
101
+ MASK_ID = 126336
102
+ status_msg_parts.append(f"Using default MASK_ID: {MASK_ID}.")
103
+
104
+ if TOKENIZER.pad_token_id is None:
105
+ if TOKENIZER.eos_token_id is not None:
106
+ TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
107
+ TOKENIZER.pad_token = TOKENIZER.eos_token
108
+ status_msg_parts.append(f"Set pad_token_id to eos_token_id ({TOKENIZER.eos_token_id}).")
109
+ else:
110
+ status_msg_parts.append("Warning: pad_token_id is None and no eos_token_id.")
111
+
112
+ if TOKENIZER.eos_token_id is None: # Important for cleaning up output in visualization
113
+ status_msg_parts.append("Warning: tokenizer.eos_token_id is None. EOS cleanup might not work.")
114
+
115
+ TOKENIZER.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}"
116
+
117
+ return " ".join(status_msg_parts)
118
+ except Exception as e:
119
+ MODEL = None
120
+ TOKENIZER = None
121
+ MASK_ID = None
122
+ CURRENT_MODEL_PATH = None
123
+ return f"Error loading model '{model_display_name_for_status}': {str(e)}"
124
+
125
+ def handle_model_selection_change(selected_model_name_ui):
126
+ if "coming soon" in selected_model_name_ui.lower():
127
+ global MODEL, TOKENIZER, MASK_ID, CURRENT_MODEL_PATH
128
+ MODEL = None
129
+ TOKENIZER = None
130
+ MASK_ID = None
131
+ CURRENT_MODEL_PATH = None
132
+ return f"'{selected_model_name_ui}' is not yet available. Please select 'Model A'."
133
+
134
+ actual_path = MODEL_ACTUAL_PATHS.get(selected_model_name_ui)
135
+ if not actual_path:
136
+ return f"Path for '{selected_model_name_ui}' is not defined. Cannot load."
137
+
138
+ return _load_model_and_tokenizer_core(actual_path, selected_model_name_ui)
139
+
140
+
141
+ def get_highlighted_text_tuples(current_x_ids_batch, prompt_input_ids, prompt_len, tk, current_mask_id, raw_prompt_attention_mask):
142
+ if current_x_ids_batch is None or current_x_ids_batch.ndim == 0 or current_x_ids_batch.shape[0] == 0:
143
+ return [("Error in sequence data for visualization.", "ERROR")]
144
+ # only answer part
145
+ current_x_ids_batch = current_x_ids_batch[:, prompt_len:]
146
+ seq_ids = current_x_ids_batch[0].tolist()
147
+ eos_token_id = tk.eos_token_id # Get EOS token ID
148
+
149
+ # Stage 1: Build initial list of tuples with (token_str, label, token_id_int)
150
+ # This helps in identifying EOS tokens later without re-checking the type.
151
+ intermediate_tuples = []
152
+ for j, token_id_int in enumerate(seq_ids):
153
+ try:
154
+ token_str = tk.decode([token_id_int], skip_special_tokens=True, clean_up_tokenization_spaces=False)
155
+ except Exception: # Handle cases where a token ID might be problematic (e.g. with mock)
156
+ token_str = f"[ID:{token_id_int}]"
157
+
158
+ label = "ERROR"
159
+ if token_id_int == current_mask_id:
160
+ token_str = "[MASK]"
161
+ label = "MASK"
162
+ else:
163
+ label = "GEN"
164
+ intermediate_tuples.append((token_str, label, token_id_int))
165
+
166
+ return intermediate_tuples
167
+
168
+ @torch.no_grad()
169
+ def generate_viz_wrapper_t2i(prompt_text, steps, guidance_scale, mask_schedule="cosine"):
170
+ global MODEL, TOKENIZER, MASK_ID, DEVICE, uni_prompting
171
+
172
+ if MODEL is None or TOKENIZER is None or MASK_ID is None:
173
+ yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
174
+ return
175
+ steps = int(steps)
176
+ guidance_scale = float(guidance_scale)
177
+
178
+ image_tokens = torch.ones((1, 1024), dtype=torch.long, device=DEVICE) * MASK_ID
179
+ prompt_text = [prompt_text]
180
+ input_ids, attention_mask = uni_prompting((prompt_text, image_tokens), 't2i_gen')
181
+
182
+ if guidance_scale > 0:
183
+ uncond_input_ids, uncond_attention_mask = uni_prompting(([''], image_tokens), 't2i_gen')
184
+ else:
185
+ uncond_input_ids, uncond_attention_mask = None, None
186
+
187
+ mask_schedule = get_mask_schedule(mask_schedule)
188
+ blank_image = Image.new("RGB", (512, 512), (255, 255, 255))
189
+ yield blank_image, "Starting generation..."
190
+ for image_step, status_msg_step in MODEL.t2i_generate_decoding_stepwise(
191
+ input_ids = input_ids,
192
+ uncond_input_ids = uncond_input_ids,
193
+ attention_mask = attention_mask,
194
+ uncond_attention_mask = uncond_attention_mask,
195
+ temperature=1.0,
196
+ timesteps = steps,
197
+ guidance_scale = guidance_scale,
198
+ noise_schedule = mask_schedule,
199
+ noise_type = "mask",
200
+ seq_len = 1024,
201
+ vq_model = VQ_MODEL,
202
+ uni_prompting=uni_prompting):
203
+ yield image_step, status_msg_step
204
+
205
+
206
+
207
+
208
+ @torch.no_grad()
209
+ def generate_viz_wrapper_lm(prompt_text, steps, gen_length, block_length, temperature,
210
+ cfg_scale, remasking_strategy, thinking_mode_lm):
211
+ global MODEL, TOKENIZER, MASK_ID, DEVICE
212
+ print(f"thinking_mode_lm: {thinking_mode_lm}")
213
+ if MODEL is None or TOKENIZER is None or MASK_ID is None:
214
+ yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
215
+ return
216
+
217
+ steps = int(steps)
218
+ gen_length = int(gen_length)
219
+ block_length = int(block_length)
220
+
221
+ if thinking_mode_lm:
222
+ prompt_text = "You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here\n" + prompt_text
223
+
224
+ try:
225
+ m = [{"role": "user", "content": prompt_text}]
226
+ processed_prompt_text = TOKENIZER.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
227
+ except Exception as e:
228
+ yield [("Error applying chat template.", "ERROR")], f"Chat template error: {e}"
229
+ processed_prompt_text = prompt_text
230
+ try:
231
+ if TOKENIZER.pad_token_id is None:
232
+ if TOKENIZER.eos_token_id is not None:
233
+ TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
234
+ else: # Should have been caught by load_model, but double check
235
+ yield [("Tokenizer Error", "ERROR")], "pad_token_id is not set in tokenizer."
236
+ return
237
+
238
+ input_ids = TOKENIZER(text=processed_prompt_text, return_tensors="pt", padding="longest", padding_side="left", truncation=True, max_length=MODEL.config.max_position_embeddings if hasattr(MODEL.config, 'max_position_embeddings') else 2048)['input_ids'].to(DEVICE)
239
+ raw_prompt_attention_mask = None
240
+
241
+ except Exception as e:
242
+ yield [("Error tokenizing prompt.", "ERROR")], f"Tokenization error: {e}"
243
+ return
244
+
245
+
246
+
247
+ batch_size = input_ids.shape[0]
248
+ prompt_len = input_ids.shape[1]
249
+
250
+ x = torch.full((batch_size, prompt_len + gen_length), MASK_ID, dtype=torch.long, device=DEVICE)
251
+ x[:, :prompt_len] = input_ids.clone()
252
+
253
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), "Starting generation: Prompt + Initial Masks"
254
+
255
+ if gen_length == 0:
256
+ final_text_output = TOKENIZER.batch_decode(x[:,prompt_len:], skip_special_tokens=True)
257
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_output[0] if final_text_output else ""
258
+ return
259
+
260
+ if block_length <= 0 or gen_length % block_length != 0 :
261
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
262
+ f"Error: gen_length ({gen_length}) must be divisible by block_length ({block_length}) and block_length > 0."
263
+ return
264
+ num_blocks = gen_length // block_length
265
+
266
+ if steps <=0 or steps % num_blocks != 0:
267
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
268
+ f"Error: steps ({steps}) must be positive and divisible by num_blocks ({num_blocks}). Steps: {steps}, Num Blocks: {num_blocks}"
269
+ return
270
+ steps_per_block = steps // num_blocks
271
+
272
+ for num_block_iter in range(num_blocks):
273
+ current_block_start_idx_in_x = prompt_len + num_block_iter * block_length
274
+ current_block_end_idx_in_x = prompt_len + (num_block_iter + 1) * block_length
275
+
276
+ block_masks_bool_current = torch.zeros_like(x, dtype=torch.bool)
277
+ block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x] = \
278
+ (x[:, current_block_start_idx_in_x:current_block_end_idx_in_x] == MASK_ID)
279
+
280
+ num_transfer_tokens_for_this_block = get_num_transfer_tokens(
281
+ block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x],
282
+ steps_per_block
283
+ )
284
+
285
+ for i_step_in_block in range(steps_per_block):
286
+ mask_index_global = (x == MASK_ID)
287
+
288
+ if cfg_scale > 0.:
289
+ un_x = x.clone()
290
+ # For unconditional pass, mask out the original prompt tokens that are not padding
291
+ # raw_prompt_attention_mask is (B, prompt_len)
292
+ prompt_active_tokens_mask = raw_prompt_attention_mask.bool() # True where actual prompt tokens are
293
+ un_x[:, :prompt_len][prompt_active_tokens_mask] = MASK_ID
294
+
295
+ x_cfg_input = torch.cat([x, un_x], dim=0)
296
+ # Pass attention_mask for CFG if model expects it, covering both parts
297
+ # For simplicity, not passing explicit attention_mask here; relies on model's internal handling.
298
+ model_output = MODEL(x_cfg_input)
299
+ logits_cond, logits_uncond = torch.chunk(model_output.logits, 2, dim=0)
300
+ logits = logits_uncond + (cfg_scale + 1) * (logits_cond - logits_uncond)
301
+ else:
302
+ # Not passing explicit attention_mask here; relies on model's internal handling.
303
+ model_output = MODEL(x)
304
+ logits = model_output.logits
305
+
306
+ logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
307
+ x0_predicted_tokens = torch.argmax(logits_with_noise, dim=-1)
308
+
309
+ if remasking_strategy == 'low_confidence':
310
+ if DEVICE == "mps":
311
+ probs = F.softmax(logits.to(torch.float32), dim=-1)
312
+ else:
313
+ probs = F.softmax(logits.to(torch.float64), dim=-1)
314
+ x0_probs = torch.gather(probs, dim=-1, index=x0_predicted_tokens.unsqueeze(-1)).squeeze(-1)
315
+ elif remasking_strategy == 'random':
316
+ if DEVICE == "mps":
317
+ x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float32)
318
+ else:
319
+ x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float64)
320
+ else:
321
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), f"Error: Unknown remasking strategy '{remasking_strategy}'"
322
+ return
323
+
324
+ confidence_for_selection = torch.full_like(x0_probs, -torch.inf)
325
+ candidate_positions_for_unmasking = mask_index_global & block_masks_bool_current
326
+ confidence_for_selection = torch.where(
327
+ candidate_positions_for_unmasking,
328
+ x0_probs,
329
+ -torch.inf
330
+ )
331
+
332
+ x0_final_candidates = torch.where(mask_index_global, x0_predicted_tokens, x)
333
+
334
+ transfer_indices_bool = torch.zeros_like(x, dtype=torch.bool)
335
+ num_to_transfer_this_step_batch = num_transfer_tokens_for_this_block[:, i_step_in_block]
336
+
337
+ for j_batch_idx in range(batch_size):
338
+ k_val = min(num_to_transfer_this_step_batch[j_batch_idx].item(),
339
+ candidate_positions_for_unmasking[j_batch_idx].sum().item()) # ensure k isn't too large
340
+
341
+ if k_val > 0:
342
+ # Ensure confidence_for_selection[j_batch_idx] is 1D for topk
343
+ conf_slice = confidence_for_selection[j_batch_idx]
344
+ if conf_slice.ndim > 1: conf_slice = conf_slice.view(-1) # Should already be 1D from x0_probs
345
+
346
+ # Check if there are enough valid (non -inf) confidences
347
+ valid_conf_count = (conf_slice > -torch.inf).sum().item()
348
+ actual_k = min(k_val, valid_conf_count)
349
+
350
+ if actual_k > 0:
351
+ _, topk_indices_in_x = torch.topk(conf_slice, k=actual_k)
352
+ transfer_indices_bool[j_batch_idx, topk_indices_in_x] = True
353
+
354
+ x[transfer_indices_bool] = x0_final_candidates[transfer_indices_bool]
355
+
356
+ current_total_step = num_block_iter * steps_per_block + i_step_in_block + 1
357
+ total_overall_steps = num_blocks * steps_per_block
358
+ status_msg = f"Block {num_block_iter+1}/{num_blocks}, Step {i_step_in_block+1}/{steps_per_block} (Total: {current_total_step}/{total_overall_steps})"
359
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), status_msg
360
+
361
+ final_generated_ids = x[:, prompt_len:]
362
+ final_text_output = TOKENIZER.batch_decode(final_generated_ids, skip_special_tokens=True)
363
+
364
+ final_text_str = final_text_output[0] if final_text_output and len(final_text_output) > 0 else ""
365
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_str
366
+
367
+ @torch.no_grad()
368
+ def generate_viz_wrapper(uploaded_image_pil, prompt_text, steps, gen_length, block_length, temperature,
369
+ cfg_scale, remasking_strategy, thinking_mode_mmu):
370
+ global MODEL, TOKENIZER, MASK_ID, DEVICE
371
+
372
+ if MODEL is None or TOKENIZER is None or MASK_ID is None:
373
+ yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
374
+ return
375
+
376
+ steps = int(steps)
377
+ gen_length = int(gen_length)
378
+ block_length = int(block_length)
379
+
380
+ if thinking_mode_mmu:
381
+ prompt_text = "You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here\n" + prompt_text
382
+
383
+ try:
384
+ m = [{"role": "user", "content": prompt_text}]
385
+ processed_prompt_text = TOKENIZER.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
386
+ except Exception as e:
387
+ yield [("Error applying chat template.", "ERROR")], f"Chat template error: {e}"
388
+ processed_prompt_text = prompt_text
389
+
390
+ image_vq_ids_tensor = None
391
+ if uploaded_image_pil is not None:
392
+ try:
393
+
394
+ image = image_transform(uploaded_image_pil, resolution=512).to(DEVICE)
395
+ image = image.unsqueeze(0)
396
+ image_vq_ids_tensor = VQ_MODEL.get_code(image) + 126349
397
+ except Exception as e:
398
+ yield [("Error processing image.", "ERROR")], f"Image to VQ tokens conversion failed: {str(e)}"
399
+ return
400
+
401
+
402
+ try:
403
+ if TOKENIZER.pad_token_id is None:
404
+ if TOKENIZER.eos_token_id is not None:
405
+ TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
406
+ else:
407
+ yield [("Tokenizer Error", "ERROR")], "pad_token_id is not set in tokenizer."
408
+ return
409
+
410
+ input_ids = TOKENIZER(text=processed_prompt_text, return_tensors="pt", padding="longest", padding_side="left", truncation=True, max_length=MODEL.config.max_position_embeddings if hasattr(MODEL.config, 'max_position_embeddings') else 2048)['input_ids'].to(DEVICE)
411
+ raw_prompt_attention_mask = None
412
+ if image_vq_ids_tensor is not None:
413
+ if image_vq_ids_tensor.ndim == 1:
414
+ image_vq_ids_tensor = image_vq_ids_tensor.unsqueeze(0)
415
+
416
+ input_ids = torch.cat([
417
+ (torch.ones(input_ids.shape[0], 1) * torch.tensor([126089])).to(DEVICE),
418
+ (torch.ones(input_ids.shape[0], 1) * torch.tensor([126084])).to(DEVICE),
419
+ image_vq_ids_tensor,
420
+ (torch.ones(input_ids.shape[0], 1) * torch.tensor([126085])).to(DEVICE),
421
+ input_ids
422
+ ], dim=1).long()
423
+
424
+ else:
425
+ input_ids = input_ids
426
+
427
+
428
+ except Exception as e:
429
+ yield [("Error tokenizing prompt.", "ERROR")], f"Tokenization error: {e}"
430
+ return
431
+
432
+
433
+
434
+ batch_size = input_ids.shape[0]
435
+ prompt_len = input_ids.shape[1]
436
+
437
+ x = torch.full((batch_size, prompt_len + gen_length), MASK_ID, dtype=torch.long, device=DEVICE)
438
+ x[:, :prompt_len] = input_ids.clone()
439
+
440
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), "Starting generation: Prompt + Initial Masks"
441
+
442
+ if gen_length == 0:
443
+ final_text_output = TOKENIZER.batch_decode(x[:,prompt_len:], skip_special_tokens=True)
444
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_output[0] if final_text_output else ""
445
+ return
446
+
447
+ if block_length <= 0 or gen_length % block_length != 0 :
448
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
449
+ f"Error: gen_length ({gen_length}) must be divisible by block_length ({block_length}) and block_length > 0."
450
+ return
451
+ num_blocks = gen_length // block_length
452
+
453
+ if steps <=0 or steps % num_blocks != 0:
454
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
455
+ f"Error: steps ({steps}) must be positive and divisible by num_blocks ({num_blocks}). Steps: {steps}, Num Blocks: {num_blocks}"
456
+ return
457
+ steps_per_block = steps // num_blocks
458
+
459
+ for num_block_iter in range(num_blocks):
460
+ current_block_start_idx_in_x = prompt_len + num_block_iter * block_length
461
+ current_block_end_idx_in_x = prompt_len + (num_block_iter + 1) * block_length
462
+
463
+ block_masks_bool_current = torch.zeros_like(x, dtype=torch.bool)
464
+ block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x] = \
465
+ (x[:, current_block_start_idx_in_x:current_block_end_idx_in_x] == MASK_ID)
466
+
467
+ num_transfer_tokens_for_this_block = get_num_transfer_tokens(
468
+ block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x],
469
+ steps_per_block
470
+ )
471
+
472
+ for i_step_in_block in range(steps_per_block):
473
+ mask_index_global = (x == MASK_ID)
474
+
475
+ if cfg_scale > 0.:
476
+ un_x = x.clone()
477
+ # For unconditional pass, mask out the original prompt tokens that are not padding
478
+ # raw_prompt_attention_mask is (B, prompt_len)
479
+ prompt_active_tokens_mask = raw_prompt_attention_mask.bool() # True where actual prompt tokens are
480
+ un_x[:, :prompt_len][prompt_active_tokens_mask] = MASK_ID
481
+
482
+ x_cfg_input = torch.cat([x, un_x], dim=0)
483
+ # Pass attention_mask for CFG if model expects it, covering both parts
484
+ # For simplicity, not passing explicit attention_mask here; relies on model's internal handling.
485
+ model_output = MODEL(x_cfg_input)
486
+ logits_cond, logits_uncond = torch.chunk(model_output.logits, 2, dim=0)
487
+ logits = logits_uncond + (cfg_scale + 1) * (logits_cond - logits_uncond)
488
+ else:
489
+ # Not passing explicit attention_mask here; relies on model's internal handling.
490
+ model_output = MODEL(x)
491
+ logits = model_output.logits
492
+
493
+ logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
494
+ x0_predicted_tokens = torch.argmax(logits_with_noise, dim=-1)
495
+
496
+ if remasking_strategy == 'low_confidence':
497
+ if DEVICE == "mps":
498
+ probs = F.softmax(logits.to(torch.float32), dim=-1)
499
+ else:
500
+ probs = F.softmax(logits.to(torch.float64), dim=-1)
501
+ x0_probs = torch.gather(probs, dim=-1, index=x0_predicted_tokens.unsqueeze(-1)).squeeze(-1)
502
+ elif remasking_strategy == 'random':
503
+ if DEVICE == "mps":
504
+ x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float32)
505
+ else:
506
+ x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float64)
507
+ else:
508
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), f"Error: Unknown remasking strategy '{remasking_strategy}'"
509
+ return
510
+
511
+ confidence_for_selection = torch.full_like(x0_probs, -torch.inf)
512
+ candidate_positions_for_unmasking = mask_index_global & block_masks_bool_current
513
+ confidence_for_selection = torch.where(
514
+ candidate_positions_for_unmasking,
515
+ x0_probs,
516
+ -torch.inf
517
+ )
518
+
519
+ x0_final_candidates = torch.where(mask_index_global, x0_predicted_tokens, x)
520
+
521
+ transfer_indices_bool = torch.zeros_like(x, dtype=torch.bool)
522
+ num_to_transfer_this_step_batch = num_transfer_tokens_for_this_block[:, i_step_in_block]
523
+
524
+ for j_batch_idx in range(batch_size):
525
+ k_val = min(num_to_transfer_this_step_batch[j_batch_idx].item(),
526
+ candidate_positions_for_unmasking[j_batch_idx].sum().item()) # ensure k isn't too large
527
+
528
+ if k_val > 0:
529
+ # Ensure confidence_for_selection[j_batch_idx] is 1D for topk
530
+ conf_slice = confidence_for_selection[j_batch_idx]
531
+ if conf_slice.ndim > 1: conf_slice = conf_slice.view(-1) # Should already be 1D from x0_probs
532
+
533
+ # Check if there are enough valid (non -inf) confidences
534
+ valid_conf_count = (conf_slice > -torch.inf).sum().item()
535
+ actual_k = min(k_val, valid_conf_count)
536
+
537
+ if actual_k > 0:
538
+ _, topk_indices_in_x = torch.topk(conf_slice, k=actual_k)
539
+ transfer_indices_bool[j_batch_idx, topk_indices_in_x] = True
540
+
541
+ x[transfer_indices_bool] = x0_final_candidates[transfer_indices_bool]
542
+
543
+ current_total_step = num_block_iter * steps_per_block + i_step_in_block + 1
544
+ total_overall_steps = num_blocks * steps_per_block
545
+ status_msg = f"Block {num_block_iter+1}/{num_blocks}, Step {i_step_in_block+1}/{steps_per_block} (Total: {current_total_step}/{total_overall_steps})"
546
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), status_msg
547
+
548
+ final_generated_ids = x[:, prompt_len:]
549
+ final_text_output = TOKENIZER.batch_decode(final_generated_ids, skip_special_tokens=True)
550
+
551
+ final_text_str = final_text_output[0] if final_text_output and len(final_text_output) > 0 else ""
552
+ yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_str
553
+
554
+
555
+ css_styles = """
556
+ .gradio-container{font-family:'IBM Plex Sans',sans-serif;margin:auto;}
557
+ .gr-input {background:#f9f9f9 !important;border:1px solid #e0e0e0 !important;}
558
+ .gr-output{background:#f0f0f0 !important;border:1px solid #d0d0d0 !important;}
559
+
560
+ .highlighted-text span{
561
+ padding:2px 4px;border-radius:4px;margin:1px 2px;display:inline-block;line-height:1.6;
562
+ }
563
+
564
+ footer{display:none !important}
565
+
566
+ #live-update-scrollable-box {
567
+ max-height: 800px; /* 您可以根据需要调整这个最大高度,例如 '300px', '50vh' 等 */
568
+ overflow-y: auto !important; /* 当内容超出 max-height 时显示垂直滚动条 */
569
+ display: block; /* 确保元素是块级元素,以便 max-height 生效 */
570
+
571
+ }
572
+ #think_btn {
573
+ background-color: #f3f4f6 !important;
574
+ border: 1px solid #d0d0d0 !important;
575
+ color: #111827 !important;
576
+ font-size: 16px !important;
577
+ font-weight: bold !important;
578
+ }
579
+ #think_btn:hover {
580
+ background-color: #e0e0e0 !important;
581
+ border: 1px solid #c0c0c0 !important;
582
+ color: #222 !important;
583
+ }
584
+ #think_btn:active {
585
+ background-color: #2563eb !important;
586
+ border: 1px solid #b0b0b0 !important;
587
+ color: white !important;
588
+ }
589
+ """
590
+
591
+
592
+ # thinking_mode_t2i = gr.State(False)
593
+ def toggle_thinking_mode_lm(current_thinking_mode):
594
+ # print(f"current_thinking_mode: {current_thinking_mode}")
595
+ new_state = not current_thinking_mode
596
+ new_label = "Thinking Mode ✅" if new_state else "Thinking Mode ❌"
597
+ return new_state, gr.update(value=new_label)
598
+
599
+ def toggle_thinking_mode_mmu(current_thinking_mode):
600
+ new_state = not current_thinking_mode
601
+ new_label = "Thinking Mode ✅" if new_state else "Thinking Mode ❌"
602
+ return new_state, gr.update(value=new_label)
603
+
604
+
605
+ color_map_config = {
606
+ "MASK": "lightgrey",
607
+ "GEN": "#DCABFA",
608
+ }
609
+
610
+ theme = gr.themes.Ocean(
611
+ primary_hue="fuchsia",
612
+ )
613
+ with gr.Blocks(css=css_styles, theme=theme) as demo:
614
+ # with gr.Blocks(css=css_styles, theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
615
+ # with gr.Blocks() as demo:
616
+ thinking_mode_lm = gr.State(False)
617
+ thinking_mode_mmu = gr.State(False)
618
+ gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>MMaDA: Multimodal Large Diffusion Language Models</h1>")
619
+ gr.Markdown("MMaDA is a novel class of multimodal diffusion foundation models designed to achieve superior performance across diverse domains such as textual reasoning, multimodal understanding, and text-to-image generation")
620
+ gr.Markdown("Github: [Gen-Verse/MMaDA](https://github.com/Gen-Verse/MMaDA)")
621
+ gr.Markdown("Paper: [MMaDA: Multimodal Large Diffusion Language Models]()")
622
+ gr.Markdown("### Select Model")
623
+ with gr.Row():
624
+ model_select_radio = gr.Radio(
625
+ label="Select Text Generation Model",
626
+ choices=MODEL_CHOICES,
627
+ value=MODEL_CHOICES[0]
628
+ )
629
+ model_load_status_box = gr.Textbox(
630
+ label="Model Load Status",
631
+ interactive=False,
632
+ lines=3,
633
+ max_lines=5
634
+ )
635
+
636
+ gr.Markdown("## Part 1. Text Generation")
637
+ with gr.Row():
638
+ with gr.Column(scale=2):
639
+ prompt_input_box_lm = gr.Textbox(label="Enter your prompt:", lines=3, value="A rectangular prism has a length of 5 units, a width of 4 units, and a height of 3 units. What is the volume of the prism?")
640
+ think_button_lm = gr.Button("🧠 Enable Thinking Mode", elem_id="think_btn")
641
+ with gr.Accordion("Generation Parameters", open=True):
642
+ with gr.Row():
643
+ gen_length_slider_lm = gr.Slider(minimum=8, maximum=1024, value=512, step=64, label="Generation Length", info="Number of tokens to generate.")
644
+ steps_slider_lm = gr.Slider(minimum=1, maximum=512, value=256, step=32, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
645
+ with gr.Row():
646
+ block_length_slider_lm = gr.Slider(minimum=8, maximum=1024, value=128, step=32, label="Block Length", info="gen_length must be divisible by this.")
647
+ remasking_dropdown_lm = gr.Dropdown(choices=['low_confidence', 'random'], value='low_confidence', label="Remasking Strategy")
648
+ with gr.Row():
649
+ cfg_scale_slider_lm = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.1, label="CFG Scale", info="Classifier-Free Guidance. 0 disables it.")
650
+ temperature_slider_lm = gr.Slider(minimum=0.0, maximum=2.0, value=1, step=0.05, label="Temperature", info="Controls randomness via Gumbel noise. 0 is deterministic.")
651
+
652
+
653
+ with gr.Row():
654
+ run_button_ui_lm = gr.Button("Generate Sequence", variant="primary", scale=3)
655
+ clear_button_ui_lm = gr.Button("Clear Outputs", scale=1)
656
+
657
+ with gr.Column(scale=3):
658
+ # gr.Markdown("## Live Generation Process")
659
+ output_visualization_box_lm = gr.HighlightedText(
660
+ label="Live Generation Process",
661
+ show_legend=True,
662
+ color_map=color_map_config,
663
+ combine_adjacent=False,
664
+ interactive=False,
665
+ elem_id="live-update-scrollable-box",
666
+ )
667
+ # gr.Markdown("## Final Generated Text")
668
+ output_final_text_box_lm = gr.Textbox(label="Final Output", lines=8, interactive=False, show_copy_button=True)
669
+
670
+
671
+
672
+ gr.Examples(
673
+ examples=[
674
+ ["A rectangular prism has a length of 5 units, a width of 4 units, and a height of 3 units. What is the volume of the prism?", 256, 512, 128, 1, 0, "low_confidence"],
675
+ ["Lily can run 12 kilometers per hour for 4 hours. After that, she can run 6 kilometers per hour. How many kilometers can she run in 8 hours?", 256, 512, 64, 1, 0, "low_confidence"]
676
+ ],
677
+ inputs=[prompt_input_box_lm, steps_slider_lm, gen_length_slider_lm, block_length_slider_lm, temperature_slider_lm, cfg_scale_slider_lm, remasking_dropdown_lm],
678
+ outputs=[output_visualization_box_lm, output_final_text_box_lm],
679
+ fn=generate_viz_wrapper_lm,
680
+ )
681
+
682
+ gr.Markdown("---")
683
+ gr.Markdown("## Part 2. Multimodal Understanding")
684
+ with gr.Row():
685
+ with gr.Column(scale=2):
686
+ prompt_input_box_mmu = gr.Textbox(
687
+ label="Enter your prompt:",
688
+ lines=3,
689
+ value="Please describe this image in detail."
690
+ )
691
+ think_button_mmu = gr.Button("🧠 Enable Thinking Mode", elem_id="think_btn")
692
+ with gr.Accordion("Generation Parameters", open=True):
693
+ with gr.Row():
694
+ gen_length_slider_mmu = gr.Slider(minimum=64, maximum=1024, value=512, step=64, label="Generation Length", info="Number of tokens to generate.")
695
+ steps_slider_mmu = gr.Slider(minimum=1, maximum=512, value=256, step=32, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
696
+ with gr.Row():
697
+ block_length_slider_mmu = gr.Slider(minimum=32, maximum=1024, value=128, step=32, label="Block Length", info="gen_length must be divisible by this.")
698
+ remasking_dropdown_mmu = gr.Dropdown(choices=['low_confidence', 'random'], value='low_confidence', label="Remasking Strategy")
699
+ with gr.Row():
700
+ cfg_scale_slider_mmu = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.1, label="CFG Scale", info="Classifier-Free Guidance. 0 disables it.")
701
+ temperature_slider_mmu = gr.Slider(minimum=0.0, maximum=2.0, value=1, step=0.05, label="Temperature", info="Controls randomness via Gumbel noise. 0 is deterministic.")
702
+
703
+ with gr.Row():
704
+ image_upload_box = gr.Image(type="pil", label="Upload Image")
705
+
706
+ with gr.Row():
707
+ run_button_ui_mmu = gr.Button("Generate Description", variant="primary", scale=3)
708
+ clear_button_ui_mmu = gr.Button("Clear Outputs", scale=1)
709
+
710
+ with gr.Column(scale=3):
711
+ gr.Markdown("## Live Generation Process")
712
+ output_visualization_box_mmu = gr.HighlightedText(
713
+ label="Token Sequence (Live Update)",
714
+ show_legend=True,
715
+ color_map=color_map_config,
716
+ combine_adjacent=False,
717
+ interactive=False,
718
+ elem_id="live-update-scrollable-box",
719
+ )
720
+ gr.Markdown("## Final Generated Text")
721
+ output_final_text_box_mmu = gr.Textbox(label="Final Output", lines=8, interactive=False, show_copy_button=True)
722
+
723
+
724
+ gr.Examples(
725
+ examples=[
726
+ [
727
+ "mmu_validation_2/sunflower.jpg",
728
+ "Please describe this image in detail.",
729
+ 256,
730
+ 512,
731
+ 128,
732
+ 1,
733
+ 0,
734
+ "low_confidence"
735
+ ],
736
+ [
737
+ "mmu_validation_2/woman.jpg",
738
+ "Please describe this image in detail.",
739
+ 256,
740
+ 512,
741
+ 128,
742
+ 1,
743
+ 0,
744
+ "low_confidence"
745
+ ]
746
+ ],
747
+ inputs=[
748
+ image_upload_box,
749
+ prompt_input_box_mmu,
750
+ steps_slider_mmu,
751
+ gen_length_slider_mmu,
752
+ block_length_slider_mmu,
753
+ temperature_slider_mmu,
754
+ cfg_scale_slider_mmu,
755
+ remasking_dropdown_mmu
756
+ ],
757
+ outputs=[output_visualization_box_mmu, output_final_text_box_mmu],
758
+ fn=generate_viz_wrapper,
759
+ )
760
+
761
+ gr.Markdown("---")
762
+ gr.Markdown("## Part 3. Text-to-Image Generation")
763
+ with gr.Row():
764
+ with gr.Column(scale=2):
765
+ prompt_input_box_t2i = gr.Textbox(label="Enter your prompt:", lines=3, value="A sea turtle swimming near a coral reef in the ocean, with a clear blue sky and water in the background.")
766
+
767
+ with gr.Accordion("Generation Parameters", open=True):
768
+ with gr.Row():
769
+ steps_slider_t2i = gr.Slider(minimum=5, maximum=100, value=15, step=5, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
770
+ guidance_scale_slider_t2i = gr.Slider(minimum=0.0, maximum=7.0, value=3.5, step=0.5, label="Guidance Scale", info="Classifier-Free Guidance. 0 disables it.")
771
+
772
+
773
+ with gr.Row():
774
+ scheduler_radio_t2i = gr.Radio(
775
+ choices=["cosine", "sigmoid", "linear"],
776
+ value="cosine",
777
+ label="Scheduler",
778
+ )
779
+
780
+ with gr.Row():
781
+ run_button_ui_t2i = gr.Button("Generate Image", variant="primary", scale=3)
782
+ clear_button_ui_t2i = gr.Button("Clear Outputs", scale=1)
783
+
784
+
785
+ with gr.Column(scale=3):
786
+ # gr.Markdown("## Live Generation Process")
787
+ output_image_t2i = gr.Image(label="Generated Image", interactive=False, type="pil")
788
+ output_status_t2i = gr.Textbox(label="Generation Status", interactive=False)
789
+
790
+ gr.Examples(
791
+ examples=[
792
+ ["A sea turtle swimming near a coral reef in the ocean, with a clear blue sky and water in the background.", 15, 3.5, "cosine"],
793
+ ["A beautiful sunset over a calm ocean, with a few clouds in the sky.", 15, 3.5, "cosine"]
794
+ ],
795
+ inputs=[prompt_input_box_t2i, steps_slider_t2i, guidance_scale_slider_t2i, scheduler_radio_t2i],
796
+ outputs=[output_image_t2i, output_status_t2i],
797
+ fn=generate_viz_wrapper_t2i,
798
+ )
799
+
800
+ run_button_ui_t2i.click(
801
+ fn=generate_viz_wrapper_t2i,
802
+ inputs=[
803
+ prompt_input_box_t2i,
804
+ steps_slider_t2i,
805
+ guidance_scale_slider_t2i,
806
+ scheduler_radio_t2i
807
+ ],
808
+ outputs=[output_image_t2i, output_status_t2i]
809
+ )
810
+
811
+ clear_button_ui_t2i.click(
812
+ fn=lambda: (None, ""),
813
+ inputs=None,
814
+ outputs=[output_image_t2i, output_status_t2i],
815
+ queue=False
816
+ )
817
+
818
+ think_button_lm.click(
819
+ fn=toggle_thinking_mode_lm,
820
+ inputs=[thinking_mode_lm],
821
+ outputs=[thinking_mode_lm, think_button_lm]
822
+ )
823
+
824
+ think_button_mmu.click(
825
+ fn=toggle_thinking_mode_mmu,
826
+ inputs=[thinking_mode_mmu],
827
+ outputs=[thinking_mode_mmu, think_button_mmu]
828
+ )
829
+
830
+
831
+
832
+ def initialize_default_model():
833
+ default_model = "MMaDA-8B-Base"
834
+ result = handle_model_selection_change(default_model)
835
+ return default_model, result
836
+
837
+ demo.load(
838
+ fn=initialize_default_model,
839
+ inputs=None,
840
+ outputs=[model_select_radio, model_load_status_box],
841
+ queue=True
842
+ )
843
+
844
+ def clear_outputs():
845
+ return None, None, None # Clear image, visualization, and final text
846
+
847
+ clear_button_ui_lm.click(
848
+ fn=clear_outputs,
849
+ inputs=None,
850
+ outputs=[image_upload_box, output_visualization_box_lm, output_final_text_box_lm],
851
+ queue=False
852
+ )
853
+ clear_button_ui_mmu.click(
854
+ fn=clear_outputs,
855
+ inputs=None,
856
+ outputs=[image_upload_box, output_visualization_box_mmu, output_final_text_box_mmu],
857
+ queue=False
858
+ )
859
+
860
+ run_button_ui_lm.click(
861
+ fn=generate_viz_wrapper_lm,
862
+ inputs=[
863
+ prompt_input_box_lm,
864
+ steps_slider_lm,
865
+ gen_length_slider_lm,
866
+ block_length_slider_lm,
867
+ temperature_slider_lm,
868
+ cfg_scale_slider_lm,
869
+ remasking_dropdown_lm,
870
+ thinking_mode_lm
871
+ ],
872
+ outputs=[output_visualization_box_lm, output_final_text_box_lm]
873
+ )
874
+
875
+ run_button_ui_mmu.click(
876
+ fn=generate_viz_wrapper,
877
+ inputs=[
878
+ image_upload_box,
879
+ prompt_input_box_mmu,
880
+ steps_slider_mmu,
881
+ gen_length_slider_mmu,
882
+ block_length_slider_mmu,
883
+ temperature_slider_mmu,
884
+ cfg_scale_slider_mmu,
885
+ remasking_dropdown_mmu,
886
+ thinking_mode_mmu
887
+ ],
888
+ outputs=[output_visualization_box_mmu, output_final_text_box_mmu]
889
+ )
890
+
891
+
892
+ if __name__ == "__main__":
893
+ print(f"Starting Gradio App. Attempting to use device: {DEVICE}")
894
+ demo.launch(share=True)
MMaDA/check_lr.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.optim import AdamW
3
+
4
+ from models.lr_schedulers import get_scheduler
5
+
6
+ MAX_TRAINING_STEPS = 100
7
+ WARMUP_STEPS = 80
8
+ INITIAL_LR = 5e-5
9
+ SCHEDULER_TYPE = "cosine" # "linear", "cosine"
10
+ # ---------------------------------------------
11
+
12
+ dummy_model = torch.nn.Linear(1, 1)
13
+ dummy_optimizer = AdamW(dummy_model.parameters(), lr=INITIAL_LR)
14
+
15
+ lr_scheduler = get_scheduler(
16
+ name=SCHEDULER_TYPE,
17
+ optimizer=dummy_optimizer,
18
+ num_warmup_steps=WARMUP_STEPS,
19
+ num_training_steps=MAX_TRAINING_STEPS,
20
+ )
21
+
22
+ all_lrs = []
23
+ for step in range(MAX_TRAINING_STEPS):
24
+ all_lrs.append(lr_scheduler.get_last_lr()[0])
25
+ lr_scheduler.step()
26
+
27
+ print(all_lrs[79])
MMaDA/check_tokens.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ 체크 방법
4
+ =========
5
+ python check_audio_tokens.py \
6
+ --config configs/omada_instruction_tuning.yaml \
7
+ --samples 20
8
+ """
9
+
10
+ import argparse
11
+ import random
12
+ from pathlib import Path
13
+ from typing import Iterable, Optional, Tuple, Union
14
+
15
+ import numpy as np
16
+ import torch
17
+ from omegaconf import OmegaConf
18
+ from tqdm import tqdm
19
+ from transformers import AutoTokenizer
20
+
21
+ from models.modeling_emova_speech_tokenizer import EMOVASpeechTokenizer
22
+ from training.data import MixedSpeechTextDataset, VideoSpeechDataset
23
+ from training.prompting_utils import UniversalPrompting
24
+ from training.utils import image_transform
25
+ import sys, os
26
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
27
+
28
+ def _to_tensor(entry: Union[torch.Tensor, np.ndarray, list, tuple, str],
29
+ vq_model: EMOVASpeechTokenizer) -> torch.Tensor:
30
+ """entry가 경로면 encode, 이미 토큰이면 long tensor로 변환."""
31
+ if isinstance(entry, torch.Tensor):
32
+ tokens = entry.clone().long()
33
+ elif isinstance(entry, np.ndarray):
34
+ tokens = torch.from_numpy(entry).long()
35
+ elif isinstance(entry, (list, tuple)):
36
+ tokens = torch.as_tensor(entry, dtype=torch.long)
37
+ elif isinstance(entry, str):
38
+ # EMOVA encode는 (1, L) 반환 → 1D로 변환
39
+ tokens = vq_model.encode(entry).squeeze(0).long()
40
+ else:
41
+ raise TypeError(f"Unsupported token entry type: {type(entry)}")
42
+ return tokens.view(-1)
43
+
44
+
45
+ def _log_stats(flow: str, path: str, tokens: torch.Tensor,
46
+ codebook_size: int = 4096) -> Tuple[int, int]:
47
+ max_id = int(tokens.max().item())
48
+ min_id = int(tokens.min().item())
49
+ over = int((tokens >= codebook_size).sum().item())
50
+ under = int((tokens < 0).sum().item())
51
+
52
+ print(
53
+ f"[{flow}] path={path} "
54
+ f"shape={tuple(tokens.shape)} "
55
+ f"min={min_id} max={max_id} "
56
+ f"<0={under} >=4096={over}"
57
+ )
58
+ return over, under
59
+
60
+
61
+ def build_prompting(config) -> UniversalPrompting:
62
+ tokenizer = AutoTokenizer.from_pretrained(
63
+ config.model.omada.tokenizer_path,
64
+ padding_side="left",
65
+ )
66
+ special_tokens = (
67
+ "<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>",
68
+ "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>",
69
+ "<|i2i|>", "<|v2t|>", "<|v2s|>", "<|s2t|>",
70
+ "<|t2s|>", "<|s2s|>", "<|soa|>", "<|eoa|>",
71
+ )
72
+ prompt = UniversalPrompting(
73
+ tokenizer,
74
+ max_text_len=config.dataset.preprocessing.max_seq_length,
75
+ max_audio_len=config.dataset.preprocessing.max_aud_length,
76
+ max_audio_len_short=config.dataset.preprocessing.max_aud_length_short,
77
+ ignore_id=-100,
78
+ cond_dropout_prob=config.training.cond_dropout_prob,
79
+ special_tokens=special_tokens,
80
+ use_reserved_token=True,
81
+ )
82
+ return prompt
83
+
84
+
85
+ def sample_indices(length: int, num: int) -> Tuple[Iterable[int], int]:
86
+ """
87
+ Returns iterable of indices and the total count that will be iterated.
88
+ If num <= 0 or num >= length, iterates through the whole dataset.
89
+ """
90
+ if num is None or num <= 0 or num >= length:
91
+ return range(length), length
92
+ indices = random.sample(range(length), num)
93
+ return indices, len(indices)
94
+
95
+
96
+ @torch.no_grad()
97
+ def inspect_v2s(config, prompting, vq_model, num_samples: int):
98
+ speech_cfg = OmegaConf.to_container(
99
+ config.dataset.params.get("video_speech_dataset", {}),
100
+ resolve=True
101
+ ) or {}
102
+ dataset = VideoSpeechDataset(
103
+ transform=image_transform,
104
+ resolution=config.dataset.preprocessing.resolution,
105
+ num_frames=speech_cfg.get("num_frames_speech", 4),
106
+ video_root=speech_cfg.get(
107
+ "video_root", "/home/work/AIDAS/data/video/openvid1m/video/video"
108
+ ),
109
+ audio_root=speech_cfg.get(
110
+ "audio_root", "/home/work/AIDAS/data/video-speech"
111
+ ),
112
+ speech_dir_name=speech_cfg.get("speech_dir_name", "openvid-speech-trunc"),
113
+ index_path=speech_cfg.get(
114
+ "index_path", "/home/work/AIDAS/data/video-speech/openvid-speech.csv"
115
+ ),
116
+ sample_method=speech_cfg.get("sample_method", "uniform"),
117
+ precomputed_tokens_root=speech_cfg.get("precomputed_tokens_root"),
118
+ )
119
+
120
+ print(f"\n=== VideoSpeechDataset (v2s) | total={len(dataset)} ===")
121
+ total_over = total_under = 0
122
+ indices, total = sample_indices(len(dataset), num_samples)
123
+ for idx in tqdm(indices, total=total, desc="v2s audio", unit="sample"):
124
+ sample = dataset.data[idx]
125
+ speech_path = sample["speech"]
126
+ tokens = dataset._load_precomputed_tokens(speech_path)
127
+ if tokens is not None:
128
+ tokens = tokens.long()
129
+ else:
130
+ tokens = vq_model.encode(speech_path).squeeze(0).long()
131
+ over, under = _log_stats("v2s", speech_path, tokens)
132
+ total_over += over
133
+ total_under += under
134
+
135
+ print(f"[v2s] total >=4096: {total_over} | total <0: {total_under}")
136
+
137
+
138
+ @torch.no_grad()
139
+ def inspect_t2s(config, prompting, vq_model, num_samples: int):
140
+ dataset = MixedSpeechTextDataset(config.dataset.params.audio_data)
141
+
142
+ print(f"\n=== MixedSpeechTextDataset (t2s/s2t 공용) | total={len(dataset)} ===")
143
+ total_over = total_under = 0
144
+ indices, total = sample_indices(len(dataset), num_samples)
145
+ for idx in tqdm(indices, total=total, desc="t2s/s2t audio", unit="sample"):
146
+ sample = dataset[idx]
147
+ entry = sample["audio_path"]
148
+ if isinstance(entry, np.ndarray):
149
+ tokens = torch.from_numpy(entry).long()
150
+ path_repr = "<precomputed-array>"
151
+ elif isinstance(entry, str):
152
+ tokens = vq_model.encode(entry).squeeze(0).long()
153
+ path_repr = entry
154
+ else:
155
+ tokens = torch.as_tensor(entry, dtype=torch.long)
156
+ path_repr = "<sequence>"
157
+ over, under = _log_stats("t2s/s2t-source", path_repr, tokens)
158
+ total_over += over
159
+ total_under += under
160
+
161
+ print(f"[t2s] total >=4096: {total_over} | total <0: {total_under}")
162
+
163
+
164
+ def main():
165
+ parser = argparse.ArgumentParser()
166
+ parser.add_argument("--config", required=True,
167
+ help="학습에 사용한 YAML 설정 파일")
168
+ parser.add_argument(
169
+ "--samples",
170
+ type=int,
171
+ default=-1,
172
+ help="각 데이터셋에서 검사할 샘플 수 (<=0이면 전체 검사)",
173
+ )
174
+ args = parser.parse_args()
175
+
176
+ config = OmegaConf.load(args.config)
177
+ prompting = build_prompting(config)
178
+
179
+ vq_model = EMOVASpeechTokenizer.from_pretrained(
180
+ config.model.vq_model_audio.vq_model_name
181
+ )
182
+ vq_model.eval()
183
+
184
+ inspect_v2s(config, prompting, vq_model, args.samples)
185
+ # inspect_t2s(config, prompting, vq_model, args.samples)
186
+
187
+
188
+ if __name__ == "__main__":
189
+ torch.manual_seed(0)
190
+ random.seed(0)
191
+ main()
MMaDA/configs/mmada_demo.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "demo"
8
+ name: "mmada-demo"
9
+ output_dir: "mmada-demo"
10
+
11
+ model:
12
+ vq_model:
13
+ type: "magvitv2"
14
+ vq_model_name: "showlab/magvitv2"
15
+
16
+ mmada:
17
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
18
+ w_clip_vit: False
19
+ new_vocab_size: 134656
20
+ llm_vocab_size: 126464
21
+ codebook_size: 8192
22
+ num_vq_tokens: 256
23
+ num_new_special_tokens: 0
24
+ tie_word_embeddings: False
25
+
26
+ gradient_checkpointing: True
27
+
28
+ dataset:
29
+ gen_type: "imagenet1k"
30
+ und_type: "captioning"
31
+ combined_loader_mode: "max_size_cycle"
32
+ params:
33
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
34
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
35
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
36
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..01209}.tar"
37
+ ]
38
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
39
+ add_caption_prompt: True
40
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
41
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
42
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
43
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
44
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
45
+ shuffle_buffer_size: 1000
46
+ num_workers: 32
47
+ resolution: 256
48
+ pin_memory: True
49
+ persistent_workers: True
50
+
51
+ preprocessing:
52
+ max_seq_length: 512 # for text tokens
53
+ resolution: 256
54
+ center_crop: False
55
+ random_flip: False
56
+
57
+ optimizer:
58
+ name: adamw
59
+ params: # default adamw params
60
+ learning_rate: 5e-5
61
+ scale_lr: False # scale learning rate by total batch size
62
+ beta1: 0.9
63
+ beta2: 0.999
64
+ weight_decay: 0.01
65
+ epsilon: 1e-8
66
+
67
+ lr_scheduler:
68
+ scheduler: "cosine"
69
+ params:
70
+ learning_rate: ${optimizer.params.learning_rate}
71
+ warmup_steps: 8000
72
+
73
+ training:
74
+ gradient_accumulation_steps: 4
75
+ noise_type: "mask"
76
+ batch_size_t2i: 5
77
+ batch_size_lm: 1
78
+ batch_size_mmu: 2
79
+ mixed_precision: "bf16"
80
+ enable_tf32: True
81
+ seed: 10086
82
+ max_train_steps: 500000
83
+ overfit_one_batch: False
84
+ cond_dropout_prob: 0.1
85
+ min_masking_rate: 0.0
86
+ label_smoothing: 0.0
87
+ max_grad_norm: 1
88
+ guidance_scale: 1.5
89
+ generation_timesteps: 20
90
+ t2i_coeff: 1.0
91
+ lm_coeff: 0.1
92
+ mmu_coeff: 1.0
93
+
94
+ mask_schedule:
95
+ schedule: "cosine"
MMaDA/configs/mmada_demo_s2t.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "omada-training-stage1"
8
+ name: "omada-training-stage1"
9
+ output_dir: "ckpts/omada/omada-training-stage1"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 5000
13
+ eval_every: 10000000000
14
+ generate_every: 1000000000
15
+ log_every: 1
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+
19
+ model:
20
+ vq_model_image:
21
+ type: "magvitv2"
22
+ vq_model_name: "showlab/magvitv2"
23
+ ### Omada ###############################################################
24
+ vq_model_audio:
25
+ type: "emova"
26
+ vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
27
+ omada:
28
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
29
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
30
+ # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
31
+ w_clip_vit: False
32
+ new_vocab_size: 138752
33
+ llm_vocab_size: 126464
34
+ codebook_size: 8192
35
+ num_vq_tokens: 256
36
+ num_new_special_tokens: 5 # task token 3 + eoa / soa
37
+ tie_word_embeddings: False
38
+ #########################################################################
39
+
40
+ gradient_checkpointing: True
41
+
42
+ dataset:
43
+ gen_type: "pass"
44
+ und_type: "pass"
45
+ combined_loader_mode: "max_size_cycle"
46
+ params:
47
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
48
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
49
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
50
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
51
+ ]
52
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
53
+ add_caption_prompt: True
54
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
55
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
56
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
57
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
58
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
59
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
60
+ ### Omada ###############################################################
61
+ video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
62
+ # subset for gigaspeech: xs, xl
63
+ # subset for librispeech: train-clean-360, train-clean-100
64
+ # subset for commonvoice: validated, invalidated
65
+ audio_data:
66
+ - name: "gigaspeech"
67
+ subset: "xl"
68
+ split: "train"
69
+ - name: "librispeech"
70
+ subset: "train-clean-360"
71
+ - name: "commonvoice"
72
+ subset: "validated"
73
+ #########################################################################
74
+ shuffle_buffer_size: 1000
75
+ num_workers: 8
76
+ resolution: 256
77
+ pin_memory: True
78
+ persistent_workers: True
79
+
80
+ preprocessing:
81
+ max_seq_length: 128 # for text tokens
82
+ max_aud_length: 256 # for audio tokens
83
+ resolution: 128
84
+ center_crop: False
85
+ random_flip: False
86
+
87
+ optimizer:
88
+ name: adamw
89
+ params: # default adamw params
90
+ learning_rate: 1e-5
91
+ scale_lr: False # scale learning rate by total batch size
92
+ beta1: 0.9
93
+ beta2: 0.999
94
+ weight_decay: 0.01
95
+ epsilon: 1e-8
96
+
97
+ lr_scheduler:
98
+ scheduler: "cosine"
99
+ params:
100
+ learning_rate: ${optimizer.params.learning_rate}
101
+ warmup_steps: 3000
102
+ min_lr_scale: 0.1
103
+
104
+ training:
105
+ gradient_accumulation_steps: 1
106
+ noise_type: "mask"
107
+ batch_size_t2i: 0
108
+ batch_size_lm: 0
109
+ batch_size_mmu: 0
110
+ batch_size_v2t: 2
111
+ batch_size_s2t: 2
112
+ batch_size_t2s: 3
113
+
114
+ mixed_precision: "bf16"
115
+ enable_tf32: True
116
+ seed: 10086
117
+ max_train_steps: 200000
118
+ max_train_epochs: 1
119
+ overfit_one_batch: False
120
+ cond_dropout_prob: 0.1
121
+ min_masking_rate: 0.0
122
+ label_smoothing: 0.0
123
+ max_grad_norm: 1
124
+ guidance_scale: 0.75
125
+ generation_timesteps: 16
126
+ # t2i_coeff: 0.1
127
+ # lm_coeff: 0.1
128
+ # mmu_coeff: 0.1
129
+ v2t_coeff: 1.0
130
+ t2s_coeff: 1.0
131
+ s2t_coeff: 1.0
MMaDA/configs/mmada_demo_speech.yaml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "demo"
8
+ name: "mmada-demo"
9
+ output_dir: "mmada-demo"
10
+
11
+ model:
12
+ vq_model:
13
+ type: "magvitv2"
14
+ vq_model_name: "showlab/magvitv2"
15
+ speech_model:
16
+ type: "emova"
17
+ speech_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
18
+
19
+ mmada:
20
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
21
+ w_clip_vit: False
22
+ new_vocab_size: 138752
23
+ llm_vocab_size: 126464
24
+ codebook_size: 8192
25
+ speech_codebook_size: 4096
26
+ num_vq_tokens: 256
27
+ num_speech_vq_tokens: 100
28
+ num_new_special_tokens: 3
29
+ tie_word_embeddings: False
30
+ train_step: 25000
31
+
32
+ gradient_checkpointing: True
33
+
34
+ dataset:
35
+ gen_type: "imagenet1k"
36
+ und_type: "captioning"
37
+ combined_loader_mode: "max_size_cycle"
38
+ params:
39
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
40
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
41
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
42
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..01209}.tar"
43
+ ]
44
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
45
+ add_caption_prompt: True
46
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
47
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
48
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
49
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
50
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
51
+ shuffle_buffer_size: 1000
52
+ num_workers: 32
53
+ resolution: 256
54
+ pin_memory: True
55
+ persistent_workers: True
56
+
57
+ preprocessing:
58
+ max_seq_length: 512 # for text tokens
59
+ resolution: 256
60
+ center_crop: False
61
+ random_flip: False
62
+
63
+ optimizer:
64
+ name: adamw
65
+ params: # default adamw params
66
+ learning_rate: 5e-5
67
+ scale_lr: False # scale learning rate by total batch size
68
+ beta1: 0.9
69
+ beta2: 0.999
70
+ weight_decay: 0.01
71
+ epsilon: 1e-8
72
+
73
+ lr_scheduler:
74
+ scheduler: "cosine"
75
+ params:
76
+ learning_rate: ${optimizer.params.learning_rate}
77
+ warmup_steps: 8000
78
+
79
+ training:
80
+ gradient_accumulation_steps: 4
81
+ noise_type: "mask"
82
+ batch_size_t2i: 5
83
+ batch_size_lm: 1
84
+ batch_size_mmu: 2
85
+ mixed_precision: "bf16"
86
+ enable_tf32: True
87
+ seed: 10086
88
+ max_train_steps: 500000
89
+ overfit_one_batch: False
90
+ cond_dropout_prob: 0.1
91
+ min_masking_rate: 0.0
92
+ label_smoothing: 0.0
93
+ max_grad_norm: 1
94
+ guidance_scale: 1.5
95
+ generation_timesteps: 20
96
+ t2i_coeff: 1.0
97
+ lm_coeff: 0.1
98
+ mmu_coeff: 1.0
99
+
100
+ mask_schedule:
101
+ schedule: "cosine"
MMaDA/configs/mmada_demo_video.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "demo"
8
+ name: "mmada-demo"
9
+ output_dir: "mmada-demo"
10
+
11
+ model:
12
+ vq_model:
13
+ type: "magvitv2"
14
+ vq_model_name: "showlab/magvitv2"
15
+
16
+ mmada:
17
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
18
+ w_clip_vit: False
19
+ new_vocab_size: 134656
20
+ llm_vocab_size: 126464
21
+ codebook_size: 8192
22
+ num_vq_tokens: 256
23
+ num_new_special_tokens: 0
24
+ tie_word_embeddings: False
25
+
26
+ gradient_checkpointing: True
27
+
28
+ dataset:
29
+ gen_type: "imagenet1k"
30
+ und_type: "captioning"
31
+ combined_loader_mode: "max_size_cycle"
32
+ params:
33
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
34
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
35
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
36
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..01209}.tar"
37
+ ]
38
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
39
+ add_caption_prompt: True
40
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
41
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
42
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
43
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
44
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
45
+ shuffle_buffer_size: 1000
46
+ num_workers: 32
47
+ resolution: 128
48
+ pin_memory: True
49
+ persistent_workers: True
50
+
51
+ preprocessing:
52
+ max_seq_length: 512 # for text tokens
53
+ resolution: 256
54
+ center_crop: False
55
+ random_flip: False
56
+
57
+ optimizer:
58
+ name: adamw
59
+ params: # default adamw params
60
+ learning_rate: 5e-5
61
+ scale_lr: False # scale learning rate by total batch size
62
+ beta1: 0.9
63
+ beta2: 0.999
64
+ weight_decay: 0.01
65
+ epsilon: 1e-8
66
+
67
+ lr_scheduler:
68
+ scheduler: "cosine"
69
+ params:
70
+ learning_rate: ${optimizer.params.learning_rate}
71
+ warmup_steps: 8000
72
+
73
+ training:
74
+ gradient_accumulation_steps: 4
75
+ noise_type: "mask"
76
+ batch_size_t2i: 5
77
+ batch_size_lm: 1
78
+ batch_size_mmu: 2
79
+ mixed_precision: "bf16"
80
+ enable_tf32: True
81
+ seed: 10086
82
+ max_train_steps: 500000
83
+ overfit_one_batch: False
84
+ cond_dropout_prob: 0.1
85
+ min_masking_rate: 0.0
86
+ label_smoothing: 0.0
87
+ max_grad_norm: 1
88
+ guidance_scale: 1.5
89
+ generation_timesteps: 20
90
+ t2i_coeff: 1.0
91
+ lm_coeff: 0.1
92
+ mmu_coeff: 1.0
93
+
94
+ mask_schedule:
95
+ schedule: "cosine"
MMaDA/configs/mmada_demo_video_temp.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "demo"
8
+ name: "mmada-demo"
9
+ output_dir: "mmada-demo"
10
+
11
+ model:
12
+ vq_model:
13
+ type: "magvitv2"
14
+ vq_model_name: "showlab/magvitv2"
15
+
16
+ mmada:
17
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
18
+ w_clip_vit: False
19
+ new_vocab_size: 134656
20
+ llm_vocab_size: 126464
21
+ codebook_size: 8192
22
+ num_vq_tokens: 900
23
+ num_new_special_tokens: 0
24
+ tie_word_embeddings: False
25
+
26
+ gradient_checkpointing: True
27
+
28
+ dataset:
29
+ gen_type: "imagenet1k"
30
+ und_type: "captioning"
31
+ combined_loader_mode: "max_size_cycle"
32
+ params:
33
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
34
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
35
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
36
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..01209}.tar"
37
+ ]
38
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
39
+ add_caption_prompt: True
40
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
41
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
42
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
43
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
44
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
45
+ shuffle_buffer_size: 1000
46
+ num_workers: 32
47
+ resolution: 480
48
+ pin_memory: True
49
+ persistent_workers: True
50
+
51
+ preprocessing:
52
+ max_seq_length: 512 # for text tokens
53
+ resolution: 480
54
+ center_crop: False
55
+ random_flip: False
56
+
57
+ optimizer:
58
+ name: adamw
59
+ params: # default adamw params
60
+ learning_rate: 5e-5
61
+ scale_lr: False # scale learning rate by total batch size
62
+ beta1: 0.9
63
+ beta2: 0.999
64
+ weight_decay: 0.01
65
+ epsilon: 1e-8
66
+
67
+ lr_scheduler:
68
+ scheduler: "cosine"
69
+ params:
70
+ learning_rate: ${optimizer.params.learning_rate}
71
+ warmup_steps: 8000
72
+
73
+ training:
74
+ gradient_accumulation_steps: 4
75
+ noise_type: "mask"
76
+ batch_size_t2i: 5
77
+ batch_size_lm: 1
78
+ batch_size_mmu: 2
79
+ mixed_precision: "bf16"
80
+ enable_tf32: True
81
+ seed: 10086
82
+ max_train_steps: 500000
83
+ overfit_one_batch: False
84
+ cond_dropout_prob: 0.1
85
+ min_masking_rate: 0.0
86
+ label_smoothing: 0.0
87
+ max_grad_norm: 1
88
+ guidance_scale: 1.5
89
+ generation_timesteps: 20
90
+ t2i_coeff: 1.0
91
+ lm_coeff: 0.1
92
+ mmu_coeff: 1.0
93
+
94
+ mask_schedule:
95
+ schedule: "cosine"
MMaDA/configs/mmada_pretraining_i2i.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "ommda-training-i2i_256_0715"
8
+ name: "ommda-training-i2i-mmada-instruct_256_0715"
9
+ output_dir: "ommda-training-i2i-mmada-instruct_256_0715"
10
+ save_every: 5000
11
+ eval_every: 20000
12
+ generate_every: 5000
13
+ num_validation_images: 20
14
+ log_every: 1
15
+ log_grad_norm_every: 100
16
+ resume_from_checkpoint: "latest"
17
+ val_every: 50000
18
+ max_val_examples_t2i: 2000
19
+
20
+ model:
21
+ vq_model:
22
+ type: "magvitv2"
23
+ vq_model_name: "showlab/magvitv2"
24
+
25
+ mmada:
26
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
27
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
28
+ w_clip_vit: False
29
+ new_vocab_size: 134656
30
+ llm_vocab_size: 126464
31
+ codebook_size: 8192
32
+ num_vq_tokens: 256
33
+ num_new_special_tokens: 0
34
+ tie_word_embeddings: False
35
+
36
+ gradient_checkpointing: True
37
+
38
+ dataset:
39
+ params:
40
+ num_workers: 0
41
+ resolution: 256
42
+ pin_memory: True
43
+ persistent_workers: True
44
+
45
+ preprocessing:
46
+ max_seq_length: 256 # for text tokens
47
+ resolution: 256
48
+ center_crop: False
49
+ random_flip: False
50
+
51
+ optimizer:
52
+ name: adamw
53
+ params: # default adamw params
54
+ learning_rate: 5e-5
55
+ scale_lr: False # scale learning rate by total batch size
56
+ beta1: 0.9
57
+ beta2: 0.999
58
+ weight_decay: 0.01
59
+ epsilon: 1e-8
60
+
61
+ lr_scheduler:
62
+ scheduler: "cosine"
63
+ params:
64
+ learning_rate: ${optimizer.params.learning_rate}
65
+ warmup_steps: 5000
66
+ min_lr_scale: 0.1
67
+
68
+ training:
69
+ gradient_accumulation_steps: 4
70
+ noise_type: "mask"
71
+ batch_size_i2i: 1
72
+ mixed_precision: "bf16"
73
+ enable_tf32: True
74
+ seed: 10086
75
+ max_train_steps: 50000
76
+ overfit_one_batch: False
77
+ cond_dropout_prob: 0.1
78
+ min_masking_rate: 0.0
79
+ label_smoothing: 0.0
80
+ max_grad_norm: 1
81
+ guidance_scale: 5
82
+ generation_timesteps: 50
83
+ t2i_coeff: 1.0
84
+ lm_coeff: 0.1
85
+ mmu_coeff: 0.5
86
+ validation_seed: 42
MMaDA/configs/mmada_pretraining_s2t.yaml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "ommda-training-s2t"
8
+ name: "ommda-training-s2t-mmada"
9
+ output_dir: "ommda-training-s2t-mmada"
10
+ save_every: 5000
11
+ eval_every: 20000
12
+ generate_every: 5000
13
+ num_validation_images: 20
14
+ log_every: 1
15
+ log_grad_norm_every: 100
16
+ resume_from_checkpoint: False
17
+ val_every: 50000
18
+ max_val_examples_t2i: 2000
19
+
20
+ model:
21
+ vq_model:
22
+ type: "emova"
23
+ vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
24
+
25
+ mmada:
26
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
27
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
28
+ w_clip_vit: False
29
+ new_vocab_size: 138752
30
+ llm_vocab_size: 126464
31
+ codebook_size: 8192
32
+ speech_codebook_size: 4096
33
+ # num_vq_tokens: 256
34
+ # num_speech_vq_tokens: 250
35
+ num_new_special_tokens: 3
36
+ tie_word_embeddings: False
37
+
38
+ gradient_checkpointing: True
39
+
40
+ dataset:
41
+ params:
42
+ num_workers: 0
43
+ resolution: 256
44
+ pin_memory: True
45
+ persistent_workers: True
46
+
47
+ preprocessing:
48
+ max_seq_length: 256 # for text tokens
49
+ resolution: 256
50
+ center_crop: False
51
+ random_flip: False
52
+
53
+ data:
54
+ # subset for gigaspeech: xs, xl
55
+ # subset for librispeech: train-clean-360, train-clean-100
56
+ # subset for commonvoice: validated, invalidated
57
+ name: "gigaspeech"
58
+ subset: "xl"
59
+ split: "train"
60
+
61
+ optimizer:
62
+ name: adamw
63
+ params: # default adamw params
64
+ learning_rate: 5e-5
65
+ scale_lr: False # scale learning rate by total batch size
66
+ beta1: 0.9
67
+ beta2: 0.999
68
+ weight_decay: 0.01
69
+ epsilon: 1e-8
70
+
71
+ lr_scheduler:
72
+ scheduler: "cosine"
73
+ params:
74
+ learning_rate: ${optimizer.params.learning_rate}
75
+ warmup_steps: 5000
76
+ min_lr_scale: 0.1
77
+
78
+ training:
79
+ gradient_accumulation_steps: 4
80
+ noise_type: "mask"
81
+ batch_size_s2t: 4
82
+ mixed_precision: "bf16"
83
+ enable_tf32: True
84
+ seed: 10086
85
+ max_train_steps: 50000
86
+ overfit_one_batch: False
87
+ cond_dropout_prob: 0.1
88
+ min_masking_rate: 0.0
89
+ label_smoothing: 0.0
90
+ max_grad_norm: 1
91
+ guidance_scale: 5
92
+ generation_timesteps: 50
93
+ t2i_coeff: 1.0
94
+ lm_coeff: 0.1
95
+ mmu_coeff: 0.5
96
+ validation_seed: 42
MMaDA/configs/mmada_pretraining_stage1_llada_instruct.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "mmada-training-stage1"
8
+ name: "mmada-training-stage1-llada-instruct"
9
+ output_dir: "mmada-training-stage1-llada-instruct"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 10000
13
+ eval_every: 2500
14
+ generate_every: 1000
15
+ log_every: 50
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+
19
+ model:
20
+ vq_model:
21
+ type: "magvitv2"
22
+ vq_model_name: "showlab/magvitv2"
23
+ mmada:
24
+ pretrained_model_path: "GSAI-ML/LLaDA-8B-Instruct"
25
+ w_clip_vit: False
26
+ new_vocab_size: 134656
27
+ llm_vocab_size: 126464
28
+ codebook_size: 8192
29
+ num_vq_tokens: 256
30
+ num_new_special_tokens: 0
31
+ tie_word_embeddings: False
32
+
33
+ gradient_checkpointing: True
34
+
35
+ dataset:
36
+ gen_type: "imagenet1k"
37
+ und_type: "captioning"
38
+ combined_loader_mode: "max_size_cycle"
39
+ params:
40
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
41
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
42
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
43
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
44
+ ]
45
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
46
+ add_caption_prompt: True
47
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
48
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
49
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
50
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
51
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
52
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
53
+ shuffle_buffer_size: 1000
54
+ num_workers: 32
55
+ resolution: 256
56
+ pin_memory: True
57
+ persistent_workers: True
58
+
59
+ preprocessing:
60
+ max_seq_length: 128 # for text tokens
61
+ resolution: 256
62
+ center_crop: False
63
+ random_flip: False
64
+
65
+ optimizer:
66
+ name: adamw
67
+ params: # default adamw params
68
+ learning_rate: 1e-4
69
+ scale_lr: False # scale learning rate by total batch size
70
+ beta1: 0.9
71
+ beta2: 0.999
72
+ weight_decay: 0.01
73
+ epsilon: 1e-8
74
+
75
+ lr_scheduler:
76
+ scheduler: "cosine"
77
+ params:
78
+ learning_rate: ${optimizer.params.learning_rate}
79
+ warmup_steps: 5000
80
+
81
+ training:
82
+ gradient_accumulation_steps: 2
83
+ noise_type: "mask"
84
+ batch_size_t2i: 7
85
+ batch_size_lm: 2
86
+ batch_size_mmu: 6
87
+ mixed_precision: "bf16"
88
+ enable_tf32: True
89
+ seed: 10086
90
+ max_train_steps: 500000
91
+ overfit_one_batch: False
92
+ cond_dropout_prob: 0.1
93
+ min_masking_rate: 0.0
94
+ label_smoothing: 0.0
95
+ max_grad_norm: 1
96
+ guidance_scale: 1.5
97
+ generation_timesteps: 12
98
+ t2i_coeff: 1.0
99
+ lm_coeff: 0.1
100
+ mmu_coeff: 1.0
MMaDA/configs/mmada_pretraining_stage2_llada_instruct.yaml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "mmada-training-stage2"
8
+ name: "mmada-training-stage2-llada-instruct"
9
+ output_dir: "mmada-training-stage2-llada-instruct"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 10000
13
+ eval_every: 2500
14
+ generate_every: 1000
15
+ log_every: 50
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+ val_every: 50
19
+ max_val_examples_t2i: 2000
20
+
21
+ model:
22
+ vq_model:
23
+ type: "magvitv2"
24
+ vq_model_name: "showlab/magvitv2"
25
+
26
+ mmada:
27
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
28
+ pretrained_model_path: "path/to/your/checkpoint"
29
+ w_clip_vit: False
30
+ new_vocab_size: 134656
31
+ llm_vocab_size: 126464
32
+ codebook_size: 8192
33
+ num_vq_tokens: 256
34
+ num_new_special_tokens: 0
35
+ tie_word_embeddings: False
36
+
37
+ gradient_checkpointing: True
38
+
39
+ dataset:
40
+ gen_type: "t2i"
41
+ und_type: "captioning"
42
+ combined_loader_mode: "max_size_cycle"
43
+ params:
44
+ train_t2i_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
45
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
46
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
47
+ ]
48
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
49
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
50
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
51
+ ]
52
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
53
+ add_caption_prompt: True
54
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
55
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
56
+ external_laion12m_caption_path: "/data_storage/ty/datasets/laion-aesthetics-12m-images-2"
57
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/new_captions"
58
+ validation_prompts_file: "validation_prompts/text2image_prompts.txt"
59
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
60
+ shuffle_buffer_size: 1000
61
+ num_workers: 32
62
+ resolution: 256
63
+ pin_memory: True
64
+ persistent_workers: True
65
+
66
+ preprocessing:
67
+ max_seq_length: 256 # for text tokens
68
+ resolution: 256
69
+ center_crop: False
70
+ random_flip: False
71
+
72
+ optimizer:
73
+ name: adamw
74
+ params: # default adamw params
75
+ learning_rate: 5e-5
76
+ scale_lr: False # scale learning rate by total batch size
77
+ beta1: 0.9
78
+ beta2: 0.999
79
+ weight_decay: 0.01
80
+ epsilon: 1e-8
81
+
82
+ lr_scheduler:
83
+ scheduler: "cosine"
84
+ params:
85
+ learning_rate: ${optimizer.params.learning_rate}
86
+ warmup_steps: 5000
87
+ min_lr_scale: 0.1
88
+
89
+ training:
90
+ gradient_accumulation_steps: 2
91
+ noise_type: "mask"
92
+ batch_size_t2i: 7
93
+ batch_size_lm: 2
94
+ batch_size_mmu: 3
95
+ mixed_precision: "bf16"
96
+ enable_tf32: True
97
+ seed: 10086
98
+ max_train_steps: 1000000
99
+ overfit_one_batch: False
100
+ cond_dropout_prob: 0.1
101
+ min_masking_rate: 0.0
102
+ label_smoothing: 0.0
103
+ max_grad_norm: 1
104
+ guidance_scale: 3
105
+ generation_timesteps: 12
106
+ t2i_coeff: 1.0
107
+ lm_coeff: 0.1
108
+ mmu_coeff: 0.5
109
+ validation_seed: 42
MMaDA/configs/mmada_pretraining_stage3_llada_instruct.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "mmada-training-stage3"
8
+ name: "mmada-training-stage3-llada-instruct"
9
+ output_dir: "mmada-training-stage3-llada-instruct"
10
+ max_train_examples_t2i: 40000000 #
11
+ max_train_examples_mmu: 40000000 #
12
+ save_every: 10000
13
+ eval_every: 2500
14
+ generate_every: 1000
15
+ log_every: 50
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+ val_every: 50
19
+ max_val_examples_t2i: 2000
20
+
21
+ model:
22
+ vq_model:
23
+ type: "magvitv2"
24
+ vq_model_name: "showlab/magvitv2"
25
+
26
+ mmada:
27
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
28
+ pretrained_model_path: "path/to/your/checkpoint"
29
+ w_clip_vit: False
30
+ new_vocab_size: 134656
31
+ llm_vocab_size: 126464
32
+ codebook_size: 8192
33
+ num_vq_tokens: 256
34
+ num_new_special_tokens: 0
35
+ tie_word_embeddings: False
36
+
37
+ gradient_checkpointing: True
38
+
39
+ dataset:
40
+ gen_type: "t2i"
41
+ und_type: "captioning"
42
+ combined_loader_mode: "max_size_cycle"
43
+ params:
44
+ train_t2i_shards_path_or_url: [ #
45
+ "/data_storage/shared/datasets/JourneyDB/train/imgs/data/train/imgs/{000..199}.tgz",
46
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar",
47
+ "/data_storage/shared/datasets/text-to-image-2M/data_512_2M"
48
+ ]
49
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar", #
50
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
51
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
52
+ ]
53
+ train_lm_shards_path_or_url: "/data_storage/ty/shared/datasets/3-instruct-datasets/parquet/*.parquet"
54
+ add_caption_prompt: True
55
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
56
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
57
+ external_laion12m_caption_path: "/data_storage/ty/datasets/laion-aesthetics-12m-images-2"
58
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/new_captions"
59
+ external_text_to_image_2M_512_caption_path: "/data_storage/shared/datasets/text-to-image-2M/data_512_2M_captions"
60
+ validation_prompts_file: "validation_prompts/text2image_prompts.txt"
61
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
62
+ lm_chat_validation_jsonl: "/data_storage/ty/MMaDA/lm_chat_validation/questions.jsonl"
63
+ shuffle_buffer_size: 1000
64
+ num_workers: 32
65
+ resolution: 512
66
+ pin_memory: True
67
+ persistent_workers: True
68
+
69
+ preprocessing:
70
+ max_seq_length: 512 # for text tokens 512
71
+ resolution: 512
72
+ center_crop: False
73
+ random_flip: False
74
+
75
+ optimizer:
76
+ name: adamw
77
+ params: # default adamw params
78
+ learning_rate: 5e-5
79
+ scale_lr: False # scale learning rate by total batch size
80
+ beta1: 0.9
81
+ beta2: 0.999
82
+ weight_decay: 0.01
83
+ epsilon: 1e-8
84
+
85
+ lr_scheduler:
86
+ scheduler: "cosine"
87
+ params:
88
+ learning_rate: ${optimizer.params.learning_rate}
89
+ warmup_steps: 5000
90
+ min_lr_scale: 0.1
91
+
92
+ training:
93
+ gradient_accumulation_steps: 4 # 4
94
+ noise_type: "mask"
95
+ batch_size_t2i: 4 # 3~4
96
+ batch_size_lm: 1
97
+ batch_size_mmu: 1
98
+ mixed_precision: "bf16"
99
+ enable_tf32: True
100
+ seed: 10086
101
+ max_train_steps: 1000000
102
+ overfit_one_batch: False
103
+ cond_dropout_prob: 0.1
104
+ min_masking_rate: 0.0
105
+ label_smoothing: 0.0
106
+ max_grad_norm: 1
107
+ guidance_scale: 3
108
+ generation_timesteps: 12
109
+ t2i_coeff: 1.0
110
+ lm_coeff: 0.4 # ~0.5
111
+ mmu_coeff: 0.5
112
+ validation_seed: 42
MMaDA/configs/mmada_pretraining_stage3_llada_instruct_512_cot.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "mmada-training-stage3"
8
+ name: "mmada-training-stage3-llada-instruct-512-cot-uni"
9
+ output_dir: "mmada-training-stage3-llada-instruct-512-cot-uni"
10
+ max_train_examples_t2i: 40000000 #
11
+ max_train_examples_mmu: 40000000 #
12
+ save_every: 10000
13
+ eval_every: 2500
14
+ generate_every: 1000
15
+ log_every: 50
16
+ log_grad_norm_every: 100
17
+ # resume_from_checkpoint: False
18
+ resume_from_checkpoint: "latest"
19
+ val_every: 50
20
+ max_val_examples_t2i: 2000
21
+
22
+ model:
23
+ vq_model:
24
+ type: "magvitv2"
25
+ vq_model_name: "showlab/magvitv2"
26
+
27
+ mmada:
28
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
29
+ pretrained_model_path: "path/to/your/checkpoint"
30
+ w_clip_vit: False
31
+ new_vocab_size: 134656
32
+ llm_vocab_size: 126464
33
+ codebook_size: 8192
34
+ num_vq_tokens: 1024
35
+ num_new_special_tokens: 0
36
+ tie_word_embeddings: False
37
+
38
+ gradient_checkpointing: True
39
+
40
+ dataset:
41
+ gen_type: "t2i"
42
+ und_type: "captioning"
43
+ combined_loader_mode: "max_size_cycle"
44
+ params:
45
+ train_t2i_shards_path_or_url: [ "/data_storage/shared/datasets/JourneyDB/train/imgs/data/train/imgs/{000..199}.tgz",
46
+ "/data_storage/shared/datasets/laion-aesthetics-12m-filter/{00000..00999}.tar",
47
+ # "/data_storage/shared/datasets/text-to-image-2M/data_512_2M/data_{000000..000046}.tar"
48
+ ]
49
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/multimodal_cot/ai2d/new_images.tar",
50
+ "/data_storage/shared/datasets/multimodal_cot/clevr/images.tar",
51
+ "/data_storage/shared/datasets/multimodal_cot/docvqa/images.tar",
52
+ "/data_storage/shared/datasets/multimodal_cot/geo/images.tar",
53
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar",
54
+ ]
55
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/3-cot-sft/parquet/*.parquet"
56
+ add_caption_prompt: True
57
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
58
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
59
+ external_laion12m_caption_path: "/data_storage/ty/datasets/laion-aesthetics-12m-images-2"
60
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/new_captions"
61
+ external_text_to_image_2M_512_caption_path: "/data_storage/shared/datasets/text-to-image-2M/data_512_2M_captions"
62
+ external_ai2d_caption_path: "/data_storage/shared/datasets/multimodal_cot/ai2d/new_metadata.csv"
63
+ external_clevr_caption_path: "/data_storage/shared/datasets/multimodal_cot/clevr/metadata.csv"
64
+ external_docvqa_caption_path: "/data_storage/shared/datasets/multimodal_cot/docvqa/metadata.csv"
65
+ external_geo_caption_path: "/data_storage/shared/datasets/multimodal_cot/geo/metadata.csv"
66
+ validation_prompts_file: "validation_prompts/text2image_prompts.txt"
67
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
68
+ mmu_validation_prompts_file: "/data_storage/ty/MMaDA/mmu_validation/prompts.jsonl"
69
+ lm_chat_validation_jsonl: "/data_storage/ty/MMaDA/lm_chat_validation/questions.jsonl"
70
+ shuffle_buffer_size: 1000
71
+ num_workers: 32
72
+ resolution: 512
73
+ pin_memory: True
74
+ persistent_workers: True
75
+
76
+ preprocessing:
77
+ max_seq_length: 512 # for text tokens in t2i & mmu
78
+ max_lm_text_length: 1536 # for text tokens in lm/lm_chat
79
+ resolution: 512
80
+ center_crop: False
81
+ random_flip: False
82
+
83
+ optimizer:
84
+ name: adamw
85
+ params: # default adamw params
86
+ learning_rate: 5e-5
87
+ scale_lr: False # scale learning rate by total batch size
88
+ beta1: 0.9
89
+ beta2: 0.999
90
+ weight_decay: 0.01
91
+ epsilon: 1e-8
92
+
93
+ lr_scheduler:
94
+ scheduler: "cosine"
95
+ params:
96
+ learning_rate: ${optimizer.params.learning_rate}
97
+ warmup_steps: 5000
98
+ min_lr_scale: 0.1
99
+
100
+ training:
101
+ gradient_accumulation_steps: 4 # 4
102
+ noise_type: "mask"
103
+ batch_size_t2i: 1
104
+ batch_size_lm: 2
105
+ batch_size_mmu: 1
106
+ mixed_precision: "bf16"
107
+ enable_tf32: True
108
+ seed: 10086
109
+ max_train_steps: 1000000
110
+ overfit_one_batch: False
111
+ cond_dropout_prob: 0.1
112
+ min_masking_rate: 0.0
113
+ label_smoothing: 0.0
114
+ max_grad_norm: 1
115
+ guidance_scale: 5
116
+ generation_timesteps: 20
117
+ t2i_coeff: 1.0
118
+ lm_coeff: 0.5
119
+ mmu_coeff: 0.5
120
+
121
+ validation:
122
+ quantative_prompts_file: "/data_storage/ty/MMaDA/validation_prompts/quantative.txt"
123
+ quantative_batch_size: 8
MMaDA/configs/mmada_pretraining_stage4_llada_instruct.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "mmada-training-stage4"
8
+ name: "mmada-training-stage4-llada-instruct"
9
+ output_dir: "mmada-training-stage4-llada-instruct"
10
+ max_train_examples_t2i: 40000000 #
11
+ max_train_examples_mmu: 40000000 #
12
+ save_every: 10000
13
+ eval_every: 2500
14
+ generate_every: 1000
15
+ log_every: 50
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+ val_every: 50
19
+ max_val_examples_t2i: 2000
20
+
21
+ model:
22
+ vq_model:
23
+ type: "magvitv2"
24
+ vq_model_name: "showlab/magvitv2"
25
+
26
+ mmada:
27
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
28
+ pretrained_model_path: "/data_storage/ty/MMaDA/mmada-training-stage3-llada-instruct-512-cot-uni/checkpoint-210000/unwrapped_model"
29
+ w_clip_vit: False
30
+ new_vocab_size: 134656
31
+ llm_vocab_size: 126464
32
+ codebook_size: 8192
33
+ num_vq_tokens: 1024
34
+ num_new_special_tokens: 0
35
+ tie_word_embeddings: False
36
+
37
+ gradient_checkpointing: True
38
+
39
+ dataset:
40
+ gen_type: "t2i"
41
+ und_type: "captioning"
42
+ combined_loader_mode: "max_size_cycle"
43
+ params:
44
+ train_t2i_shards_path_or_url: [ "/data_storage/shared/datasets/JourneyDB/train/imgs/data/train/imgs/{000..199}.tgz",
45
+ "/data_storage/shared/datasets/laion-aesthetics-12m-filter/{00000..00999}.tar",
46
+ # "/data_storage/shared/datasets/text-to-image-2M/data_512_2M/data_{000000..000046}.tar"
47
+ ]
48
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/multimodal_cot/ai2d/new_images.tar",
49
+ "/data_storage/shared/datasets/multimodal_cot/clevr/images.tar",
50
+ "/data_storage/shared/datasets/multimodal_cot/docvqa/images.tar",
51
+ "/data_storage/shared/datasets/multimodal_cot/geo/images.tar",
52
+ ]
53
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
54
+ train_instruct_shards_path_or_url: "/data_storage/shared/datasets/stage4_instruct/*.parquet"
55
+ add_caption_prompt: True
56
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
57
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
58
+ external_laion12m_caption_path: "/data_storage/ty/datasets/laion-aesthetics-12m-images-2"
59
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/new_captions"
60
+ external_text_to_image_2M_512_caption_path: "/data_storage/shared/datasets/text-to-image-2M/data_512_2M_captions"
61
+ external_ai2d_caption_path: "/data_storage/shared/datasets/multimodal_cot/ai2d/new_metadata.csv"
62
+ external_clevr_caption_path: "/data_storage/shared/datasets/multimodal_cot/clevr/metadata.csv"
63
+ external_docvqa_caption_path: "/data_storage/shared/datasets/multimodal_cot/docvqa/metadata.csv"
64
+ external_geo_caption_path: "/data_storage/shared/datasets/multimodal_cot/geo/metadata.csv"
65
+ external_vqa_caption_path: "/data_storage/shared/datasets/LLaVA-Instruct-150K/llava_v1_5_mix665k.json"
66
+ external_clevr2_caption_path: "/data_storage/ty/datasets/Clevr_CoGenT_TrainA_70K_Complex/captions.json"
67
+ external_geo170k_caption_path: "/data_storage/ty/shared/datasets/Geo170K/Geo170K/all.json"
68
+ vqa_images_path: "/data_storage/shared/datasets/LLaVA-Instruct-150K-images"
69
+ clevr2_images_path: "/data_storage/ty/datasets/Clevr_CoGenT_TrainA_70K_Complex/images"
70
+ geo170k_images_path: "/data_storage/ty/shared/datasets/Geo170K/Geo170K/images"
71
+ validation_prompts_file: "validation_prompts/text2image_prompts.txt"
72
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
73
+ mmu_validation_prompts_file: "/data_storage/ty/MMaDA/mmu_validation/prompts_with_vqa.json"
74
+ lm_chat_validation_jsonl: "/data_storage/ty/MMaDA/lm_chat_validation/questions.jsonl"
75
+ shuffle_buffer_size: 1000
76
+ num_workers: 16
77
+ resolution: 512
78
+ pin_memory: True
79
+ persistent_workers: True
80
+
81
+ preprocessing:
82
+ max_seq_length: 512 # for text tokens in t2i & mmu
83
+ max_lm_text_length: 1536 # for text tokens in lm/lm_chat
84
+ resolution: 512
85
+ center_crop: False
86
+ random_flip: False
87
+
88
+ optimizer:
89
+ name: adamw
90
+ params: # default adamw params
91
+ learning_rate: 5e-5
92
+ scale_lr: False # scale learning rate by total batch size
93
+ beta1: 0.9
94
+ beta2: 0.999
95
+ weight_decay: 0.01
96
+ epsilon: 1e-8
97
+
98
+ lr_scheduler:
99
+ scheduler: "cosine"
100
+ params:
101
+ learning_rate: ${optimizer.params.learning_rate}
102
+ warmup_steps: 5000
103
+ min_lr_scale: 0.1
104
+
105
+ training:
106
+ gradient_accumulation_steps: 4 # 4
107
+ noise_type: "mask"
108
+ batch_size_t2i: 1
109
+ batch_size_lm: 2
110
+ batch_size_mmu: 1
111
+ mixed_precision: "bf16"
112
+ enable_tf32: True
113
+ seed: 10086
114
+ max_train_steps: 1000000
115
+ overfit_one_batch: False
116
+ cond_dropout_prob: 0.1
117
+ min_masking_rate: 0.0
118
+ label_smoothing: 0.0
119
+ max_grad_norm: 1
120
+ guidance_scale: 5
121
+ generation_timesteps: 20
122
+ t2i_coeff: 0.05
123
+ lm_coeff: 0.6
124
+ mmu_coeff: 0.4
125
+ cot_in_mmu_coeff: 3.5
126
+ vqa_in_mmu_coeff: 5.5
127
+ clevr2_in_mmu_coeff: 0.5
128
+ geo170k_in_mmu_coeff: 0.5
129
+ base_in_lm_coeff: 0.02
130
+ instruct_in_lm_coeff: 0.98
131
+
132
+ validation:
133
+ quantative_prompts_file: "/data_storage/ty/MMaDA/validation_prompts/quantative.txt"
134
+ quantative_batch_size: 8
MMaDA/configs/mmada_pretraining_t2s.yaml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "ommda-training-t2s"
8
+ name: "ommda-training-t2s-mmada"
9
+ output_dir: "ommda-training-t2s-mmada"
10
+ save_every: 5000
11
+ eval_every: 20000
12
+ generate_every: 5000
13
+ num_validation_images: 20
14
+ log_every: 1
15
+ log_grad_norm_every: 100
16
+ resume_from_checkpoint: "latest"
17
+ val_every: 50000
18
+ max_val_examples_t2i: 2000
19
+
20
+ model:
21
+ vq_model:
22
+ type: "emova"
23
+ vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
24
+
25
+ mmada:
26
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
27
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
28
+ w_clip_vit: False
29
+ new_vocab_size: 138752
30
+ llm_vocab_size: 126464
31
+ codebook_size: 8192
32
+ speech_codebook_size: 4096
33
+ # num_vq_tokens: 256
34
+ # num_speech_vq_tokens: 250
35
+ num_new_special_tokens: 3
36
+ tie_word_embeddings: False
37
+
38
+ gradient_checkpointing: True
39
+
40
+ dataset:
41
+ params:
42
+ num_workers: 0
43
+ resolution: 256
44
+ pin_memory: True
45
+ persistent_workers: True
46
+
47
+ preprocessing:
48
+ max_seq_length: 256 # for text tokens
49
+ resolution: 256
50
+ center_crop: False
51
+ random_flip: False
52
+
53
+ data:
54
+ # subset for gigaspeech: xs, xl
55
+ # subset for librispeech: train-clean-360, train-clean-100
56
+ # subset for commonvoice: validated, invalidated
57
+ name: "gigaspeech"
58
+ subset: "xl"
59
+ split: "train"
60
+
61
+ optimizer:
62
+ name: adamw
63
+ params: # default adamw params
64
+ learning_rate: 1e-4
65
+ scale_lr: False # scale learning rate by total batch size
66
+ beta1: 0.9
67
+ beta2: 0.999
68
+ weight_decay: 0.01
69
+ epsilon: 1e-8
70
+
71
+ lr_scheduler:
72
+ scheduler: "cosine"
73
+ params:
74
+ learning_rate: ${optimizer.params.learning_rate}
75
+ warmup_steps: 2500
76
+ min_lr_scale: 0.1
77
+
78
+ training:
79
+ gradient_accumulation_steps: 4
80
+ noise_type: "mask"
81
+ batch_size_s2t: 4
82
+ mixed_precision: "bf16"
83
+ enable_tf32: True
84
+ seed: 10086
85
+ max_train_steps: 50000
86
+ overfit_one_batch: False
87
+ cond_dropout_prob: 0.1
88
+ min_masking_rate: 0.0
89
+ label_smoothing: 0.0
90
+ max_grad_norm: 1
91
+ guidance_scale: 5
92
+ generation_timesteps: 50
93
+ t2i_coeff: 1.0
94
+ lm_coeff: 0.1
95
+ mmu_coeff: 0.5
96
+ validation_seed: 42
MMaDA/configs/mmada_pretraining_v2s.yaml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "omada-training-stage1"
8
+ name: "omada-training-stage1_ignore_SP"
9
+ output_dir: "ckpts/omada/omada-training-stage1_v2s_test"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 5000
13
+ eval_every: 5000
14
+ generate_every: 1000000000
15
+ log_every: 1
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+
19
+ model:
20
+ vq_model_image:
21
+ type: "magvitv2"
22
+ vq_model_name: "showlab/magvitv2"
23
+ ### Omada ###############################################################
24
+ vq_model_audio:
25
+ type: "emova"
26
+ vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
27
+ omada:
28
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
29
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
30
+ # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
31
+ w_clip_vit: False
32
+ new_vocab_size: 138752
33
+ llm_vocab_size: 126464
34
+ codebook_size: 8192
35
+ num_vq_tokens: 256
36
+ num_new_special_tokens: 5 # task token 3 + eoa / soa
37
+ tie_word_embeddings: False
38
+ #########################################################################
39
+
40
+ gradient_checkpointing: True
41
+
42
+ dataset:
43
+ gen_type: "pass"
44
+ und_type: "pass"
45
+ combined_loader_mode: "max_size_cycle"
46
+ params:
47
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
48
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
49
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
50
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
51
+ ]
52
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
53
+ add_caption_prompt: True
54
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
55
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
56
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
57
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
58
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
59
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
60
+ ### Omada ###############################################################
61
+ video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
62
+ # subset for gigaspeech: xs, xl
63
+ # subset for librispeech: train-clean-360, train-clean-100
64
+ # subset for commonvoice: validated, invalidated
65
+ audio_data:
66
+ - name: "gigaspeech"
67
+ subset: "xl"
68
+ split: "train"
69
+ - name: "librispeech"
70
+ subset: "train-clean-360"
71
+ - name: "commonvoice"
72
+ subset: "validated"
73
+ #########################################################################
74
+ shuffle_buffer_size: 1000
75
+ num_workers: 8
76
+ resolution: 256
77
+ pin_memory: True
78
+ persistent_workers: True
79
+
80
+ preprocessing:
81
+ max_seq_length: 128 # for text tokens
82
+ max_aud_length: 384 # for audio tokens
83
+ resolution: 128
84
+ center_crop: False
85
+ random_flip: False
86
+
87
+ optimizer:
88
+ name: adamw
89
+ params: # default adamw params
90
+ # learning_rate: 1e-4
91
+ learning_rate: 0.000079
92
+ scale_lr: False # scale learning rate by total batch size
93
+ beta1: 0.9
94
+ beta2: 0.999
95
+ weight_decay: 0.01
96
+ epsilon: 1e-8
97
+
98
+ lr_scheduler:
99
+ scheduler: "cosine"
100
+ params:
101
+ learning_rate: ${optimizer.params.learning_rate}
102
+ warmup_steps: 0
103
+ min_lr_scale: 0.1
104
+
105
+ training:
106
+ gradient_accumulation_steps: 1
107
+ noise_type: "mask"
108
+ batch_size_t2i: 0
109
+ batch_size_lm: 0
110
+ batch_size_mmu: 0
111
+ batch_size_v2t: 0
112
+ batch_size_s2t: 0
113
+ batch_size_t2s: 0
114
+ batch_size_v2s: 1
115
+
116
+ mixed_precision: "bf16"
117
+ enable_tf32: True
118
+ seed: 10086
119
+ max_train_steps: 630000 # 2epoch
120
+ max_train_epochs: NONE
121
+ overfit_one_batch: False
122
+ cond_dropout_prob: 0.1
123
+ min_masking_rate: 0.0
124
+ label_smoothing: 0.0
125
+ max_grad_norm: 1
126
+ guidance_scale: 1.5
127
+ generation_timesteps: 16
128
+ # t2i_coeff: 0.1
129
+ # lm_coeff: 0.1
130
+ # mmu_coeff: 0.1
131
+ v2t_coeff: 0.2
132
+ t2s_coeff: 1.0
133
+ s2t_coeff: 0.2
MMaDA/configs/mmada_pretraining_v2t.yaml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "mmada-training-v2t"
8
+ name: "mmada-training-stage3-llada-instruct-v2t"
9
+ output_dir: "mmada-training-stage3-llada-instruct-v2t-special-token-1e-5"
10
+ max_train_examples_t2i: 40000000 #
11
+ max_train_examples_mmu: 40000000 ddd#
12
+ save_every: 1000
13
+ eval_every: 2500
14
+ generate_every: 1000
15
+ log_every: 10
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+ val_every: 50
19
+ max_val_examples_t2i: 2000
20
+
21
+ model:
22
+ vq_model:
23
+ type: "magvitv2"
24
+ vq_model_name: "showlab/magvitv2"
25
+
26
+ mmada:
27
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
28
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
29
+ w_clip_vit: False
30
+ new_vocab_size: 134656
31
+ llm_vocab_size: 126464
32
+ codebook_size: 8192
33
+ num_vq_tokens: 256
34
+ num_new_special_tokens: 0
35
+ tie_word_embeddings: False
36
+
37
+ gradient_checkpointing: True
38
+
39
+ dataset:
40
+ und_type: "captioning"
41
+ combined_loader_mode: "max_size_cycle"
42
+
43
+ preprocessing:
44
+ max_seq_length: 128 # for text tokens 512
45
+ resolution: 128
46
+ center_crop: False
47
+ random_flip: False
48
+
49
+ params:
50
+ num_workers: 32
51
+
52
+
53
+
54
+ optimizer:
55
+ name: adamw
56
+ params: # default adamw params
57
+ learning_rate: 1e-5
58
+ scale_lr: False # scale learning rate by total batch size
59
+ beta1: 0.9
60
+ beta2: 0.999
61
+ weight_decay: 0.01
62
+ epsilon: 1e-8
63
+
64
+ lr_scheduler:
65
+ scheduler: "cosine"
66
+ params:
67
+ learning_rate: ${optimizer.params.learning_rate}
68
+ warmup_steps: 5000
69
+ min_lr_scale: 0.1
70
+
71
+ training:
72
+ gradient_accumulation_steps: 4 # 4
73
+ noise_type: "mask"
74
+ batch_size_v2t: 4
75
+ batch_size_mmu: 1
76
+ mixed_precision: "bf16"
77
+ enable_tf32: True
78
+ seed: 10086
79
+ max_train_steps: 1000000
80
+ overfit_one_batch: False
81
+ cond_dropout_prob: 0.1
82
+ min_masking_rate: 0.0
83
+ label_smoothing: 0.0
84
+ max_grad_norm: 1
85
+ guidance_scale: 3
86
+ generation_timesteps: 12
87
+ mmu_coeff: 1.0
88
+ validation_seed: 42
MMaDA/configs/omada_instruction_tuning.yaml ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "omada-instruction-tuning"
8
+ name: "omada-instruction-tuning"
9
+ output_dir: "ckpts/omada/omada-instruction-tuning-tv_sacle_0.7"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 5000
13
+ eval_every: 10000
14
+ generate_every: 1000000000
15
+ log_every: 1
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+
19
+ model:
20
+ vq_model_image:
21
+ type: "magvitv2"
22
+ vq_model_name: "showlab/magvitv2"
23
+ ### Omada ###############################################################
24
+ vq_model_audio:
25
+ type: "emova"
26
+ vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
27
+ omada:
28
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
29
+ # pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
30
+ pretrained_model_path: "//home/work/AIDAS/ckpts/merged_model/hf_common_merge_alpha_999_scale_0p7"
31
+ w_clip_vit: False
32
+ new_vocab_size: 138752
33
+ llm_vocab_size: 126464
34
+ codebook_size: 8192
35
+ num_vq_tokens: 256
36
+ num_new_special_tokens: 3 # v2s, s2s, i2i
37
+ tie_word_embeddings: False
38
+ #########################################################################
39
+
40
+ gradient_checkpointing: True
41
+
42
+ dataset:
43
+ gen_type: "pass"
44
+ und_type: "pass"
45
+ combined_loader_mode: "max_size_cycle"
46
+ params:
47
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
48
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
49
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
50
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
51
+ ]
52
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
53
+ add_caption_prompt: True
54
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
55
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
56
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
57
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
58
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
59
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
60
+ ### Omada ###############################################################
61
+ video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
62
+ video_speech_dataset:
63
+ sample_mode: "exclusive"
64
+ use_precomputed_tokens: true
65
+ precomputed_tokens_root: "/home/work/AIDAS/cache/speech_tokens"
66
+ llavavid_path: "/home/work/AIDAS/data/video/LLaVA-Video-178K"
67
+ llavavid_local_files_only: true
68
+ llavavid_skip_configs:
69
+ - "llava_hound"
70
+ - "0_30_s_activitynetqa"
71
+ - "30_60_s_activitynetqa"
72
+ - "1_2_m_activitynetqa"
73
+ - "2_3_m_activitynetqa"
74
+ - "0_30_s_activitynet"
75
+ - "30_60_s_activitynet"
76
+ - "1_2_m_activitynet"
77
+ - "2_3_m_activitynet"
78
+ llavavid_skip_video_patterns:
79
+ - "activitynet"
80
+ # video_dataset_name: "openvid1m"
81
+ hqedit_split: "train"
82
+ t2i_dataset: "text2image2m+openimage_i2i+hqedit"
83
+ t2i_split: "train"
84
+ t2i_dataset_name: "jackyhate/text-to-image-2M"
85
+ t2i_local_files_only: true
86
+ openimage_i2i:
87
+ sft_jsonl: "/home/work/AIDAS/data/openimage_source_images/sft_with_local_source_image_path.jsonl"
88
+ pref_jsonl: "/home/work/AIDAS/data/openimage_source_images/pref_with_local_source_image_path.jsonl"
89
+ multi_turn_jsonl: "/home/work/AIDAS/data/openimage_source_images/multi-turn_with_local_source_image_path.jsonl"
90
+ image_root: "/home/work/AIDAS/data/nano_edited_images"
91
+ prefer_summarized_text: true
92
+ pref_positive_only: true
93
+ skip_missing: true
94
+ max_samples_per_source: null
95
+ max_total_samples: null
96
+ seed: 42
97
+ hf_instruction_lm:
98
+ split: "train"
99
+ max_samples_per_source: 1000000
100
+ max_total_samples: 20000000
101
+ seed: 42
102
+ speech2speech:
103
+ - name: "instructs2s"
104
+ use_precomputed_tokens: false
105
+ precomputed_tokens_root: "/home/work/AIDAS/cache/instructs2s_tokens"
106
+ mmu_interleaved:
107
+ local_data_root: /home/work/AIDAS/data/TIGER-Lab/Mantis-Instruct
108
+ local_files_only: true
109
+ # subset for gigaspeech: xs, xl
110
+ # subset for librispeech: train-clean-360, train-clean-100
111
+ # subset for commonvoice: validated, invalidated
112
+ audio_data:
113
+ # - name: "gigaspeech"
114
+ # subset: "xl"
115
+ # split: "train"
116
+ - name: "librispeech"
117
+ subset: "train-clean-360"
118
+ use_precomputed_tokens: true
119
+ precomputed_tokens_root: "/home/work/AIDAS/cache/librispeech_tokens"
120
+ # - name: "commonvoice"
121
+ # subset: "validated"
122
+ #########################################################################
123
+ shuffle_buffer_size: 1000
124
+ num_workers: 0
125
+ resolution: 256
126
+ # resolution: 16
127
+ pin_memory: False
128
+ persistent_workers: False
129
+ dataloader_timeout: 0
130
+
131
+
132
+ speech_token_cache:
133
+ enable: true
134
+ root: "cache/speech_tokens"
135
+ max_items_in_memory: 4096
136
+
137
+ preprocessing:
138
+ max_seq_length: 128 # for text tokens
139
+ max_aud_length: 384 # for audio tokens
140
+ max_aud_length_short: 256 # for short audio tokens
141
+ resolution: 128 # for video tokens
142
+ # max_seq_length: 16 # for text tokens
143
+ # max_aud_length: 16 # for audio tokens
144
+ # resolution: 16 # for video tokens
145
+ center_crop: False
146
+ random_flip: False
147
+
148
+ optimizer:
149
+ name: adamw
150
+ params: # default adamw params
151
+ learning_rate: 5e-5
152
+ # learning_rate: 0.00004859840219369731
153
+ scale_lr: False # scale learning rate by total batch size
154
+ beta1: 0.9
155
+ beta2: 0.999
156
+ weight_decay: 0.01
157
+ epsilon: 1e-8
158
+
159
+ lr_scheduler:
160
+ scheduler: "cosine"
161
+ params:
162
+ learning_rate: ${optimizer.params.learning_rate}
163
+ # warmup_steps: 1000
164
+ warmup_steps: 0
165
+ min_lr_scale: 0.1
166
+
167
+ training:
168
+ gradient_accumulation_steps: 1
169
+ noise_type: "mask"
170
+ batch_size_t2i: 1
171
+ batch_size_lm: 1
172
+ batch_size_mmu: 1
173
+ batch_size_v2t: 1
174
+ batch_size_v2s: 1
175
+ batch_size_s2t: 2
176
+ batch_size_t2s: 2
177
+ batch_size_s2s: 2
178
+
179
+ mixed_precision: "bf16"
180
+ enable_tf32: True
181
+ seed: 10086
182
+ max_train_steps: 500000
183
+ max_train_epochs: NONE
184
+ overfit_one_batch: False
185
+ cond_dropout_prob: 0.1
186
+ min_masking_rate: 0.0
187
+ label_smoothing: 0.0
188
+ max_grad_norm: 1
189
+ guidance_scale: 3.5
190
+ generation_timesteps: 42
191
+
192
+ t2i_coeff: 2.5
193
+ i2i_coeff: 2.5
194
+ lm_coeff: 2.5
195
+ mmu_coeff: 0.1
196
+ v2t_coeff: 0.2
197
+ v2s_coeff: 2.0
198
+ t2s_coeff: 2.5
199
+ s2t_coeff: 0.5
200
+ s2s_coeff: 3.0
MMaDA/configs/omada_pretraining_stage1-2.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "omada-training-stage1"
8
+ name: "omada-training-stage1"
9
+ output_dir: "ckpts/omada/omada-training-stage1_2nd"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 5000
13
+ eval_every: 5000
14
+ generate_every: 1000000000
15
+ log_every: 1
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+
19
+ model:
20
+ vq_model_image:
21
+ type: "magvitv2"
22
+ vq_model_name: "showlab/magvitv2"
23
+ ### Omada ###############################################################
24
+ vq_model_audio:
25
+ type: "emova"
26
+ vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
27
+ omada:
28
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
29
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
30
+ # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
31
+ w_clip_vit: False
32
+ new_vocab_size: 138752
33
+ llm_vocab_size: 126464
34
+ codebook_size: 8192
35
+ num_vq_tokens: 256
36
+ num_new_special_tokens: 5 # task token 3 + eoa / soa
37
+ tie_word_embeddings: False
38
+ #########################################################################
39
+
40
+ gradient_checkpointing: True
41
+
42
+ dataset:
43
+ gen_type: "pass"
44
+ und_type: "pass"
45
+ combined_loader_mode: "max_size_cycle"
46
+ params:
47
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
48
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
49
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
50
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
51
+ ]
52
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
53
+ add_caption_prompt: True
54
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
55
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
56
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
57
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
58
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
59
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
60
+ ### Omada ###############################################################
61
+ video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
62
+ # subset for gigaspeech: xs, xl
63
+ # subset for librispeech: train-clean-360, train-clean-100
64
+ # subset for commonvoice: validated, invalidated
65
+ audio_data:
66
+ - name: "gigaspeech"
67
+ subset: "xl"
68
+ split: "train"
69
+ - name: "librispeech"
70
+ subset: "train-clean-360"
71
+ - name: "commonvoice"
72
+ subset: "validated"
73
+ #########################################################################
74
+ shuffle_buffer_size: 1000
75
+ num_workers: 8
76
+ resolution: 256
77
+ pin_memory: True
78
+ persistent_workers: True
79
+
80
+ preprocessing:
81
+ max_seq_length: 128 # for text tokens
82
+ max_aud_length: 256 # for audio tokens
83
+ resolution: 128
84
+ center_crop: False
85
+ random_flip: False
86
+
87
+ optimizer:
88
+ name: adamw
89
+ params: # default adamw params
90
+ learning_rate: 5e-5
91
+ scale_lr: False # scale learning rate by total batch size
92
+ beta1: 0.9
93
+ beta2: 0.999
94
+ weight_decay: 0.01
95
+ epsilon: 1e-8
96
+
97
+ lr_scheduler:
98
+ scheduler: "cosine"
99
+ params:
100
+ learning_rate: ${optimizer.params.learning_rate}
101
+ warmup_steps: 0
102
+ min_lr_scale: 0.1
103
+
104
+ training:
105
+ gradient_accumulation_steps: 1
106
+ noise_type: "mask"
107
+ batch_size_t2i: 0
108
+ batch_size_lm: 0
109
+ batch_size_mmu: 0
110
+ batch_size_v2t: 1
111
+ batch_size_s2t: 1
112
+ batch_size_t2s: 5
113
+
114
+ mixed_precision: "bf16"
115
+ enable_tf32: True
116
+ seed: 10086
117
+ max_train_steps: 315000 # 2epoch
118
+ max_train_epochs: NONE
119
+ overfit_one_batch: False
120
+ cond_dropout_prob: 0.1
121
+ min_masking_rate: 0.0
122
+ label_smoothing: 0.0
123
+ max_grad_norm: 1
124
+ guidance_scale: 0.0
125
+ generation_timesteps: 64
126
+ # t2i_coeff: 0.1
127
+ # lm_coeff: 0.1
128
+ # mmu_coeff: 0.1
129
+ v2t_coeff: 0.1
130
+ t2s_coeff: 1.0
131
+ s2t_coeff: 0.1
MMaDA/configs/omada_pretraining_stage1-3.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "omada-training-stage1"
8
+ name: "omada-training-stage1_ignore_SP"
9
+ output_dir: "ckpts/omada/omada-training-stage1_7th"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 5000
13
+ eval_every: 5000
14
+ generate_every: 1000000000
15
+ log_every: 1
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+
19
+ model:
20
+ vq_model_image:
21
+ type: "magvitv2"
22
+ vq_model_name: "showlab/magvitv2"
23
+ ### Omada ###############################################################
24
+ vq_model_audio:
25
+ type: "emova"
26
+ vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
27
+ omada:
28
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
29
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
30
+ # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
31
+ w_clip_vit: False
32
+ new_vocab_size: 138752
33
+ llm_vocab_size: 126464
34
+ codebook_size: 8192
35
+ num_vq_tokens: 256
36
+ num_new_special_tokens: 5 # task token 3 + eoa / soa
37
+ tie_word_embeddings: False
38
+ #########################################################################
39
+
40
+ gradient_checkpointing: True
41
+
42
+ dataset:
43
+ gen_type: "pass"
44
+ und_type: "pass"
45
+ combined_loader_mode: "max_size_cycle"
46
+ params:
47
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
48
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
49
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
50
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
51
+ ]
52
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
53
+ add_caption_prompt: True
54
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
55
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
56
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
57
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
58
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
59
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
60
+ ### Omada ###############################################################
61
+ video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
62
+ # subset for gigaspeech: xs, xl
63
+ # subset for librispeech: train-clean-360, train-clean-100
64
+ # subset for commonvoice: validated, invalidated
65
+ audio_data:
66
+ - name: "gigaspeech"
67
+ subset: "xl"
68
+ split: "train"
69
+ - name: "librispeech"
70
+ subset: "train-clean-360"
71
+ - name: "commonvoice"
72
+ subset: "validated"
73
+ #########################################################################
74
+ shuffle_buffer_size: 1000
75
+ num_workers: 8
76
+ resolution: 256
77
+ pin_memory: True
78
+ persistent_workers: True
79
+
80
+ preprocessing:
81
+ max_seq_length: 128 # for text tokens
82
+ max_aud_length: 384 # for audio tokens
83
+ resolution: 128
84
+ center_crop: False
85
+ random_flip: False
86
+
87
+ optimizer:
88
+ name: adamw
89
+ params: # default adamw params
90
+ # learning_rate: 1e-4
91
+ learning_rate: 0.000079
92
+ scale_lr: False # scale learning rate by total batch size
93
+ beta1: 0.9
94
+ beta2: 0.999
95
+ weight_decay: 0.01
96
+ epsilon: 1e-8
97
+
98
+ lr_scheduler:
99
+ scheduler: "cosine"
100
+ params:
101
+ learning_rate: ${optimizer.params.learning_rate}
102
+ warmup_steps: 0
103
+ min_lr_scale: 0.1
104
+
105
+ training:
106
+ gradient_accumulation_steps: 1
107
+ noise_type: "mask"
108
+ batch_size_t2i: 0
109
+ batch_size_lm: 0
110
+ batch_size_mmu: 0
111
+ batch_size_v2t: 1
112
+ batch_size_s2t: 1
113
+ batch_size_t2s: 5
114
+
115
+ mixed_precision: "bf16"
116
+ enable_tf32: True
117
+ seed: 10086
118
+ max_train_steps: 630000 # 2epoch
119
+ max_train_epochs: NONE
120
+ overfit_one_batch: False
121
+ cond_dropout_prob: 0.1
122
+ min_masking_rate: 0.0
123
+ label_smoothing: 0.0
124
+ max_grad_norm: 1
125
+ guidance_scale: 1.5
126
+ generation_timesteps: 16
127
+ # t2i_coeff: 0.1
128
+ # lm_coeff: 0.1
129
+ # mmu_coeff: 0.1
130
+ v2t_coeff: 0.2
131
+ t2s_coeff: 1.0
132
+ s2t_coeff: 0.2
MMaDA/configs/omada_pretraining_stage1-4.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "omada-training-stage1"
8
+ name: "omada-training-stage1_ignore_SP"
9
+ output_dir: "ckpts/omada/omada-training-stage1_5th"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 5000
13
+ eval_every: 5000
14
+ generate_every: 1000000000
15
+ log_every: 1
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+
19
+ model:
20
+ vq_model_image:
21
+ type: "magvitv2"
22
+ vq_model_name: "showlab/magvitv2"
23
+ ### Omada ###############################################################
24
+ vq_model_audio:
25
+ type: "emova"
26
+ vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
27
+ omada:
28
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
29
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
30
+ # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
31
+ w_clip_vit: False
32
+ new_vocab_size: 138752
33
+ llm_vocab_size: 126464
34
+ codebook_size: 8192
35
+ num_vq_tokens: 256
36
+ num_new_special_tokens: 5 # task token 3 + eoa / soa
37
+ tie_word_embeddings: False
38
+ #########################################################################
39
+
40
+ gradient_checkpointing: True
41
+
42
+ dataset:
43
+ gen_type: "pass"
44
+ und_type: "pass"
45
+ combined_loader_mode: "max_size_cycle"
46
+ params:
47
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
48
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
49
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
50
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
51
+ ]
52
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
53
+ add_caption_prompt: True
54
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
55
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
56
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
57
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
58
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
59
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
60
+ ### Omada ###############################################################
61
+ video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
62
+ # subset for gigaspeech: xs, xl
63
+ # subset for librispeech: train-clean-360, train-clean-100
64
+ # subset for commonvoice: validated, invalidated
65
+ audio_data:
66
+ - name: "gigaspeech"
67
+ subset: "xl"
68
+ split: "train"
69
+ - name: "librispeech"
70
+ subset: "train-clean-360"
71
+ - name: "commonvoice"
72
+ subset: "validated"
73
+ #########################################################################
74
+ shuffle_buffer_size: 1000
75
+ num_workers: 4
76
+ resolution: 256
77
+ pin_memory: True
78
+ persistent_workers: True
79
+
80
+ preprocessing:
81
+ max_seq_length: 128 # for text tokens
82
+ max_aud_length: 256 # for audio tokens
83
+ resolution: 128
84
+ center_crop: False
85
+ random_flip: False
86
+
87
+ optimizer:
88
+ name: adamw
89
+ params: # default adamw params
90
+ # learning_rate: 5e-6
91
+ learning_rate: 0.00000483
92
+ scale_lr: False # scale learning rate by total batch size
93
+ beta1: 0.9
94
+ beta2: 0.999
95
+ weight_decay: 0.01
96
+ epsilon: 1e-8
97
+
98
+ lr_scheduler:
99
+ scheduler: "cosine"
100
+ params:
101
+ learning_rate: ${optimizer.params.learning_rate}
102
+ warmup_steps: 0
103
+ min_lr_scale: 0.1
104
+
105
+ training:
106
+ gradient_accumulation_steps: 1
107
+ noise_type: "mask"
108
+ batch_size_t2i: 0
109
+ batch_size_lm: 0
110
+ batch_size_mmu: 0
111
+ batch_size_v2t: 1
112
+ batch_size_s2t: 1
113
+ batch_size_t2s: 5
114
+
115
+ mixed_precision: "bf16"
116
+ enable_tf32: True
117
+ seed: 10086
118
+ max_train_steps: 630000 # 2epoch
119
+ max_train_epochs: NONE
120
+ overfit_one_batch: False
121
+ cond_dropout_prob: 0.1
122
+ min_masking_rate: 0.0
123
+ label_smoothing: 0.0
124
+ max_grad_norm: 1
125
+ guidance_scale: 1.5
126
+ generation_timesteps: 16
127
+ # t2i_coeff: 0.1
128
+ # lm_coeff: 0.1
129
+ # mmu_coeff: 0.1
130
+ v2t_coeff: 0.2
131
+ t2s_coeff: 1.0
132
+ s2t_coeff: 0.2
MMaDA/configs/omada_pretraining_stage1.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "omada-training-stage1"
8
+ name: "omada-training-stage1"
9
+ output_dir: "ckpts/omada/omada-training-stage1"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 5000
13
+ eval_every: 10000000000
14
+ generate_every: 1000000000
15
+ log_every: 1
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+
19
+ model:
20
+ vq_model_image:
21
+ type: "magvitv2"
22
+ vq_model_name: "showlab/magvitv2"
23
+ ### Omada ###############################################################
24
+ vq_model_audio:
25
+ type: "emova"
26
+ vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
27
+ omada:
28
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
29
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
30
+ # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
31
+ w_clip_vit: False
32
+ new_vocab_size: 138752
33
+ llm_vocab_size: 126464
34
+ codebook_size: 8192
35
+ num_vq_tokens: 256
36
+ num_new_special_tokens: 5 # task token 3 + eoa / soa
37
+ tie_word_embeddings: False
38
+ #########################################################################
39
+
40
+ gradient_checkpointing: True
41
+
42
+ dataset:
43
+ gen_type: "pass"
44
+ und_type: "pass"
45
+ combined_loader_mode: "max_size_cycle"
46
+ params:
47
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
48
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
49
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
50
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
51
+ ]
52
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
53
+ add_caption_prompt: True
54
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
55
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
56
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
57
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
58
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
59
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
60
+ ### Omada ###############################################################
61
+ video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
62
+ # subset for gigaspeech: xs, xl
63
+ # subset for librispeech: train-clean-360, train-clean-100
64
+ # subset for commonvoice: validated, invalidated
65
+ audio_data:
66
+ - name: "gigaspeech"
67
+ subset: "xl"
68
+ split: "train"
69
+ - name: "librispeech"
70
+ subset: "train-clean-360"
71
+ - name: "commonvoice"
72
+ subset: "validated"
73
+ #########################################################################
74
+ shuffle_buffer_size: 1000
75
+ num_workers: 8
76
+ resolution: 256
77
+ pin_memory: True
78
+ persistent_workers: True
79
+
80
+ preprocessing:
81
+ max_seq_length: 128 # for text tokens
82
+ max_aud_length: 256 # for audio tokens
83
+ resolution: 128
84
+ center_crop: False
85
+ random_flip: False
86
+
87
+ optimizer:
88
+ name: adamw
89
+ params: # default adamw params
90
+ learning_rate: 1e-5
91
+ scale_lr: False # scale learning rate by total batch size
92
+ beta1: 0.9
93
+ beta2: 0.999
94
+ weight_decay: 0.01
95
+ epsilon: 1e-8
96
+
97
+ lr_scheduler:
98
+ scheduler: "cosine"
99
+ params:
100
+ learning_rate: ${optimizer.params.learning_rate}
101
+ warmup_steps: 3000
102
+ min_lr_scale: 0.1
103
+
104
+ training:
105
+ gradient_accumulation_steps: 1
106
+ noise_type: "mask"
107
+ batch_size_t2i: 0
108
+ batch_size_lm: 0
109
+ batch_size_mmu: 0
110
+ batch_size_v2t: 2
111
+ batch_size_s2t: 2
112
+ batch_size_t2s: 3
113
+
114
+ mixed_precision: "bf16"
115
+ enable_tf32: True
116
+ seed: 10086
117
+ max_train_steps: 200000
118
+ max_train_epochs: 1
119
+ overfit_one_batch: False
120
+ cond_dropout_prob: 0.1
121
+ min_masking_rate: 0.0
122
+ label_smoothing: 0.0
123
+ max_grad_norm: 1
124
+ guidance_scale: 1.5
125
+ generation_timesteps: 12
126
+ # t2i_coeff: 0.1
127
+ # lm_coeff: 0.1
128
+ # mmu_coeff: 0.1
129
+ v2t_coeff: 1.0
130
+ t2s_coeff: 1.0
131
+ s2t_coeff: 1.0
MMaDA/configs/omada_pretraining_v2t_inst.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "omada-training-v2t_inst"
8
+ name: "omada-training-v2t_inst"
9
+ output_dir: "ckpts/omada/omada-training-v2t_inst"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 5000
13
+ eval_every: 5000
14
+ generate_every: 1000000000
15
+ log_every: 1
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+
19
+ model:
20
+ vq_model_image:
21
+ type: "magvitv2"
22
+ vq_model_name: "showlab/magvitv2"
23
+ ### Omada ###############################################################
24
+ vq_model_audio:
25
+ type: "emova"
26
+ vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
27
+ omada:
28
+ tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
29
+ pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
30
+ # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
31
+ w_clip_vit: False
32
+ new_vocab_size: 138752
33
+ llm_vocab_size: 126464
34
+ codebook_size: 8192
35
+ num_vq_tokens: 256
36
+ num_new_special_tokens: 5 # task token 3 + eoa / soa
37
+ tie_word_embeddings: False
38
+ #########################################################################
39
+
40
+ gradient_checkpointing: True
41
+
42
+ dataset:
43
+ gen_type: "pass"
44
+ und_type: "pass"
45
+ combined_loader_mode: "max_size_cycle"
46
+ params:
47
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
48
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
49
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
50
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
51
+ ]
52
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
53
+ add_caption_prompt: True
54
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
55
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
56
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
57
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
58
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
59
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
60
+ ### Omada ###############################################################
61
+ video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
62
+ # subset for gigaspeech: xs, xl
63
+ # subset for librispeech: train-clean-360, train-clean-100
64
+ # subset for commonvoice: validated, invalidated
65
+ audio_data:
66
+ - name: "gigaspeech"
67
+ subset: "xl"
68
+ split: "train"
69
+ - name: "librispeech"
70
+ subset: "train-clean-360"
71
+ - name: "commonvoice"
72
+ subset: "validated"
73
+ #########################################################################
74
+ shuffle_buffer_size: 1000
75
+ num_workers: 8
76
+ resolution: 256
77
+ pin_memory: True
78
+ persistent_workers: True
79
+
80
+ preprocessing:
81
+ max_seq_length: 128 # for text tokens
82
+ max_aud_length: 384 # for audio tokens
83
+ resolution: 128
84
+ center_crop: False
85
+ random_flip: False
86
+
87
+ optimizer:
88
+ name: adamw
89
+ params: # default adamw params
90
+ # learning_rate: 1e-4
91
+ learning_rate: 0.000079
92
+ scale_lr: False # scale learning rate by total batch size
93
+ beta1: 0.9
94
+ beta2: 0.999
95
+ weight_decay: 0.01
96
+ epsilon: 1e-8
97
+
98
+ lr_scheduler:
99
+ scheduler: "cosine"
100
+ params:
101
+ learning_rate: ${optimizer.params.learning_rate}
102
+ warmup_steps: 0
103
+ min_lr_scale: 0.1
104
+
105
+ training:
106
+ gradient_accumulation_steps: 1
107
+ noise_type: "mask"
108
+ batch_size_t2i: 0
109
+ batch_size_lm: 0
110
+ batch_size_mmu: 0
111
+ batch_size_v2t: 1
112
+ batch_size_s2t: 1
113
+ batch_size_t2s: 5
114
+
115
+ mixed_precision: "bf16"
116
+ enable_tf32: True
117
+ seed: 10086
118
+ max_train_steps: 630000 # 2epoch
119
+ max_train_epochs: NONE
120
+ overfit_one_batch: False
121
+ cond_dropout_prob: 0.1
122
+ min_masking_rate: 0.0
123
+ label_smoothing: 0.0
124
+ max_grad_norm: 1
125
+ guidance_scale: 1.5
126
+ generation_timesteps: 16
127
+ # t2i_coeff: 0.1
128
+ # lm_coeff: 0.1
129
+ # mmu_coeff: 0.1
130
+ v2t_coeff: 0.2
131
+ t2s_coeff: 1.0
132
+ s2t_coeff: 0.2
MMaDA/debug_speech_dataloader.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Utility to reproduce and debug the speech DataLoader used in training.
3
+
4
+ This script pulls the speech dataset configuration from the Omada
5
+ instruction-tuning config, instantiates the same `MixedSpeechTextDataset`, and
6
+ iterates a configurable number of batches while measuring how long each fetch
7
+ takes. Use it to spot slow or stuck samples without launching the full training
8
+ job.
9
+
10
+ Typical usage::
11
+
12
+ python AIDAS/MMaDA/script/debug_speech_dataloader.py \
13
+ --config AIDAS/MMaDA/configs/omada_instruction_tuning.yaml \
14
+ --flow s2t --max-batches 5 --num-workers 1 --timeout 0
15
+
16
+ Pass `--inspect-items` for a direct `dataset[idx]` sweep when a specific sample
17
+ seems suspicious.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import itertools
24
+ import logging
25
+ import sys
26
+ import time
27
+ from pathlib import Path
28
+ from typing import Any, Iterable, List
29
+
30
+ from omegaconf import OmegaConf
31
+ from torch.utils.data import DataLoader
32
+
33
+ from MMaDA.training.data import MixedSpeechTextDataset
34
+
35
+
36
+ def _collate_fn_audio(batch: List[dict[str, Any]]) -> dict[str, List[Any]]:
37
+ """Match the collate function used in training for speech flows."""
38
+
39
+ return {
40
+ "audio_path": [item["audio_path"] for item in batch],
41
+ "text": [item["text"] for item in batch],
42
+ "audio_tokens": [item.get("audio_tokens") for item in batch],
43
+ }
44
+
45
+
46
+ def _as_list_of_dicts(cfg_fragment: Any) -> List[dict[str, Any]]:
47
+ container = OmegaConf.to_container(cfg_fragment, resolve=True)
48
+ if not isinstance(container, Iterable): # pragma: no cover - sanity guard
49
+ raise TypeError("audio_data config must be a list of dataset dicts")
50
+ return list(container) # type: ignore[arg-type]
51
+
52
+
53
+ def _build_dataset(cfg) -> MixedSpeechTextDataset:
54
+ dataset_cfg = cfg.dataset.params
55
+ audio_data_cfg = _as_list_of_dicts(dataset_cfg.audio_data)
56
+ return MixedSpeechTextDataset(audio_data_cfg)
57
+
58
+
59
+ def _log_batch_summary(idx: int, batch: dict[str, List[Any]], elapsed: float) -> None:
60
+ audio_paths = batch.get("audio_path", [])
61
+ sample = audio_paths[0] if audio_paths else "<empty>"
62
+ logging.info(
63
+ "batch=%d size=%d elapsed=%.2fs sample=%s",
64
+ idx,
65
+ len(audio_paths),
66
+ elapsed,
67
+ sample,
68
+ )
69
+
70
+
71
+ def _inspect_items(dataset: MixedSpeechTextDataset, max_items: int) -> None:
72
+ logging.info("Inspecting individual dataset items (max=%d)", max_items)
73
+ for idx in itertools.islice(range(len(dataset)), max_items):
74
+ tick = time.perf_counter()
75
+ try:
76
+ item = dataset[idx]
77
+ except Exception as exc: # pragma: no cover - diagnostic path
78
+ logging.error("idx=%d failed: %s", idx, exc)
79
+ continue
80
+ elapsed = time.perf_counter() - tick
81
+ logging.info(
82
+ "idx=%d elapsed=%.2fs path=%s text_len=%d tokens=%s",
83
+ idx,
84
+ elapsed,
85
+ item.get("audio_path"),
86
+ len(item.get("text", "")),
87
+ "cached" if item.get("audio_tokens") is not None else "None",
88
+ )
89
+
90
+
91
+ def parse_args(argv: List[str]) -> argparse.Namespace:
92
+ parser = argparse.ArgumentParser(description=__doc__)
93
+ parser.add_argument(
94
+ "--config",
95
+ type=Path,
96
+ default=Path("AIDAS/MMaDA/configs/omada_instruction_tuning.yaml"),
97
+ help="Path to the training config YAML",
98
+ )
99
+ parser.add_argument(
100
+ "--flow",
101
+ choices=["s2t", "t2s"],
102
+ default="s2t",
103
+ help="Which speech flow's batch size defaults to use",
104
+ )
105
+ parser.add_argument(
106
+ "--batch-size",
107
+ type=int,
108
+ default=None,
109
+ help="Override batch size (defaults to config.training.batch_size_<flow>)",
110
+ )
111
+ parser.add_argument(
112
+ "--num-workers",
113
+ type=int,
114
+ default=None,
115
+ help="Override DataLoader workers (defaults to config.dataset.params.num_workers)",
116
+ )
117
+ parser.add_argument(
118
+ "--persistent-workers",
119
+ action="store_true",
120
+ help="Enable persistent workers regardless of config",
121
+ )
122
+ parser.add_argument(
123
+ "--timeout",
124
+ type=float,
125
+ default=None,
126
+ help="DataLoader timeout in seconds (defaults to config.dataset.params.dataloader_timeout)",
127
+ )
128
+ parser.add_argument(
129
+ "--max-batches",
130
+ type=int,
131
+ default=10,
132
+ help="Number of batches to iterate (0 means run through the entire dataset)",
133
+ )
134
+ parser.add_argument(
135
+ "--inspect-items",
136
+ type=int,
137
+ default=0,
138
+ help="If >0, bypass the DataLoader and inspect this many individual dataset items first",
139
+ )
140
+ parser.add_argument(
141
+ "--prefetch-factor",
142
+ type=int,
143
+ default=None,
144
+ help="Optional override for DataLoader prefetch_factor",
145
+ )
146
+ parser.add_argument(
147
+ "--log-level",
148
+ default="INFO",
149
+ help="Logging level",
150
+ )
151
+ return parser.parse_args(argv)
152
+
153
+
154
+ def main(argv: List[str]) -> int:
155
+ args = parse_args(argv)
156
+ logging.basicConfig(
157
+ level=getattr(logging, args.log_level.upper(), logging.INFO),
158
+ format="%(asctime)s | %(levelname)s | %(message)s",
159
+ )
160
+
161
+ cfg = OmegaConf.load(args.config)
162
+ dataset = _build_dataset(cfg)
163
+
164
+ if args.inspect_items:
165
+ _inspect_items(dataset, args.inspect_items)
166
+
167
+ dataset_params = cfg.dataset.params
168
+ batch_size = args.batch_size or getattr(cfg.training, f"batch_size_{args.flow}")
169
+ num_workers = args.num_workers if args.num_workers is not None else dataset_params.num_workers
170
+ timeout = args.timeout if args.timeout is not None else dataset_params.dataloader_timeout
171
+
172
+ if num_workers == 0:
173
+ persistent_workers = False
174
+ else:
175
+ persistent_workers = args.persistent_workers or bool(dataset_params.persistent_workers)
176
+
177
+ dataloader_kwargs = {
178
+ "dataset": dataset,
179
+ "batch_size": batch_size,
180
+ "shuffle": False,
181
+ "num_workers": num_workers,
182
+ "drop_last": True,
183
+ "pin_memory": bool(dataset_params.pin_memory),
184
+ "timeout": timeout,
185
+ "persistent_workers": persistent_workers,
186
+ "collate_fn": _collate_fn_audio,
187
+ }
188
+ if args.prefetch_factor is not None and num_workers > 0:
189
+ dataloader_kwargs["prefetch_factor"] = args.prefetch_factor
190
+
191
+ logging.info(
192
+ "Starting DataLoader debug: batch_size=%d num_workers=%d timeout=%s persistent=%s",
193
+ batch_size,
194
+ num_workers,
195
+ timeout,
196
+ persistent_workers,
197
+ )
198
+
199
+ dataloader = DataLoader(**dataloader_kwargs)
200
+
201
+ max_batches = args.max_batches
202
+ iterator = iter(dataloader)
203
+
204
+ processed = 0
205
+ while True:
206
+ if max_batches and processed >= max_batches:
207
+ break
208
+ tick = time.perf_counter()
209
+ try:
210
+ batch = next(iterator)
211
+ except StopIteration:
212
+ logging.info("Reached end of DataLoader after %d batches", processed)
213
+ break
214
+ elapsed = time.perf_counter() - tick
215
+ _log_batch_summary(processed, batch, elapsed)
216
+ processed += 1
217
+
218
+ return 0
219
+
220
+
221
+ if __name__ == "__main__":
222
+ raise SystemExit(main(sys.argv[1:]))
MMaDA/eval_ASR_TTS/test.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "54c0a618-750f-4bf0-8cdb-c2dda158c433",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import argparse\n",
11
+ "import json\n",
12
+ "import os\n",
13
+ "import editdistance"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "markdown",
18
+ "id": "658bb863-f147-444e-8b14-466e1999d15f",
19
+ "metadata": {},
20
+ "source": [
21
+ "# Speech -> Text"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 2,
27
+ "id": "7e4d5e19-e526-4b33-aa03-0a4cc68abd90",
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "def calculate_WER(recognized_text_list, groundtruth_text_list):\n",
32
+ " word_num = 0.0\n",
33
+ " scores = 0.0\n",
34
+ " for recognized_text, groundtruth_text in zip(recognized_text_list, groundtruth_text_list):\n",
35
+ " if len(recognized_text) > 1000:\n",
36
+ " print(recognized_text)\n",
37
+ " continue\n",
38
+ " recognized_word_list = recognized_text.split()\n",
39
+ " groundtruth_word_list = groundtruth_text.split()\n",
40
+ " current_word_num = len(groundtruth_word_list)\n",
41
+ " word_num += current_word_num\n",
42
+ " # Compute Levenstein's distance\n",
43
+ " current_score = editdistance.eval(recognized_word_list, groundtruth_word_list)\n",
44
+ " scores += current_score\n",
45
+ " WER = scores / word_num\n",
46
+ " return WER, scores, word_num\n",
47
+ "\n",
48
+ "\n",
49
+ "def evaluate_asr(prediction_list, ground_truth_list):\n",
50
+ " wer, scores_wer, word_num_wer = calculate_WER(prediction_list, ground_truth_list)\n",
51
+ " print(f'wer: {wer}, scores_wer: {scores_wer}, word_num_wer: {word_num_wer}')"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 3,
57
+ "id": "05f4a95c",
58
+ "metadata": {},
59
+ "outputs": [
60
+ {
61
+ "name": "stdout",
62
+ "output_type": "stream",
63
+ "text": [
64
+ "WER (demo): 0.4375 | word errors: 7.0 | total words: 16.0\n"
65
+ ]
66
+ }
67
+ ],
68
+ "source": [
69
+ "\n",
70
+ "gt_0 = \"Hello. We are AIDAS laboratory.\"\n",
71
+ "gt_1 = \"Hello. Let's build an omni model diffusion foundation model.\"\n",
72
+ "gt_2 = \"Pretty intense.\"\n",
73
+ "\n",
74
+ "pred_0 = \"hello, we are AIDAS laboratory.\"\n",
75
+ "pred_1 = \"hello let's build an omni model diffusion foundation model\"\n",
76
+ "pred_2 = \"pretty intense\"\n",
77
+ "\n",
78
+ "groundtruth_text_list = [gt_0, gt_1, gt_2]\n",
79
+ "recognized_text_list = [pred_0, pred_1, pred_2]\n",
80
+ "\n",
81
+ "wer, errors, words = calculate_WER(recognized_text_list, groundtruth_text_list)\n",
82
+ "print(f\"WER (demo): {wer:.4f} | word errors: {errors} | total words: {words}\")"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "markdown",
87
+ "id": "3635f492-2ae2-4ef4-9321-36d08aa6645e",
88
+ "metadata": {},
89
+ "source": [
90
+ "# Text -> Speech (with normalizer)"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 4,
96
+ "id": "1ac74c9a",
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "# Environment & deps check (safe to run multiple times)\n",
101
+ "import sys, os, importlib\n",
102
+ "from pathlib import Path\n",
103
+ "\n",
104
+ "\n",
105
+ "# optional: ensure packages (comment out if you manage env separately)\n",
106
+ "try:\n",
107
+ " import editdistance # used by calculate_WER\n",
108
+ "except Exception:\n",
109
+ " print(\"Installing editdistance...\")\n",
110
+ " %pip -q install editdistance\n",
111
+ "\n",
112
+ "try:\n",
113
+ " import more_itertools # required by english.py normalizer\n",
114
+ "except Exception:\n",
115
+ " print(\"Installing more-itertools...\")\n",
116
+ " %pip -q install more-itertools\n",
117
+ "\n",
118
+ "# local modules\n",
119
+ "from whisper_asr.whisper_asr import load_whisper_model, EN_ASR_WER\n",
120
+ "from whisper_asr.normalizers.english import EnglishTextNormalizer # EMOVA-style normalizer\n"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": 5,
126
+ "id": "4ffd26a0",
127
+ "metadata": {},
128
+ "outputs": [
129
+ {
130
+ "name": "stderr",
131
+ "output_type": "stream",
132
+ "text": [
133
+ "Device set to use cuda\n",
134
+ "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily be entirely accurate and will have caveats. More information: https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(..., ignore_warning=True). To use Whisper for long-form transcription, use rather the model's `generate` method directly as the model relies on it's own chunking mechanism (cf. Whisper original paper, section 3.8. Long-form Transcription).\n"
135
+ ]
136
+ },
137
+ {
138
+ "name": "stdout",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "whisper model loaded!\n"
142
+ ]
143
+ },
144
+ {
145
+ "name": "stderr",
146
+ "output_type": "stream",
147
+ "text": [
148
+ " 0%| | 0/1 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.\n",
149
+ "Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.\n",
150
+ "100%|██████████| 1/1 [00:02<00:00, 2.81s/it]"
151
+ ]
152
+ },
153
+ {
154
+ "name": "stdout",
155
+ "output_type": "stream",
156
+ "text": [
157
+ "groundtruth text:Hello. We are AIDAS laboratory.\n",
158
+ "recognized text: Hello, we are IDAS Laboratory.\n",
159
+ "groundtruth text:Hello. Let's build an omni model diffusion foundation model.\n",
160
+ "recognized text: Hello! Let's build an Omnimodal Diffusion Foundation model.\n",
161
+ "groundtruth text:It's pretty intense.\n",
162
+ "recognized text: It's pretty intense.\n",
163
+ "Computation Time: 2.8128 s\n",
164
+ "groundtruth:Hello. We are AIDAS laboratory.\n",
165
+ "recognized: Hello, we are IDAS Laboratory.\n",
166
+ "groundtruth:Hello. Let's build an omni model diffusion foundation model.\n",
167
+ "recognized: Hello! Let's build an Omnimodal Diffusion Foundation model.\n",
168
+ "groundtruth:It's pretty intense.\n",
169
+ "recognized: It's pretty intense.\n",
170
+ "Word count: 17\n",
171
+ "Word error: 9\n",
172
+ "utterance num:3\n",
173
+ "WER without Whisper text normalization: 0.5294 \n",
174
+ "normalized_groundtruth:hello we are aidas laboratory\n",
175
+ "normalized_recognized:hello we are idas laboratory\n",
176
+ "normalized_groundtruth:hello let us build an omni model diffusion foundation model\n",
177
+ "normalized_recognized:hello let us build an omnimodal diffusion foundation model\n",
178
+ "normalized_groundtruth:it is pretty intense\n",
179
+ "normalized_recognized:it is pretty intense\n",
180
+ "Word count: 19\n",
181
+ "Word error: 3\n",
182
+ "utterance num:3\n",
183
+ "WER with Whisper text normalization: 0.1579 \n"
184
+ ]
185
+ },
186
+ {
187
+ "name": "stderr",
188
+ "output_type": "stream",
189
+ "text": [
190
+ "\n"
191
+ ]
192
+ }
193
+ ],
194
+ "source": [
195
+ "# TTS → ASR with normalization (EMOVA EnglishTextNormalizer)\n",
196
+ "import torch\n",
197
+ "from pathlib import Path\n",
198
+ "\n",
199
+ "# inputs\n",
200
+ "groundtruth_text_list = [\n",
201
+ " \"Hello. We are AIDAS laboratory.\",\n",
202
+ " \"Hello. Let's build an omni model diffusion foundation model.\",\n",
203
+ " \"It's pretty intense.\",\n",
204
+ "]\n",
205
+ "wav_file_list = [\n",
206
+ " \"./audio/AIDAS_team.wav\",\n",
207
+ " \"./audio/diffusion.wav\",\n",
208
+ " \"./audio/pretty_intense.wav\",\n",
209
+ "]\n",
210
+ "\n",
211
+ "# Load Whisper large-v3\n",
212
+ "model_id = \"openai/whisper-large-v3\"\n",
213
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
214
+ "pipe = load_whisper_model(model_id, device)\n",
215
+ "\n",
216
+ "# Run batch inference and print both raw and normalized WERs\n",
217
+ "EN_ASR_WER(pipe, wav_file_list, groundtruth_text_list, batch_size=3, print_verbose=True)"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": null,
223
+ "id": "dd157230-07a5-4b05-a8c1-2f7a49475cdd",
224
+ "metadata": {},
225
+ "outputs": [],
226
+ "source": []
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "id": "310f5f23-43c6-40e1-a20c-09cd1ce287ad",
232
+ "metadata": {},
233
+ "outputs": [],
234
+ "source": []
235
+ },
236
+ {
237
+ "cell_type": "code",
238
+ "execution_count": null,
239
+ "id": "01b93914-2624-4c1a-b893-ed2cd3b944b7",
240
+ "metadata": {},
241
+ "outputs": [],
242
+ "source": []
243
+ }
244
+ ],
245
+ "metadata": {
246
+ "kernelspec": {
247
+ "display_name": "diff",
248
+ "language": "python",
249
+ "name": "diff"
250
+ },
251
+ "language_info": {
252
+ "codemirror_mode": {
253
+ "name": "ipython",
254
+ "version": 3
255
+ },
256
+ "file_extension": ".py",
257
+ "mimetype": "text/x-python",
258
+ "name": "python",
259
+ "nbconvert_exporter": "python",
260
+ "pygments_lexer": "ipython3",
261
+ "version": "3.11.11"
262
+ }
263
+ },
264
+ "nbformat": 4,
265
+ "nbformat_minor": 5
266
+ }
MMaDA/eval_ASR_TTS/whisper_asr/normalizers/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .basic import BasicTextNormalizer as BasicTextNormalizer
2
+ from .english import EnglishTextNormalizer as EnglishTextNormalizer
MMaDA/eval_ASR_TTS/whisper_asr/normalizers/basic.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ import regex
5
+
6
+ # non-ASCII letters that are not separated by "NFKD" normalization
7
+ ADDITIONAL_DIACRITICS = {
8
+ "œ": "oe",
9
+ "Œ": "OE",
10
+ "ø": "o",
11
+ "Ø": "O",
12
+ "æ": "ae",
13
+ "Æ": "AE",
14
+ "ß": "ss",
15
+ "ẞ": "SS",
16
+ "đ": "d",
17
+ "Đ": "D",
18
+ "ð": "d",
19
+ "Ð": "D",
20
+ "þ": "th",
21
+ "Þ": "th",
22
+ "ł": "l",
23
+ "Ł": "L",
24
+ }
25
+
26
+
27
+ def remove_symbols_and_diacritics(s: str, keep=""):
28
+ """
29
+ Replace any other markers, symbols, and punctuations with a space,
30
+ and drop any diacritics (category 'Mn' and some manual mappings)
31
+ """
32
+ return "".join(
33
+ c
34
+ if c in keep
35
+ else ADDITIONAL_DIACRITICS[c]
36
+ if c in ADDITIONAL_DIACRITICS
37
+ else ""
38
+ if unicodedata.category(c) == "Mn"
39
+ else " "
40
+ if unicodedata.category(c)[0] in "MSP"
41
+ else c
42
+ for c in unicodedata.normalize("NFKD", s)
43
+ )
44
+
45
+
46
+ def remove_symbols(s: str):
47
+ """
48
+ Replace any other markers, symbols, punctuations with a space, keeping diacritics
49
+ """
50
+ return "".join(
51
+ " " if unicodedata.category(c)[0] in "MSP" else c
52
+ for c in unicodedata.normalize("NFKC", s)
53
+ )
54
+
55
+
56
+ class BasicTextNormalizer:
57
+ def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
58
+ self.clean = (
59
+ remove_symbols_and_diacritics if remove_diacritics else remove_symbols
60
+ )
61
+ self.split_letters = split_letters
62
+
63
+ def __call__(self, s: str):
64
+ s = s.lower()
65
+ s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
66
+ s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
67
+ s = self.clean(s).lower()
68
+
69
+ if self.split_letters:
70
+ s = " ".join(regex.findall(r"\X", s, regex.U))
71
+
72
+ s = re.sub(
73
+ r"\s+", " ", s
74
+ ) # replace any successive whitespace characters with a space
75
+
76
+ return s
MMaDA/eval_ASR_TTS/whisper_asr/normalizers/english.json ADDED
@@ -0,0 +1,1741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accessorise": "accessorize",
3
+ "accessorised": "accessorized",
4
+ "accessorises": "accessorizes",
5
+ "accessorising": "accessorizing",
6
+ "acclimatisation": "acclimatization",
7
+ "acclimatise": "acclimatize",
8
+ "acclimatised": "acclimatized",
9
+ "acclimatises": "acclimatizes",
10
+ "acclimatising": "acclimatizing",
11
+ "accoutrements": "accouterments",
12
+ "aeon": "eon",
13
+ "aeons": "eons",
14
+ "aerogramme": "aerogram",
15
+ "aerogrammes": "aerograms",
16
+ "aeroplane": "airplane",
17
+ "aeroplanes": "airplanes",
18
+ "aesthete": "esthete",
19
+ "aesthetes": "esthetes",
20
+ "aesthetic": "esthetic",
21
+ "aesthetically": "esthetically",
22
+ "aesthetics": "esthetics",
23
+ "aetiology": "etiology",
24
+ "ageing": "aging",
25
+ "aggrandisement": "aggrandizement",
26
+ "agonise": "agonize",
27
+ "agonised": "agonized",
28
+ "agonises": "agonizes",
29
+ "agonising": "agonizing",
30
+ "agonisingly": "agonizingly",
31
+ "almanack": "almanac",
32
+ "almanacks": "almanacs",
33
+ "aluminium": "aluminum",
34
+ "amortisable": "amortizable",
35
+ "amortisation": "amortization",
36
+ "amortisations": "amortizations",
37
+ "amortise": "amortize",
38
+ "amortised": "amortized",
39
+ "amortises": "amortizes",
40
+ "amortising": "amortizing",
41
+ "amphitheatre": "amphitheater",
42
+ "amphitheatres": "amphitheaters",
43
+ "anaemia": "anemia",
44
+ "anaemic": "anemic",
45
+ "anaesthesia": "anesthesia",
46
+ "anaesthetic": "anesthetic",
47
+ "anaesthetics": "anesthetics",
48
+ "anaesthetise": "anesthetize",
49
+ "anaesthetised": "anesthetized",
50
+ "anaesthetises": "anesthetizes",
51
+ "anaesthetising": "anesthetizing",
52
+ "anaesthetist": "anesthetist",
53
+ "anaesthetists": "anesthetists",
54
+ "anaesthetize": "anesthetize",
55
+ "anaesthetized": "anesthetized",
56
+ "anaesthetizes": "anesthetizes",
57
+ "anaesthetizing": "anesthetizing",
58
+ "analogue": "analog",
59
+ "analogues": "analogs",
60
+ "analyse": "analyze",
61
+ "analysed": "analyzed",
62
+ "analyses": "analyzes",
63
+ "analysing": "analyzing",
64
+ "anglicise": "anglicize",
65
+ "anglicised": "anglicized",
66
+ "anglicises": "anglicizes",
67
+ "anglicising": "anglicizing",
68
+ "annualised": "annualized",
69
+ "antagonise": "antagonize",
70
+ "antagonised": "antagonized",
71
+ "antagonises": "antagonizes",
72
+ "antagonising": "antagonizing",
73
+ "apologise": "apologize",
74
+ "apologised": "apologized",
75
+ "apologises": "apologizes",
76
+ "apologising": "apologizing",
77
+ "appal": "appall",
78
+ "appals": "appalls",
79
+ "appetiser": "appetizer",
80
+ "appetisers": "appetizers",
81
+ "appetising": "appetizing",
82
+ "appetisingly": "appetizingly",
83
+ "arbour": "arbor",
84
+ "arbours": "arbors",
85
+ "archeological": "archaeological",
86
+ "archaeologically": "archeologically",
87
+ "archaeologist": "archeologist",
88
+ "archaeologists": "archeologists",
89
+ "archaeology": "archeology</span>",
90
+ "ardour": "ardor",
91
+ "armour": "armor",
92
+ "armoured": "armored",
93
+ "armourer": "armorer",
94
+ "armourers": "armorers",
95
+ "armouries": "armories",
96
+ "armoury": "armory",
97
+ "artefact": "artifact",
98
+ "artefacts": "artifacts",
99
+ "authorise": "authorize",
100
+ "authorised": "authorized",
101
+ "authorises": "authorizes",
102
+ "authorising": "authorizing",
103
+ "axe": "ax",
104
+ "backpedalled": "backpedaled",
105
+ "backpedalling": "backpedaling",
106
+ "bannister": "banister",
107
+ "bannisters": "banisters",
108
+ "baptise": "baptize",
109
+ "baptised": "baptized",
110
+ "baptises": "baptizes",
111
+ "baptising": "baptizing",
112
+ "bastardise": "bastardize",
113
+ "bastardised": "bastardized",
114
+ "bastardises": "bastardizes",
115
+ "bastardising": "bastardizing",
116
+ "battleax": "battleaxe",
117
+ "baulk": "balk",
118
+ "baulked": "balked",
119
+ "baulking": "balking",
120
+ "baulks": "balks",
121
+ "bedevilled": "bedeviled",
122
+ "bedevilling": "bedeviling",
123
+ "behaviour": "behavior",
124
+ "behavioural": "behavioral",
125
+ "behaviourism": "behaviorism",
126
+ "behaviourist": "behaviorist",
127
+ "behaviourists": "behaviorists",
128
+ "behaviours": "behaviors",
129
+ "behove": "behoove",
130
+ "behoved": "behooved",
131
+ "behoves": "behooves",
132
+ "bejewelled": "bejeweled",
133
+ "belabour": "belabor",
134
+ "belaboured": "belabored",
135
+ "belabouring": "belaboring",
136
+ "belabours": "belabors",
137
+ "bevelled": "beveled",
138
+ "bevvies": "bevies",
139
+ "bevvy": "bevy",
140
+ "biassed": "biased",
141
+ "biassing": "biasing",
142
+ "bingeing": "binging",
143
+ "bougainvillaea": "bougainvillea",
144
+ "bougainvillaeas": "bougainvilleas",
145
+ "bowdlerise": "bowdlerize",
146
+ "bowdlerised": "bowdlerized",
147
+ "bowdlerises": "bowdlerizes",
148
+ "bowdlerising": "bowdlerizing",
149
+ "breathalyse": "breathalyze",
150
+ "breathalysed": "breathalyzed",
151
+ "breathalyser": "breathalyzer",
152
+ "breathalysers": "breathalyzers",
153
+ "breathalyses": "breathalyzes",
154
+ "breathalysing": "breathalyzing",
155
+ "brutalise": "brutalize",
156
+ "brutalised": "brutalized",
157
+ "brutalises": "brutalizes",
158
+ "brutalising": "brutalizing",
159
+ "busses": "buses",
160
+ "bussing": "busing",
161
+ "caesarean": "cesarean",
162
+ "caesareans": "cesareans",
163
+ "calibre": "caliber",
164
+ "calibres": "calibers",
165
+ "calliper": "caliper",
166
+ "callipers": "calipers",
167
+ "callisthenics": "calisthenics",
168
+ "canalise": "canalize",
169
+ "canalised": "canalized",
170
+ "canalises": "canalizes",
171
+ "canalising": "canalizing",
172
+ "cancelation": "cancellation",
173
+ "cancelations": "cancellations",
174
+ "cancelled": "canceled",
175
+ "cancelling": "canceling",
176
+ "candour": "candor",
177
+ "cannibalise": "cannibalize",
178
+ "cannibalised": "cannibalized",
179
+ "cannibalises": "cannibalizes",
180
+ "cannibalising": "cannibalizing",
181
+ "canonise": "canonize",
182
+ "canonised": "canonized",
183
+ "canonises": "canonizes",
184
+ "canonising": "canonizing",
185
+ "capitalise": "capitalize",
186
+ "capitalised": "capitalized",
187
+ "capitalises": "capitalizes",
188
+ "capitalising": "capitalizing",
189
+ "caramelise": "caramelize",
190
+ "caramelised": "caramelized",
191
+ "caramelises": "caramelizes",
192
+ "caramelising": "caramelizing",
193
+ "carbonise": "carbonize",
194
+ "carbonised": "carbonized",
195
+ "carbonises": "carbonizes",
196
+ "carbonising": "carbonizing",
197
+ "carolled": "caroled",
198
+ "carolling": "caroling",
199
+ "catalogue": "catalog",
200
+ "catalogued": "cataloged",
201
+ "catalogues": "catalogs",
202
+ "cataloguing": "cataloging",
203
+ "catalyse": "catalyze",
204
+ "catalysed": "catalyzed",
205
+ "catalyses": "catalyzes",
206
+ "catalysing": "catalyzing",
207
+ "categorise": "categorize",
208
+ "categorised": "categorized",
209
+ "categorises": "categorizes",
210
+ "categorising": "categorizing",
211
+ "cauterise": "cauterize",
212
+ "cauterised": "cauterized",
213
+ "cauterises": "cauterizes",
214
+ "cauterising": "cauterizing",
215
+ "cavilled": "caviled",
216
+ "cavilling": "caviling",
217
+ "centigramme": "centigram",
218
+ "centigrammes": "centigrams",
219
+ "centilitre": "centiliter",
220
+ "centilitres": "centiliters",
221
+ "centimetre": "centimeter",
222
+ "centimetres": "centimeters",
223
+ "centralise": "centralize",
224
+ "centralised": "centralized",
225
+ "centralises": "centralizes",
226
+ "centralising": "centralizing",
227
+ "centre": "center",
228
+ "centred": "centered",
229
+ "centrefold": "centerfold",
230
+ "centrefolds": "centerfolds",
231
+ "centrepiece": "centerpiece",
232
+ "centrepieces": "centerpieces",
233
+ "centres": "centers",
234
+ "channelled": "channeled",
235
+ "channelling": "channeling",
236
+ "characterise": "characterize",
237
+ "characterised": "characterized",
238
+ "characterises": "characterizes",
239
+ "characterising": "characterizing",
240
+ "cheque": "check",
241
+ "chequebook": "checkbook",
242
+ "chequebooks": "checkbooks",
243
+ "chequered": "checkered",
244
+ "cheques": "checks",
245
+ "chilli": "chili",
246
+ "chimaera": "chimera",
247
+ "chimaeras": "chimeras",
248
+ "chiselled": "chiseled",
249
+ "chiselling": "chiseling",
250
+ "circularise": "circularize",
251
+ "circularised": "circularized",
252
+ "circularises": "circularizes",
253
+ "circularising": "circularizing",
254
+ "civilise": "civilize",
255
+ "civilised": "civilized",
256
+ "civilises": "civilizes",
257
+ "civilising": "civilizing",
258
+ "clamour": "clamor",
259
+ "clamoured": "clamored",
260
+ "clamouring": "clamoring",
261
+ "clamours": "clamors",
262
+ "clangour": "clangor",
263
+ "clarinettist": "clarinetist",
264
+ "clarinettists": "clarinetists",
265
+ "collectivise": "collectivize",
266
+ "collectivised": "collectivized",
267
+ "collectivises": "collectivizes",
268
+ "collectivising": "collectivizing",
269
+ "colonisation": "colonization",
270
+ "colonise": "colonize",
271
+ "colonised": "colonized",
272
+ "coloniser": "colonizer",
273
+ "colonisers": "colonizers",
274
+ "colonises": "colonizes",
275
+ "colonising": "colonizing",
276
+ "colour": "color",
277
+ "colourant": "colorant",
278
+ "colourants": "colorants",
279
+ "coloured": "colored",
280
+ "coloureds": "coloreds",
281
+ "colourful": "colorful",
282
+ "colourfully": "colorfully",
283
+ "colouring": "coloring",
284
+ "colourize": "colorize",
285
+ "colourized": "colorized",
286
+ "colourizes": "colorizes",
287
+ "colourizing": "colorizing",
288
+ "colourless": "colorless",
289
+ "colours": "colors",
290
+ "commercialise": "commercialize",
291
+ "commercialised": "commercialized",
292
+ "commercialises": "commercializes",
293
+ "commercialising": "commercializing",
294
+ "compartmentalise": "compartmentalize",
295
+ "compartmentalised": "compartmentalized",
296
+ "compartmentalises": "compartmentalizes",
297
+ "compartmentalising": "compartmentalizing",
298
+ "computerise": "computerize",
299
+ "computerised": "computerized",
300
+ "computerises": "computerizes",
301
+ "computerising": "computerizing",
302
+ "conceptualise": "conceptualize",
303
+ "conceptualised": "conceptualized",
304
+ "conceptualises": "conceptualizes",
305
+ "conceptualising": "conceptualizing",
306
+ "connexion": "connection",
307
+ "connexions": "connections",
308
+ "contextualise": "contextualize",
309
+ "contextualised": "contextualized",
310
+ "contextualises": "contextualizes",
311
+ "contextualising": "contextualizing",
312
+ "cosier": "cozier",
313
+ "cosies": "cozies",
314
+ "cosiest": "coziest",
315
+ "cosily": "cozily",
316
+ "cosiness": "coziness",
317
+ "cosy": "cozy",
318
+ "councillor": "councilor",
319
+ "councillors": "councilors",
320
+ "counselled": "counseled",
321
+ "counselling": "counseling",
322
+ "counsellor": "counselor",
323
+ "counsellors": "counselors",
324
+ "crenelated": "crenellated",
325
+ "criminalise": "criminalize",
326
+ "criminalised": "criminalized",
327
+ "criminalises": "criminalizes",
328
+ "criminalising": "criminalizing",
329
+ "criticise": "criticize",
330
+ "criticised": "criticized",
331
+ "criticises": "criticizes",
332
+ "criticising": "criticizing",
333
+ "crueller": "crueler",
334
+ "cruellest": "cruelest",
335
+ "crystallisation": "crystallization",
336
+ "crystallise": "crystallize",
337
+ "crystallised": "crystallized",
338
+ "crystallises": "crystallizes",
339
+ "crystallising": "crystallizing",
340
+ "cudgelled": "cudgeled",
341
+ "cudgelling": "cudgeling",
342
+ "customise": "customize",
343
+ "customised": "customized",
344
+ "customises": "customizes",
345
+ "customising": "customizing",
346
+ "cypher": "cipher",
347
+ "cyphers": "ciphers",
348
+ "decentralisation": "decentralization",
349
+ "decentralise": "decentralize",
350
+ "decentralised": "decentralized",
351
+ "decentralises": "decentralizes",
352
+ "decentralising": "decentralizing",
353
+ "decriminalisation": "decriminalization",
354
+ "decriminalise": "decriminalize",
355
+ "decriminalised": "decriminalized",
356
+ "decriminalises": "decriminalizes",
357
+ "decriminalising": "decriminalizing",
358
+ "defence": "defense",
359
+ "defenceless": "defenseless",
360
+ "defences": "defenses",
361
+ "dehumanisation": "dehumanization",
362
+ "dehumanise": "dehumanize",
363
+ "dehumanised": "dehumanized",
364
+ "dehumanises": "dehumanizes",
365
+ "dehumanising": "dehumanizing",
366
+ "demeanour": "demeanor",
367
+ "demilitarisation": "demilitarization",
368
+ "demilitarise": "demilitarize",
369
+ "demilitarised": "demilitarized",
370
+ "demilitarises": "demilitarizes",
371
+ "demilitarising": "demilitarizing",
372
+ "demobilisation": "demobilization",
373
+ "demobilise": "demobilize",
374
+ "demobilised": "demobilized",
375
+ "demobilises": "demobilizes",
376
+ "demobilising": "demobilizing",
377
+ "democratisation": "democratization",
378
+ "democratise": "democratize",
379
+ "democratised": "democratized",
380
+ "democratises": "democratizes",
381
+ "democratising": "democratizing",
382
+ "demonise": "demonize",
383
+ "demonised": "demonized",
384
+ "demonises": "demonizes",
385
+ "demonising": "demonizing",
386
+ "demoralisation": "demoralization",
387
+ "demoralise": "demoralize",
388
+ "demoralised": "demoralized",
389
+ "demoralises": "demoralizes",
390
+ "demoralising": "demoralizing",
391
+ "denationalisation": "denationalization",
392
+ "denationalise": "denationalize",
393
+ "denationalised": "denationalized",
394
+ "denationalises": "denationalizes",
395
+ "denationalising": "denationalizing",
396
+ "deodorise": "deodorize",
397
+ "deodorised": "deodorized",
398
+ "deodorises": "deodorizes",
399
+ "deodorising": "deodorizing",
400
+ "depersonalise": "depersonalize",
401
+ "depersonalised": "depersonalized",
402
+ "depersonalises": "depersonalizes",
403
+ "depersonalising": "depersonalizing",
404
+ "deputise": "deputize",
405
+ "deputised": "deputized",
406
+ "deputises": "deputizes",
407
+ "deputising": "deputizing",
408
+ "desensitisation": "desensitization",
409
+ "desensitise": "desensitize",
410
+ "desensitised": "desensitized",
411
+ "desensitises": "desensitizes",
412
+ "desensitising": "desensitizing",
413
+ "destabilisation": "destabilization",
414
+ "destabilise": "destabilize",
415
+ "destabilised": "destabilized",
416
+ "destabilises": "destabilizes",
417
+ "destabilising": "destabilizing",
418
+ "dialled": "dialed",
419
+ "dialling": "dialing",
420
+ "dialogue": "dialog",
421
+ "dialogues": "dialogs",
422
+ "diarrhoea": "diarrhea",
423
+ "digitise": "digitize",
424
+ "digitised": "digitized",
425
+ "digitises": "digitizes",
426
+ "digitising": "digitizing",
427
+ "disc": "disk",
428
+ "discolour": "discolor",
429
+ "discoloured": "discolored",
430
+ "discolouring": "discoloring",
431
+ "discolours": "discolors",
432
+ "discs": "disks",
433
+ "disembowelled": "disemboweled",
434
+ "disembowelling": "disemboweling",
435
+ "disfavour": "disfavor",
436
+ "dishevelled": "disheveled",
437
+ "dishonour": "dishonor",
438
+ "dishonourable": "dishonorable",
439
+ "dishonourably": "dishonorably",
440
+ "dishonoured": "dishonored",
441
+ "dishonouring": "dishonoring",
442
+ "dishonours": "dishonors",
443
+ "disorganisation": "disorganization",
444
+ "disorganised": "disorganized",
445
+ "distil": "distill",
446
+ "distils": "distills",
447
+ "dramatisation": "dramatization",
448
+ "dramatisations": "dramatizations",
449
+ "dramatise": "dramatize",
450
+ "dramatised": "dramatized",
451
+ "dramatises": "dramatizes",
452
+ "dramatising": "dramatizing",
453
+ "draught": "draft",
454
+ "draughtboard": "draftboard",
455
+ "draughtboards": "draftboards",
456
+ "draughtier": "draftier",
457
+ "draughtiest": "draftiest",
458
+ "draughts": "drafts",
459
+ "draughtsman": "draftsman",
460
+ "draughtsmanship": "draftsmanship",
461
+ "draughtsmen": "draftsmen",
462
+ "draughtswoman": "draftswoman",
463
+ "draughtswomen": "draftswomen",
464
+ "draughty": "drafty",
465
+ "drivelled": "driveled",
466
+ "drivelling": "driveling",
467
+ "duelled": "dueled",
468
+ "duelling": "dueling",
469
+ "economise": "economize",
470
+ "economised": "economized",
471
+ "economises": "economizes",
472
+ "economising": "economizing",
473
+ "edoema": "edema",
474
+ "editorialise": "editorialize",
475
+ "editorialised": "editorialized",
476
+ "editorialises": "editorializes",
477
+ "editorialising": "editorializing",
478
+ "empathise": "empathize",
479
+ "empathised": "empathized",
480
+ "empathises": "empathizes",
481
+ "empathising": "empathizing",
482
+ "emphasise": "emphasize",
483
+ "emphasised": "emphasized",
484
+ "emphasises": "emphasizes",
485
+ "emphasising": "emphasizing",
486
+ "enamelled": "enameled",
487
+ "enamelling": "enameling",
488
+ "enamoured": "enamored",
489
+ "encyclopaedia": "encyclopedia",
490
+ "encyclopaedias": "encyclopedias",
491
+ "encyclopaedic": "encyclopedic",
492
+ "endeavour": "endeavor",
493
+ "endeavoured": "endeavored",
494
+ "endeavouring": "endeavoring",
495
+ "endeavours": "endeavors",
496
+ "energise": "energize",
497
+ "energised": "energized",
498
+ "energises": "energizes",
499
+ "energising": "energizing",
500
+ "enrol": "enroll",
501
+ "enrols": "enrolls",
502
+ "enthral": "enthrall",
503
+ "enthrals": "enthralls",
504
+ "epaulette": "epaulet",
505
+ "epaulettes": "epaulets",
506
+ "epicentre": "epicenter",
507
+ "epicentres": "epicenters",
508
+ "epilogue": "epilog",
509
+ "epilogues": "epilogs",
510
+ "epitomise": "epitomize",
511
+ "epitomised": "epitomized",
512
+ "epitomises": "epitomizes",
513
+ "epitomising": "epitomizing",
514
+ "equalisation": "equalization",
515
+ "equalise": "equalize",
516
+ "equalised": "equalized",
517
+ "equaliser": "equalizer",
518
+ "equalisers": "equalizers",
519
+ "equalises": "equalizes",
520
+ "equalising": "equalizing",
521
+ "eulogise": "eulogize",
522
+ "eulogised": "eulogized",
523
+ "eulogises": "eulogizes",
524
+ "eulogising": "eulogizing",
525
+ "evangelise": "evangelize",
526
+ "evangelised": "evangelized",
527
+ "evangelises": "evangelizes",
528
+ "evangelising": "evangelizing",
529
+ "exorcise": "exorcize",
530
+ "exorcised": "exorcized",
531
+ "exorcises": "exorcizes",
532
+ "exorcising": "exorcizing",
533
+ "extemporisation": "extemporization",
534
+ "extemporise": "extemporize",
535
+ "extemporised": "extemporized",
536
+ "extemporises": "extemporizes",
537
+ "extemporising": "extemporizing",
538
+ "externalisation": "externalization",
539
+ "externalisations": "externalizations",
540
+ "externalise": "externalize",
541
+ "externalised": "externalized",
542
+ "externalises": "externalizes",
543
+ "externalising": "externalizing",
544
+ "factorise": "factorize",
545
+ "factorised": "factorized",
546
+ "factorises": "factorizes",
547
+ "factorising": "factorizing",
548
+ "faecal": "fecal",
549
+ "faeces": "feces",
550
+ "familiarisation": "familiarization",
551
+ "familiarise": "familiarize",
552
+ "familiarised": "familiarized",
553
+ "familiarises": "familiarizes",
554
+ "familiarising": "familiarizing",
555
+ "fantasise": "fantasize",
556
+ "fantasised": "fantasized",
557
+ "fantasises": "fantasizes",
558
+ "fantasising": "fantasizing",
559
+ "favour": "favor",
560
+ "favourable": "favorable",
561
+ "favourably": "favorably",
562
+ "favoured": "favored",
563
+ "favouring": "favoring",
564
+ "favourite": "favorite",
565
+ "favourites": "favorites",
566
+ "favouritism": "favoritism",
567
+ "favours": "favors",
568
+ "feminise": "feminize",
569
+ "feminised": "feminized",
570
+ "feminises": "feminizes",
571
+ "feminising": "feminizing",
572
+ "fertilisation": "fertilization",
573
+ "fertilise": "fertilize",
574
+ "fertilised": "fertilized",
575
+ "fertiliser": "fertilizer",
576
+ "fertilisers": "fertilizers",
577
+ "fertilises": "fertilizes",
578
+ "fertilising": "fertilizing",
579
+ "fervour": "fervor",
580
+ "fibre": "fiber",
581
+ "fibreglass": "fiberglass",
582
+ "fibres": "fibers",
583
+ "fictionalisation": "fictionalization",
584
+ "fictionalisations": "fictionalizations",
585
+ "fictionalise": "fictionalize",
586
+ "fictionalised": "fictionalized",
587
+ "fictionalises": "fictionalizes",
588
+ "fictionalising": "fictionalizing",
589
+ "fillet": "filet",
590
+ "filleted": "fileted",
591
+ "filleting": "fileting",
592
+ "fillets": "filets",
593
+ "finalisation": "finalization",
594
+ "finalise": "finalize",
595
+ "finalised": "finalized",
596
+ "finalises": "finalizes",
597
+ "finalising": "finalizing",
598
+ "flautist": "flutist",
599
+ "flautists": "flutists",
600
+ "flavour": "flavor",
601
+ "flavoured": "flavored",
602
+ "flavouring": "flavoring",
603
+ "flavourings": "flavorings",
604
+ "flavourless": "flavorless",
605
+ "flavours": "flavors",
606
+ "flavoursome": "flavorsome",
607
+ "flyer / flier": "flier / flyer",
608
+ "foetal": "fetal",
609
+ "foetid": "fetid",
610
+ "foetus": "fetus",
611
+ "foetuses": "fetuses",
612
+ "formalisation": "formalization",
613
+ "formalise": "formalize",
614
+ "formalised": "formalized",
615
+ "formalises": "formalizes",
616
+ "formalising": "formalizing",
617
+ "fossilisation": "fossilization",
618
+ "fossilise": "fossilize",
619
+ "fossilised": "fossilized",
620
+ "fossilises": "fossilizes",
621
+ "fossilising": "fossilizing",
622
+ "fraternisation": "fraternization",
623
+ "fraternise": "fraternize",
624
+ "fraternised": "fraternized",
625
+ "fraternises": "fraternizes",
626
+ "fraternising": "fraternizing",
627
+ "fulfil": "fulfill",
628
+ "fulfilment": "fulfillment",
629
+ "fulfils": "fulfills",
630
+ "funnelled": "funneled",
631
+ "funnelling": "funneling",
632
+ "galvanise": "galvanize",
633
+ "galvanised": "galvanized",
634
+ "galvanises": "galvanizes",
635
+ "galvanising": "galvanizing",
636
+ "gambolled": "gamboled",
637
+ "gambolling": "gamboling",
638
+ "gaol": "jail",
639
+ "gaolbird": "jailbird",
640
+ "gaolbirds": "jailbirds",
641
+ "gaolbreak": "jailbreak",
642
+ "gaolbreaks": "jailbreaks",
643
+ "gaoled": "jailed",
644
+ "gaoler": "jailer",
645
+ "gaolers": "jailers",
646
+ "gaoling": "jailing",
647
+ "gaols": "jails",
648
+ "gasses": "gases",
649
+ "gage": "gauge",
650
+ "gaged": "gauged",
651
+ "gages": "gauges",
652
+ "gaging": "gauging",
653
+ "generalisation": "generalization",
654
+ "generalisations": "generalizations",
655
+ "generalise": "generalize",
656
+ "generalised": "generalized",
657
+ "generalises": "generalizes",
658
+ "generalising": "generalizing",
659
+ "ghettoise": "ghettoize",
660
+ "ghettoised": "ghettoized",
661
+ "ghettoises": "ghettoizes",
662
+ "ghettoising": "ghettoizing",
663
+ "gipsies": "gypsies",
664
+ "glamorise": "glamorize",
665
+ "glamorised": "glamorized",
666
+ "glamorises": "glamorizes",
667
+ "glamorising": "glamorizing",
668
+ "glamor": "glamour",
669
+ "globalisation": "globalization",
670
+ "globalise": "globalize",
671
+ "globalised": "globalized",
672
+ "globalises": "globalizes",
673
+ "globalising": "globalizing",
674
+ "glueing": "gluing",
675
+ "goitre": "goiter",
676
+ "goitres": "goiters",
677
+ "gonorrhoea": "gonorrhea",
678
+ "gramme": "gram",
679
+ "grammes": "grams",
680
+ "gravelled": "graveled",
681
+ "grey": "gray",
682
+ "greyed": "grayed",
683
+ "greying": "graying",
684
+ "greyish": "grayish",
685
+ "greyness": "grayness",
686
+ "greys": "grays",
687
+ "grovelled": "groveled",
688
+ "grovelling": "groveling",
689
+ "groyne": "groin",
690
+ "groynes": "groins",
691
+ "gruelling": "grueling",
692
+ "gruellingly": "gruelingly",
693
+ "gryphon": "griffin",
694
+ "gryphons": "griffins",
695
+ "gynaecological": "gynecological",
696
+ "gynaecologist": "gynecologist",
697
+ "gynaecologists": "gynecologists",
698
+ "gynaecology": "gynecology",
699
+ "haematological": "hematological",
700
+ "haematologist": "hematologist",
701
+ "haematologists": "hematologists",
702
+ "haematology": "hematology",
703
+ "haemoglobin": "hemoglobin",
704
+ "haemophilia": "hemophilia",
705
+ "haemophiliac": "hemophiliac",
706
+ "haemophiliacs": "hemophiliacs",
707
+ "haemorrhage": "hemorrhage",
708
+ "haemorrhaged": "hemorrhaged",
709
+ "haemorrhages": "hemorrhages",
710
+ "haemorrhaging": "hemorrhaging",
711
+ "haemorrhoids": "hemorrhoids",
712
+ "harbour": "harbor",
713
+ "harboured": "harbored",
714
+ "harbouring": "harboring",
715
+ "harbours": "harbors",
716
+ "harmonisation": "harmonization",
717
+ "harmonise": "harmonize",
718
+ "harmonised": "harmonized",
719
+ "harmonises": "harmonizes",
720
+ "harmonising": "harmonizing",
721
+ "homoeopath": "homeopath",
722
+ "homoeopathic": "homeopathic",
723
+ "homoeopaths": "homeopaths",
724
+ "homoeopathy": "homeopathy",
725
+ "homogenise": "homogenize",
726
+ "homogenised": "homogenized",
727
+ "homogenises": "homogenizes",
728
+ "homogenising": "homogenizing",
729
+ "honour": "honor",
730
+ "honourable": "honorable",
731
+ "honourably": "honorably",
732
+ "honoured": "honored",
733
+ "honouring": "honoring",
734
+ "honours": "honors",
735
+ "hospitalisation": "hospitalization",
736
+ "hospitalise": "hospitalize",
737
+ "hospitalised": "hospitalized",
738
+ "hospitalises": "hospitalizes",
739
+ "hospitalising": "hospitalizing",
740
+ "humanise": "humanize",
741
+ "humanised": "humanized",
742
+ "humanises": "humanizes",
743
+ "humanising": "humanizing",
744
+ "humour": "humor",
745
+ "humoured": "humored",
746
+ "humouring": "humoring",
747
+ "humourless": "humorless",
748
+ "humours": "humors",
749
+ "hybridise": "hybridize",
750
+ "hybridised": "hybridized",
751
+ "hybridises": "hybridizes",
752
+ "hybridising": "hybridizing",
753
+ "hypnotise": "hypnotize",
754
+ "hypnotised": "hypnotized",
755
+ "hypnotises": "hypnotizes",
756
+ "hypnotising": "hypnotizing",
757
+ "hypothesise": "hypothesize",
758
+ "hypothesised": "hypothesized",
759
+ "hypothesises": "hypothesizes",
760
+ "hypothesising": "hypothesizing",
761
+ "idealisation": "idealization",
762
+ "idealise": "idealize",
763
+ "idealised": "idealized",
764
+ "idealises": "idealizes",
765
+ "idealising": "idealizing",
766
+ "idolise": "idolize",
767
+ "idolised": "idolized",
768
+ "idolises": "idolizes",
769
+ "idolising": "idolizing",
770
+ "immobilisation": "immobilization",
771
+ "immobilise": "immobilize",
772
+ "immobilised": "immobilized",
773
+ "immobiliser": "immobilizer",
774
+ "immobilisers": "immobilizers",
775
+ "immobilises": "immobilizes",
776
+ "immobilising": "immobilizing",
777
+ "immortalise": "immortalize",
778
+ "immortalised": "immortalized",
779
+ "immortalises": "immortalizes",
780
+ "immortalising": "immortalizing",
781
+ "immunisation": "immunization",
782
+ "immunise": "immunize",
783
+ "immunised": "immunized",
784
+ "immunises": "immunizes",
785
+ "immunising": "immunizing",
786
+ "impanelled": "impaneled",
787
+ "impanelling": "impaneling",
788
+ "imperilled": "imperiled",
789
+ "imperilling": "imperiling",
790
+ "individualise": "individualize",
791
+ "individualised": "individualized",
792
+ "individualises": "individualizes",
793
+ "individualising": "individualizing",
794
+ "industrialise": "industrialize",
795
+ "industrialised": "industrialized",
796
+ "industrialises": "industrializes",
797
+ "industrialising": "industrializing",
798
+ "inflexion": "inflection",
799
+ "inflexions": "inflections",
800
+ "initialise": "initialize",
801
+ "initialised": "initialized",
802
+ "initialises": "initializes",
803
+ "initialising": "initializing",
804
+ "initialled": "initialed",
805
+ "initialling": "initialing",
806
+ "instal": "install",
807
+ "instalment": "installment",
808
+ "instalments": "installments",
809
+ "instals": "installs",
810
+ "instil": "instill",
811
+ "instils": "instills",
812
+ "institutionalisation": "institutionalization",
813
+ "institutionalise": "institutionalize",
814
+ "institutionalised": "institutionalized",
815
+ "institutionalises": "institutionalizes",
816
+ "institutionalising": "institutionalizing",
817
+ "intellectualise": "intellectualize",
818
+ "intellectualised": "intellectualized",
819
+ "intellectualises": "intellectualizes",
820
+ "intellectualising": "intellectualizing",
821
+ "internalisation": "internalization",
822
+ "internalise": "internalize",
823
+ "internalised": "internalized",
824
+ "internalises": "internalizes",
825
+ "internalising": "internalizing",
826
+ "internationalisation": "internationalization",
827
+ "internationalise": "internationalize",
828
+ "internationalised": "internationalized",
829
+ "internationalises": "internationalizes",
830
+ "internationalising": "internationalizing",
831
+ "ionisation": "ionization",
832
+ "ionise": "ionize",
833
+ "ionised": "ionized",
834
+ "ioniser": "ionizer",
835
+ "ionisers": "ionizers",
836
+ "ionises": "ionizes",
837
+ "ionising": "ionizing",
838
+ "italicise": "italicize",
839
+ "italicised": "italicized",
840
+ "italicises": "italicizes",
841
+ "italicising": "italicizing",
842
+ "itemise": "itemize",
843
+ "itemised": "itemized",
844
+ "itemises": "itemizes",
845
+ "itemising": "itemizing",
846
+ "jeopardise": "jeopardize",
847
+ "jeopardised": "jeopardized",
848
+ "jeopardises": "jeopardizes",
849
+ "jeopardising": "jeopardizing",
850
+ "jewelled": "jeweled",
851
+ "jeweller": "jeweler",
852
+ "jewellers": "jewelers",
853
+ "jewellery": "jewelry",
854
+ "judgement": "judgment",
855
+ "kilogramme": "kilogram",
856
+ "kilogrammes": "kilograms",
857
+ "kilometre": "kilometer",
858
+ "kilometres": "kilometers",
859
+ "labelled": "labeled",
860
+ "labelling": "labeling",
861
+ "labour": "labor",
862
+ "laboured": "labored",
863
+ "labourer": "laborer",
864
+ "labourers": "laborers",
865
+ "labouring": "laboring",
866
+ "labours": "labors",
867
+ "lacklustre": "lackluster",
868
+ "legalisation": "legalization",
869
+ "legalise": "legalize",
870
+ "legalised": "legalized",
871
+ "legalises": "legalizes",
872
+ "legalising": "legalizing",
873
+ "legitimise": "legitimize",
874
+ "legitimised": "legitimized",
875
+ "legitimises": "legitimizes",
876
+ "legitimising": "legitimizing",
877
+ "leukaemia": "leukemia",
878
+ "levelled": "leveled",
879
+ "leveller": "leveler",
880
+ "levellers": "levelers",
881
+ "levelling": "leveling",
882
+ "libelled": "libeled",
883
+ "libelling": "libeling",
884
+ "libellous": "libelous",
885
+ "liberalisation": "liberalization",
886
+ "liberalise": "liberalize",
887
+ "liberalised": "liberalized",
888
+ "liberalises": "liberalizes",
889
+ "liberalising": "liberalizing",
890
+ "licence": "license",
891
+ "licenced": "licensed",
892
+ "licences": "licenses",
893
+ "licencing": "licensing",
894
+ "likeable": "likable",
895
+ "lionisation": "lionization",
896
+ "lionise": "lionize",
897
+ "lionised": "lionized",
898
+ "lionises": "lionizes",
899
+ "lionising": "lionizing",
900
+ "liquidise": "liquidize",
901
+ "liquidised": "liquidized",
902
+ "liquidiser": "liquidizer",
903
+ "liquidisers": "liquidizers",
904
+ "liquidises": "liquidizes",
905
+ "liquidising": "liquidizing",
906
+ "litre": "liter",
907
+ "litres": "liters",
908
+ "localise": "localize",
909
+ "localised": "localized",
910
+ "localises": "localizes",
911
+ "localising": "localizing",
912
+ "louvre": "louver",
913
+ "louvred": "louvered",
914
+ "louvres": "louvers",
915
+ "lustre": "luster",
916
+ "magnetise": "magnetize",
917
+ "magnetised": "magnetized",
918
+ "magnetises": "magnetizes",
919
+ "magnetising": "magnetizing",
920
+ "manoeuvrability": "maneuverability",
921
+ "manoeuvrable": "maneuverable",
922
+ "manoeuvre": "maneuver",
923
+ "manoeuvred": "maneuvered",
924
+ "manoeuvres": "maneuvers",
925
+ "manoeuvring": "maneuvering",
926
+ "manoeuvrings": "maneuverings",
927
+ "marginalisation": "marginalization",
928
+ "marginalise": "marginalize",
929
+ "marginalised": "marginalized",
930
+ "marginalises": "marginalizes",
931
+ "marginalising": "marginalizing",
932
+ "marshalled": "marshaled",
933
+ "marshalling": "marshaling",
934
+ "marvelled": "marveled",
935
+ "marvelling": "marveling",
936
+ "marvellous": "marvelous",
937
+ "marvellously": "marvelously",
938
+ "materialisation": "materialization",
939
+ "materialise": "materialize",
940
+ "materialised": "materialized",
941
+ "materialises": "materializes",
942
+ "materialising": "materializing",
943
+ "maximisation": "maximization",
944
+ "maximise": "maximize",
945
+ "maximised": "maximized",
946
+ "maximises": "maximizes",
947
+ "maximising": "maximizing",
948
+ "meagre": "meager",
949
+ "mechanisation": "mechanization",
950
+ "mechanise": "mechanize",
951
+ "mechanised": "mechanized",
952
+ "mechanises": "mechanizes",
953
+ "mechanising": "mechanizing",
954
+ "mediaeval": "medieval",
955
+ "memorialise": "memorialize",
956
+ "memorialised": "memorialized",
957
+ "memorialises": "memorializes",
958
+ "memorialising": "memorializing",
959
+ "memorise": "memorize",
960
+ "memorised": "memorized",
961
+ "memorises": "memorizes",
962
+ "memorising": "memorizing",
963
+ "mesmerise": "mesmerize",
964
+ "mesmerised": "mesmerized",
965
+ "mesmerises": "mesmerizes",
966
+ "mesmerising": "mesmerizing",
967
+ "metabolise": "metabolize",
968
+ "metabolised": "metabolized",
969
+ "metabolises": "metabolizes",
970
+ "metabolising": "metabolizing",
971
+ "metre": "meter",
972
+ "metres": "meters",
973
+ "micrometre": "micrometer",
974
+ "micrometres": "micrometers",
975
+ "militarise": "militarize",
976
+ "militarised": "militarized",
977
+ "militarises": "militarizes",
978
+ "militarising": "militarizing",
979
+ "milligramme": "milligram",
980
+ "milligrammes": "milligrams",
981
+ "millilitre": "milliliter",
982
+ "millilitres": "milliliters",
983
+ "millimetre": "millimeter",
984
+ "millimetres": "millimeters",
985
+ "miniaturisation": "miniaturization",
986
+ "miniaturise": "miniaturize",
987
+ "miniaturised": "miniaturized",
988
+ "miniaturises": "miniaturizes",
989
+ "miniaturising": "miniaturizing",
990
+ "minibusses": "minibuses",
991
+ "minimise": "minimize",
992
+ "minimised": "minimized",
993
+ "minimises": "minimizes",
994
+ "minimising": "minimizing",
995
+ "misbehaviour": "misbehavior",
996
+ "misdemeanour": "misdemeanor",
997
+ "misdemeanours": "misdemeanors",
998
+ "misspelt": "misspelled",
999
+ "mitre": "miter",
1000
+ "mitres": "miters",
1001
+ "mobilisation": "mobilization",
1002
+ "mobilise": "mobilize",
1003
+ "mobilised": "mobilized",
1004
+ "mobilises": "mobilizes",
1005
+ "mobilising": "mobilizing",
1006
+ "modelled": "modeled",
1007
+ "modeller": "modeler",
1008
+ "modellers": "modelers",
1009
+ "modelling": "modeling",
1010
+ "modernise": "modernize",
1011
+ "modernised": "modernized",
1012
+ "modernises": "modernizes",
1013
+ "modernising": "modernizing",
1014
+ "moisturise": "moisturize",
1015
+ "moisturised": "moisturized",
1016
+ "moisturiser": "moisturizer",
1017
+ "moisturisers": "moisturizers",
1018
+ "moisturises": "moisturizes",
1019
+ "moisturising": "moisturizing",
1020
+ "monologue": "monolog",
1021
+ "monologues": "monologs",
1022
+ "monopolisation": "monopolization",
1023
+ "monopolise": "monopolize",
1024
+ "monopolised": "monopolized",
1025
+ "monopolises": "monopolizes",
1026
+ "monopolising": "monopolizing",
1027
+ "moralise": "moralize",
1028
+ "moralised": "moralized",
1029
+ "moralises": "moralizes",
1030
+ "moralising": "moralizing",
1031
+ "motorised": "motorized",
1032
+ "mould": "mold",
1033
+ "moulded": "molded",
1034
+ "moulder": "molder",
1035
+ "mouldered": "moldered",
1036
+ "mouldering": "moldering",
1037
+ "moulders": "molders",
1038
+ "mouldier": "moldier",
1039
+ "mouldiest": "moldiest",
1040
+ "moulding": "molding",
1041
+ "mouldings": "moldings",
1042
+ "moulds": "molds",
1043
+ "mouldy": "moldy",
1044
+ "moult": "molt",
1045
+ "moulted": "molted",
1046
+ "moulting": "molting",
1047
+ "moults": "molts",
1048
+ "moustache": "mustache",
1049
+ "moustached": "mustached",
1050
+ "moustaches": "mustaches",
1051
+ "moustachioed": "mustachioed",
1052
+ "multicoloured": "multicolored",
1053
+ "nationalisation": "nationalization",
1054
+ "nationalisations": "nationalizations",
1055
+ "nationalise": "nationalize",
1056
+ "nationalised": "nationalized",
1057
+ "nationalises": "nationalizes",
1058
+ "nationalising": "nationalizing",
1059
+ "naturalisation": "naturalization",
1060
+ "naturalise": "naturalize",
1061
+ "naturalised": "naturalized",
1062
+ "naturalises": "naturalizes",
1063
+ "naturalising": "naturalizing",
1064
+ "neighbour": "neighbor",
1065
+ "neighbourhood": "neighborhood",
1066
+ "neighbourhoods": "neighborhoods",
1067
+ "neighbouring": "neighboring",
1068
+ "neighbourliness": "neighborliness",
1069
+ "neighbourly": "neighborly",
1070
+ "neighbours": "neighbors",
1071
+ "neutralisation": "neutralization",
1072
+ "neutralise": "neutralize",
1073
+ "neutralised": "neutralized",
1074
+ "neutralises": "neutralizes",
1075
+ "neutralising": "neutralizing",
1076
+ "normalisation": "normalization",
1077
+ "normalise": "normalize",
1078
+ "normalised": "normalized",
1079
+ "normalises": "normalizes",
1080
+ "normalising": "normalizing",
1081
+ "odour": "odor",
1082
+ "odourless": "odorless",
1083
+ "odours": "odors",
1084
+ "oesophagus": "esophagus",
1085
+ "oesophaguses": "esophaguses",
1086
+ "oestrogen": "estrogen",
1087
+ "offence": "offense",
1088
+ "offences": "offenses",
1089
+ "omelette": "omelet",
1090
+ "omelettes": "omelets",
1091
+ "optimise": "optimize",
1092
+ "optimised": "optimized",
1093
+ "optimises": "optimizes",
1094
+ "optimising": "optimizing",
1095
+ "organisation": "organization",
1096
+ "organisational": "organizational",
1097
+ "organisations": "organizations",
1098
+ "organise": "organize",
1099
+ "organised": "organized",
1100
+ "organiser": "organizer",
1101
+ "organisers": "organizers",
1102
+ "organises": "organizes",
1103
+ "organising": "organizing",
1104
+ "orthopaedic": "orthopedic",
1105
+ "orthopaedics": "orthopedics",
1106
+ "ostracise": "ostracize",
1107
+ "ostracised": "ostracized",
1108
+ "ostracises": "ostracizes",
1109
+ "ostracising": "ostracizing",
1110
+ "outmanoeuvre": "outmaneuver",
1111
+ "outmanoeuvred": "outmaneuvered",
1112
+ "outmanoeuvres": "outmaneuvers",
1113
+ "outmanoeuvring": "outmaneuvering",
1114
+ "overemphasise": "overemphasize",
1115
+ "overemphasised": "overemphasized",
1116
+ "overemphasises": "overemphasizes",
1117
+ "overemphasising": "overemphasizing",
1118
+ "oxidisation": "oxidization",
1119
+ "oxidise": "oxidize",
1120
+ "oxidised": "oxidized",
1121
+ "oxidises": "oxidizes",
1122
+ "oxidising": "oxidizing",
1123
+ "paederast": "pederast",
1124
+ "paederasts": "pederasts",
1125
+ "paediatric": "pediatric",
1126
+ "paediatrician": "pediatrician",
1127
+ "paediatricians": "pediatricians",
1128
+ "paediatrics": "pediatrics",
1129
+ "paedophile": "pedophile",
1130
+ "paedophiles": "pedophiles",
1131
+ "paedophilia": "pedophilia",
1132
+ "palaeolithic": "paleolithic",
1133
+ "palaeontologist": "paleontologist",
1134
+ "palaeontologists": "paleontologists",
1135
+ "palaeontology": "paleontology",
1136
+ "panelled": "paneled",
1137
+ "panelling": "paneling",
1138
+ "panellist": "panelist",
1139
+ "panellists": "panelists",
1140
+ "paralyse": "paralyze",
1141
+ "paralysed": "paralyzed",
1142
+ "paralyses": "paralyzes",
1143
+ "paralysing": "paralyzing",
1144
+ "parcelled": "parceled",
1145
+ "parcelling": "parceling",
1146
+ "parlour": "parlor",
1147
+ "parlours": "parlors",
1148
+ "particularise": "particularize",
1149
+ "particularised": "particularized",
1150
+ "particularises": "particularizes",
1151
+ "particularising": "particularizing",
1152
+ "passivisation": "passivization",
1153
+ "passivise": "passivize",
1154
+ "passivised": "passivized",
1155
+ "passivises": "passivizes",
1156
+ "passivising": "passivizing",
1157
+ "pasteurisation": "pasteurization",
1158
+ "pasteurise": "pasteurize",
1159
+ "pasteurised": "pasteurized",
1160
+ "pasteurises": "pasteurizes",
1161
+ "pasteurising": "pasteurizing",
1162
+ "patronise": "patronize",
1163
+ "patronised": "patronized",
1164
+ "patronises": "patronizes",
1165
+ "patronising": "patronizing",
1166
+ "patronisingly": "patronizingly",
1167
+ "pedalled": "pedaled",
1168
+ "pedalling": "pedaling",
1169
+ "pedestrianisation": "pedestrianization",
1170
+ "pedestrianise": "pedestrianize",
1171
+ "pedestrianised": "pedestrianized",
1172
+ "pedestrianises": "pedestrianizes",
1173
+ "pedestrianising": "pedestrianizing",
1174
+ "penalise": "penalize",
1175
+ "penalised": "penalized",
1176
+ "penalises": "penalizes",
1177
+ "penalising": "penalizing",
1178
+ "pencilled": "penciled",
1179
+ "pencilling": "penciling",
1180
+ "personalise": "personalize",
1181
+ "personalised": "personalized",
1182
+ "personalises": "personalizes",
1183
+ "personalising": "personalizing",
1184
+ "pharmacopoeia": "pharmacopeia",
1185
+ "pharmacopoeias": "pharmacopeias",
1186
+ "philosophise": "philosophize",
1187
+ "philosophised": "philosophized",
1188
+ "philosophises": "philosophizes",
1189
+ "philosophising": "philosophizing",
1190
+ "philtre": "filter",
1191
+ "philtres": "filters",
1192
+ "phoney": "phony",
1193
+ "plagiarise": "plagiarize",
1194
+ "plagiarised": "plagiarized",
1195
+ "plagiarises": "plagiarizes",
1196
+ "plagiarising": "plagiarizing",
1197
+ "plough": "plow",
1198
+ "ploughed": "plowed",
1199
+ "ploughing": "plowing",
1200
+ "ploughman": "plowman",
1201
+ "ploughmen": "plowmen",
1202
+ "ploughs": "plows",
1203
+ "ploughshare": "plowshare",
1204
+ "ploughshares": "plowshares",
1205
+ "polarisation": "polarization",
1206
+ "polarise": "polarize",
1207
+ "polarised": "polarized",
1208
+ "polarises": "polarizes",
1209
+ "polarising": "polarizing",
1210
+ "politicisation": "politicization",
1211
+ "politicise": "politicize",
1212
+ "politicised": "politicized",
1213
+ "politicises": "politicizes",
1214
+ "politicising": "politicizing",
1215
+ "popularisation": "popularization",
1216
+ "popularise": "popularize",
1217
+ "popularised": "popularized",
1218
+ "popularises": "popularizes",
1219
+ "popularising": "popularizing",
1220
+ "pouffe": "pouf",
1221
+ "pouffes": "poufs",
1222
+ "practise": "practice",
1223
+ "practised": "practiced",
1224
+ "practises": "practices",
1225
+ "practising": "practicing",
1226
+ "praesidium": "presidium",
1227
+ "praesidiums": "presidiums",
1228
+ "pressurisation": "pressurization",
1229
+ "pressurise": "pressurize",
1230
+ "pressurised": "pressurized",
1231
+ "pressurises": "pressurizes",
1232
+ "pressurising": "pressurizing",
1233
+ "pretence": "pretense",
1234
+ "pretences": "pretenses",
1235
+ "primaeval": "primeval",
1236
+ "prioritisation": "prioritization",
1237
+ "prioritise": "prioritize",
1238
+ "prioritised": "prioritized",
1239
+ "prioritises": "prioritizes",
1240
+ "prioritising": "prioritizing",
1241
+ "privatisation": "privatization",
1242
+ "privatisations": "privatizations",
1243
+ "privatise": "privatize",
1244
+ "privatised": "privatized",
1245
+ "privatises": "privatizes",
1246
+ "privatising": "privatizing",
1247
+ "professionalisation": "professionalization",
1248
+ "professionalise": "professionalize",
1249
+ "professionalised": "professionalized",
1250
+ "professionalises": "professionalizes",
1251
+ "professionalising": "professionalizing",
1252
+ "programme": "program",
1253
+ "programmes": "programs",
1254
+ "prologue": "prolog",
1255
+ "prologues": "prologs",
1256
+ "propagandise": "propagandize",
1257
+ "propagandised": "propagandized",
1258
+ "propagandises": "propagandizes",
1259
+ "propagandising": "propagandizing",
1260
+ "proselytise": "proselytize",
1261
+ "proselytised": "proselytized",
1262
+ "proselytiser": "proselytizer",
1263
+ "proselytisers": "proselytizers",
1264
+ "proselytises": "proselytizes",
1265
+ "proselytising": "proselytizing",
1266
+ "psychoanalyse": "psychoanalyze",
1267
+ "psychoanalysed": "psychoanalyzed",
1268
+ "psychoanalyses": "psychoanalyzes",
1269
+ "psychoanalysing": "psychoanalyzing",
1270
+ "publicise": "publicize",
1271
+ "publicised": "publicized",
1272
+ "publicises": "publicizes",
1273
+ "publicising": "publicizing",
1274
+ "pulverisation": "pulverization",
1275
+ "pulverise": "pulverize",
1276
+ "pulverised": "pulverized",
1277
+ "pulverises": "pulverizes",
1278
+ "pulverising": "pulverizing",
1279
+ "pummelled": "pummel",
1280
+ "pummelling": "pummeled",
1281
+ "pyjama": "pajama",
1282
+ "pyjamas": "pajamas",
1283
+ "pzazz": "pizzazz",
1284
+ "quarrelled": "quarreled",
1285
+ "quarrelling": "quarreling",
1286
+ "radicalise": "radicalize",
1287
+ "radicalised": "radicalized",
1288
+ "radicalises": "radicalizes",
1289
+ "radicalising": "radicalizing",
1290
+ "rancour": "rancor",
1291
+ "randomise": "randomize",
1292
+ "randomised": "randomized",
1293
+ "randomises": "randomizes",
1294
+ "randomising": "randomizing",
1295
+ "rationalisation": "rationalization",
1296
+ "rationalisations": "rationalizations",
1297
+ "rationalise": "rationalize",
1298
+ "rationalised": "rationalized",
1299
+ "rationalises": "rationalizes",
1300
+ "rationalising": "rationalizing",
1301
+ "ravelled": "raveled",
1302
+ "ravelling": "raveling",
1303
+ "realisable": "realizable",
1304
+ "realisation": "realization",
1305
+ "realisations": "realizations",
1306
+ "realise": "realize",
1307
+ "realised": "realized",
1308
+ "realises": "realizes",
1309
+ "realising": "realizing",
1310
+ "recognisable": "recognizable",
1311
+ "recognisably": "recognizably",
1312
+ "recognisance": "recognizance",
1313
+ "recognise": "recognize",
1314
+ "recognised": "recognized",
1315
+ "recognises": "recognizes",
1316
+ "recognising": "recognizing",
1317
+ "reconnoitre": "reconnoiter",
1318
+ "reconnoitred": "reconnoitered",
1319
+ "reconnoitres": "reconnoiters",
1320
+ "reconnoitring": "reconnoitering",
1321
+ "refuelled": "refueled",
1322
+ "refuelling": "refueling",
1323
+ "regularisation": "regularization",
1324
+ "regularise": "regularize",
1325
+ "regularised": "regularized",
1326
+ "regularises": "regularizes",
1327
+ "regularising": "regularizing",
1328
+ "remodelled": "remodeled",
1329
+ "remodelling": "remodeling",
1330
+ "remould": "remold",
1331
+ "remoulded": "remolded",
1332
+ "remoulding": "remolding",
1333
+ "remoulds": "remolds",
1334
+ "reorganisation": "reorganization",
1335
+ "reorganisations": "reorganizations",
1336
+ "reorganise": "reorganize",
1337
+ "reorganised": "reorganized",
1338
+ "reorganises": "reorganizes",
1339
+ "reorganising": "reorganizing",
1340
+ "revelled": "reveled",
1341
+ "reveller": "reveler",
1342
+ "revellers": "revelers",
1343
+ "revelling": "reveling",
1344
+ "revitalise": "revitalize",
1345
+ "revitalised": "revitalized",
1346
+ "revitalises": "revitalizes",
1347
+ "revitalising": "revitalizing",
1348
+ "revolutionise": "revolutionize",
1349
+ "revolutionised": "revolutionized",
1350
+ "revolutionises": "revolutionizes",
1351
+ "revolutionising": "revolutionizing",
1352
+ "rhapsodise": "rhapsodize",
1353
+ "rhapsodised": "rhapsodized",
1354
+ "rhapsodises": "rhapsodizes",
1355
+ "rhapsodising": "rhapsodizing",
1356
+ "rigour": "rigor",
1357
+ "rigours": "rigors",
1358
+ "ritualised": "ritualized",
1359
+ "rivalled": "rivaled",
1360
+ "rivalling": "rivaling",
1361
+ "romanticise": "romanticize",
1362
+ "romanticised": "romanticized",
1363
+ "romanticises": "romanticizes",
1364
+ "romanticising": "romanticizing",
1365
+ "rumour": "rumor",
1366
+ "rumoured": "rumored",
1367
+ "rumours": "rumors",
1368
+ "sabre": "saber",
1369
+ "sabres": "sabers",
1370
+ "saltpetre": "saltpeter",
1371
+ "sanitise": "sanitize",
1372
+ "sanitised": "sanitized",
1373
+ "sanitises": "sanitizes",
1374
+ "sanitising": "sanitizing",
1375
+ "satirise": "satirize",
1376
+ "satirised": "satirized",
1377
+ "satirises": "satirizes",
1378
+ "satirising": "satirizing",
1379
+ "saviour": "savior",
1380
+ "saviours": "saviors",
1381
+ "savour": "savor",
1382
+ "savoured": "savored",
1383
+ "savouries": "savories",
1384
+ "savouring": "savoring",
1385
+ "savours": "savors",
1386
+ "savoury": "savory",
1387
+ "scandalise": "scandalize",
1388
+ "scandalised": "scandalized",
1389
+ "scandalises": "scandalizes",
1390
+ "scandalising": "scandalizing",
1391
+ "sceptic": "skeptic",
1392
+ "sceptical": "skeptical",
1393
+ "sceptically": "skeptically",
1394
+ "scepticism": "skepticism",
1395
+ "sceptics": "skeptics",
1396
+ "sceptre": "scepter",
1397
+ "sceptres": "scepters",
1398
+ "scrutinise": "scrutinize",
1399
+ "scrutinised": "scrutinized",
1400
+ "scrutinises": "scrutinizes",
1401
+ "scrutinising": "scrutinizing",
1402
+ "secularisation": "secularization",
1403
+ "secularise": "secularize",
1404
+ "secularised": "secularized",
1405
+ "secularises": "secularizes",
1406
+ "secularising": "secularizing",
1407
+ "sensationalise": "sensationalize",
1408
+ "sensationalised": "sensationalized",
1409
+ "sensationalises": "sensationalizes",
1410
+ "sensationalising": "sensationalizing",
1411
+ "sensitise": "sensitize",
1412
+ "sensitised": "sensitized",
1413
+ "sensitises": "sensitizes",
1414
+ "sensitising": "sensitizing",
1415
+ "sentimentalise": "sentimentalize",
1416
+ "sentimentalised": "sentimentalized",
1417
+ "sentimentalises": "sentimentalizes",
1418
+ "sentimentalising": "sentimentalizing",
1419
+ "sepulchre": "sepulcher",
1420
+ "sepulchres": "sepulchers",
1421
+ "serialisation": "serialization",
1422
+ "serialisations": "serializations",
1423
+ "serialise": "serialize",
1424
+ "serialised": "serialized",
1425
+ "serialises": "serializes",
1426
+ "serialising": "serializing",
1427
+ "sermonise": "sermonize",
1428
+ "sermonised": "sermonized",
1429
+ "sermonises": "sermonizes",
1430
+ "sermonising": "sermonizing",
1431
+ "sheikh": "sheik",
1432
+ "shovelled": "shoveled",
1433
+ "shovelling": "shoveling",
1434
+ "shrivelled": "shriveled",
1435
+ "shrivelling": "shriveling",
1436
+ "signalise": "signalize",
1437
+ "signalised": "signalized",
1438
+ "signalises": "signalizes",
1439
+ "signalising": "signalizing",
1440
+ "signalled": "signaled",
1441
+ "signalling": "signaling",
1442
+ "smoulder": "smolder",
1443
+ "smouldered": "smoldered",
1444
+ "smouldering": "smoldering",
1445
+ "smoulders": "smolders",
1446
+ "snivelled": "sniveled",
1447
+ "snivelling": "sniveling",
1448
+ "snorkelled": "snorkeled",
1449
+ "snorkelling": "snorkeling",
1450
+ "snowplough": "snowplow",
1451
+ "snowploughs": "snowplow",
1452
+ "socialisation": "socialization",
1453
+ "socialise": "socialize",
1454
+ "socialised": "socialized",
1455
+ "socialises": "socializes",
1456
+ "socialising": "socializing",
1457
+ "sodomise": "sodomize",
1458
+ "sodomised": "sodomized",
1459
+ "sodomises": "sodomizes",
1460
+ "sodomising": "sodomizing",
1461
+ "solemnise": "solemnize",
1462
+ "solemnised": "solemnized",
1463
+ "solemnises": "solemnizes",
1464
+ "solemnising": "solemnizing",
1465
+ "sombre": "somber",
1466
+ "specialisation": "specialization",
1467
+ "specialisations": "specializations",
1468
+ "specialise": "specialize",
1469
+ "specialised": "specialized",
1470
+ "specialises": "specializes",
1471
+ "specialising": "specializing",
1472
+ "spectre": "specter",
1473
+ "spectres": "specters",
1474
+ "spiralled": "spiraled",
1475
+ "spiralling": "spiraling",
1476
+ "splendour": "splendor",
1477
+ "splendours": "splendors",
1478
+ "squirrelled": "squirreled",
1479
+ "squirrelling": "squirreling",
1480
+ "stabilisation": "stabilization",
1481
+ "stabilise": "stabilize",
1482
+ "stabilised": "stabilized",
1483
+ "stabiliser": "stabilizer",
1484
+ "stabilisers": "stabilizers",
1485
+ "stabilises": "stabilizes",
1486
+ "stabilising": "stabilizing",
1487
+ "standardisation": "standardization",
1488
+ "standardise": "standardize",
1489
+ "standardised": "standardized",
1490
+ "standardises": "standardizes",
1491
+ "standardising": "standardizing",
1492
+ "stencilled": "stenciled",
1493
+ "stencilling": "stenciling",
1494
+ "sterilisation": "sterilization",
1495
+ "sterilisations": "sterilizations",
1496
+ "sterilise": "sterilize",
1497
+ "sterilised": "sterilized",
1498
+ "steriliser": "sterilizer",
1499
+ "sterilisers": "sterilizers",
1500
+ "sterilises": "sterilizes",
1501
+ "sterilising": "sterilizing",
1502
+ "stigmatisation": "stigmatization",
1503
+ "stigmatise": "stigmatize",
1504
+ "stigmatised": "stigmatized",
1505
+ "stigmatises": "stigmatizes",
1506
+ "stigmatising": "stigmatizing",
1507
+ "storey": "story",
1508
+ "storeys": "stories",
1509
+ "subsidisation": "subsidization",
1510
+ "subsidise": "subsidize",
1511
+ "subsidised": "subsidized",
1512
+ "subsidiser": "subsidizer",
1513
+ "subsidisers": "subsidizers",
1514
+ "subsidises": "subsidizes",
1515
+ "subsidising": "subsidizing",
1516
+ "succour": "succor",
1517
+ "succoured": "succored",
1518
+ "succouring": "succoring",
1519
+ "succours": "succors",
1520
+ "sulphate": "sulfate",
1521
+ "sulphates": "sulfates",
1522
+ "sulphide": "sulfide",
1523
+ "sulphides": "sulfides",
1524
+ "sulphur": "sulfur",
1525
+ "sulphurous": "sulfurous",
1526
+ "summarise": "summarize",
1527
+ "summarised": "summarized",
1528
+ "summarises": "summarizes",
1529
+ "summarising": "summarizing",
1530
+ "swivelled": "swiveled",
1531
+ "swivelling": "swiveling",
1532
+ "symbolise": "symbolize",
1533
+ "symbolised": "symbolized",
1534
+ "symbolises": "symbolizes",
1535
+ "symbolising": "symbolizing",
1536
+ "sympathise": "sympathize",
1537
+ "sympathised": "sympathized",
1538
+ "sympathiser": "sympathizer",
1539
+ "sympathisers": "sympathizers",
1540
+ "sympathises": "sympathizes",
1541
+ "sympathising": "sympathizing",
1542
+ "synchronisation": "synchronization",
1543
+ "synchronise": "synchronize",
1544
+ "synchronised": "synchronized",
1545
+ "synchronises": "synchronizes",
1546
+ "synchronising": "synchronizing",
1547
+ "synthesise": "synthesize",
1548
+ "synthesised": "synthesized",
1549
+ "synthesiser": "synthesizer",
1550
+ "synthesisers": "synthesizers",
1551
+ "synthesises": "synthesizes",
1552
+ "synthesising": "synthesizing",
1553
+ "syphon": "siphon",
1554
+ "syphoned": "siphoned",
1555
+ "syphoning": "siphoning",
1556
+ "syphons": "siphons",
1557
+ "systematisation": "systematization",
1558
+ "systematise": "systematize",
1559
+ "systematised": "systematized",
1560
+ "systematises": "systematizes",
1561
+ "systematising": "systematizing",
1562
+ "tantalise": "tantalize",
1563
+ "tantalised": "tantalized",
1564
+ "tantalises": "tantalizes",
1565
+ "tantalising": "tantalizing",
1566
+ "tantalisingly": "tantalizingly",
1567
+ "tasselled": "tasseled",
1568
+ "technicolour": "technicolor",
1569
+ "temporise": "temporize",
1570
+ "temporised": "temporized",
1571
+ "temporises": "temporizes",
1572
+ "temporising": "temporizing",
1573
+ "tenderise": "tenderize",
1574
+ "tenderised": "tenderized",
1575
+ "tenderises": "tenderizes",
1576
+ "tenderising": "tenderizing",
1577
+ "terrorise": "terrorize",
1578
+ "terrorised": "terrorized",
1579
+ "terrorises": "terrorizes",
1580
+ "terrorising": "terrorizing",
1581
+ "theatre": "theater",
1582
+ "theatregoer": "theatergoer",
1583
+ "theatregoers": "theatergoers",
1584
+ "theatres": "theaters",
1585
+ "theorise": "theorize",
1586
+ "theorised": "theorized",
1587
+ "theorises": "theorizes",
1588
+ "theorising": "theorizing",
1589
+ "tonne": "ton",
1590
+ "tonnes": "tons",
1591
+ "towelled": "toweled",
1592
+ "towelling": "toweling",
1593
+ "toxaemia": "toxemia",
1594
+ "tranquillise": "tranquilize",
1595
+ "tranquillised": "tranquilized",
1596
+ "tranquilliser": "tranquilizer",
1597
+ "tranquillisers": "tranquilizers",
1598
+ "tranquillises": "tranquilizes",
1599
+ "tranquillising": "tranquilizing",
1600
+ "tranquillity": "tranquility",
1601
+ "tranquillize": "tranquilize",
1602
+ "tranquillized": "tranquilized",
1603
+ "tranquillizer": "tranquilizer",
1604
+ "tranquillizers": "tranquilizers",
1605
+ "tranquillizes": "tranquilizes",
1606
+ "tranquillizing": "tranquilizing",
1607
+ "tranquilly": "tranquility",
1608
+ "transistorised": "transistorized",
1609
+ "traumatise": "traumatize",
1610
+ "traumatised": "traumatized",
1611
+ "traumatises": "traumatizes",
1612
+ "traumatising": "traumatizing",
1613
+ "travelled": "traveled",
1614
+ "traveller": "traveler",
1615
+ "travellers": "travelers",
1616
+ "travelling": "traveling",
1617
+ "travelog": "travelogue",
1618
+ "travelogs": "travelogues",
1619
+ "trialled": "trialed",
1620
+ "trialling": "trialing",
1621
+ "tricolour": "tricolor",
1622
+ "tricolours": "tricolors",
1623
+ "trivialise": "trivialize",
1624
+ "trivialised": "trivialized",
1625
+ "trivialises": "trivializes",
1626
+ "trivialising": "trivializing",
1627
+ "tumour": "tumor",
1628
+ "tumours": "tumors",
1629
+ "tunnelled": "tunneled",
1630
+ "tunnelling": "tunneling",
1631
+ "tyrannise": "tyrannize",
1632
+ "tyrannised": "tyrannized",
1633
+ "tyrannises": "tyrannizes",
1634
+ "tyrannising": "tyrannizing",
1635
+ "tyre": "tire",
1636
+ "tyres": "tires",
1637
+ "unauthorised": "unauthorized",
1638
+ "uncivilised": "uncivilized",
1639
+ "underutilised": "underutilized",
1640
+ "unequalled": "unequaled",
1641
+ "unfavourable": "unfavorable",
1642
+ "unfavourably": "unfavorably",
1643
+ "unionisation": "unionization",
1644
+ "unionise": "unionize",
1645
+ "unionised": "unionized",
1646
+ "unionises": "unionizes",
1647
+ "unionising": "unionizing",
1648
+ "unorganised": "unorganized",
1649
+ "unravelled": "unraveled",
1650
+ "unravelling": "unraveling",
1651
+ "unrecognisable": "unrecognizable",
1652
+ "unrecognised": "unrecognized",
1653
+ "unrivalled": "unrivaled",
1654
+ "unsavoury": "unsavory",
1655
+ "untrammelled": "untrammeled",
1656
+ "urbanisation": "urbanization",
1657
+ "urbanise": "urbanize",
1658
+ "urbanised": "urbanized",
1659
+ "urbanises": "urbanizes",
1660
+ "urbanising": "urbanizing",
1661
+ "utilisable": "utilizable",
1662
+ "utilisation": "utilization",
1663
+ "utilise": "utilize",
1664
+ "utilised": "utilized",
1665
+ "utilises": "utilizes",
1666
+ "utilising": "utilizing",
1667
+ "valour": "valor",
1668
+ "vandalise": "vandalize",
1669
+ "vandalised": "vandalized",
1670
+ "vandalises": "vandalizes",
1671
+ "vandalising": "vandalizing",
1672
+ "vaporisation": "vaporization",
1673
+ "vaporise": "vaporize",
1674
+ "vaporised": "vaporized",
1675
+ "vaporises": "vaporizes",
1676
+ "vaporising": "vaporizing",
1677
+ "vapour": "vapor",
1678
+ "vapours": "vapors",
1679
+ "verbalise": "verbalize",
1680
+ "verbalised": "verbalized",
1681
+ "verbalises": "verbalizes",
1682
+ "verbalising": "verbalizing",
1683
+ "victimisation": "victimization",
1684
+ "victimise": "victimize",
1685
+ "victimised": "victimized",
1686
+ "victimises": "victimizes",
1687
+ "victimising": "victimizing",
1688
+ "videodisc": "videodisk",
1689
+ "videodiscs": "videodisks",
1690
+ "vigour": "vigor",
1691
+ "visualisation": "visualization",
1692
+ "visualisations": "visualizations",
1693
+ "visualise": "visualize",
1694
+ "visualised": "visualized",
1695
+ "visualises": "visualizes",
1696
+ "visualising": "visualizing",
1697
+ "vocalisation": "vocalization",
1698
+ "vocalisations": "vocalizations",
1699
+ "vocalise": "vocalize",
1700
+ "vocalised": "vocalized",
1701
+ "vocalises": "vocalizes",
1702
+ "vocalising": "vocalizing",
1703
+ "vulcanised": "vulcanized",
1704
+ "vulgarisation": "vulgarization",
1705
+ "vulgarise": "vulgarize",
1706
+ "vulgarised": "vulgarized",
1707
+ "vulgarises": "vulgarizes",
1708
+ "vulgarising": "vulgarizing",
1709
+ "waggon": "wagon",
1710
+ "waggons": "wagons",
1711
+ "watercolour": "watercolor",
1712
+ "watercolours": "watercolors",
1713
+ "weaselled": "weaseled",
1714
+ "weaselling": "weaseling",
1715
+ "westernisation": "westernization",
1716
+ "westernise": "westernize",
1717
+ "westernised": "westernized",
1718
+ "westernises": "westernizes",
1719
+ "westernising": "westernizing",
1720
+ "womanise": "womanize",
1721
+ "womanised": "womanized",
1722
+ "womaniser": "womanizer",
1723
+ "womanisers": "womanizers",
1724
+ "womanises": "womanizes",
1725
+ "womanising": "womanizing",
1726
+ "woollen": "woolen",
1727
+ "woollens": "woolens",
1728
+ "woollies": "woolies",
1729
+ "woolly": "wooly",
1730
+ "worshipped": "worshiped",
1731
+ "worshipping": "worshiping",
1732
+ "worshipper": "worshiper",
1733
+ "yodelled": "yodeled",
1734
+ "yodelling": "yodeling",
1735
+ "yoghourt": "yogurt",
1736
+ "yoghourts": "yogurts",
1737
+ "yoghurt": "yogurt",
1738
+ "yoghurts": "yogurts",
1739
+ "mhm": "hmm",
1740
+ "mmm": "hmm"
1741
+ }
MMaDA/eval_ASR_TTS/whisper_asr/normalizers/english.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from fractions import Fraction
5
+ from typing import Iterator, List, Match, Optional, Union
6
+
7
+ from more_itertools import windowed
8
+
9
+ from .basic import remove_symbols_and_diacritics
10
+
11
+
12
+ class EnglishNumberNormalizer:
13
+ """
14
+ Convert any spelled-out numbers into arabic numbers, while handling:
15
+
16
+ - remove any commas
17
+ - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
18
+ - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
19
+ - spell out `one` and `ones`
20
+ - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
21
+ """
22
+
23
+ def __init__(self):
24
+ super().__init__()
25
+
26
+ self.zeros = {"o", "oh", "zero"}
27
+ self.ones = {
28
+ name: i
29
+ for i, name in enumerate(
30
+ [
31
+ "one",
32
+ "two",
33
+ "three",
34
+ "four",
35
+ "five",
36
+ "six",
37
+ "seven",
38
+ "eight",
39
+ "nine",
40
+ "ten",
41
+ "eleven",
42
+ "twelve",
43
+ "thirteen",
44
+ "fourteen",
45
+ "fifteen",
46
+ "sixteen",
47
+ "seventeen",
48
+ "eighteen",
49
+ "nineteen",
50
+ ],
51
+ start=1,
52
+ )
53
+ }
54
+ self.ones_plural = {
55
+ "sixes" if name == "six" else name + "s": (value, "s")
56
+ for name, value in self.ones.items()
57
+ }
58
+ self.ones_ordinal = {
59
+ "zeroth": (0, "th"),
60
+ "first": (1, "st"),
61
+ "second": (2, "nd"),
62
+ "third": (3, "rd"),
63
+ "fifth": (5, "th"),
64
+ "twelfth": (12, "th"),
65
+ **{
66
+ name + ("h" if name.endswith("t") else "th"): (value, "th")
67
+ for name, value in self.ones.items()
68
+ if value > 3 and value != 5 and value != 12
69
+ },
70
+ }
71
+ self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
72
+
73
+ self.tens = {
74
+ "twenty": 20,
75
+ "thirty": 30,
76
+ "forty": 40,
77
+ "fifty": 50,
78
+ "sixty": 60,
79
+ "seventy": 70,
80
+ "eighty": 80,
81
+ "ninety": 90,
82
+ }
83
+ self.tens_plural = {
84
+ name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
85
+ }
86
+ self.tens_ordinal = {
87
+ name.replace("y", "ieth"): (value, "th")
88
+ for name, value in self.tens.items()
89
+ }
90
+ self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
91
+
92
+ self.multipliers = {
93
+ "hundred": 100,
94
+ "thousand": 1_000,
95
+ "million": 1_000_000,
96
+ "billion": 1_000_000_000,
97
+ "trillion": 1_000_000_000_000,
98
+ "quadrillion": 1_000_000_000_000_000,
99
+ "quintillion": 1_000_000_000_000_000_000,
100
+ "sextillion": 1_000_000_000_000_000_000_000,
101
+ "septillion": 1_000_000_000_000_000_000_000_000,
102
+ "octillion": 1_000_000_000_000_000_000_000_000_000,
103
+ "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
104
+ "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
105
+ }
106
+ self.multipliers_plural = {
107
+ name + "s": (value, "s") for name, value in self.multipliers.items()
108
+ }
109
+ self.multipliers_ordinal = {
110
+ name + "th": (value, "th") for name, value in self.multipliers.items()
111
+ }
112
+ self.multipliers_suffixed = {
113
+ **self.multipliers_plural,
114
+ **self.multipliers_ordinal,
115
+ }
116
+ self.decimals = {*self.ones, *self.tens, *self.zeros}
117
+
118
+ self.preceding_prefixers = {
119
+ "minus": "-",
120
+ "negative": "-",
121
+ "plus": "+",
122
+ "positive": "+",
123
+ }
124
+ self.following_prefixers = {
125
+ "pound": "£",
126
+ "pounds": "£",
127
+ "euro": "€",
128
+ "euros": "€",
129
+ "dollar": "$",
130
+ "dollars": "$",
131
+ "cent": "¢",
132
+ "cents": "¢",
133
+ }
134
+ self.prefixes = set(
135
+ list(self.preceding_prefixers.values())
136
+ + list(self.following_prefixers.values())
137
+ )
138
+ self.suffixers = {
139
+ "per": {"cent": "%"},
140
+ "percent": "%",
141
+ }
142
+ self.specials = {"and", "double", "triple", "point"}
143
+
144
+ self.words = set(
145
+ [
146
+ key
147
+ for mapping in [
148
+ self.zeros,
149
+ self.ones,
150
+ self.ones_suffixed,
151
+ self.tens,
152
+ self.tens_suffixed,
153
+ self.multipliers,
154
+ self.multipliers_suffixed,
155
+ self.preceding_prefixers,
156
+ self.following_prefixers,
157
+ self.suffixers,
158
+ self.specials,
159
+ ]
160
+ for key in mapping
161
+ ]
162
+ )
163
+ self.literal_words = {"one", "ones"}
164
+
165
+ def process_words(self, words: List[str]) -> Iterator[str]:
166
+ prefix: Optional[str] = None
167
+ value: Optional[Union[str, int]] = None
168
+ skip = False
169
+
170
+ def to_fraction(s: str):
171
+ try:
172
+ return Fraction(s)
173
+ except ValueError:
174
+ return None
175
+
176
+ def output(result: Union[str, int]):
177
+ nonlocal prefix, value
178
+ result = str(result)
179
+ if prefix is not None:
180
+ result = prefix + result
181
+ value = None
182
+ prefix = None
183
+ return result
184
+
185
+ if len(words) == 0:
186
+ return
187
+
188
+ for prev, current, next in windowed([None] + words + [None], 3):
189
+ if skip:
190
+ skip = False
191
+ continue
192
+
193
+ next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
194
+ has_prefix = current[0] in self.prefixes
195
+ current_without_prefix = current[1:] if has_prefix else current
196
+ if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
197
+ # arabic numbers (potentially with signs and fractions)
198
+ f = to_fraction(current_without_prefix)
199
+ assert f is not None
200
+ if value is not None:
201
+ if isinstance(value, str) and value.endswith("."):
202
+ # concatenate decimals / ip address components
203
+ value = str(value) + str(current)
204
+ continue
205
+ else:
206
+ yield output(value)
207
+
208
+ prefix = current[0] if has_prefix else prefix
209
+ if f.denominator == 1:
210
+ value = f.numerator # store integers as int
211
+ else:
212
+ value = current_without_prefix
213
+ elif current not in self.words:
214
+ # non-numeric words
215
+ if value is not None:
216
+ yield output(value)
217
+ yield output(current)
218
+ elif current in self.zeros:
219
+ value = str(value or "") + "0"
220
+ elif current in self.ones:
221
+ ones = self.ones[current]
222
+
223
+ if value is None:
224
+ value = ones
225
+ elif isinstance(value, str) or prev in self.ones:
226
+ if (
227
+ prev in self.tens and ones < 10
228
+ ): # replace the last zero with the digit
229
+ assert value[-1] == "0"
230
+ value = value[:-1] + str(ones)
231
+ else:
232
+ value = str(value) + str(ones)
233
+ elif ones < 10:
234
+ if value % 10 == 0:
235
+ value += ones
236
+ else:
237
+ value = str(value) + str(ones)
238
+ else: # eleven to nineteen
239
+ if value % 100 == 0:
240
+ value += ones
241
+ else:
242
+ value = str(value) + str(ones)
243
+ elif current in self.ones_suffixed:
244
+ # ordinal or cardinal; yield the number right away
245
+ ones, suffix = self.ones_suffixed[current]
246
+ if value is None:
247
+ yield output(str(ones) + suffix)
248
+ elif isinstance(value, str) or prev in self.ones:
249
+ if prev in self.tens and ones < 10:
250
+ assert value[-1] == "0"
251
+ yield output(value[:-1] + str(ones) + suffix)
252
+ else:
253
+ yield output(str(value) + str(ones) + suffix)
254
+ elif ones < 10:
255
+ if value % 10 == 0:
256
+ yield output(str(value + ones) + suffix)
257
+ else:
258
+ yield output(str(value) + str(ones) + suffix)
259
+ else: # eleven to nineteen
260
+ if value % 100 == 0:
261
+ yield output(str(value + ones) + suffix)
262
+ else:
263
+ yield output(str(value) + str(ones) + suffix)
264
+ value = None
265
+ elif current in self.tens:
266
+ tens = self.tens[current]
267
+ if value is None:
268
+ value = tens
269
+ elif isinstance(value, str):
270
+ value = str(value) + str(tens)
271
+ else:
272
+ if value % 100 == 0:
273
+ value += tens
274
+ else:
275
+ value = str(value) + str(tens)
276
+ elif current in self.tens_suffixed:
277
+ # ordinal or cardinal; yield the number right away
278
+ tens, suffix = self.tens_suffixed[current]
279
+ if value is None:
280
+ yield output(str(tens) + suffix)
281
+ elif isinstance(value, str):
282
+ yield output(str(value) + str(tens) + suffix)
283
+ else:
284
+ if value % 100 == 0:
285
+ yield output(str(value + tens) + suffix)
286
+ else:
287
+ yield output(str(value) + str(tens) + suffix)
288
+ elif current in self.multipliers:
289
+ multiplier = self.multipliers[current]
290
+ if value is None:
291
+ value = multiplier
292
+ elif isinstance(value, str) or value == 0:
293
+ f = to_fraction(value)
294
+ p = f * multiplier if f is not None else None
295
+ if f is not None and p.denominator == 1:
296
+ value = p.numerator
297
+ else:
298
+ yield output(value)
299
+ value = multiplier
300
+ else:
301
+ before = value // 1000 * 1000
302
+ residual = value % 1000
303
+ value = before + residual * multiplier
304
+ elif current in self.multipliers_suffixed:
305
+ multiplier, suffix = self.multipliers_suffixed[current]
306
+ if value is None:
307
+ yield output(str(multiplier) + suffix)
308
+ elif isinstance(value, str):
309
+ f = to_fraction(value)
310
+ p = f * multiplier if f is not None else None
311
+ if f is not None and p.denominator == 1:
312
+ yield output(str(p.numerator) + suffix)
313
+ else:
314
+ yield output(value)
315
+ yield output(str(multiplier) + suffix)
316
+ else: # int
317
+ before = value // 1000 * 1000
318
+ residual = value % 1000
319
+ value = before + residual * multiplier
320
+ yield output(str(value) + suffix)
321
+ value = None
322
+ elif current in self.preceding_prefixers:
323
+ # apply prefix (positive, minus, etc.) if it precedes a number
324
+ if value is not None:
325
+ yield output(value)
326
+
327
+ if next in self.words or next_is_numeric:
328
+ prefix = self.preceding_prefixers[current]
329
+ else:
330
+ yield output(current)
331
+ elif current in self.following_prefixers:
332
+ # apply prefix (dollars, cents, etc.) only after a number
333
+ if value is not None:
334
+ prefix = self.following_prefixers[current]
335
+ yield output(value)
336
+ else:
337
+ yield output(current)
338
+ elif current in self.suffixers:
339
+ # apply suffix symbols (percent -> '%')
340
+ if value is not None:
341
+ suffix = self.suffixers[current]
342
+ if isinstance(suffix, dict):
343
+ if next in suffix:
344
+ yield output(str(value) + suffix[next])
345
+ skip = True
346
+ else:
347
+ yield output(value)
348
+ yield output(current)
349
+ else:
350
+ yield output(str(value) + suffix)
351
+ else:
352
+ yield output(current)
353
+ elif current in self.specials:
354
+ if next not in self.words and not next_is_numeric:
355
+ # apply special handling only if the next word can be numeric
356
+ if value is not None:
357
+ yield output(value)
358
+ yield output(current)
359
+ elif current == "and":
360
+ # ignore "and" after hundreds, thousands, etc.
361
+ if prev not in self.multipliers:
362
+ if value is not None:
363
+ yield output(value)
364
+ yield output(current)
365
+ elif current == "double" or current == "triple":
366
+ if next in self.ones or next in self.zeros:
367
+ repeats = 2 if current == "double" else 3
368
+ ones = self.ones.get(next, 0)
369
+ value = str(value or "") + str(ones) * repeats
370
+ skip = True
371
+ else:
372
+ if value is not None:
373
+ yield output(value)
374
+ yield output(current)
375
+ elif current == "point":
376
+ if next in self.decimals or next_is_numeric:
377
+ value = str(value or "") + "."
378
+ else:
379
+ # should all have been covered at this point
380
+ raise ValueError(f"Unexpected token: {current}")
381
+ else:
382
+ # all should have been covered at this point
383
+ raise ValueError(f"Unexpected token: {current}")
384
+
385
+ if value is not None:
386
+ yield output(value)
387
+
388
+ def preprocess(self, s: str):
389
+ # replace "<number> and a half" with "<number> point five"
390
+ results = []
391
+
392
+ segments = re.split(r"\band\s+a\s+half\b", s)
393
+ for i, segment in enumerate(segments):
394
+ if len(segment.strip()) == 0:
395
+ continue
396
+ if i == len(segments) - 1:
397
+ results.append(segment)
398
+ else:
399
+ results.append(segment)
400
+ last_word = segment.rsplit(maxsplit=2)[-1]
401
+ if last_word in self.decimals or last_word in self.multipliers:
402
+ results.append("point five")
403
+ else:
404
+ results.append("and a half")
405
+
406
+ s = " ".join(results)
407
+
408
+ # put a space at number/letter boundary
409
+ s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
410
+ s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
411
+
412
+ # but remove spaces which could be a suffix
413
+ s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
414
+
415
+ return s
416
+
417
+ def postprocess(self, s: str):
418
+ def combine_cents(m: Match):
419
+ try:
420
+ currency = m.group(1)
421
+ integer = m.group(2)
422
+ cents = int(m.group(3))
423
+ return f"{currency}{integer}.{cents:02d}"
424
+ except ValueError:
425
+ return m.string
426
+
427
+ def extract_cents(m: Match):
428
+ try:
429
+ return f"¢{int(m.group(1))}"
430
+ except ValueError:
431
+ return m.string
432
+
433
+ # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
434
+ s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
435
+ s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
436
+
437
+ # write "one(s)" instead of "1(s)", just for the readability
438
+ s = re.sub(r"\b1(s?)\b", r"one\1", s)
439
+
440
+ return s
441
+
442
+ def __call__(self, s: str):
443
+ s = self.preprocess(s)
444
+ s = " ".join(word for word in self.process_words(s.split()) if word is not None)
445
+ s = self.postprocess(s)
446
+
447
+ return s
448
+
449
+
450
+ class EnglishSpellingNormalizer:
451
+ """
452
+ Applies British-American spelling mappings as listed in [1].
453
+
454
+ [1] https://www.tysto.com/uk-us-spelling-list.html
455
+ """
456
+
457
+ def __init__(self):
458
+ mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
459
+ self.mapping = json.load(open(mapping_path))
460
+
461
+ def __call__(self, s: str):
462
+ return " ".join(self.mapping.get(word, word) for word in s.split())
463
+
464
+
465
+ class EnglishTextNormalizer:
466
+ def __init__(self):
467
+ self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
468
+ self.replacers = {
469
+ # common contractions
470
+ r"\bwon't\b": "will not",
471
+ r"\bcan't\b": "can not",
472
+ r"\blet's\b": "let us",
473
+ r"\bain't\b": "aint",
474
+ r"\by'all\b": "you all",
475
+ r"\bwanna\b": "want to",
476
+ r"\bgotta\b": "got to",
477
+ r"\bgonna\b": "going to",
478
+ r"\bi'ma\b": "i am going to",
479
+ r"\bimma\b": "i am going to",
480
+ r"\bwoulda\b": "would have",
481
+ r"\bcoulda\b": "could have",
482
+ r"\bshoulda\b": "should have",
483
+ r"\bma'am\b": "madam",
484
+ # contractions in titles/prefixes
485
+ r"\bmr\b": "mister ",
486
+ r"\bmrs\b": "missus ",
487
+ r"\bst\b": "saint ",
488
+ r"\bdr\b": "doctor ",
489
+ r"\bprof\b": "professor ",
490
+ r"\bcapt\b": "captain ",
491
+ r"\bgov\b": "governor ",
492
+ r"\bald\b": "alderman ",
493
+ r"\bgen\b": "general ",
494
+ r"\bsen\b": "senator ",
495
+ r"\brep\b": "representative ",
496
+ r"\bpres\b": "president ",
497
+ r"\brev\b": "reverend ",
498
+ r"\bhon\b": "honorable ",
499
+ r"\basst\b": "assistant ",
500
+ r"\bassoc\b": "associate ",
501
+ r"\blt\b": "lieutenant ",
502
+ r"\bcol\b": "colonel ",
503
+ r"\bjr\b": "junior ",
504
+ r"\bsr\b": "senior ",
505
+ r"\besq\b": "esquire ",
506
+ # prefect tenses, ideally it should be any past participles, but it's harder..
507
+ r"'d been\b": " had been",
508
+ r"'s been\b": " has been",
509
+ r"'d gone\b": " had gone",
510
+ r"'s gone\b": " has gone",
511
+ r"'d done\b": " had done", # "'s done" is ambiguous
512
+ r"'s got\b": " has got",
513
+ # general contractions
514
+ r"n't\b": " not",
515
+ r"'re\b": " are",
516
+ r"'s\b": " is",
517
+ r"'d\b": " would",
518
+ r"'ll\b": " will",
519
+ r"'t\b": " not",
520
+ r"'ve\b": " have",
521
+ r"'m\b": " am",
522
+ }
523
+ self.standardize_numbers = EnglishNumberNormalizer()
524
+ self.standardize_spellings = EnglishSpellingNormalizer()
525
+
526
+ def __call__(self, s: str):
527
+ s = s.lower()
528
+
529
+ s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
530
+ s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
531
+ s = re.sub(self.ignore_patterns, "", s)
532
+ s = re.sub(r"\s+'", "'", s) # when there's a space before an apostrophe
533
+
534
+ for pattern, replacement in self.replacers.items():
535
+ s = re.sub(pattern, replacement, s)
536
+
537
+ s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits
538
+ s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers
539
+ s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep numeric symbols
540
+
541
+ s = self.standardize_numbers(s)
542
+ s = self.standardize_spellings(s)
543
+
544
+ # now remove prefix/suffix symbols that are not preceded/followed by numbers
545
+ s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
546
+ s = re.sub(r"([^0-9])%", r"\1 ", s)
547
+
548
+ s = re.sub(r"\s+", " ", s) # replace any successive whitespaces with a space
549
+
550
+ return s
MMaDA/eval_ASR_TTS/whisper_asr/whisper_asr.py ADDED
File without changes
MMaDA/eval_emova.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 AIDAS Lab
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ import re
18
+ import logging
19
+ import editdistance
20
+ from functools import partial
21
+
22
+ os.environ["TOKENIZERS_PARALLETISM"] = "true"
23
+
24
+ from tqdm import tqdm
25
+ import torch
26
+ import torch.distributed as dist
27
+ from torch.utils.data import Dataset, DataLoader
28
+ from torch.utils.data.distributed import DistributedSampler
29
+ from torch.nn.parallel import DistributedDataParallel as DDP
30
+
31
+ import wandb
32
+ from datasets import load_dataset
33
+ from transformers import AutoModel, AutoProcessor
34
+
35
+ # --- Helper Functions (from your reference script) ---
36
+
37
+ def setup_logger(rank):
38
+ """Sets up a logger for each DDP process."""
39
+ logger = logging.getLogger(__name__)
40
+ if logger.hasHandlers():
41
+ logger.handlers.clear()
42
+
43
+ formatter = logging.Formatter(f'%(asctime)s - [RANK {rank}] - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
44
+ ch = logging.StreamHandler()
45
+ ch.setFormatter(formatter)
46
+ logger.addHandler(ch)
47
+
48
+ logger.setLevel(logging.INFO if rank == 0 else logging.WARNING)
49
+ return logger
50
+
51
+ def calculate_WER(recognized_text_list, groundtruth_text_list):
52
+ """Calculates the Word Error Rate (WER) between predicted and ground truth texts."""
53
+ word_num, scores = 0.0, 0.0
54
+ for recognized_text, groundtruth_text in zip(recognized_text_list, groundtruth_text_list):
55
+ recognized_text = re.sub(r"[^\w\s']", "", recognized_text.lower())
56
+ groundtruth_text = re.sub(r"[^\w\s']", "", groundtruth_text.lower())
57
+
58
+ recognized_word_list = recognized_text.split()
59
+ groundtruth_word_list = groundtruth_text.split()
60
+
61
+ current_word_num = len(groundtruth_word_list)
62
+ word_num += current_word_num
63
+
64
+ scores += editdistance.eval(recognized_word_list, groundtruth_word_list)
65
+
66
+ WER = scores / word_num if word_num > 0 else 0.0
67
+ return WER, scores, word_num
68
+
69
+ def get_librispeech_dataset(logger, split="test.clean"):
70
+ """Loads the Librispeech ASR dataset from Hugging Face."""
71
+ logger.info(f"Loading librispeech_asr dataset ({split})...")
72
+ dataset = load_dataset("librispeech_asr", split=split, trust_remote_code=True)
73
+ logger.info("Dataset loaded successfully.")
74
+ return dataset
75
+
76
+ def setup_distributed(rank, world_size):
77
+ """Initializes the distributed process group."""
78
+ dist.init_process_group("nccl", rank=rank, world_size=world_size)
79
+
80
+ def cleanup_distributed():
81
+ """Cleans up the distributed process group."""
82
+ dist.destroy_process_group()
83
+
84
+ # --- Custom Dataset and Collate Function for EMOVA ---
85
+
86
+ class LibrispeechAudioDataset(Dataset):
87
+ """A simple dataset that returns audio file path and ground truth text."""
88
+ def __init__(self, hf_dataset):
89
+ self.hf_dataset = hf_dataset
90
+
91
+ def __len__(self):
92
+ return len(self.hf_dataset)
93
+
94
+ def __getitem__(self, idx):
95
+ example = self.hf_dataset[idx]
96
+ return {
97
+ "audio_path": example['file'],
98
+ "gt_text": example['text'],
99
+ "sample_id": example['id']
100
+ }
101
+
102
+ class EmovaS2TCollateFn:
103
+ """
104
+ Collate function to prepare batches for the EMOVA model using its processor.
105
+ """
106
+ def __init__(self, processor):
107
+ self.processor = processor
108
+ self.prompt_text = "Transcribe the given audio."
109
+
110
+ def __call__(self, batch):
111
+ audio_paths = [item["audio_path"] for item in batch]
112
+ gt_texts = [item["gt_text"] for item in batch]
113
+ sample_ids = [item["sample_id"] for item in batch]
114
+
115
+ # Construct the text input for each audio file in the batch
116
+ text_inputs = [
117
+ [
118
+ {"role": "user", "content": [{"type": "audio"}, {"type": "text", "text": self.prompt_text}]}
119
+ ]
120
+ for _ in audio_paths
121
+ ]
122
+
123
+ # Use the EMOVA processor to prepare the multimodal batch
124
+ inputs = self.processor(
125
+ text=text_inputs,
126
+ audios=audio_paths,
127
+ return_tensors="pt",
128
+ padding=True
129
+ )
130
+
131
+ inputs['gt_texts'] = gt_texts
132
+ inputs['sample_ids'] = sample_ids
133
+ return inputs
134
+
135
+ def main():
136
+ """Main function to run the distributed evaluation."""
137
+ rank = int(os.environ['RANK'])
138
+ world_size = int(os.environ['WORLD_SIZE'])
139
+ setup_distributed(rank, world_size)
140
+ device = torch.device(f"cuda:{rank}")
141
+ logger = setup_logger(rank)
142
+
143
+ if rank == 0:
144
+ wandb.init(project="emova-librispeech-eval")
145
+
146
+ # --- 1. Load EMOVA Models and Processors ---
147
+ logger.info("Loading EMOVA models and processors...")
148
+ model_name = "Emova-ollm/emova-qwen-2-5-7b-hf"
149
+
150
+ model = AutoModel.from_pretrained(
151
+ model_name,
152
+ torch_dtype=torch.bfloat16,
153
+ attn_implementation='flash_attention_2',
154
+ low_cpu_mem_usage=True,
155
+ trust_remote_code=True
156
+ ).to(device)
157
+
158
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
159
+
160
+ speech_tokenizer = AutoModel.from_pretrained(
161
+ "Emova-ollm/emova_speech_tokenizer_hf",
162
+ torch_dtype=torch.float32,
163
+ trust_remote_code=True
164
+ ).to(device).eval()
165
+
166
+ processor.set_speech_tokenizer(speech_tokenizer)
167
+
168
+ # Wrap the main model with DDP
169
+ model = DDP(model, device_ids=[rank], find_unused_parameters=True)
170
+ logger.info("✅ Models loaded and wrapped with DDP successfully!")
171
+
172
+ # --- 2. Setup DataLoader ---
173
+ hf_dataset = get_librispeech_dataset(logger, split="test.clean")
174
+ eval_dataset = LibrispeechAudioDataset(hf_dataset)
175
+ sampler = DistributedSampler(eval_dataset, num_replicas=world_size, rank=rank, shuffle=False)
176
+
177
+ collate_fn = EmovaS2TCollateFn(processor)
178
+
179
+ dataloader = DataLoader(
180
+ eval_dataset,
181
+ batch_size=4, # Adjust batch size based on your GPU memory
182
+ sampler=sampler,
183
+ num_workers=4,
184
+ collate_fn=collate_fn,
185
+ pin_memory=True
186
+ )
187
+
188
+ # --- 3. Evaluation Loop ---
189
+ local_results = []
190
+ model.eval()
191
+
192
+ progress_bar = tqdm(dataloader, desc="Evaluating on Librispeech", disable=(rank != 0))
193
+ for batch in progress_bar:
194
+ gt_texts = batch.pop("gt_texts")
195
+ sample_ids = batch.pop("sample_ids")
196
+
197
+ # Move batch tensors to the correct device
198
+ inputs = {k: v.to(device) for k, v in batch.items()}
199
+
200
+ with torch.no_grad():
201
+ outputs = model.module.generate(**inputs, max_new_tokens=256, do_sample=False)
202
+ # Slice to get only the generated tokens
203
+ generated_ids = outputs[:, inputs['input_ids'].shape[1]:]
204
+ decoded_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
205
+
206
+ for i in range(len(decoded_texts)):
207
+ local_results.append({
208
+ "sample_id": sample_ids[i],
209
+ "gt_text": gt_texts[i],
210
+ "decoded_text": decoded_texts[i].strip()
211
+ })
212
+
213
+ if rank == 0 and i == 0 and len(local_results) % 10 == 1: # Log sample every 10 batches on rank 0
214
+ logger.info(f"\n--- Sample ---")
215
+ logger.info(f" ID: {sample_ids[i]}")
216
+ logger.info(f" GT: {gt_texts[i]}")
217
+ logger.info(f" PD: {decoded_texts[i].strip()}")
218
+ logger.info(f"----------------")
219
+
220
+ # --- 4. Gather Results and Calculate Final Score ---
221
+ all_results = [None] * world_size
222
+ dist.all_gather_object(all_results, local_results)
223
+
224
+ if rank == 0:
225
+ logger.info("Gathering and processing results from all GPUs...")
226
+ final_results = [item for sublist in all_results for item in sublist]
227
+
228
+ gt_list = [res["gt_text"] for res in final_results]
229
+ pred_list = [res["decoded_text"] for res in final_results]
230
+
231
+ results_table = wandb.Table(columns=["ID", "Ground Truth", "Prediction"])
232
+ for res in final_results:
233
+ results_table.add_data(res["sample_id"], res["gt_text"], res["decoded_text"])
234
+ wandb.log({"S2T Predictions": results_table})
235
+
236
+ wer, errors, words = calculate_WER(pred_list, gt_list)
237
+ logger.info(f"Final WER (Librispeech test.clean): {wer:.4f} | Word Errors: {errors} | Total Words: {words}")
238
+ wandb.log({"WER": wer, "Total Word Errors": errors, "Total Words": words})
239
+
240
+ # --- Cleanup ---
241
+ if rank == 0:
242
+ wandb.finish()
243
+ cleanup_distributed()
244
+
245
+ if __name__ == '__main__':
246
+ # Set master address and port for DDP
247
+ # os.environ['MASTER_ADDR'] = 'localhost'
248
+ # os.environ['MASTER_PORT'] = '12355'
249
+ main()
MMaDA/generate.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import torch.nn.functional as F
4
+
5
+ from transformers import AutoTokenizer, AutoModel
6
+ from models import MMadaModelLM
7
+
8
+ def add_gumbel_noise(logits, temperature):
9
+ '''
10
+ The Gumbel max is a method for sampling categorical distributions.
11
+ According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
12
+ Thus, we use float64.
13
+ '''
14
+ if temperature == 0:
15
+ return logits
16
+ logits = logits.to(torch.float64)
17
+ noise = torch.rand_like(logits, dtype=torch.float64)
18
+ gumbel_noise = (- torch.log(noise)) ** temperature
19
+ return logits.exp() / gumbel_noise
20
+
21
+
22
+ def get_num_transfer_tokens(mask_index, steps):
23
+ '''
24
+ In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals.
25
+ Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)),
26
+ the expected number of tokens transitioned at each step should be consistent.
27
+
28
+ This function is designed to precompute the number of tokens that need to be transitioned at each step.
29
+ '''
30
+ mask_num = mask_index.sum(dim=1, keepdim=True)
31
+
32
+ base = mask_num // steps
33
+ remainder = mask_num % steps
34
+
35
+ num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
36
+
37
+ for i in range(mask_num.size(0)):
38
+ num_transfer_tokens[i, :remainder[i]] += 1
39
+
40
+ return num_transfer_tokens
41
+
42
+
43
+ @ torch.no_grad()
44
+ def generate(model, prompt, steps=128, gen_length=128, block_length=128, temperature=0.,
45
+ cfg_scale=0., remasking='low_confidence', mask_id=126336, attention_mask=None):
46
+ '''
47
+ Args:
48
+ model: Mask predictor.
49
+ prompt: A tensor of shape (B, L), where B is batch size.
50
+ steps: Sampling steps, less than or equal to gen_length.
51
+ gen_length: Generated answer length.
52
+ block_length: Block length, less than or equal to gen_length. If less than gen_length, it means using semi_autoregressive remasking.
53
+ temperature: Categorical distribution sampling temperature.
54
+ cfg_scale: Unsupervised classifier-free guidance scale.
55
+ remasking: Remasking strategy. 'low_confidence' or 'random'.
56
+ mask_id: The toke id of [MASK] is 126336.
57
+ '''
58
+ if attention_mask is not None and 0.0 in attention_mask:
59
+ attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
60
+ print(f"attention_bias: {attention_bias}")
61
+ else:
62
+ attention_bias = None
63
+ batch_size = prompt.shape[0]
64
+ x = torch.full((batch_size, prompt.shape[1] + gen_length), mask_id, dtype=torch.long).to(model.device)
65
+ x[:, :prompt.shape[1]] = prompt.clone()
66
+
67
+ prompt_index = (x != mask_id)
68
+
69
+ assert gen_length % block_length == 0
70
+ num_blocks = gen_length // block_length
71
+
72
+ assert steps % num_blocks == 0
73
+ steps = steps // num_blocks
74
+
75
+ for num_block in range(num_blocks):
76
+ block_mask_index = (x[:, prompt.shape[1] + num_block * block_length: prompt.shape[1] + (num_block + 1) * block_length:] == mask_id)
77
+ num_transfer_tokens = get_num_transfer_tokens(block_mask_index, steps)
78
+ for i in range(steps):
79
+ mask_index = (x == mask_id)
80
+ if cfg_scale > 0.:
81
+ un_x = x.clone()
82
+ un_x[prompt_index] = mask_id
83
+ x_ = torch.cat([x, un_x], dim=0)
84
+ logits = model(x_).logits
85
+ logits, un_logits = torch.chunk(logits, 2, dim=0)
86
+ logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
87
+ else:
88
+ logits = model(x, attention_bias=attention_bias).logits
89
+
90
+ logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
91
+ x0 = torch.argmax(logits_with_noise, dim=-1) # b, l
92
+
93
+ if remasking == 'low_confidence':
94
+ p = F.softmax(logits.to(torch.float64), dim=-1)
95
+ x0_p = torch.squeeze(
96
+ torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
97
+ elif remasking == 'random':
98
+ x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
99
+ else:
100
+ raise NotImplementedError(remasking)
101
+
102
+ x0_p[:, prompt.shape[1] + (num_block + 1) * block_length:] = -np.inf
103
+
104
+ x0 = torch.where(mask_index, x0, x)
105
+ confidence = torch.where(mask_index, x0_p, -np.inf)
106
+ # print(confidence.shape)
107
+ transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
108
+ for j in range(confidence.shape[0]):
109
+ _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j, i])
110
+ transfer_index[j, select_index] = True
111
+ x[transfer_index] = x0[transfer_index]
112
+
113
+ return x
114
+
115
+
116
+ def main():
117
+ device = 'cuda'
118
+ # Load from HF
119
+
120
+ # model = MMadaModelLM.from_pretrained("Gen-Verse/MMaDA-8B-Base", trust_remote_code=True, torch_dtype=torch.bfloat16).to(device).eval()
121
+ # tokenizer = AutoTokenizer.from_pretrained("Gen-Verse/MMaDA-8B-Base", trust_remote_code=True)
122
+
123
+ train_step = 135000
124
+ trained_checkpoint_path = f"/home/work/AIDAS/ckpts/omada/omada-training-stage1/checkpoint-{train_step}/unwrapped_model/"
125
+
126
+ model = MMadaModelLM.from_pretrained(
127
+ trained_checkpoint_path,
128
+ trust_remote_code=True,
129
+ torch_dtype=torch.bfloat16,
130
+ config="/home/work/AIDAS/ckpts/omada/omada-training-stage1/config.json"
131
+ ).to(device)
132
+
133
+ tokenizer = AutoTokenizer.from_pretrained("Gen-Verse/MMaDA-8B-MixCoT", trust_remote_code=True)
134
+
135
+ tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}"
136
+ prompt = "Lily can run 12 kilometers per hour for 4 hours. After that, she runs 6 kilometers per hour. How many kilometers can she run in 8 hours?"
137
+ m = [{"role": "user", "content": prompt}, ]
138
+ prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
139
+ input_ids = tokenizer(text=prompt, return_tensors="pt", padding=True, padding_side="left")['input_ids']
140
+ input_ids = torch.tensor(input_ids).to(device)
141
+ out = generate(model, input_ids, steps=128, gen_length=128, block_length=128, temperature=1, cfg_scale=0., remasking='low_confidence')
142
+ print(tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True))
143
+
144
+
145
+ if __name__ == '__main__':
146
+ main()