Spaces:

jaeikkim
/

AIDAS-Omni-Modal-Diffusion

Running on Zero

App Files Files Community

jaeikkim commited on Nov 20

Commit

7bfbdc3

0 Parent(s):

Reinit Space without binary assets

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
MMaDA/.cursor/rules/python-env.mdc +4 -0
MMaDA/.gitignore +2 -0
MMaDA/AIDAS-Omni-Modal-Diffusion/app.py +16 -0
MMaDA/LICENSE +21 -0
MMaDA/README.md +209 -0
MMaDA/accelerate_configs/1_gpu.yaml +15 -0
MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero2.yaml +21 -0
MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero3.yaml +24 -0
MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero4.yaml +24 -0
MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero2_aidas.yaml +25 -0
MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero2_aidas2.yaml +25 -0
MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero4.yaml +26 -0
MMaDA/accelerate_configs/3_node_8_gpus_deepspeed_zero1.yaml +25 -0
MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2.yaml +21 -0
MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml +25 -0
MMaDA/accelerate_configs/8_node_8_gpus_deepspeed_zero2.yaml +21 -0
MMaDA/app.py +894 -0
MMaDA/check_lr.py +27 -0
MMaDA/check_tokens.py +191 -0
MMaDA/configs/mmada_demo.yaml +95 -0
MMaDA/configs/mmada_demo_s2t.yaml +131 -0
MMaDA/configs/mmada_demo_speech.yaml +101 -0
MMaDA/configs/mmada_demo_video.yaml +95 -0
MMaDA/configs/mmada_demo_video_temp.yaml +95 -0
MMaDA/configs/mmada_pretraining_i2i.yaml +86 -0
MMaDA/configs/mmada_pretraining_s2t.yaml +96 -0
MMaDA/configs/mmada_pretraining_stage1_llada_instruct.yaml +100 -0
MMaDA/configs/mmada_pretraining_stage2_llada_instruct.yaml +109 -0
MMaDA/configs/mmada_pretraining_stage3_llada_instruct.yaml +112 -0
MMaDA/configs/mmada_pretraining_stage3_llada_instruct_512_cot.yaml +123 -0
MMaDA/configs/mmada_pretraining_stage4_llada_instruct.yaml +134 -0
MMaDA/configs/mmada_pretraining_t2s.yaml +96 -0
MMaDA/configs/mmada_pretraining_v2s.yaml +133 -0
MMaDA/configs/mmada_pretraining_v2t.yaml +88 -0
MMaDA/configs/omada_instruction_tuning.yaml +200 -0
MMaDA/configs/omada_pretraining_stage1-2.yaml +131 -0
MMaDA/configs/omada_pretraining_stage1-3.yaml +132 -0
MMaDA/configs/omada_pretraining_stage1-4.yaml +132 -0
MMaDA/configs/omada_pretraining_stage1.yaml +131 -0
MMaDA/configs/omada_pretraining_v2t_inst.yaml +132 -0
MMaDA/debug_speech_dataloader.py +222 -0
MMaDA/eval_ASR_TTS/test.py +266 -0
MMaDA/eval_ASR_TTS/whisper_asr/normalizers/__init__.py +2 -0
MMaDA/eval_ASR_TTS/whisper_asr/normalizers/basic.py +76 -0
MMaDA/eval_ASR_TTS/whisper_asr/normalizers/english.json +1741 -0
MMaDA/eval_ASR_TTS/whisper_asr/normalizers/english.py +550 -0
MMaDA/eval_ASR_TTS/whisper_asr/whisper_asr.py +0 -0
MMaDA/eval_emova.py +249 -0
MMaDA/generate.py +146 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

MMaDA/.cursor/rules/python-env.mdc ADDED Viewed

	@@ -0,0 +1,4 @@

+---
+alwaysApply: true
+---
+When running python script, use conda env `mmada`.

MMaDA/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ exp
2	+ wandb

MMaDA/AIDAS-Omni-Modal-Diffusion/app.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import gradio as gr
+import spaces
+import torch
+zero = torch.Tensor([0]).cuda()
+print(zero.device)  # should print 'cpu' until GPU context is enabled
+@spaces.GPU
+def greet(n):
+    print(zero.device)  # now this should print 'cuda:0'
+    return f"Hello {zero + n} Tensor"
+demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
+demo.launch()

MMaDA/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Ling Yang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MMaDA/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+<div align="center">
+<br>
+<img src="assets/title.png" width="166">
+<h3>Multimodal Large Diffusion Language Models</h3></div>
+<p align="center">
+  <a href="https://arxiv.org/abs/2505.15809">
+    <img
+      src="https://img.shields.io/badge/MMaDA-Paper-red?logo=arxiv&logoColor=red"
+      alt="MMaDA Paper on arXiv"
+    />
+  </a>
+  <a href="https://huggingface.co/spaces/Gen-Verse/MMaDA">
+    <img
+        src="https://img.shields.io/badge/MMaDA%20Demo-Hugging%20Face%20Space-blue?logo=huggingface&logoColor=blue"
+        alt="MMaDA on Hugging Face"
+    />
+  </a>
+  <a href="https://huggingface.co/Gen-Verse/MMaDA-8B-Base">
+    <img
+        src="https://img.shields.io/badge/MMaDA--8B--Base-Hugging%20Face%20Model-orange?logo=huggingface&logoColor=yellow"
+        alt="MMaDA on Hugging Face"
+    />
+  </a>
+    <a href="https://huggingface.co/Gen-Verse/MMaDA-8B-MixCoT">
+    <img
+        src="https://img.shields.io/badge/MMaDA--8B--MixCoT-Hugging%20Face%20Model-orange?logo=huggingface&logoColor=yellow"
+        alt="MMaDA on Hugging Face"
+    />
+  </a>
+  <a href="https://github.com/Gen-Verse/MMaDA/blob/main/assets/wx-mmada-0613.jpeg">
+    <img
+        src="https://img.shields.io/badge/Wechat-Join-green?logo=wechat&amp"
+        alt="Wechat Group Link"
+    />
+  </a>
+</p>
+## 🌌 Introduction
+MMaDA is a new family of **multimodal diffusion foundation models** designed to achieve superior performance across diverse domains such as textual reasoning, multimodal understanding, and text-to-image generation. MMaDA is distinguished by three key innovations:
+1. MMaDA adopts a **unified diffusion architecture** with a shared probabilistic formulation and a modality-agnostic design, eliminating the need for modality-specific components.
+2. MMaDA introduces a **mixed long chain-of-thought (CoT) fine-tuning** strategy that curates a unified CoT format across modalities.
+3. MMaDA adopts a unified policy-gradient-based RL algorithm, which we call **UniGRPO**, tailored for diffusion foundation models. Utilizing diversified reward modeling, **UniGRPO** unifies post-training across both reasoning and generation tasks, ensuring consistent performance improvements.
+<div align="center" style="width: 600px; margin: auto;">
+  <img src="assets/showcase0.8.gif" alt="MMaDA decoding demo" width="550" />
+  <p style="font-style: italic; font-size: 14px; color: #555; margin-top: 6px;">
+    MMaDA's decoding demo. This video showcases how a diffusion foundation model generates text and image.<br>
+    The "Text Generation" part uses a semi-autoregressive sampling method, while the "Multimodal Generation" part adopts non-autoregressive diffusion denoising.
+  </p>
+</div>
+<!--
+## Decoding Demo
+We demonstrate the decoding process of MMaDA with a teaser video to show how a diffusion model generates text and image. The "Text Generation" part adopts a "semi-autoregressive" sampling method and the "MultiModal Generation" part adopts a non-autoregressive sampling method which is purely diffusion denoising.
+<!-- <div style="display: flex; justify-content: center; flex-wrap: wrap;">
+    <img src="assets/showcase0.8.gif" style="width: 90%" />
+</div> -->
+## 📰 Latest Updates
+* **[2025-06-02]** We open source our **MMaDA-8B-MixCoT** at [Huggingface](https://huggingface.co/Gen-Verse/MMaDA-8B-MixCoT).
+* **[2025-05-24]** We add support for MPS inference, tested on M4.
+* **[2025-05-22]** We release the inference and training code of MMaDA for text generation, multimodal generation and image generation.
+* **[2025-05-22]** We open source our **MMaDA-8B-Base** at [Huggingface](https://huggingface.co/Gen-Verse/MMaDA-8B-Base). **MMaDA-8B-MixCoT** and  **MMaDA-8B-Max** will be released in the near future.
+* **[2025-05-22]** We release our [research paper](https://arxiv.org/abs/2505.15809) and [demo](https://huggingface.co/spaces/Gen-Verse/MMaDA) for the first unified multimodal diffusion model: MMaDA.
+## 🧬 MMaDA Series Overview
+MMaDA includes a series of checkpoints reflecting different training stages:
+1. **[MMaDA-8B-Base](https://huggingface.co/Gen-Verse/MMaDA-8B-Base)**: After pretraining and instruction tuning. Capable of basic text generation, image generation, image captioning and **thinking ablities**.
+2. **[MMaDA-8B-MixCoT](https://huggingface.co/Gen-Verse/MMaDA-8B-MixCoT)**: After mixed long chain-of-thought (CoT) fine-tuning. Capable of **complex** textual, multimodal and image generation reasoning.
+3. **MMaDA-8B-Max (coming soon)**: After UniGRPO reinforment learning. Excels at complex reasoning and awesome visual generation. Will be released in the future.
+<div align="center">
+<img src="assets/example_compare.png" width="800">
+<p><i>Overview of MMaDA's capablities.</i></p>
+</div>
+## ✅ TODO
+- [x] Release [MMaDA-8B-MixCoT](https://huggingface.co/Gen-Verse/MMaDA-8B-MixCoT)
+- [ ] Release MMaDA-8B-Max and OpenRLHF-based UniGRPO training code.
+## ⚙️ Quick Start
+First, set up the enviroment:
+```
+pip install -r requirements.txt
+```
+Launch local Gradio demo:
+```
+python app.py
+```
+Or try it online via our [Huggingface Demo](https://huggingface.co/spaces/Gen-Verse/MMaDA).
+## 🚀 Inference
+For batch-level inference, we provide our inference scripts here.
+### 1. Text Generation
+For text generation, we follow LLaDA's configuration and generation script. Simple run:
+```bash
+python generate.py
+```
+### 2. MultiModal Generation
+For multimodal generation and text-to-image generation, first login your wandb account:
+```
+wandb login
+```
+Inference demo for MultiModal Generation and you can view the results on wandb:
+```
+python3 inference_mmu.py config=configs/mmada_demo.yaml mmu_image_root=./mmu_validation question='Please describe this image in detail.'
+```
+### 3. Text-to-Image Genertion
+For multimodal generation and text-to-image generation, first login your wandb account:
+```
+wandb login
+```
+Inference demo for Text-to-Image Genertion and you can view the results on wandb:
+```
+python3 inference_t2i.py config=configs/mmada_demo.yaml batch_size=1 validation_prompts_file=validation_prompts/text2image_prompts.txt guidance_scale=3.5 generation_timesteps=15
+mode='t2i'
+```
+## 🔧 Training
+**Update your training data path in `configs/xx.yaml`.**
+### Stage 0. Prepare your accelerate configs
+Please first prepare your accelerate configs. You can simple run
+```
+accelerate config
+```
+Or use our provided configs in `accelerate_configs`:
+```
+├── accelerate_configs/
+|   ├── 1_gpu.yaml
+|   └── 8_node_8_gpus_deepspeed_zero2.yaml (for 8 * 8 gpus)
+```
+### Stage 1.1: Pre-training on ImageNet
+First we use LLaDA-8B-Instruct to initialize our model, and train on ImageNet for basic visual capbalities.
+```
+accelerate launch --config_file path/to/your/accelerate_config --main_process_port=8888 training/train_mmada.py config=configs/mmada_pretraining_stage1_llada_instruct.yaml
+```
+### Stage 1.2 Pre-training on Image-Text Dataset
+Then we replace the ImageNet dataset in Stage 1.1 with Image-Text Dataset. Please change the pretrained model path in `mmada_pretraining_stage2_llada_instruct.yaml` with your checkpoint in Stage 1.1
+```
+accelerate launch --config_file path/to/your/accelerate_config --main_process_port=8888 training/train_mmada_stage2.py config=configs/mmada_pretraining_stage2_llada_instruct.yaml
+```
+### Stage 1.3 Pre-training on Text Instruction following
+In this stage, we begin training on text instruction following and include corresponding validations. Please change the pretrained model path in `mmada_pretraining_stage3_llada_instruct.yaml` with your checkpoint in Stage 1.2
+```
+accelerate launch --config_file path/to/your/accelerate_config --main_process_port=8888 training/train_mmada_stage3.py config=configs/mmada_pretraining_stage3_llada_instruct.yaml
+```
+### Stage 2.1 Mix-CoT Training (Text Only)
+In this stage, we begin our Mix-CoT finetuning with text reasoning first, along with improved image quality. Please change the pretrained model path in `mmada_pretraining_stage3_llada_instruct.yaml` with your checkpoint in Stage 1.3 and prepare your CoT data.
+```
+accelerate launch --config_file path/to/your/accelerate_config --main_process_port=8888 training/train_mmada_stage_cot_sft.py config=configs/mmada_pretraining_stage3_llada_instruct_512_cot.yaml
+```
+### Stage 2.2 Mix-CoT Training (with MultiModal Reasoning)
+In this stage, we include multimodal reasoning, along with improved image quality. Please change the pretrained model path in `mmada_pretraining_stage3_llada_instruct.yaml` with your checkpoint in Stage 2.1 and prepare your CoT data.
+```
+accelerate launch --config_file path/to/your/accelerate_config --main_process_port=8888 training/train_mmada_stage4.py config=configs/mmada_pretraining_stage4_llada_instruct.yaml
+```
+### Stage 3 UniGRPO RL
+[Will be released once we finished our code transition to OpenRLHF]
+## 📖 Citation
+```
+@article{yang2025mmada,
+  title={MMaDA: Multimodal Large Diffusion Language Models},
+  author={Yang, Ling and Tian, Ye and Li, Bowen and Zhang, Xinchen and Shen, Ke and Tong, Yunhai and Wang, Mengdi},
+  journal={arXiv preprint arXiv:2505.15809},
+  year={2025}
+}
+```
+## 🤝 Acknowledgments
+This work is heavily based on [Show-o](https://github.com/showlab/Show-o), [LLaDA](https://github.com/ML-GSAI/LLaDA), [maskgit](https://github.com/google-research/maskgit), [transformers](https://github.com/huggingface/transformers), [accelerate](https://github.com/huggingface/accelerate) and [webdataset](https://github.com/webdataset/webdataset). Thanks to all the authors for their great work.
+## 💬 Discussion and Collaboration
+Welcome to discuss and collaborate with us for continuously improving MMaDA. If you have any bad cases, please kindly share them in the [Issue](https://github.com/Gen-Verse/MMaDA/issues/4#issue-3083196081).
+Also, you can reach us with this WeChat QR code!
+<p align="center">
+<img src="assets/wx-mmada-0613.jpeg" width="256">
+</p>

MMaDA/accelerate_configs/1_gpu.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: 'NO'
+downcast_bf16: 'no'
+gpu_ids: '0'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero2.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero3.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 2
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+  zero_optimization:
+    overlap_comm: false
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/accelerate_configs/1_node_8_gpus_deepspeed_zero4.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 2
+  zero_optimization:
+    overlap_comm: false
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero2_aidas.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 2
+  zero_optimization:
+    overlap_comm: false
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+main_process_ip: 172.51.80.134
+main_training_function: main
+num_machines: 2
+num_processes: 16
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero2_aidas2.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 2
+  zero_optimization:
+    overlap_comm: false
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+main_process_ip: 172.51.80.136
+main_training_function: main
+num_machines: 4
+num_processes: 32
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/accelerate_configs/2_node_8_gpus_deepspeed_zero4.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 4
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 2
+  zero_optimization:
+    overlap_comm: false
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+main_process_ip: 172.51.64.134
+main_training_function: main
+num_machines: 2
+num_processes: 16
+machine_rank: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/accelerate_configs/3_node_8_gpus_deepspeed_zero1.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 4
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 2
+  zero_optimization:
+    overlap_comm: false
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+main_process_ip: 172.51.64.130
+main_training_function: main
+num_machines: 3
+num_processes: 24
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 4
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+main_training_function: main
+mixed_precision: bf16
+num_machines: 4
+num_processes: 32
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none #cpu
+  offload_param_device: none #cpu
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 2
+  zero_optimization:
+    overlap_comm: false
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: true
+main_process_ip: 172.51.133.6
+main_training_function: main
+num_machines: 4
+num_processes: 32
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/accelerate_configs/8_node_8_gpus_deepspeed_zero2.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+main_training_function: main
+mixed_precision: bf16
+num_machines: 8
+num_processes: 64
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

MMaDA/app.py ADDED Viewed

	@@ -0,0 +1,894 @@

+import gradio as gr
+import torch
+import numpy as np
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+from torchvision import transforms
+from models import MAGVITv2, get_mask_schedule, MMadaModelLM
+from training.prompting_utils import UniversalPrompting
+from PIL import Image
+def image_transform(image, resolution=256, normalize=True):
+    image = transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BICUBIC)(image)
+    image = transforms.CenterCrop((resolution, resolution))(image)
+    image = transforms.ToTensor()(image)
+    if normalize:
+        image = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)(image)
+    return image
+def add_gumbel_noise(logits, temperature):
+    """
+    Adds Gumbel noise to logits for stochastic sampling.
+    Equivalent to argmax(logits + temperature * G) where G ~ Gumbel(0,1).
+    This version is more numerically stable than a version involving exp() and division.
+    """
+    if abs(temperature) < 1e-9: # Effectively zero temperature
+        return logits
+    # Ensure logits are float64 for precision with noise, as suggested by user context
+    if DEVICE == "mps":
+        logits = logits.to(torch.float32)
+    else:
+        logits = logits.to(torch.float64)
+    # Standard Gumbel noise: -log(-log(U)), U ~ Uniform(0,1)
+    # Add small epsilon for numerical stability inside logs
+    if DEVICE == "mps":
+        noise = torch.rand_like(logits, dtype=torch.float32)
+    else:
+        noise = torch.rand_like(logits, dtype=torch.float64)
+    standard_gumbel_noise = -torch.log(-torch.log(noise + 1e-20) + 1e-20)
+    return logits + temperature * standard_gumbel_noise
+def get_num_transfer_tokens(mask_index, steps):
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    # Ensure steps is at least 1 to avoid division by zero if mask_num is also 0 (though sum should be >=0)
+    steps = max(1, int(steps)) # Ensure steps is a positive integer
+    base = mask_num // steps
+    remainder = mask_num % steps
+    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.long) + base
+    for i in range(mask_num.size(0)): # Iterate over batch
+        if remainder[i] > 0 : # Ensure remainder is positive before indexing
+             num_transfer_tokens[i, :remainder[i].item()] += 1 # .item() for single value tensor to int
+    return num_transfer_tokens
+MODEL = None
+TOKENIZER = None
+DEVICE = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available() else "cpu"
+)
+MASK_ID = None
+uni_prompting = None
+VQ_MODEL = MAGVITv2().from_pretrained("showlab/magvitv2").to(DEVICE)
+DEFAULT_MODEL_PATH = "Gen-Verse/MMaDA-8B-Base" # Default
+CURRENT_MODEL_PATH = None
+MODEL_CHOICES = [
+    "MMaDA-8B-Base",
+    "MMaDA-8B-MixCoT (coming soon)",
+    "MMaDA-8B-Max (coming soon)"
+]
+MODEL_ACTUAL_PATHS = {
+    "MMaDA-8B-Base": DEFAULT_MODEL_PATH,
+}
+def clear_outputs_action():
+        return None, None
+def _load_model_and_tokenizer_core(model_path_to_load, model_display_name_for_status):
+    global MODEL, TOKENIZER, MASK_ID, CURRENT_MODEL_PATH, DEVICE, uni_prompting
+    if MODEL is not None and CURRENT_MODEL_PATH == model_path_to_load:
+        return f"Model '{model_display_name_for_status}' from '{model_path_to_load}' is already loaded. MASK_ID: {MASK_ID}"
+    CURRENT_MODEL_PATH = model_path_to_load
+    status_msg_parts = [f"Loading '{model_display_name_for_status}'..."]
+    try:
+        TOKENIZER = AutoTokenizer.from_pretrained(model_path_to_load, trust_remote_code=True)
+        status_msg_parts.append(f"Tokenizer for '{model_display_name_for_status}' loaded.")
+        MODEL = MMadaModelLM.from_pretrained(model_path_to_load, trust_remote_code=True, torch_dtype=torch.bfloat16).to(DEVICE).eval()
+        status_msg_parts.append(f"Model '{model_display_name_for_status}' loaded to {DEVICE}.")
+        uni_prompting = UniversalPrompting(TOKENIZER, max_text_len=512, special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),ignore_id=-100, cond_dropout_prob=0.1, use_reserved_token=True)
+        if hasattr(TOKENIZER, 'mask_token_id') and TOKENIZER.mask_token_id is not None:
+            MASK_ID = TOKENIZER.mask_token_id
+            status_msg_parts.append(f"Using MASK_ID from tokenizer: {MASK_ID}.")
+        else:
+            MASK_ID = 126336
+            status_msg_parts.append(f"Using default MASK_ID: {MASK_ID}.")
+        if TOKENIZER.pad_token_id is None:
+            if TOKENIZER.eos_token_id is not None:
+                TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
+                TOKENIZER.pad_token = TOKENIZER.eos_token
+                status_msg_parts.append(f"Set pad_token_id to eos_token_id ({TOKENIZER.eos_token_id}).")
+            else:
+                status_msg_parts.append("Warning: pad_token_id is None and no eos_token_id.")
+        if TOKENIZER.eos_token_id is None: # Important for cleaning up output in visualization
+             status_msg_parts.append("Warning: tokenizer.eos_token_id is None. EOS cleanup might not work.")
+        TOKENIZER.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}"
+        return " ".join(status_msg_parts)
+    except Exception as e:
+        MODEL = None
+        TOKENIZER = None
+        MASK_ID = None
+        CURRENT_MODEL_PATH = None
+        return f"Error loading model '{model_display_name_for_status}': {str(e)}"
+def handle_model_selection_change(selected_model_name_ui):
+    if "coming soon" in selected_model_name_ui.lower():
+        global MODEL, TOKENIZER, MASK_ID, CURRENT_MODEL_PATH
+        MODEL = None
+        TOKENIZER = None
+        MASK_ID = None
+        CURRENT_MODEL_PATH = None
+        return f"'{selected_model_name_ui}' is not yet available. Please select 'Model A'."
+    actual_path = MODEL_ACTUAL_PATHS.get(selected_model_name_ui)
+    if not actual_path:
+        return f"Path for '{selected_model_name_ui}' is not defined. Cannot load."
+    return _load_model_and_tokenizer_core(actual_path, selected_model_name_ui)
+def get_highlighted_text_tuples(current_x_ids_batch, prompt_input_ids, prompt_len, tk, current_mask_id, raw_prompt_attention_mask):
+    if current_x_ids_batch is None or current_x_ids_batch.ndim == 0 or current_x_ids_batch.shape[0] == 0:
+        return [("Error in sequence data for visualization.", "ERROR")]
+    # only answer part
+    current_x_ids_batch = current_x_ids_batch[:, prompt_len:]
+    seq_ids = current_x_ids_batch[0].tolist()
+    eos_token_id = tk.eos_token_id  # Get EOS token ID
+    # Stage 1: Build initial list of tuples with (token_str, label, token_id_int)
+    # This helps in identifying EOS tokens later without re-checking the type.
+    intermediate_tuples = []
+    for j, token_id_int in enumerate(seq_ids):
+        try:
+            token_str = tk.decode([token_id_int], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        except Exception: # Handle cases where a token ID might be problematic (e.g. with mock)
+            token_str = f"[ID:{token_id_int}]"
+        label = "ERROR"
+        if token_id_int == current_mask_id:
+            token_str = "[MASK]"
+            label = "MASK"
+        else:
+            label = "GEN"
+        intermediate_tuples.append((token_str, label, token_id_int))
+    return intermediate_tuples
+@torch.no_grad()
+def generate_viz_wrapper_t2i(prompt_text, steps, guidance_scale, mask_schedule="cosine"):
+    global MODEL, TOKENIZER, MASK_ID, DEVICE, uni_prompting
+    if MODEL is None or TOKENIZER is None or MASK_ID is None:
+        yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
+        return
+    steps = int(steps)
+    guidance_scale = float(guidance_scale)
+    image_tokens = torch.ones((1, 1024), dtype=torch.long, device=DEVICE) * MASK_ID
+    prompt_text = [prompt_text]
+    input_ids, attention_mask = uni_prompting((prompt_text, image_tokens), 't2i_gen')
+    if guidance_scale > 0:
+        uncond_input_ids, uncond_attention_mask = uni_prompting(([''], image_tokens), 't2i_gen')
+    else:
+        uncond_input_ids, uncond_attention_mask = None, None
+    mask_schedule = get_mask_schedule(mask_schedule)
+    blank_image = Image.new("RGB", (512, 512), (255, 255, 255))
+    yield blank_image, "Starting generation..."
+    for image_step, status_msg_step in MODEL.t2i_generate_decoding_stepwise(
+            input_ids = input_ids,
+            uncond_input_ids = uncond_input_ids,
+            attention_mask = attention_mask,
+            uncond_attention_mask = uncond_attention_mask,
+            temperature=1.0,
+            timesteps = steps,
+            guidance_scale = guidance_scale,
+            noise_schedule = mask_schedule,
+            noise_type = "mask",
+            seq_len = 1024,
+            vq_model = VQ_MODEL,
+            uni_prompting=uni_prompting):
+        yield image_step, status_msg_step
+@torch.no_grad()
+def generate_viz_wrapper_lm(prompt_text, steps, gen_length, block_length, temperature,
+                         cfg_scale, remasking_strategy, thinking_mode_lm):
+    global MODEL, TOKENIZER, MASK_ID, DEVICE
+    print(f"thinking_mode_lm: {thinking_mode_lm}")
+    if MODEL is None or TOKENIZER is None or MASK_ID is None:
+        yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
+        return
+    steps = int(steps)
+    gen_length = int(gen_length)
+    block_length = int(block_length)
+    if thinking_mode_lm:
+        prompt_text = "You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here\n" + prompt_text
+    try:
+        m = [{"role": "user", "content": prompt_text}]
+        processed_prompt_text = TOKENIZER.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
+    except Exception as e:
+        yield [("Error applying chat template.", "ERROR")], f"Chat template error: {e}"
+        processed_prompt_text = prompt_text
+    try:
+        if TOKENIZER.pad_token_id is None:
+            if TOKENIZER.eos_token_id is not None:
+                TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
+            else: # Should have been caught by load_model, but double check
+                 yield [("Tokenizer Error", "ERROR")], "pad_token_id is not set in tokenizer."
+                 return
+        input_ids = TOKENIZER(text=processed_prompt_text, return_tensors="pt", padding="longest", padding_side="left", truncation=True, max_length=MODEL.config.max_position_embeddings if hasattr(MODEL.config, 'max_position_embeddings') else 2048)['input_ids'].to(DEVICE)
+        raw_prompt_attention_mask = None
+    except Exception as e:
+        yield [("Error tokenizing prompt.", "ERROR")], f"Tokenization error: {e}"
+        return
+    batch_size = input_ids.shape[0]
+    prompt_len = input_ids.shape[1]
+    x = torch.full((batch_size, prompt_len + gen_length), MASK_ID, dtype=torch.long, device=DEVICE)
+    x[:, :prompt_len] = input_ids.clone()
+    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), "Starting generation: Prompt + Initial Masks"
+    if gen_length == 0:
+         final_text_output = TOKENIZER.batch_decode(x[:,prompt_len:], skip_special_tokens=True)
+         yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_output[0] if final_text_output else ""
+         return
+    if block_length <= 0 or gen_length % block_length != 0 :
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+              f"Error: gen_length ({gen_length}) must be divisible by block_length ({block_length}) and block_length > 0."
+        return
+    num_blocks = gen_length // block_length
+    if steps <=0 or steps % num_blocks != 0:
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+              f"Error: steps ({steps}) must be positive and divisible by num_blocks ({num_blocks}). Steps: {steps}, Num Blocks: {num_blocks}"
+        return
+    steps_per_block = steps // num_blocks
+    for num_block_iter in range(num_blocks):
+        current_block_start_idx_in_x = prompt_len + num_block_iter * block_length
+        current_block_end_idx_in_x = prompt_len + (num_block_iter + 1) * block_length
+        block_masks_bool_current = torch.zeros_like(x, dtype=torch.bool)
+        block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x] = \
+            (x[:, current_block_start_idx_in_x:current_block_end_idx_in_x] == MASK_ID)
+        num_transfer_tokens_for_this_block = get_num_transfer_tokens(
+            block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x],
+            steps_per_block
+        )
+        for i_step_in_block in range(steps_per_block):
+            mask_index_global = (x == MASK_ID)
+            if cfg_scale > 0.:
+                un_x = x.clone()
+                # For unconditional pass, mask out the original prompt tokens that are not padding
+                # raw_prompt_attention_mask is (B, prompt_len)
+                prompt_active_tokens_mask = raw_prompt_attention_mask.bool() # True where actual prompt tokens are
+                un_x[:, :prompt_len][prompt_active_tokens_mask] = MASK_ID
+                x_cfg_input = torch.cat([x, un_x], dim=0)
+                # Pass attention_mask for CFG if model expects it, covering both parts
+                # For simplicity, not passing explicit attention_mask here; relies on model's internal handling.
+                model_output = MODEL(x_cfg_input)
+                logits_cond, logits_uncond = torch.chunk(model_output.logits, 2, dim=0)
+                logits = logits_uncond + (cfg_scale + 1) * (logits_cond - logits_uncond)
+            else:
+                # Not passing explicit attention_mask here; relies on model's internal handling.
+                model_output = MODEL(x)
+                logits = model_output.logits
+            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+            x0_predicted_tokens = torch.argmax(logits_with_noise, dim=-1)
+            if remasking_strategy == 'low_confidence':
+                if DEVICE == "mps":
+                    probs = F.softmax(logits.to(torch.float32), dim=-1)
+                else:
+                    probs = F.softmax(logits.to(torch.float64), dim=-1)
+                x0_probs = torch.gather(probs, dim=-1, index=x0_predicted_tokens.unsqueeze(-1)).squeeze(-1)
+            elif remasking_strategy == 'random':
+                if DEVICE == "mps":
+                    x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float32)
+                else:
+                    x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float64)
+            else:
+                yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), f"Error: Unknown remasking strategy '{remasking_strategy}'"
+                return
+            confidence_for_selection = torch.full_like(x0_probs, -torch.inf)
+            candidate_positions_for_unmasking = mask_index_global & block_masks_bool_current
+            confidence_for_selection = torch.where(
+                candidate_positions_for_unmasking,
+                x0_probs,
+                -torch.inf
+            )
+            x0_final_candidates = torch.where(mask_index_global, x0_predicted_tokens, x)
+            transfer_indices_bool = torch.zeros_like(x, dtype=torch.bool)
+            num_to_transfer_this_step_batch = num_transfer_tokens_for_this_block[:, i_step_in_block]
+            for j_batch_idx in range(batch_size):
+                k_val = min(num_to_transfer_this_step_batch[j_batch_idx].item(),
+                            candidate_positions_for_unmasking[j_batch_idx].sum().item()) # ensure k isn't too large
+                if k_val > 0:
+                    # Ensure confidence_for_selection[j_batch_idx] is 1D for topk
+                    conf_slice = confidence_for_selection[j_batch_idx]
+                    if conf_slice.ndim > 1: conf_slice = conf_slice.view(-1) # Should already be 1D from x0_probs
+                    # Check if there are enough valid (non -inf) confidences
+                    valid_conf_count = (conf_slice > -torch.inf).sum().item()
+                    actual_k = min(k_val, valid_conf_count)
+                    if actual_k > 0:
+                        _, topk_indices_in_x = torch.topk(conf_slice, k=actual_k)
+                        transfer_indices_bool[j_batch_idx, topk_indices_in_x] = True
+            x[transfer_indices_bool] = x0_final_candidates[transfer_indices_bool]
+            current_total_step = num_block_iter * steps_per_block + i_step_in_block + 1
+            total_overall_steps = num_blocks * steps_per_block
+            status_msg = f"Block {num_block_iter+1}/{num_blocks}, Step {i_step_in_block+1}/{steps_per_block} (Total: {current_total_step}/{total_overall_steps})"
+            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), status_msg
+    final_generated_ids = x[:, prompt_len:]
+    final_text_output = TOKENIZER.batch_decode(final_generated_ids, skip_special_tokens=True)
+    final_text_str = final_text_output[0] if final_text_output and len(final_text_output) > 0 else ""
+    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_str
+@torch.no_grad()
+def generate_viz_wrapper(uploaded_image_pil, prompt_text, steps, gen_length, block_length, temperature,
+                         cfg_scale, remasking_strategy, thinking_mode_mmu):
+    global MODEL, TOKENIZER, MASK_ID, DEVICE
+    if MODEL is None or TOKENIZER is None or MASK_ID is None:
+        yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
+        return
+    steps = int(steps)
+    gen_length = int(gen_length)
+    block_length = int(block_length)
+    if thinking_mode_mmu:
+        prompt_text = "You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here\n" + prompt_text
+    try:
+        m = [{"role": "user", "content": prompt_text}]
+        processed_prompt_text = TOKENIZER.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
+    except Exception as e:
+        yield [("Error applying chat template.", "ERROR")], f"Chat template error: {e}"
+        processed_prompt_text = prompt_text
+    image_vq_ids_tensor = None
+    if uploaded_image_pil is not None:
+        try:
+            image = image_transform(uploaded_image_pil, resolution=512).to(DEVICE)
+            image = image.unsqueeze(0)
+            image_vq_ids_tensor = VQ_MODEL.get_code(image)  + 126349
+        except Exception as e:
+            yield [("Error processing image.", "ERROR")], f"Image to VQ tokens conversion failed: {str(e)}"
+            return
+    try:
+        if TOKENIZER.pad_token_id is None:
+            if TOKENIZER.eos_token_id is not None:
+                TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
+            else:
+                 yield [("Tokenizer Error", "ERROR")], "pad_token_id is not set in tokenizer."
+                 return
+        input_ids = TOKENIZER(text=processed_prompt_text, return_tensors="pt", padding="longest", padding_side="left", truncation=True, max_length=MODEL.config.max_position_embeddings if hasattr(MODEL.config, 'max_position_embeddings') else 2048)['input_ids'].to(DEVICE)
+        raw_prompt_attention_mask = None
+        if image_vq_ids_tensor is not None:
+            if image_vq_ids_tensor.ndim == 1:
+                image_vq_ids_tensor = image_vq_ids_tensor.unsqueeze(0)
+            input_ids = torch.cat([
+                (torch.ones(input_ids.shape[0], 1) * torch.tensor([126089])).to(DEVICE),
+                (torch.ones(input_ids.shape[0], 1) * torch.tensor([126084])).to(DEVICE),
+                image_vq_ids_tensor,
+                (torch.ones(input_ids.shape[0], 1) * torch.tensor([126085])).to(DEVICE),
+                input_ids
+            ], dim=1).long()
+        else:
+            input_ids = input_ids
+    except Exception as e:
+        yield [("Error tokenizing prompt.", "ERROR")], f"Tokenization error: {e}"
+        return
+    batch_size = input_ids.shape[0]
+    prompt_len = input_ids.shape[1]
+    x = torch.full((batch_size, prompt_len + gen_length), MASK_ID, dtype=torch.long, device=DEVICE)
+    x[:, :prompt_len] = input_ids.clone()
+    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), "Starting generation: Prompt + Initial Masks"
+    if gen_length == 0:
+         final_text_output = TOKENIZER.batch_decode(x[:,prompt_len:], skip_special_tokens=True)
+         yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_output[0] if final_text_output else ""
+         return
+    if block_length <= 0 or gen_length % block_length != 0 :
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+              f"Error: gen_length ({gen_length}) must be divisible by block_length ({block_length}) and block_length > 0."
+        return
+    num_blocks = gen_length // block_length
+    if steps <=0 or steps % num_blocks != 0:
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+              f"Error: steps ({steps}) must be positive and divisible by num_blocks ({num_blocks}). Steps: {steps}, Num Blocks: {num_blocks}"
+        return
+    steps_per_block = steps // num_blocks
+    for num_block_iter in range(num_blocks):
+        current_block_start_idx_in_x = prompt_len + num_block_iter * block_length
+        current_block_end_idx_in_x = prompt_len + (num_block_iter + 1) * block_length
+        block_masks_bool_current = torch.zeros_like(x, dtype=torch.bool)
+        block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x] = \
+            (x[:, current_block_start_idx_in_x:current_block_end_idx_in_x] == MASK_ID)
+        num_transfer_tokens_for_this_block = get_num_transfer_tokens(
+            block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x],
+            steps_per_block
+        )
+        for i_step_in_block in range(steps_per_block):
+            mask_index_global = (x == MASK_ID)
+            if cfg_scale > 0.:
+                un_x = x.clone()
+                # For unconditional pass, mask out the original prompt tokens that are not padding
+                # raw_prompt_attention_mask is (B, prompt_len)
+                prompt_active_tokens_mask = raw_prompt_attention_mask.bool() # True where actual prompt tokens are
+                un_x[:, :prompt_len][prompt_active_tokens_mask] = MASK_ID
+                x_cfg_input = torch.cat([x, un_x], dim=0)
+                # Pass attention_mask for CFG if model expects it, covering both parts
+                # For simplicity, not passing explicit attention_mask here; relies on model's internal handling.
+                model_output = MODEL(x_cfg_input)
+                logits_cond, logits_uncond = torch.chunk(model_output.logits, 2, dim=0)
+                logits = logits_uncond + (cfg_scale + 1) * (logits_cond - logits_uncond)
+            else:
+                # Not passing explicit attention_mask here; relies on model's internal handling.
+                model_output = MODEL(x)
+                logits = model_output.logits
+            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+            x0_predicted_tokens = torch.argmax(logits_with_noise, dim=-1)
+            if remasking_strategy == 'low_confidence':
+                if DEVICE == "mps":
+                    probs = F.softmax(logits.to(torch.float32), dim=-1)
+                else:
+                    probs = F.softmax(logits.to(torch.float64), dim=-1)
+                x0_probs = torch.gather(probs, dim=-1, index=x0_predicted_tokens.unsqueeze(-1)).squeeze(-1)
+            elif remasking_strategy == 'random':
+                if DEVICE == "mps":
+                    x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float32)
+                else:
+                    x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float64)
+            else:
+                yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), f"Error: Unknown remasking strategy '{remasking_strategy}'"
+                return
+            confidence_for_selection = torch.full_like(x0_probs, -torch.inf)
+            candidate_positions_for_unmasking = mask_index_global & block_masks_bool_current
+            confidence_for_selection = torch.where(
+                candidate_positions_for_unmasking,
+                x0_probs,
+                -torch.inf
+            )
+            x0_final_candidates = torch.where(mask_index_global, x0_predicted_tokens, x)
+            transfer_indices_bool = torch.zeros_like(x, dtype=torch.bool)
+            num_to_transfer_this_step_batch = num_transfer_tokens_for_this_block[:, i_step_in_block]
+            for j_batch_idx in range(batch_size):
+                k_val = min(num_to_transfer_this_step_batch[j_batch_idx].item(),
+                            candidate_positions_for_unmasking[j_batch_idx].sum().item()) # ensure k isn't too large
+                if k_val > 0:
+                    # Ensure confidence_for_selection[j_batch_idx] is 1D for topk
+                    conf_slice = confidence_for_selection[j_batch_idx]
+                    if conf_slice.ndim > 1: conf_slice = conf_slice.view(-1) # Should already be 1D from x0_probs
+                    # Check if there are enough valid (non -inf) confidences
+                    valid_conf_count = (conf_slice > -torch.inf).sum().item()
+                    actual_k = min(k_val, valid_conf_count)
+                    if actual_k > 0:
+                        _, topk_indices_in_x = torch.topk(conf_slice, k=actual_k)
+                        transfer_indices_bool[j_batch_idx, topk_indices_in_x] = True
+            x[transfer_indices_bool] = x0_final_candidates[transfer_indices_bool]
+            current_total_step = num_block_iter * steps_per_block + i_step_in_block + 1
+            total_overall_steps = num_blocks * steps_per_block
+            status_msg = f"Block {num_block_iter+1}/{num_blocks}, Step {i_step_in_block+1}/{steps_per_block} (Total: {current_total_step}/{total_overall_steps})"
+            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), status_msg
+    final_generated_ids = x[:, prompt_len:]
+    final_text_output = TOKENIZER.batch_decode(final_generated_ids, skip_special_tokens=True)
+    final_text_str = final_text_output[0] if final_text_output and len(final_text_output) > 0 else ""
+    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_str
+css_styles = """
+.gradio-container{font-family:'IBM Plex Sans',sans-serif;margin:auto;}
+.gr-input {background:#f9f9f9 !important;border:1px solid #e0e0e0 !important;}
+.gr-output{background:#f0f0f0 !important;border:1px solid #d0d0d0 !important;}
+.highlighted-text span{
+    padding:2px 4px;border-radius:4px;margin:1px 2px;display:inline-block;line-height:1.6;
+}
+footer{display:none !important}
+#live-update-scrollable-box {
+    max-height: 800px; /* 您可以根据需要调整这个最大高度，例如 '300px', '50vh' 等 */
+    overflow-y: auto !important; /* 当内容超出 max-height 时显示垂直滚动条 */
+    display: block; /* 确保元素是块级元素，以便 max-height 生效 */
+}
+#think_btn {
+    background-color: #f3f4f6 !important;
+    border: 1px solid #d0d0d0 !important;
+    color: #111827 !important;
+    font-size: 16px !important;
+    font-weight: bold !important;
+}
+#think_btn:hover {
+    background-color: #e0e0e0 !important;
+    border: 1px solid #c0c0c0 !important;
+    color: #222 !important;
+}
+#think_btn:active {
+    background-color: #2563eb !important;
+    border: 1px solid #b0b0b0 !important;
+    color: white !important;
+}
+"""
+# thinking_mode_t2i = gr.State(False)
+def toggle_thinking_mode_lm(current_thinking_mode):
+    # print(f"current_thinking_mode: {current_thinking_mode}")
+    new_state = not current_thinking_mode
+    new_label = "Thinking Mode ✅" if new_state else "Thinking Mode ❌"
+    return new_state, gr.update(value=new_label)
+def toggle_thinking_mode_mmu(current_thinking_mode):
+    new_state = not current_thinking_mode
+    new_label = "Thinking Mode ✅" if new_state else "Thinking Mode ❌"
+    return new_state, gr.update(value=new_label)
+color_map_config = {
+    "MASK": "lightgrey",
+    "GEN": "#DCABFA",
+}
+theme = gr.themes.Ocean(
+    primary_hue="fuchsia",
+)
+with gr.Blocks(css=css_styles, theme=theme) as demo:
+# with gr.Blocks(css=css_styles, theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
+# with gr.Blocks() as demo:
+    thinking_mode_lm = gr.State(False)
+    thinking_mode_mmu = gr.State(False)
+    gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>MMaDA: Multimodal Large Diffusion Language Models</h1>")
+    gr.Markdown("MMaDA is a novel class of multimodal diffusion foundation models designed to achieve superior performance across diverse domains such as textual reasoning, multimodal understanding, and text-to-image generation")
+    gr.Markdown("Github: [Gen-Verse/MMaDA](https://github.com/Gen-Verse/MMaDA)")
+    gr.Markdown("Paper: [MMaDA: Multimodal Large Diffusion Language Models]()")
+    gr.Markdown("### Select Model")
+    with gr.Row():
+        model_select_radio = gr.Radio(
+            label="Select Text Generation Model",
+            choices=MODEL_CHOICES,
+            value=MODEL_CHOICES[0]
+        )
+        model_load_status_box = gr.Textbox(
+            label="Model Load Status",
+            interactive=False,
+            lines=3,
+            max_lines=5
+        )
+    gr.Markdown("## Part 1. Text Generation")
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt_input_box_lm = gr.Textbox(label="Enter your prompt:", lines=3, value="A rectangular prism has a length of 5 units, a width of 4 units, and a height of 3 units. What is the volume of the prism?")
+            think_button_lm = gr.Button("🧠 Enable Thinking Mode", elem_id="think_btn")
+            with gr.Accordion("Generation Parameters", open=True):
+                with gr.Row():
+                    gen_length_slider_lm = gr.Slider(minimum=8, maximum=1024, value=512, step=64, label="Generation Length", info="Number of tokens to generate.")
+                    steps_slider_lm = gr.Slider(minimum=1, maximum=512, value=256, step=32, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
+                with gr.Row():
+                    block_length_slider_lm = gr.Slider(minimum=8, maximum=1024, value=128, step=32, label="Block Length", info="gen_length must be divisible by this.")
+                    remasking_dropdown_lm = gr.Dropdown(choices=['low_confidence', 'random'], value='low_confidence', label="Remasking Strategy")
+                with gr.Row():
+                    cfg_scale_slider_lm = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.1, label="CFG Scale", info="Classifier-Free Guidance. 0 disables it.")
+                    temperature_slider_lm = gr.Slider(minimum=0.0, maximum=2.0, value=1, step=0.05, label="Temperature", info="Controls randomness via Gumbel noise. 0 is deterministic.")
+            with gr.Row():
+                run_button_ui_lm = gr.Button("Generate Sequence", variant="primary", scale=3)
+                clear_button_ui_lm = gr.Button("Clear Outputs", scale=1)
+        with gr.Column(scale=3):
+            # gr.Markdown("## Live Generation Process")
+            output_visualization_box_lm = gr.HighlightedText(
+                label="Live Generation Process",
+                show_legend=True,
+                color_map=color_map_config,
+                combine_adjacent=False,
+                interactive=False,
+                elem_id="live-update-scrollable-box",
+            )
+            # gr.Markdown("## Final Generated Text")
+            output_final_text_box_lm = gr.Textbox(label="Final Output", lines=8, interactive=False, show_copy_button=True)
+    gr.Examples(
+        examples=[
+            ["A rectangular prism has a length of 5 units, a width of 4 units, and a height of 3 units. What is the volume of the prism?", 256, 512, 128, 1, 0, "low_confidence"],
+            ["Lily can run 12 kilometers per hour for 4 hours. After that, she can run 6 kilometers per hour. How many kilometers can she run in 8 hours?", 256, 512, 64, 1, 0, "low_confidence"]
+        ],
+        inputs=[prompt_input_box_lm, steps_slider_lm, gen_length_slider_lm, block_length_slider_lm, temperature_slider_lm, cfg_scale_slider_lm, remasking_dropdown_lm],
+        outputs=[output_visualization_box_lm, output_final_text_box_lm],
+        fn=generate_viz_wrapper_lm,
+    )
+    gr.Markdown("---")
+    gr.Markdown("## Part 2. Multimodal Understanding")
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt_input_box_mmu = gr.Textbox(
+                label="Enter your prompt:",
+                lines=3,
+                value="Please describe this image in detail."
+            )
+            think_button_mmu = gr.Button("🧠 Enable Thinking Mode", elem_id="think_btn")
+            with gr.Accordion("Generation Parameters", open=True):
+                with gr.Row():
+                    gen_length_slider_mmu = gr.Slider(minimum=64, maximum=1024, value=512, step=64, label="Generation Length", info="Number of tokens to generate.")
+                    steps_slider_mmu = gr.Slider(minimum=1, maximum=512, value=256, step=32, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
+                with gr.Row():
+                    block_length_slider_mmu = gr.Slider(minimum=32, maximum=1024, value=128, step=32, label="Block Length", info="gen_length must be divisible by this.")
+                    remasking_dropdown_mmu = gr.Dropdown(choices=['low_confidence', 'random'], value='low_confidence', label="Remasking Strategy")
+                with gr.Row():
+                    cfg_scale_slider_mmu = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.1, label="CFG Scale", info="Classifier-Free Guidance. 0 disables it.")
+                    temperature_slider_mmu = gr.Slider(minimum=0.0, maximum=2.0, value=1, step=0.05, label="Temperature", info="Controls randomness via Gumbel noise. 0 is deterministic.")
+            with gr.Row():
+                image_upload_box = gr.Image(type="pil", label="Upload Image")
+            with gr.Row():
+                run_button_ui_mmu = gr.Button("Generate Description", variant="primary", scale=3)
+                clear_button_ui_mmu = gr.Button("Clear Outputs", scale=1)
+        with gr.Column(scale=3):
+            gr.Markdown("## Live Generation Process")
+            output_visualization_box_mmu = gr.HighlightedText(
+                label="Token Sequence (Live Update)",
+                show_legend=True,
+                color_map=color_map_config,
+                combine_adjacent=False,
+                interactive=False,
+                elem_id="live-update-scrollable-box",
+            )
+            gr.Markdown("## Final Generated Text")
+            output_final_text_box_mmu = gr.Textbox(label="Final Output", lines=8, interactive=False, show_copy_button=True)
+    gr.Examples(
+        examples=[
+            [
+                "mmu_validation_2/sunflower.jpg",
+                "Please describe this image in detail.",
+                256,
+                512,
+                128,
+                1,
+                0,
+                "low_confidence"
+            ],
+            [
+                "mmu_validation_2/woman.jpg",
+                "Please describe this image in detail.",
+                256,
+                512,
+                128,
+                1,
+                0,
+                "low_confidence"
+            ]
+        ],
+        inputs=[
+            image_upload_box,
+            prompt_input_box_mmu,
+            steps_slider_mmu,
+            gen_length_slider_mmu,
+            block_length_slider_mmu,
+            temperature_slider_mmu,
+            cfg_scale_slider_mmu,
+            remasking_dropdown_mmu
+        ],
+        outputs=[output_visualization_box_mmu, output_final_text_box_mmu],
+        fn=generate_viz_wrapper,
+    )
+    gr.Markdown("---")
+    gr.Markdown("## Part 3. Text-to-Image Generation")
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt_input_box_t2i = gr.Textbox(label="Enter your prompt:", lines=3, value="A sea turtle swimming near a coral reef in the ocean, with a clear blue sky and water in the background.")
+            with gr.Accordion("Generation Parameters", open=True):
+                with gr.Row():
+                    steps_slider_t2i = gr.Slider(minimum=5, maximum=100, value=15, step=5, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
+                    guidance_scale_slider_t2i = gr.Slider(minimum=0.0, maximum=7.0, value=3.5, step=0.5, label="Guidance Scale", info="Classifier-Free Guidance. 0 disables it.")
+            with gr.Row():
+                scheduler_radio_t2i = gr.Radio(
+                    choices=["cosine", "sigmoid", "linear"],
+                    value="cosine",
+                    label="Scheduler",
+                )
+            with gr.Row():
+                run_button_ui_t2i = gr.Button("Generate Image", variant="primary", scale=3)
+                clear_button_ui_t2i = gr.Button("Clear Outputs", scale=1)
+        with gr.Column(scale=3):
+            # gr.Markdown("## Live Generation Process")
+            output_image_t2i = gr.Image(label="Generated Image", interactive=False, type="pil")
+            output_status_t2i = gr.Textbox(label="Generation Status", interactive=False)
+    gr.Examples(
+        examples=[
+            ["A sea turtle swimming near a coral reef in the ocean, with a clear blue sky and water in the background.", 15, 3.5, "cosine"],
+            ["A beautiful sunset over a calm ocean, with a few clouds in the sky.", 15, 3.5, "cosine"]
+        ],
+        inputs=[prompt_input_box_t2i, steps_slider_t2i, guidance_scale_slider_t2i, scheduler_radio_t2i],
+        outputs=[output_image_t2i, output_status_t2i],
+        fn=generate_viz_wrapper_t2i,
+    )
+    run_button_ui_t2i.click(
+        fn=generate_viz_wrapper_t2i,
+        inputs=[
+            prompt_input_box_t2i,
+            steps_slider_t2i,
+            guidance_scale_slider_t2i,
+            scheduler_radio_t2i
+        ],
+        outputs=[output_image_t2i, output_status_t2i]
+    )
+    clear_button_ui_t2i.click(
+        fn=lambda: (None, ""),
+        inputs=None,
+        outputs=[output_image_t2i, output_status_t2i],
+        queue=False
+    )
+    think_button_lm.click(
+        fn=toggle_thinking_mode_lm,
+        inputs=[thinking_mode_lm],
+        outputs=[thinking_mode_lm, think_button_lm]
+    )
+    think_button_mmu.click(
+        fn=toggle_thinking_mode_mmu,
+        inputs=[thinking_mode_mmu],
+        outputs=[thinking_mode_mmu, think_button_mmu]
+    )
+    def initialize_default_model():
+        default_model = "MMaDA-8B-Base"
+        result = handle_model_selection_change(default_model)
+        return default_model, result
+    demo.load(
+        fn=initialize_default_model,
+        inputs=None,
+        outputs=[model_select_radio, model_load_status_box],
+        queue=True
+    )
+    def clear_outputs():
+        return None, None, None  # Clear image, visualization, and final text
+    clear_button_ui_lm.click(
+        fn=clear_outputs,
+        inputs=None,
+        outputs=[image_upload_box, output_visualization_box_lm, output_final_text_box_lm],
+        queue=False
+    )
+    clear_button_ui_mmu.click(
+        fn=clear_outputs,
+        inputs=None,
+        outputs=[image_upload_box, output_visualization_box_mmu, output_final_text_box_mmu],
+        queue=False
+    )
+    run_button_ui_lm.click(
+        fn=generate_viz_wrapper_lm,
+        inputs=[
+            prompt_input_box_lm,
+            steps_slider_lm,
+            gen_length_slider_lm,
+            block_length_slider_lm,
+            temperature_slider_lm,
+            cfg_scale_slider_lm,
+            remasking_dropdown_lm,
+            thinking_mode_lm
+        ],
+        outputs=[output_visualization_box_lm, output_final_text_box_lm]
+    )
+    run_button_ui_mmu.click(
+        fn=generate_viz_wrapper,
+        inputs=[
+            image_upload_box,
+            prompt_input_box_mmu,
+            steps_slider_mmu,
+            gen_length_slider_mmu,
+            block_length_slider_mmu,
+            temperature_slider_mmu,
+            cfg_scale_slider_mmu,
+            remasking_dropdown_mmu,
+            thinking_mode_mmu
+        ],
+        outputs=[output_visualization_box_mmu, output_final_text_box_mmu]
+    )
+if __name__ == "__main__":
+    print(f"Starting Gradio App. Attempting to use device: {DEVICE}")
+    demo.launch(share=True)

MMaDA/check_lr.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+from torch.optim import AdamW
+from models.lr_schedulers import get_scheduler
+MAX_TRAINING_STEPS = 100
+WARMUP_STEPS = 80
+INITIAL_LR = 5e-5
+SCHEDULER_TYPE = "cosine" # "linear", "cosine"
+# ---------------------------------------------
+dummy_model = torch.nn.Linear(1, 1)
+dummy_optimizer = AdamW(dummy_model.parameters(), lr=INITIAL_LR)
+lr_scheduler = get_scheduler(
+    name=SCHEDULER_TYPE,
+    optimizer=dummy_optimizer,
+    num_warmup_steps=WARMUP_STEPS,
+    num_training_steps=MAX_TRAINING_STEPS,
+)
+all_lrs = []
+for step in range(MAX_TRAINING_STEPS):
+    all_lrs.append(lr_scheduler.get_last_lr()[0])
+    lr_scheduler.step()
+print(all_lrs[79])

MMaDA/check_tokens.py ADDED Viewed

	@@ -0,0 +1,191 @@

+#!/usr/bin/env python3
+"""
+체크 방법
+=========
+python check_audio_tokens.py \
+  --config configs/omada_instruction_tuning.yaml \
+  --samples 20
+"""
+import argparse
+import random
+from pathlib import Path
+from typing import Iterable, Optional, Tuple, Union
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from models.modeling_emova_speech_tokenizer import EMOVASpeechTokenizer
+from training.data import MixedSpeechTextDataset, VideoSpeechDataset
+from training.prompting_utils import UniversalPrompting
+from training.utils import image_transform
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+def _to_tensor(entry: Union[torch.Tensor, np.ndarray, list, tuple, str],
+               vq_model: EMOVASpeechTokenizer) -> torch.Tensor:
+    """entry가 경로면 encode, 이미 토큰이면 long tensor로 변환."""
+    if isinstance(entry, torch.Tensor):
+        tokens = entry.clone().long()
+    elif isinstance(entry, np.ndarray):
+        tokens = torch.from_numpy(entry).long()
+    elif isinstance(entry, (list, tuple)):
+        tokens = torch.as_tensor(entry, dtype=torch.long)
+    elif isinstance(entry, str):
+        # EMOVA encode는 (1, L) 반환 → 1D로 변환
+        tokens = vq_model.encode(entry).squeeze(0).long()
+    else:
+        raise TypeError(f"Unsupported token entry type: {type(entry)}")
+    return tokens.view(-1)
+def _log_stats(flow: str, path: str, tokens: torch.Tensor,
+               codebook_size: int = 4096) -> Tuple[int, int]:
+    max_id = int(tokens.max().item())
+    min_id = int(tokens.min().item())
+    over = int((tokens >= codebook_size).sum().item())
+    under = int((tokens < 0).sum().item())
+    print(
+        f"[{flow}] path={path} "
+        f"shape={tuple(tokens.shape)} "
+        f"min={min_id} max={max_id} "
+        f"<0={under} >=4096={over}"
+    )
+    return over, under
+def build_prompting(config) -> UniversalPrompting:
+    tokenizer = AutoTokenizer.from_pretrained(
+        config.model.omada.tokenizer_path,
+        padding_side="left",
+    )
+    special_tokens = (
+        "<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>",
+        "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>",
+        "<|i2i|>", "<|v2t|>", "<|v2s|>", "<|s2t|>",
+        "<|t2s|>", "<|s2s|>", "<|soa|>", "<|eoa|>",
+    )
+    prompt = UniversalPrompting(
+        tokenizer,
+        max_text_len=config.dataset.preprocessing.max_seq_length,
+        max_audio_len=config.dataset.preprocessing.max_aud_length,
+        max_audio_len_short=config.dataset.preprocessing.max_aud_length_short,
+        ignore_id=-100,
+        cond_dropout_prob=config.training.cond_dropout_prob,
+        special_tokens=special_tokens,
+        use_reserved_token=True,
+    )
+    return prompt
+def sample_indices(length: int, num: int) -> Tuple[Iterable[int], int]:
+    """
+    Returns iterable of indices and the total count that will be iterated.
+    If num <= 0 or num >= length, iterates through the whole dataset.
+    """
+    if num is None or num <= 0 or num >= length:
+        return range(length), length
+    indices = random.sample(range(length), num)
+    return indices, len(indices)
+@torch.no_grad()
+def inspect_v2s(config, prompting, vq_model, num_samples: int):
+    speech_cfg = OmegaConf.to_container(
+        config.dataset.params.get("video_speech_dataset", {}),
+        resolve=True
+    ) or {}
+    dataset = VideoSpeechDataset(
+        transform=image_transform,
+        resolution=config.dataset.preprocessing.resolution,
+        num_frames=speech_cfg.get("num_frames_speech", 4),
+        video_root=speech_cfg.get(
+            "video_root", "/home/work/AIDAS/data/video/openvid1m/video/video"
+        ),
+        audio_root=speech_cfg.get(
+            "audio_root", "/home/work/AIDAS/data/video-speech"
+        ),
+        speech_dir_name=speech_cfg.get("speech_dir_name", "openvid-speech-trunc"),
+        index_path=speech_cfg.get(
+            "index_path", "/home/work/AIDAS/data/video-speech/openvid-speech.csv"
+        ),
+        sample_method=speech_cfg.get("sample_method", "uniform"),
+        precomputed_tokens_root=speech_cfg.get("precomputed_tokens_root"),
+    )
+    print(f"\n=== VideoSpeechDataset (v2s) | total={len(dataset)} ===")
+    total_over = total_under = 0
+    indices, total = sample_indices(len(dataset), num_samples)
+    for idx in tqdm(indices, total=total, desc="v2s audio", unit="sample"):
+        sample = dataset.data[idx]
+        speech_path = sample["speech"]
+        tokens = dataset._load_precomputed_tokens(speech_path)
+        if tokens is not None:
+            tokens = tokens.long()
+        else:
+            tokens = vq_model.encode(speech_path).squeeze(0).long()
+        over, under = _log_stats("v2s", speech_path, tokens)
+        total_over += over
+        total_under += under
+    print(f"[v2s] total >=4096: {total_over} | total <0: {total_under}")
+@torch.no_grad()
+def inspect_t2s(config, prompting, vq_model, num_samples: int):
+    dataset = MixedSpeechTextDataset(config.dataset.params.audio_data)
+    print(f"\n=== MixedSpeechTextDataset (t2s/s2t 공용) | total={len(dataset)} ===")
+    total_over = total_under = 0
+    indices, total = sample_indices(len(dataset), num_samples)
+    for idx in tqdm(indices, total=total, desc="t2s/s2t audio", unit="sample"):
+        sample = dataset[idx]
+        entry = sample["audio_path"]
+        if isinstance(entry, np.ndarray):
+            tokens = torch.from_numpy(entry).long()
+            path_repr = "<precomputed-array>"
+        elif isinstance(entry, str):
+            tokens = vq_model.encode(entry).squeeze(0).long()
+            path_repr = entry
+        else:
+            tokens = torch.as_tensor(entry, dtype=torch.long)
+            path_repr = "<sequence>"
+        over, under = _log_stats("t2s/s2t-source", path_repr, tokens)
+        total_over += over
+        total_under += under
+    print(f"[t2s] total >=4096: {total_over} | total <0: {total_under}")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True,
+                        help="학습에 사용한 YAML 설정 파일")
+    parser.add_argument(
+        "--samples",
+        type=int,
+        default=-1,
+        help="각 데이터셋에서 검사할 샘플 수 (<=0이면 전체 검사)",
+    )
+    args = parser.parse_args()
+    config = OmegaConf.load(args.config)
+    prompting = build_prompting(config)
+    vq_model = EMOVASpeechTokenizer.from_pretrained(
+        config.model.vq_model_audio.vq_model_name
+    )
+    vq_model.eval()
+    inspect_v2s(config, prompting, vq_model, args.samples)
+    # inspect_t2s(config, prompting, vq_model, args.samples)
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    random.seed(0)
+    main()

MMaDA/configs/mmada_demo.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "demo"
+    name: "mmada-demo"
+    output_dir: "mmada-demo"
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    mmada:
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 134656
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 0
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    gen_type: "imagenet1k"
+    und_type: "captioning"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                        "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..01209}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        shuffle_buffer_size: 1000
+        num_workers: 32
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 512 # for text tokens
+        resolution: 256
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 8000
+training:
+    gradient_accumulation_steps: 4
+    noise_type: "mask"
+    batch_size_t2i: 5
+    batch_size_lm: 1
+    batch_size_mmu: 2
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 500000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 1.5
+    generation_timesteps: 20
+    t2i_coeff: 1.0
+    lm_coeff: 0.1
+    mmu_coeff: 1.0
+mask_schedule:
+    schedule: "cosine"

MMaDA/configs/mmada_demo_s2t.yaml ADDED Viewed

	@@ -0,0 +1,131 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "omada-training-stage1"
+    name: "omada-training-stage1"
+    output_dir: "ckpts/omada/omada-training-stage1"
+    max_train_examples_t2i: 40000000
+    max_train_examples_mmu: 40000000
+    save_every: 5000
+    eval_every: 10000000000
+    generate_every: 1000000000
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+model:
+    vq_model_image:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    ### Omada ###############################################################
+    vq_model_audio:
+        type: "emova"
+        vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    omada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
+        # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 5 # task token 3 + eoa / soa
+        tie_word_embeddings: False
+    #########################################################################
+    gradient_checkpointing: True
+dataset:
+    gen_type: "pass"
+    und_type: "pass"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        ### Omada ###############################################################
+        video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
+        # subset for gigaspeech: xs, xl
+        # subset for librispeech: train-clean-360, train-clean-100
+        # subset for commonvoice: validated, invalidated
+        audio_data:
+          - name: "gigaspeech"
+            subset: "xl"
+            split: "train"
+          - name: "librispeech"
+            subset: "train-clean-360"
+          - name: "commonvoice"
+            subset: "validated"
+        #########################################################################
+        shuffle_buffer_size: 1000
+        num_workers: 8
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 128 # for text tokens
+        max_aud_length: 256 # for audio tokens
+        resolution: 128
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 1e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 3000
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 1
+    noise_type: "mask"
+    batch_size_t2i: 0
+    batch_size_lm: 0
+    batch_size_mmu: 0
+    batch_size_v2t: 2
+    batch_size_s2t: 2
+    batch_size_t2s: 3
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 200000
+    max_train_epochs: 1
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 0.75
+    generation_timesteps: 16
+    # t2i_coeff: 0.1
+    # lm_coeff: 0.1
+    # mmu_coeff: 0.1
+    v2t_coeff: 1.0
+    t2s_coeff: 1.0
+    s2t_coeff: 1.0

MMaDA/configs/mmada_demo_speech.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "demo"
+    name: "mmada-demo"
+    output_dir: "mmada-demo"
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    speech_model:
+        type: "emova"
+        speech_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    mmada:
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        speech_codebook_size: 4096
+        num_vq_tokens: 256
+        num_speech_vq_tokens: 100
+        num_new_special_tokens: 3
+        tie_word_embeddings: False
+        train_step: 25000
+    gradient_checkpointing: True
+dataset:
+    gen_type: "imagenet1k"
+    und_type: "captioning"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                        "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..01209}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        shuffle_buffer_size: 1000
+        num_workers: 32
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 512 # for text tokens
+        resolution: 256
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 8000
+training:
+    gradient_accumulation_steps: 4
+    noise_type: "mask"
+    batch_size_t2i: 5
+    batch_size_lm: 1
+    batch_size_mmu: 2
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 500000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 1.5
+    generation_timesteps: 20
+    t2i_coeff: 1.0
+    lm_coeff: 0.1
+    mmu_coeff: 1.0
+mask_schedule:
+    schedule: "cosine"

MMaDA/configs/mmada_demo_video.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "demo"
+    name: "mmada-demo"
+    output_dir: "mmada-demo"
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    mmada:
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 134656
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 0
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    gen_type: "imagenet1k"
+    und_type: "captioning"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                        "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..01209}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        shuffle_buffer_size: 1000
+        num_workers: 32
+        resolution: 128
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 512 # for text tokens
+        resolution: 256
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 8000
+training:
+    gradient_accumulation_steps: 4
+    noise_type: "mask"
+    batch_size_t2i: 5
+    batch_size_lm: 1
+    batch_size_mmu: 2
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 500000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 1.5
+    generation_timesteps: 20
+    t2i_coeff: 1.0
+    lm_coeff: 0.1
+    mmu_coeff: 1.0
+mask_schedule:
+    schedule: "cosine"

MMaDA/configs/mmada_demo_video_temp.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "demo"
+    name: "mmada-demo"
+    output_dir: "mmada-demo"
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    mmada:
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 134656
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 900
+        num_new_special_tokens: 0
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    gen_type: "imagenet1k"
+    und_type: "captioning"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                        "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..01209}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        shuffle_buffer_size: 1000
+        num_workers: 32
+        resolution: 480
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 512 # for text tokens
+        resolution: 480
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 8000
+training:
+    gradient_accumulation_steps: 4
+    noise_type: "mask"
+    batch_size_t2i: 5
+    batch_size_lm: 1
+    batch_size_mmu: 2
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 500000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 1.5
+    generation_timesteps: 20
+    t2i_coeff: 1.0
+    lm_coeff: 0.1
+    mmu_coeff: 1.0
+mask_schedule:
+    schedule: "cosine"

MMaDA/configs/mmada_pretraining_i2i.yaml ADDED Viewed

	@@ -0,0 +1,86 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "ommda-training-i2i_256_0715"
+    name: "ommda-training-i2i-mmada-instruct_256_0715"
+    output_dir: "ommda-training-i2i-mmada-instruct_256_0715"
+    save_every: 5000
+    eval_every: 20000
+    generate_every: 5000
+    num_validation_images: 20
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+    val_every: 50000
+    max_val_examples_t2i: 2000
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    mmada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 134656
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 0
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    params:
+        num_workers: 0
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 256 # for text tokens
+        resolution: 256
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 5000
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 4
+    noise_type: "mask"
+    batch_size_i2i: 1
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 50000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 5
+    generation_timesteps: 50
+    t2i_coeff: 1.0
+    lm_coeff: 0.1
+    mmu_coeff: 0.5
+    validation_seed: 42

MMaDA/configs/mmada_pretraining_s2t.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "ommda-training-s2t"
+    name: "ommda-training-s2t-mmada"
+    output_dir: "ommda-training-s2t-mmada"
+    save_every: 5000
+    eval_every: 20000
+    generate_every: 5000
+    num_validation_images: 20
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: False
+    val_every: 50000
+    max_val_examples_t2i: 2000
+model:
+    vq_model:
+        type: "emova"
+        vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    mmada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        speech_codebook_size: 4096
+        # num_vq_tokens: 256
+        # num_speech_vq_tokens: 250
+        num_new_special_tokens: 3
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    params:
+        num_workers: 0
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 256 # for text tokens
+        resolution: 256
+        center_crop: False
+        random_flip: False
+    data:
+    # subset for gigaspeech: xs, xl
+    # subset for librispeech: train-clean-360, train-clean-100
+    # subset for commonvoice: validated, invalidated
+        name: "gigaspeech"
+        subset: "xl"
+        split: "train"
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 5000
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 4
+    noise_type: "mask"
+    batch_size_s2t: 4
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 50000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 5
+    generation_timesteps: 50
+    t2i_coeff: 1.0
+    lm_coeff: 0.1
+    mmu_coeff: 0.5
+    validation_seed: 42

MMaDA/configs/mmada_pretraining_stage1_llada_instruct.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "mmada-training-stage1"
+    name: "mmada-training-stage1-llada-instruct"
+    output_dir: "mmada-training-stage1-llada-instruct"
+    max_train_examples_t2i: 40000000
+    max_train_examples_mmu: 40000000
+    save_every: 10000
+    eval_every: 2500
+    generate_every: 1000
+    log_every: 50
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    mmada:
+        pretrained_model_path: "GSAI-ML/LLaDA-8B-Instruct"
+        w_clip_vit: False
+        new_vocab_size: 134656
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 0
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    gen_type: "imagenet1k"
+    und_type: "captioning"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        shuffle_buffer_size: 1000
+        num_workers: 32
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 128 # for text tokens
+        resolution: 256
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 1e-4
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 5000
+training:
+    gradient_accumulation_steps: 2
+    noise_type: "mask"
+    batch_size_t2i: 7
+    batch_size_lm: 2
+    batch_size_mmu: 6
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 500000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 1.5
+    generation_timesteps: 12
+    t2i_coeff: 1.0
+    lm_coeff: 0.1
+    mmu_coeff: 1.0

MMaDA/configs/mmada_pretraining_stage2_llada_instruct.yaml ADDED Viewed

	@@ -0,0 +1,109 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "mmada-training-stage2"
+    name: "mmada-training-stage2-llada-instruct"
+    output_dir: "mmada-training-stage2-llada-instruct"
+    max_train_examples_t2i: 40000000
+    max_train_examples_mmu: 40000000
+    save_every: 10000
+    eval_every: 2500
+    generate_every: 1000
+    log_every: 50
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+    val_every: 50
+    max_val_examples_t2i: 2000
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    mmada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "path/to/your/checkpoint"
+        w_clip_vit: False
+        new_vocab_size: 134656
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 0
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    gen_type: "t2i"
+    und_type: "captioning"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                        "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/ty/datasets/laion-aesthetics-12m-images-2"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/new_captions"
+        validation_prompts_file: "validation_prompts/text2image_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        shuffle_buffer_size: 1000
+        num_workers: 32
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 256 # for text tokens
+        resolution: 256
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 5000
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 2
+    noise_type: "mask"
+    batch_size_t2i: 7
+    batch_size_lm: 2
+    batch_size_mmu: 3
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 1000000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 3
+    generation_timesteps: 12
+    t2i_coeff: 1.0
+    lm_coeff: 0.1
+    mmu_coeff: 0.5
+    validation_seed: 42

MMaDA/configs/mmada_pretraining_stage3_llada_instruct.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "mmada-training-stage3"
+    name: "mmada-training-stage3-llada-instruct"
+    output_dir: "mmada-training-stage3-llada-instruct"
+    max_train_examples_t2i: 40000000    #
+    max_train_examples_mmu: 40000000    #
+    save_every: 10000
+    eval_every: 2500
+    generate_every: 1000
+    log_every: 50
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+    val_every: 50
+    max_val_examples_t2i: 2000
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    mmada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "path/to/your/checkpoint"
+        w_clip_vit: False
+        new_vocab_size: 134656
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 0
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    gen_type: "t2i"
+    und_type: "captioning"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: [     #
+                                        "/data_storage/shared/datasets/JourneyDB/train/imgs/data/train/imgs/{000..199}.tgz",
+                                        "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar",
+                                        "/data_storage/shared/datasets/text-to-image-2M/data_512_2M"
+        ]
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",  #
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/ty/shared/datasets/3-instruct-datasets/parquet/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/ty/datasets/laion-aesthetics-12m-images-2"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/new_captions"
+        external_text_to_image_2M_512_caption_path: "/data_storage/shared/datasets/text-to-image-2M/data_512_2M_captions"
+        validation_prompts_file: "validation_prompts/text2image_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        lm_chat_validation_jsonl: "/data_storage/ty/MMaDA/lm_chat_validation/questions.jsonl"
+        shuffle_buffer_size: 1000
+        num_workers: 32
+        resolution: 512
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 512 # for text tokens   512
+        resolution: 512
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 5000
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 4  # 4
+    noise_type: "mask"
+    batch_size_t2i: 4   # 3~4
+    batch_size_lm: 1
+    batch_size_mmu: 1
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 1000000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 3
+    generation_timesteps: 12
+    t2i_coeff: 1.0
+    lm_coeff: 0.4 # ~0.5
+    mmu_coeff: 0.5
+    validation_seed: 42

MMaDA/configs/mmada_pretraining_stage3_llada_instruct_512_cot.yaml ADDED Viewed

	@@ -0,0 +1,123 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "mmada-training-stage3"
+    name: "mmada-training-stage3-llada-instruct-512-cot-uni"
+    output_dir: "mmada-training-stage3-llada-instruct-512-cot-uni"
+    max_train_examples_t2i: 40000000    #
+    max_train_examples_mmu: 40000000    #
+    save_every: 10000
+    eval_every: 2500
+    generate_every: 1000
+    log_every: 50
+    log_grad_norm_every: 100
+    # resume_from_checkpoint: False
+    resume_from_checkpoint: "latest"
+    val_every: 50
+    max_val_examples_t2i: 2000
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    mmada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "path/to/your/checkpoint"
+        w_clip_vit: False
+        new_vocab_size: 134656
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 1024
+        num_new_special_tokens: 0
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    gen_type: "t2i"
+    und_type: "captioning"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: [ "/data_storage/shared/datasets/JourneyDB/train/imgs/data/train/imgs/{000..199}.tgz",
+                                        "/data_storage/shared/datasets/laion-aesthetics-12m-filter/{00000..00999}.tar",
+                                        # "/data_storage/shared/datasets/text-to-image-2M/data_512_2M/data_{000000..000046}.tar"
+        ]
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/multimodal_cot/ai2d/new_images.tar",
+                                        "/data_storage/shared/datasets/multimodal_cot/clevr/images.tar",
+                                        "/data_storage/shared/datasets/multimodal_cot/docvqa/images.tar",
+                                        "/data_storage/shared/datasets/multimodal_cot/geo/images.tar",
+                                        "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar",
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/3-cot-sft/parquet/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/ty/datasets/laion-aesthetics-12m-images-2"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/new_captions"
+        external_text_to_image_2M_512_caption_path: "/data_storage/shared/datasets/text-to-image-2M/data_512_2M_captions"
+        external_ai2d_caption_path: "/data_storage/shared/datasets/multimodal_cot/ai2d/new_metadata.csv"
+        external_clevr_caption_path: "/data_storage/shared/datasets/multimodal_cot/clevr/metadata.csv"
+        external_docvqa_caption_path: "/data_storage/shared/datasets/multimodal_cot/docvqa/metadata.csv"
+        external_geo_caption_path: "/data_storage/shared/datasets/multimodal_cot/geo/metadata.csv"
+        validation_prompts_file: "validation_prompts/text2image_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        mmu_validation_prompts_file: "/data_storage/ty/MMaDA/mmu_validation/prompts.jsonl"
+        lm_chat_validation_jsonl: "/data_storage/ty/MMaDA/lm_chat_validation/questions.jsonl"
+        shuffle_buffer_size: 1000
+        num_workers: 32
+        resolution: 512
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 512 # for text tokens in t2i & mmu
+        max_lm_text_length: 1536 # for text tokens in lm/lm_chat
+        resolution: 512
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 5000
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 4  # 4
+    noise_type: "mask"
+    batch_size_t2i: 1
+    batch_size_lm: 2
+    batch_size_mmu: 1
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 1000000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 5
+    generation_timesteps: 20
+    t2i_coeff: 1.0
+    lm_coeff: 0.5
+    mmu_coeff: 0.5
+validation:
+    quantative_prompts_file: "/data_storage/ty/MMaDA/validation_prompts/quantative.txt"
+    quantative_batch_size: 8

MMaDA/configs/mmada_pretraining_stage4_llada_instruct.yaml ADDED Viewed

	@@ -0,0 +1,134 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "mmada-training-stage4"
+    name: "mmada-training-stage4-llada-instruct"
+    output_dir: "mmada-training-stage4-llada-instruct"
+    max_train_examples_t2i: 40000000    #
+    max_train_examples_mmu: 40000000    #
+    save_every: 10000
+    eval_every: 2500
+    generate_every: 1000
+    log_every: 50
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+    val_every: 50
+    max_val_examples_t2i: 2000
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    mmada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "/data_storage/ty/MMaDA/mmada-training-stage3-llada-instruct-512-cot-uni/checkpoint-210000/unwrapped_model"
+        w_clip_vit: False
+        new_vocab_size: 134656
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 1024
+        num_new_special_tokens: 0
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    gen_type: "t2i"
+    und_type: "captioning"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: [ "/data_storage/shared/datasets/JourneyDB/train/imgs/data/train/imgs/{000..199}.tgz",
+                                        "/data_storage/shared/datasets/laion-aesthetics-12m-filter/{00000..00999}.tar",
+                                        # "/data_storage/shared/datasets/text-to-image-2M/data_512_2M/data_{000000..000046}.tar"
+        ]
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/multimodal_cot/ai2d/new_images.tar",
+                                        "/data_storage/shared/datasets/multimodal_cot/clevr/images.tar",
+                                        "/data_storage/shared/datasets/multimodal_cot/docvqa/images.tar",
+                                        "/data_storage/shared/datasets/multimodal_cot/geo/images.tar",
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        train_instruct_shards_path_or_url: "/data_storage/shared/datasets/stage4_instruct/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/ty/datasets/laion-aesthetics-12m-images-2"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/new_captions"
+        external_text_to_image_2M_512_caption_path: "/data_storage/shared/datasets/text-to-image-2M/data_512_2M_captions"
+        external_ai2d_caption_path: "/data_storage/shared/datasets/multimodal_cot/ai2d/new_metadata.csv"
+        external_clevr_caption_path: "/data_storage/shared/datasets/multimodal_cot/clevr/metadata.csv"
+        external_docvqa_caption_path: "/data_storage/shared/datasets/multimodal_cot/docvqa/metadata.csv"
+        external_geo_caption_path: "/data_storage/shared/datasets/multimodal_cot/geo/metadata.csv"
+        external_vqa_caption_path: "/data_storage/shared/datasets/LLaVA-Instruct-150K/llava_v1_5_mix665k.json"
+        external_clevr2_caption_path: "/data_storage/ty/datasets/Clevr_CoGenT_TrainA_70K_Complex/captions.json"
+        external_geo170k_caption_path: "/data_storage/ty/shared/datasets/Geo170K/Geo170K/all.json"
+        vqa_images_path: "/data_storage/shared/datasets/LLaVA-Instruct-150K-images"
+        clevr2_images_path: "/data_storage/ty/datasets/Clevr_CoGenT_TrainA_70K_Complex/images"
+        geo170k_images_path: "/data_storage/ty/shared/datasets/Geo170K/Geo170K/images"
+        validation_prompts_file: "validation_prompts/text2image_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        mmu_validation_prompts_file: "/data_storage/ty/MMaDA/mmu_validation/prompts_with_vqa.json"
+        lm_chat_validation_jsonl: "/data_storage/ty/MMaDA/lm_chat_validation/questions.jsonl"
+        shuffle_buffer_size: 1000
+        num_workers: 16
+        resolution: 512
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 512 # for text tokens in t2i & mmu
+        max_lm_text_length: 1536 # for text tokens in lm/lm_chat
+        resolution: 512
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 5000
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 4  # 4
+    noise_type: "mask"
+    batch_size_t2i: 1
+    batch_size_lm: 2
+    batch_size_mmu: 1
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 1000000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 5
+    generation_timesteps: 20
+    t2i_coeff: 0.05
+    lm_coeff: 0.6
+    mmu_coeff: 0.4
+    cot_in_mmu_coeff: 3.5
+    vqa_in_mmu_coeff: 5.5
+    clevr2_in_mmu_coeff: 0.5
+    geo170k_in_mmu_coeff: 0.5
+    base_in_lm_coeff: 0.02
+    instruct_in_lm_coeff: 0.98
+validation:
+    quantative_prompts_file: "/data_storage/ty/MMaDA/validation_prompts/quantative.txt"
+    quantative_batch_size: 8

MMaDA/configs/mmada_pretraining_t2s.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+wandb:
+  entity: null
+  # run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "ommda-training-t2s"
+    name: "ommda-training-t2s-mmada"
+    output_dir: "ommda-training-t2s-mmada"
+    save_every: 5000
+    eval_every: 20000
+    generate_every: 5000
+    num_validation_images: 20
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+    val_every: 50000
+    max_val_examples_t2i: 2000
+model:
+    vq_model:
+        type: "emova"
+        vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    mmada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        speech_codebook_size: 4096
+        # num_vq_tokens: 256
+        # num_speech_vq_tokens: 250
+        num_new_special_tokens: 3
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    params:
+        num_workers: 0
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 256 # for text tokens
+        resolution: 256
+        center_crop: False
+        random_flip: False
+    data:
+    # subset for gigaspeech: xs, xl
+    # subset for librispeech: train-clean-360, train-clean-100
+    # subset for commonvoice: validated, invalidated
+        name: "gigaspeech"
+        subset: "xl"
+        split: "train"
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 1e-4
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 2500
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 4
+    noise_type: "mask"
+    batch_size_s2t: 4
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 50000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 5
+    generation_timesteps: 50
+    t2i_coeff: 1.0
+    lm_coeff: 0.1
+    mmu_coeff: 0.5
+    validation_seed: 42

MMaDA/configs/mmada_pretraining_v2s.yaml ADDED Viewed

	@@ -0,0 +1,133 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "omada-training-stage1"
+    name: "omada-training-stage1_ignore_SP"
+    output_dir: "ckpts/omada/omada-training-stage1_v2s_test"
+    max_train_examples_t2i: 40000000
+    max_train_examples_mmu: 40000000
+    save_every: 5000
+    eval_every: 5000
+    generate_every: 1000000000
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+model:
+    vq_model_image:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    ### Omada ###############################################################
+    vq_model_audio:
+        type: "emova"
+        vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    omada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
+        # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 5 # task token 3 + eoa / soa
+        tie_word_embeddings: False
+    #########################################################################
+    gradient_checkpointing: True
+dataset:
+    gen_type: "pass"
+    und_type: "pass"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        ### Omada ###############################################################
+        video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
+        # subset for gigaspeech: xs, xl
+        # subset for librispeech: train-clean-360, train-clean-100
+        # subset for commonvoice: validated, invalidated
+        audio_data:
+          - name: "gigaspeech"
+            subset: "xl"
+            split: "train"
+          - name: "librispeech"
+            subset: "train-clean-360"
+          - name: "commonvoice"
+            subset: "validated"
+        #########################################################################
+        shuffle_buffer_size: 1000
+        num_workers: 8
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 128 # for text tokens
+        max_aud_length: 384 # for audio tokens
+        resolution: 128
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        # learning_rate: 1e-4
+        learning_rate: 0.000079
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 0
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 1
+    noise_type: "mask"
+    batch_size_t2i: 0
+    batch_size_lm: 0
+    batch_size_mmu: 0
+    batch_size_v2t: 0
+    batch_size_s2t: 0
+    batch_size_t2s: 0
+    batch_size_v2s: 1
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 630000 # 2epoch
+    max_train_epochs: NONE
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 1.5
+    generation_timesteps: 16
+    # t2i_coeff: 0.1
+    # lm_coeff: 0.1
+    # mmu_coeff: 0.1
+    v2t_coeff: 0.2
+    t2s_coeff: 1.0
+    s2t_coeff: 0.2

MMaDA/configs/mmada_pretraining_v2t.yaml ADDED Viewed

	@@ -0,0 +1,88 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "mmada-training-v2t"
+    name: "mmada-training-stage3-llada-instruct-v2t"
+    output_dir: "mmada-training-stage3-llada-instruct-v2t-special-token-1e-5"
+    max_train_examples_t2i: 40000000    #
+    max_train_examples_mmu: 40000000    ddd#
+    save_every: 1000
+    eval_every: 2500
+    generate_every: 1000
+    log_every: 10
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+    val_every: 50
+    max_val_examples_t2i: 2000
+model:
+    vq_model:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    mmada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 134656
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 0
+        tie_word_embeddings: False
+    gradient_checkpointing: True
+dataset:
+    und_type: "captioning"
+    combined_loader_mode: "max_size_cycle"
+    preprocessing:
+        max_seq_length: 128 # for text tokens   512
+        resolution: 128
+        center_crop: False
+        random_flip: False
+    params:
+        num_workers: 32
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 1e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 5000
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 4  # 4
+    noise_type: "mask"
+    batch_size_v2t: 4
+    batch_size_mmu: 1
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 1000000
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 3
+    generation_timesteps: 12
+    mmu_coeff: 1.0
+    validation_seed: 42

MMaDA/configs/omada_instruction_tuning.yaml ADDED Viewed

	@@ -0,0 +1,200 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "omada-instruction-tuning"
+    name: "omada-instruction-tuning"
+    output_dir: "ckpts/omada/omada-instruction-tuning-tv_sacle_0.7"
+    max_train_examples_t2i: 40000000
+    max_train_examples_mmu: 40000000
+    save_every: 5000
+    eval_every: 10000
+    generate_every: 1000000000
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+model:
+    vq_model_image:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    ### Omada ###############################################################
+    vq_model_audio:
+        type: "emova"
+        vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    omada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        # pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
+        pretrained_model_path: "//home/work/AIDAS/ckpts/merged_model/hf_common_merge_alpha_999_scale_0p7"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 3 # v2s, s2s, i2i
+        tie_word_embeddings: False
+    #########################################################################
+    gradient_checkpointing: True
+dataset:
+    gen_type: "pass"
+    und_type: "pass"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        ### Omada ###############################################################
+        video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
+        video_speech_dataset:
+          sample_mode: "exclusive"
+          use_precomputed_tokens: true
+          precomputed_tokens_root: "/home/work/AIDAS/cache/speech_tokens"
+          llavavid_path: "/home/work/AIDAS/data/video/LLaVA-Video-178K"
+          llavavid_local_files_only: true
+          llavavid_skip_configs:
+            - "llava_hound"
+            - "0_30_s_activitynetqa"
+            - "30_60_s_activitynetqa"
+            - "1_2_m_activitynetqa"
+            - "2_3_m_activitynetqa"
+            - "0_30_s_activitynet"
+            - "30_60_s_activitynet"
+            - "1_2_m_activitynet"
+            - "2_3_m_activitynet"
+          llavavid_skip_video_patterns:
+            - "activitynet"
+        # video_dataset_name: "openvid1m"
+        hqedit_split: "train"
+        t2i_dataset: "text2image2m+openimage_i2i+hqedit"
+        t2i_split: "train"
+        t2i_dataset_name: "jackyhate/text-to-image-2M"
+        t2i_local_files_only: true
+        openimage_i2i:
+          sft_jsonl: "/home/work/AIDAS/data/openimage_source_images/sft_with_local_source_image_path.jsonl"
+          pref_jsonl: "/home/work/AIDAS/data/openimage_source_images/pref_with_local_source_image_path.jsonl"
+          multi_turn_jsonl: "/home/work/AIDAS/data/openimage_source_images/multi-turn_with_local_source_image_path.jsonl"
+          image_root: "/home/work/AIDAS/data/nano_edited_images"
+          prefer_summarized_text: true
+          pref_positive_only: true
+          skip_missing: true
+          max_samples_per_source: null
+          max_total_samples: null
+          seed: 42
+        hf_instruction_lm:
+          split: "train"
+          max_samples_per_source: 1000000
+          max_total_samples: 20000000
+          seed: 42
+        speech2speech:
+          - name: "instructs2s"
+            use_precomputed_tokens: false
+            precomputed_tokens_root: "/home/work/AIDAS/cache/instructs2s_tokens"
+        mmu_interleaved:
+          local_data_root: /home/work/AIDAS/data/TIGER-Lab/Mantis-Instruct
+          local_files_only: true
+        # subset for gigaspeech: xs, xl
+        # subset for librispeech: train-clean-360, train-clean-100
+        # subset for commonvoice: validated, invalidated
+        audio_data:
+          # - name: "gigaspeech"
+          #   subset: "xl"
+          #   split: "train"
+          - name: "librispeech"
+            subset: "train-clean-360"
+            use_precomputed_tokens: true
+            precomputed_tokens_root: "/home/work/AIDAS/cache/librispeech_tokens"
+          # - name: "commonvoice"
+          #   subset: "validated"
+        #########################################################################
+        shuffle_buffer_size: 1000
+        num_workers: 0
+        resolution: 256
+        # resolution: 16
+        pin_memory: False
+        persistent_workers: False
+        dataloader_timeout: 0
+    speech_token_cache:
+        enable: true
+        root: "cache/speech_tokens"
+        max_items_in_memory: 4096
+    preprocessing:
+        max_seq_length: 128 # for text tokens
+        max_aud_length: 384 # for audio tokens
+        max_aud_length_short: 256 # for short audio tokens
+        resolution: 128 # for video tokens
+        # max_seq_length: 16 # for text tokens
+        # max_aud_length: 16 # for audio tokens
+        # resolution: 16 # for video tokens
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        # learning_rate: 0.00004859840219369731
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        # warmup_steps: 1000
+        warmup_steps: 0
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 1
+    noise_type: "mask"
+    batch_size_t2i: 1
+    batch_size_lm: 1
+    batch_size_mmu: 1
+    batch_size_v2t: 1
+    batch_size_v2s: 1
+    batch_size_s2t: 2
+    batch_size_t2s: 2
+    batch_size_s2s: 2
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 500000
+    max_train_epochs: NONE
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 3.5
+    generation_timesteps: 42
+    t2i_coeff: 2.5
+    i2i_coeff: 2.5
+    lm_coeff: 2.5
+    mmu_coeff: 0.1
+    v2t_coeff: 0.2
+    v2s_coeff: 2.0
+    t2s_coeff: 2.5
+    s2t_coeff: 0.5
+    s2s_coeff: 3.0

MMaDA/configs/omada_pretraining_stage1-2.yaml ADDED Viewed

	@@ -0,0 +1,131 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "omada-training-stage1"
+    name: "omada-training-stage1"
+    output_dir: "ckpts/omada/omada-training-stage1_2nd"
+    max_train_examples_t2i: 40000000
+    max_train_examples_mmu: 40000000
+    save_every: 5000
+    eval_every: 5000
+    generate_every: 1000000000
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+model:
+    vq_model_image:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    ### Omada ###############################################################
+    vq_model_audio:
+        type: "emova"
+        vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    omada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
+        # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 5 # task token 3 + eoa / soa
+        tie_word_embeddings: False
+    #########################################################################
+    gradient_checkpointing: True
+dataset:
+    gen_type: "pass"
+    und_type: "pass"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        ### Omada ###############################################################
+        video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
+        # subset for gigaspeech: xs, xl
+        # subset for librispeech: train-clean-360, train-clean-100
+        # subset for commonvoice: validated, invalidated
+        audio_data:
+          - name: "gigaspeech"
+            subset: "xl"
+            split: "train"
+          - name: "librispeech"
+            subset: "train-clean-360"
+          - name: "commonvoice"
+            subset: "validated"
+        #########################################################################
+        shuffle_buffer_size: 1000
+        num_workers: 8
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 128 # for text tokens
+        max_aud_length: 256 # for audio tokens
+        resolution: 128
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 5e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 0
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 1
+    noise_type: "mask"
+    batch_size_t2i: 0
+    batch_size_lm: 0
+    batch_size_mmu: 0
+    batch_size_v2t: 1
+    batch_size_s2t: 1
+    batch_size_t2s: 5
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 315000 # 2epoch
+    max_train_epochs: NONE
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 0.0
+    generation_timesteps: 64
+    # t2i_coeff: 0.1
+    # lm_coeff: 0.1
+    # mmu_coeff: 0.1
+    v2t_coeff: 0.1
+    t2s_coeff: 1.0
+    s2t_coeff: 0.1

MMaDA/configs/omada_pretraining_stage1-3.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "omada-training-stage1"
+    name: "omada-training-stage1_ignore_SP"
+    output_dir: "ckpts/omada/omada-training-stage1_7th"
+    max_train_examples_t2i: 40000000
+    max_train_examples_mmu: 40000000
+    save_every: 5000
+    eval_every: 5000
+    generate_every: 1000000000
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+model:
+    vq_model_image:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    ### Omada ###############################################################
+    vq_model_audio:
+        type: "emova"
+        vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    omada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
+        # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 5 # task token 3 + eoa / soa
+        tie_word_embeddings: False
+    #########################################################################
+    gradient_checkpointing: True
+dataset:
+    gen_type: "pass"
+    und_type: "pass"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        ### Omada ###############################################################
+        video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
+        # subset for gigaspeech: xs, xl
+        # subset for librispeech: train-clean-360, train-clean-100
+        # subset for commonvoice: validated, invalidated
+        audio_data:
+          - name: "gigaspeech"
+            subset: "xl"
+            split: "train"
+          - name: "librispeech"
+            subset: "train-clean-360"
+          - name: "commonvoice"
+            subset: "validated"
+        #########################################################################
+        shuffle_buffer_size: 1000
+        num_workers: 8
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 128 # for text tokens
+        max_aud_length: 384 # for audio tokens
+        resolution: 128
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        # learning_rate: 1e-4
+        learning_rate: 0.000079
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 0
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 1
+    noise_type: "mask"
+    batch_size_t2i: 0
+    batch_size_lm: 0
+    batch_size_mmu: 0
+    batch_size_v2t: 1
+    batch_size_s2t: 1
+    batch_size_t2s: 5
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 630000 # 2epoch
+    max_train_epochs: NONE
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 1.5
+    generation_timesteps: 16
+    # t2i_coeff: 0.1
+    # lm_coeff: 0.1
+    # mmu_coeff: 0.1
+    v2t_coeff: 0.2
+    t2s_coeff: 1.0
+    s2t_coeff: 0.2

MMaDA/configs/omada_pretraining_stage1-4.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "omada-training-stage1"
+    name: "omada-training-stage1_ignore_SP"
+    output_dir: "ckpts/omada/omada-training-stage1_5th"
+    max_train_examples_t2i: 40000000
+    max_train_examples_mmu: 40000000
+    save_every: 5000
+    eval_every: 5000
+    generate_every: 1000000000
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+model:
+    vq_model_image:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    ### Omada ###############################################################
+    vq_model_audio:
+        type: "emova"
+        vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    omada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
+        # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 5 # task token 3 + eoa / soa
+        tie_word_embeddings: False
+    #########################################################################
+    gradient_checkpointing: True
+dataset:
+    gen_type: "pass"
+    und_type: "pass"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        ### Omada ###############################################################
+        video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
+        # subset for gigaspeech: xs, xl
+        # subset for librispeech: train-clean-360, train-clean-100
+        # subset for commonvoice: validated, invalidated
+        audio_data:
+          - name: "gigaspeech"
+            subset: "xl"
+            split: "train"
+          - name: "librispeech"
+            subset: "train-clean-360"
+          - name: "commonvoice"
+            subset: "validated"
+        #########################################################################
+        shuffle_buffer_size: 1000
+        num_workers: 4
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 128 # for text tokens
+        max_aud_length: 256 # for audio tokens
+        resolution: 128
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        # learning_rate: 5e-6
+        learning_rate: 0.00000483
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 0
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 1
+    noise_type: "mask"
+    batch_size_t2i: 0
+    batch_size_lm: 0
+    batch_size_mmu: 0
+    batch_size_v2t: 1
+    batch_size_s2t: 1
+    batch_size_t2s: 5
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 630000 # 2epoch
+    max_train_epochs: NONE
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 1.5
+    generation_timesteps: 16
+    # t2i_coeff: 0.1
+    # lm_coeff: 0.1
+    # mmu_coeff: 0.1
+    v2t_coeff: 0.2
+    t2s_coeff: 1.0
+    s2t_coeff: 0.2

MMaDA/configs/omada_pretraining_stage1.yaml ADDED Viewed

	@@ -0,0 +1,131 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "omada-training-stage1"
+    name: "omada-training-stage1"
+    output_dir: "ckpts/omada/omada-training-stage1"
+    max_train_examples_t2i: 40000000
+    max_train_examples_mmu: 40000000
+    save_every: 5000
+    eval_every: 10000000000
+    generate_every: 1000000000
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+model:
+    vq_model_image:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    ### Omada ###############################################################
+    vq_model_audio:
+        type: "emova"
+        vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    omada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
+        # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 5 # task token 3 + eoa / soa
+        tie_word_embeddings: False
+    #########################################################################
+    gradient_checkpointing: True
+dataset:
+    gen_type: "pass"
+    und_type: "pass"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        ### Omada ###############################################################
+        video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
+        # subset for gigaspeech: xs, xl
+        # subset for librispeech: train-clean-360, train-clean-100
+        # subset for commonvoice: validated, invalidated
+        audio_data:
+          - name: "gigaspeech"
+            subset: "xl"
+            split: "train"
+          - name: "librispeech"
+            subset: "train-clean-360"
+          - name: "commonvoice"
+            subset: "validated"
+        #########################################################################
+        shuffle_buffer_size: 1000
+        num_workers: 8
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 128 # for text tokens
+        max_aud_length: 256 # for audio tokens
+        resolution: 128
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        learning_rate: 1e-5
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 3000
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 1
+    noise_type: "mask"
+    batch_size_t2i: 0
+    batch_size_lm: 0
+    batch_size_mmu: 0
+    batch_size_v2t: 2
+    batch_size_s2t: 2
+    batch_size_t2s: 3
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 200000
+    max_train_epochs: 1
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 1.5
+    generation_timesteps: 12
+    # t2i_coeff: 0.1
+    # lm_coeff: 0.1
+    # mmu_coeff: 0.1
+    v2t_coeff: 1.0
+    t2s_coeff: 1.0
+    s2t_coeff: 1.0

MMaDA/configs/omada_pretraining_v2t_inst.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+wandb:
+  entity: null
+#  run_id: askkz9i2
+  resume: 'auto'
+experiment:
+    project: "omada-training-v2t_inst"
+    name: "omada-training-v2t_inst"
+    output_dir: "ckpts/omada/omada-training-v2t_inst"
+    max_train_examples_t2i: 40000000
+    max_train_examples_mmu: 40000000
+    save_every: 5000
+    eval_every: 5000
+    generate_every: 1000000000
+    log_every: 1
+    log_grad_norm_every: 100
+    resume_from_checkpoint: "latest"
+model:
+    vq_model_image:
+        type: "magvitv2"
+        vq_model_name: "showlab/magvitv2"
+    ### Omada ###############################################################
+    vq_model_audio:
+        type: "emova"
+        vq_model_name: "Emova-ollm/emova_speech_tokenizer_hf"
+    omada:
+        tokenizer_path: "GSAI-ML/LLaDA-8B-Instruct"
+        pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
+        # pretrained_model_path: "Gen-Verse/MMaDA-8B-Base"
+        w_clip_vit: False
+        new_vocab_size: 138752
+        llm_vocab_size: 126464
+        codebook_size: 8192
+        num_vq_tokens: 256
+        num_new_special_tokens: 5 # task token 3 + eoa / soa
+        tie_word_embeddings: False
+    #########################################################################
+    gradient_checkpointing: True
+dataset:
+    gen_type: "pass"
+    und_type: "pass"
+    combined_loader_mode: "max_size_cycle"
+    params:
+        train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
+        train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
+                                        "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
+                                      "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
+        ]
+        train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
+        add_caption_prompt: True
+        external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
+        external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
+        external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
+        external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
+        validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
+        mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
+        ### Omada ###############################################################
+        video_root: "/home/work/AIDAS/data/video/panda70m/panda70m_training_2m"
+        # subset for gigaspeech: xs, xl
+        # subset for librispeech: train-clean-360, train-clean-100
+        # subset for commonvoice: validated, invalidated
+        audio_data:
+          - name: "gigaspeech"
+            subset: "xl"
+            split: "train"
+          - name: "librispeech"
+            subset: "train-clean-360"
+          - name: "commonvoice"
+            subset: "validated"
+        #########################################################################
+        shuffle_buffer_size: 1000
+        num_workers: 8
+        resolution: 256
+        pin_memory: True
+        persistent_workers: True
+    preprocessing:
+        max_seq_length: 128 # for text tokens
+        max_aud_length: 384 # for audio tokens
+        resolution: 128
+        center_crop: False
+        random_flip: False
+optimizer:
+    name: adamw
+    params: # default adamw params
+        # learning_rate: 1e-4
+        learning_rate: 0.000079
+        scale_lr: False # scale learning rate by total batch size
+        beta1: 0.9
+        beta2: 0.999
+        weight_decay: 0.01
+        epsilon: 1e-8
+lr_scheduler:
+    scheduler: "cosine"
+    params:
+        learning_rate: ${optimizer.params.learning_rate}
+        warmup_steps: 0
+        min_lr_scale: 0.1
+training:
+    gradient_accumulation_steps: 1
+    noise_type: "mask"
+    batch_size_t2i: 0
+    batch_size_lm: 0
+    batch_size_mmu: 0
+    batch_size_v2t: 1
+    batch_size_s2t: 1
+    batch_size_t2s: 5
+    mixed_precision: "bf16"
+    enable_tf32: True
+    seed: 10086
+    max_train_steps: 630000 # 2epoch
+    max_train_epochs: NONE
+    overfit_one_batch: False
+    cond_dropout_prob: 0.1
+    min_masking_rate: 0.0
+    label_smoothing: 0.0
+    max_grad_norm: 1
+    guidance_scale: 1.5
+    generation_timesteps: 16
+    # t2i_coeff: 0.1
+    # lm_coeff: 0.1
+    # mmu_coeff: 0.1
+    v2t_coeff: 0.2
+    t2s_coeff: 1.0
+    s2t_coeff: 0.2

MMaDA/debug_speech_dataloader.py ADDED Viewed

	@@ -0,0 +1,222 @@

+#!/usr/bin/env python3
+"""Utility to reproduce and debug the speech DataLoader used in training.
+This script pulls the speech dataset configuration from the Omada
+instruction-tuning config, instantiates the same `MixedSpeechTextDataset`, and
+iterates a configurable number of batches while measuring how long each fetch
+takes. Use it to spot slow or stuck samples without launching the full training
+job.
+Typical usage::
+    python AIDAS/MMaDA/script/debug_speech_dataloader.py \
+        --config AIDAS/MMaDA/configs/omada_instruction_tuning.yaml \
+        --flow s2t --max-batches 5 --num-workers 1 --timeout 0
+Pass `--inspect-items` for a direct `dataset[idx]` sweep when a specific sample
+seems suspicious.
+"""
+from __future__ import annotations
+import argparse
+import itertools
+import logging
+import sys
+import time
+from pathlib import Path
+from typing import Any, Iterable, List
+from omegaconf import OmegaConf
+from torch.utils.data import DataLoader
+from MMaDA.training.data import MixedSpeechTextDataset
+def _collate_fn_audio(batch: List[dict[str, Any]]) -> dict[str, List[Any]]:
+    """Match the collate function used in training for speech flows."""
+    return {
+        "audio_path": [item["audio_path"] for item in batch],
+        "text": [item["text"] for item in batch],
+        "audio_tokens": [item.get("audio_tokens") for item in batch],
+    }
+def _as_list_of_dicts(cfg_fragment: Any) -> List[dict[str, Any]]:
+    container = OmegaConf.to_container(cfg_fragment, resolve=True)
+    if not isinstance(container, Iterable):  # pragma: no cover - sanity guard
+        raise TypeError("audio_data config must be a list of dataset dicts")
+    return list(container)  # type: ignore[arg-type]
+def _build_dataset(cfg) -> MixedSpeechTextDataset:
+    dataset_cfg = cfg.dataset.params
+    audio_data_cfg = _as_list_of_dicts(dataset_cfg.audio_data)
+    return MixedSpeechTextDataset(audio_data_cfg)
+def _log_batch_summary(idx: int, batch: dict[str, List[Any]], elapsed: float) -> None:
+    audio_paths = batch.get("audio_path", [])
+    sample = audio_paths[0] if audio_paths else "<empty>"
+    logging.info(
+        "batch=%d size=%d elapsed=%.2fs sample=%s",
+        idx,
+        len(audio_paths),
+        elapsed,
+        sample,
+    )
+def _inspect_items(dataset: MixedSpeechTextDataset, max_items: int) -> None:
+    logging.info("Inspecting individual dataset items (max=%d)", max_items)
+    for idx in itertools.islice(range(len(dataset)), max_items):
+        tick = time.perf_counter()
+        try:
+            item = dataset[idx]
+        except Exception as exc:  # pragma: no cover - diagnostic path
+            logging.error("idx=%d failed: %s", idx, exc)
+            continue
+        elapsed = time.perf_counter() - tick
+        logging.info(
+            "idx=%d elapsed=%.2fs path=%s text_len=%d tokens=%s",
+            idx,
+            elapsed,
+            item.get("audio_path"),
+            len(item.get("text", "")),
+            "cached" if item.get("audio_tokens") is not None else "None",
+        )
+def parse_args(argv: List[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path("AIDAS/MMaDA/configs/omada_instruction_tuning.yaml"),
+        help="Path to the training config YAML",
+    )
+    parser.add_argument(
+        "--flow",
+        choices=["s2t", "t2s"],
+        default="s2t",
+        help="Which speech flow's batch size defaults to use",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=None,
+        help="Override batch size (defaults to config.training.batch_size_<flow>)",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=None,
+        help="Override DataLoader workers (defaults to config.dataset.params.num_workers)",
+    )
+    parser.add_argument(
+        "--persistent-workers",
+        action="store_true",
+        help="Enable persistent workers regardless of config",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=None,
+        help="DataLoader timeout in seconds (defaults to config.dataset.params.dataloader_timeout)",
+    )
+    parser.add_argument(
+        "--max-batches",
+        type=int,
+        default=10,
+        help="Number of batches to iterate (0 means run through the entire dataset)",
+    )
+    parser.add_argument(
+        "--inspect-items",
+        type=int,
+        default=0,
+        help="If >0, bypass the DataLoader and inspect this many individual dataset items first",
+    )
+    parser.add_argument(
+        "--prefetch-factor",
+        type=int,
+        default=None,
+        help="Optional override for DataLoader prefetch_factor",
+    )
+    parser.add_argument(
+        "--log-level",
+        default="INFO",
+        help="Logging level",
+    )
+    return parser.parse_args(argv)
+def main(argv: List[str]) -> int:
+    args = parse_args(argv)
+    logging.basicConfig(
+        level=getattr(logging, args.log_level.upper(), logging.INFO),
+        format="%(asctime)s | %(levelname)s | %(message)s",
+    )
+    cfg = OmegaConf.load(args.config)
+    dataset = _build_dataset(cfg)
+    if args.inspect_items:
+        _inspect_items(dataset, args.inspect_items)
+    dataset_params = cfg.dataset.params
+    batch_size = args.batch_size or getattr(cfg.training, f"batch_size_{args.flow}")
+    num_workers = args.num_workers if args.num_workers is not None else dataset_params.num_workers
+    timeout = args.timeout if args.timeout is not None else dataset_params.dataloader_timeout
+    if num_workers == 0:
+        persistent_workers = False
+    else:
+        persistent_workers = args.persistent_workers or bool(dataset_params.persistent_workers)
+    dataloader_kwargs = {
+        "dataset": dataset,
+        "batch_size": batch_size,
+        "shuffle": False,
+        "num_workers": num_workers,
+        "drop_last": True,
+        "pin_memory": bool(dataset_params.pin_memory),
+        "timeout": timeout,
+        "persistent_workers": persistent_workers,
+        "collate_fn": _collate_fn_audio,
+    }
+    if args.prefetch_factor is not None and num_workers > 0:
+        dataloader_kwargs["prefetch_factor"] = args.prefetch_factor
+    logging.info(
+        "Starting DataLoader debug: batch_size=%d num_workers=%d timeout=%s persistent=%s",
+        batch_size,
+        num_workers,
+        timeout,
+        persistent_workers,
+    )
+    dataloader = DataLoader(**dataloader_kwargs)
+    max_batches = args.max_batches
+    iterator = iter(dataloader)
+    processed = 0
+    while True:
+        if max_batches and processed >= max_batches:
+            break
+        tick = time.perf_counter()
+        try:
+            batch = next(iterator)
+        except StopIteration:
+            logging.info("Reached end of DataLoader after %d batches", processed)
+            break
+        elapsed = time.perf_counter() - tick
+        _log_batch_summary(processed, batch, elapsed)
+        processed += 1
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))

MMaDA/eval_ASR_TTS/test.py ADDED Viewed

	@@ -0,0 +1,266 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "54c0a618-750f-4bf0-8cdb-c2dda158c433",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import argparse\n",
+    "import json\n",
+    "import os\n",
+    "import editdistance"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "658bb863-f147-444e-8b14-466e1999d15f",
+   "metadata": {},
+   "source": [
+    "# Speech -> Text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7e4d5e19-e526-4b33-aa03-0a4cc68abd90",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_WER(recognized_text_list, groundtruth_text_list):\n",
+    "    word_num = 0.0\n",
+    "    scores = 0.0\n",
+    "    for recognized_text, groundtruth_text in zip(recognized_text_list, groundtruth_text_list):\n",
+    "        if len(recognized_text) > 1000:\n",
+    "            print(recognized_text)\n",
+    "            continue\n",
+    "        recognized_word_list = recognized_text.split()\n",
+    "        groundtruth_word_list = groundtruth_text.split()\n",
+    "        current_word_num = len(groundtruth_word_list)\n",
+    "        word_num += current_word_num\n",
+    "        # Compute Levenstein's distance\n",
+    "        current_score = editdistance.eval(recognized_word_list, groundtruth_word_list)\n",
+    "        scores += current_score\n",
+    "    WER = scores / word_num\n",
+    "    return WER, scores, word_num\n",
+    "\n",
+    "\n",
+    "def evaluate_asr(prediction_list, ground_truth_list):\n",
+    "    wer, scores_wer, word_num_wer = calculate_WER(prediction_list, ground_truth_list)\n",
+    "    print(f'wer: {wer}, scores_wer: {scores_wer}, word_num_wer: {word_num_wer}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "05f4a95c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WER (demo): 0.4375  | word errors: 7.0  | total words: 16.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "gt_0 = \"Hello. We are AIDAS laboratory.\"\n",
+    "gt_1 = \"Hello. Let's build an omni model diffusion foundation model.\"\n",
+    "gt_2 = \"Pretty intense.\"\n",
+    "\n",
+    "pred_0 = \"hello, we are AIDAS laboratory.\"\n",
+    "pred_1 = \"hello let's build an omni model diffusion foundation model\"\n",
+    "pred_2 = \"pretty intense\"\n",
+    "\n",
+    "groundtruth_text_list = [gt_0, gt_1, gt_2]\n",
+    "recognized_text_list = [pred_0, pred_1, pred_2]\n",
+    "\n",
+    "wer, errors, words = calculate_WER(recognized_text_list, groundtruth_text_list)\n",
+    "print(f\"WER (demo): {wer:.4f}  | word errors: {errors}  | total words: {words}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3635f492-2ae2-4ef4-9321-36d08aa6645e",
+   "metadata": {},
+   "source": [
+    "# Text -> Speech (with normalizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1ac74c9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Environment & deps check (safe to run multiple times)\n",
+    "import sys, os, importlib\n",
+    "from pathlib import Path\n",
+    "\n",
+    "\n",
+    "# optional: ensure packages (comment out if you manage env separately)\n",
+    "try:\n",
+    "    import editdistance  # used by calculate_WER\n",
+    "except Exception:\n",
+    "    print(\"Installing editdistance...\")\n",
+    "    %pip -q install editdistance\n",
+    "\n",
+    "try:\n",
+    "    import more_itertools  # required by english.py normalizer\n",
+    "except Exception:\n",
+    "    print(\"Installing more-itertools...\")\n",
+    "    %pip -q install more-itertools\n",
+    "\n",
+    "# local modules\n",
+    "from whisper_asr.whisper_asr import load_whisper_model, EN_ASR_WER\n",
+    "from whisper_asr.normalizers.english import EnglishTextNormalizer  # EMOVA-style normalizer\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4ffd26a0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cuda\n",
+      "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily be entirely accurate and will have caveats. More information: https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(..., ignore_warning=True). To use Whisper for long-form transcription, use rather the model's `generate` method directly as the model relies on it's own chunking mechanism (cf. Whisper original paper, section 3.8. Long-form Transcription).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "whisper model loaded!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/1 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.\n",
+      "Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.\n",
+      "100%|██████████| 1/1 [00:02<00:00,  2.81s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "groundtruth text:Hello. We are AIDAS laboratory.\n",
+      "recognized text: Hello, we are IDAS Laboratory.\n",
+      "groundtruth text:Hello. Let's build an omni model diffusion foundation model.\n",
+      "recognized text: Hello! Let's build an Omnimodal Diffusion Foundation model.\n",
+      "groundtruth text:It's pretty intense.\n",
+      "recognized text: It's pretty intense.\n",
+      "Computation Time: 2.8128 s\n",
+      "groundtruth:Hello. We are AIDAS laboratory.\n",
+      "recognized: Hello, we are IDAS Laboratory.\n",
+      "groundtruth:Hello. Let's build an omni model diffusion foundation model.\n",
+      "recognized: Hello! Let's build an Omnimodal Diffusion Foundation model.\n",
+      "groundtruth:It's pretty intense.\n",
+      "recognized: It's pretty intense.\n",
+      "Word count: 17\n",
+      "Word error: 9\n",
+      "utterance num:3\n",
+      "WER without Whisper text normalization: 0.5294 \n",
+      "normalized_groundtruth:hello we are aidas laboratory\n",
+      "normalized_recognized:hello we are idas laboratory\n",
+      "normalized_groundtruth:hello let us build an omni model diffusion foundation model\n",
+      "normalized_recognized:hello let us build an omnimodal diffusion foundation model\n",
+      "normalized_groundtruth:it is pretty intense\n",
+      "normalized_recognized:it is pretty intense\n",
+      "Word count: 19\n",
+      "Word error: 3\n",
+      "utterance num:3\n",
+      "WER with Whisper text normalization: 0.1579 \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# TTS → ASR with normalization (EMOVA EnglishTextNormalizer)\n",
+    "import torch\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# inputs\n",
+    "groundtruth_text_list = [\n",
+    "    \"Hello. We are AIDAS laboratory.\",\n",
+    "    \"Hello. Let's build an omni model diffusion foundation model.\",\n",
+    "    \"It's pretty intense.\",\n",
+    "]\n",
+    "wav_file_list = [\n",
+    "    \"./audio/AIDAS_team.wav\",\n",
+    "    \"./audio/diffusion.wav\",\n",
+    "    \"./audio/pretty_intense.wav\",\n",
+    "]\n",
+    "\n",
+    "# Load Whisper large-v3\n",
+    "model_id = \"openai/whisper-large-v3\"\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "pipe = load_whisper_model(model_id, device)\n",
+    "\n",
+    "# Run batch inference and print both raw and normalized WERs\n",
+    "EN_ASR_WER(pipe, wav_file_list, groundtruth_text_list, batch_size=3, print_verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd157230-07a5-4b05-a8c1-2f7a49475cdd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "310f5f23-43c6-40e1-a20c-09cd1ce287ad",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01b93914-2624-4c1a-b893-ed2cd3b944b7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "diff",
+   "language": "python",
+   "name": "diff"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

MMaDA/eval_ASR_TTS/whisper_asr/normalizers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .basic import BasicTextNormalizer as BasicTextNormalizer
2	+ from .english import EnglishTextNormalizer as EnglishTextNormalizer

MMaDA/eval_ASR_TTS/whisper_asr/normalizers/basic.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import re
+import unicodedata
+import regex
+# non-ASCII letters that are not separated by "NFKD" normalization
+ADDITIONAL_DIACRITICS = {
+    "œ": "oe",
+    "Œ": "OE",
+    "ø": "o",
+    "Ø": "O",
+    "æ": "ae",
+    "Æ": "AE",
+    "ß": "ss",
+    "ẞ": "SS",
+    "đ": "d",
+    "Đ": "D",
+    "ð": "d",
+    "Ð": "D",
+    "þ": "th",
+    "Þ": "th",
+    "ł": "l",
+    "Ł": "L",
+}
+def remove_symbols_and_diacritics(s: str, keep=""):
+    """
+    Replace any other markers, symbols, and punctuations with a space,
+    and drop any diacritics (category 'Mn' and some manual mappings)
+    """
+    return "".join(
+        c
+        if c in keep
+        else ADDITIONAL_DIACRITICS[c]
+        if c in ADDITIONAL_DIACRITICS
+        else ""
+        if unicodedata.category(c) == "Mn"
+        else " "
+        if unicodedata.category(c)[0] in "MSP"
+        else c
+        for c in unicodedata.normalize("NFKD", s)
+    )
+def remove_symbols(s: str):
+    """
+    Replace any other markers, symbols, punctuations with a space, keeping diacritics
+    """
+    return "".join(
+        " " if unicodedata.category(c)[0] in "MSP" else c
+        for c in unicodedata.normalize("NFKC", s)
+    )
+class BasicTextNormalizer:
+    def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
+        self.clean = (
+            remove_symbols_and_diacritics if remove_diacritics else remove_symbols
+        )
+        self.split_letters = split_letters
+    def __call__(self, s: str):
+        s = s.lower()
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = self.clean(s).lower()
+        if self.split_letters:
+            s = " ".join(regex.findall(r"\X", s, regex.U))
+        s = re.sub(
+            r"\s+", " ", s
+        )  # replace any successive whitespace characters with a space
+        return s

MMaDA/eval_ASR_TTS/whisper_asr/normalizers/english.json ADDED Viewed

	@@ -0,0 +1,1741 @@

+{
+    "accessorise": "accessorize",
+    "accessorised": "accessorized",
+    "accessorises": "accessorizes",
+    "accessorising": "accessorizing",
+    "acclimatisation": "acclimatization",
+    "acclimatise": "acclimatize",
+    "acclimatised": "acclimatized",
+    "acclimatises": "acclimatizes",
+    "acclimatising": "acclimatizing",
+    "accoutrements": "accouterments",
+    "aeon": "eon",
+    "aeons": "eons",
+    "aerogramme": "aerogram",
+    "aerogrammes": "aerograms",
+    "aeroplane": "airplane",
+    "aeroplanes": "airplanes",
+    "aesthete": "esthete",
+    "aesthetes": "esthetes",
+    "aesthetic": "esthetic",
+    "aesthetically": "esthetically",
+    "aesthetics": "esthetics",
+    "aetiology": "etiology",
+    "ageing": "aging",
+    "aggrandisement": "aggrandizement",
+    "agonise": "agonize",
+    "agonised": "agonized",
+    "agonises": "agonizes",
+    "agonising": "agonizing",
+    "agonisingly": "agonizingly",
+    "almanack": "almanac",
+    "almanacks": "almanacs",
+    "aluminium": "aluminum",
+    "amortisable": "amortizable",
+    "amortisation": "amortization",
+    "amortisations": "amortizations",
+    "amortise": "amortize",
+    "amortised": "amortized",
+    "amortises": "amortizes",
+    "amortising": "amortizing",
+    "amphitheatre": "amphitheater",
+    "amphitheatres": "amphitheaters",
+    "anaemia": "anemia",
+    "anaemic": "anemic",
+    "anaesthesia": "anesthesia",
+    "anaesthetic": "anesthetic",
+    "anaesthetics": "anesthetics",
+    "anaesthetise": "anesthetize",
+    "anaesthetised": "anesthetized",
+    "anaesthetises": "anesthetizes",
+    "anaesthetising": "anesthetizing",
+    "anaesthetist": "anesthetist",
+    "anaesthetists": "anesthetists",
+    "anaesthetize": "anesthetize",
+    "anaesthetized": "anesthetized",
+    "anaesthetizes": "anesthetizes",
+    "anaesthetizing": "anesthetizing",
+    "analogue": "analog",
+    "analogues": "analogs",
+    "analyse": "analyze",
+    "analysed": "analyzed",
+    "analyses": "analyzes",
+    "analysing": "analyzing",
+    "anglicise": "anglicize",
+    "anglicised": "anglicized",
+    "anglicises": "anglicizes",
+    "anglicising": "anglicizing",
+    "annualised": "annualized",
+    "antagonise": "antagonize",
+    "antagonised": "antagonized",
+    "antagonises": "antagonizes",
+    "antagonising": "antagonizing",
+    "apologise": "apologize",
+    "apologised": "apologized",
+    "apologises": "apologizes",
+    "apologising": "apologizing",
+    "appal": "appall",
+    "appals": "appalls",
+    "appetiser": "appetizer",
+    "appetisers": "appetizers",
+    "appetising": "appetizing",
+    "appetisingly": "appetizingly",
+    "arbour": "arbor",
+    "arbours": "arbors",
+    "archeological": "archaeological",
+    "archaeologically": "archeologically",
+    "archaeologist": "archeologist",
+    "archaeologists": "archeologists",
+    "archaeology": "archeology</span>",
+    "ardour": "ardor",
+    "armour": "armor",
+    "armoured": "armored",
+    "armourer": "armorer",
+    "armourers": "armorers",
+    "armouries": "armories",
+    "armoury": "armory",
+    "artefact": "artifact",
+    "artefacts": "artifacts",
+    "authorise": "authorize",
+    "authorised": "authorized",
+    "authorises": "authorizes",
+    "authorising": "authorizing",
+    "axe": "ax",
+    "backpedalled": "backpedaled",
+    "backpedalling": "backpedaling",
+    "bannister": "banister",
+    "bannisters": "banisters",
+    "baptise": "baptize",
+    "baptised": "baptized",
+    "baptises": "baptizes",
+    "baptising": "baptizing",
+    "bastardise": "bastardize",
+    "bastardised": "bastardized",
+    "bastardises": "bastardizes",
+    "bastardising": "bastardizing",
+    "battleax": "battleaxe",
+    "baulk": "balk",
+    "baulked": "balked",
+    "baulking": "balking",
+    "baulks": "balks",
+    "bedevilled": "bedeviled",
+    "bedevilling": "bedeviling",
+    "behaviour": "behavior",
+    "behavioural": "behavioral",
+    "behaviourism": "behaviorism",
+    "behaviourist": "behaviorist",
+    "behaviourists": "behaviorists",
+    "behaviours": "behaviors",
+    "behove": "behoove",
+    "behoved": "behooved",
+    "behoves": "behooves",
+    "bejewelled": "bejeweled",
+    "belabour": "belabor",
+    "belaboured": "belabored",
+    "belabouring": "belaboring",
+    "belabours": "belabors",
+    "bevelled": "beveled",
+    "bevvies": "bevies",
+    "bevvy": "bevy",
+    "biassed": "biased",
+    "biassing": "biasing",
+    "bingeing": "binging",
+    "bougainvillaea": "bougainvillea",
+    "bougainvillaeas": "bougainvilleas",
+    "bowdlerise": "bowdlerize",
+    "bowdlerised": "bowdlerized",
+    "bowdlerises": "bowdlerizes",
+    "bowdlerising": "bowdlerizing",
+    "breathalyse": "breathalyze",
+    "breathalysed": "breathalyzed",
+    "breathalyser": "breathalyzer",
+    "breathalysers": "breathalyzers",
+    "breathalyses": "breathalyzes",
+    "breathalysing": "breathalyzing",
+    "brutalise": "brutalize",
+    "brutalised": "brutalized",
+    "brutalises": "brutalizes",
+    "brutalising": "brutalizing",
+    "busses": "buses",
+    "bussing": "busing",
+    "caesarean": "cesarean",
+    "caesareans": "cesareans",
+    "calibre": "caliber",
+    "calibres": "calibers",
+    "calliper": "caliper",
+    "callipers": "calipers",
+    "callisthenics": "calisthenics",
+    "canalise": "canalize",
+    "canalised": "canalized",
+    "canalises": "canalizes",
+    "canalising": "canalizing",
+    "cancelation": "cancellation",
+    "cancelations": "cancellations",
+    "cancelled": "canceled",
+    "cancelling": "canceling",
+    "candour": "candor",
+    "cannibalise": "cannibalize",
+    "cannibalised": "cannibalized",
+    "cannibalises": "cannibalizes",
+    "cannibalising": "cannibalizing",
+    "canonise": "canonize",
+    "canonised": "canonized",
+    "canonises": "canonizes",
+    "canonising": "canonizing",
+    "capitalise": "capitalize",
+    "capitalised": "capitalized",
+    "capitalises": "capitalizes",
+    "capitalising": "capitalizing",
+    "caramelise": "caramelize",
+    "caramelised": "caramelized",
+    "caramelises": "caramelizes",
+    "caramelising": "caramelizing",
+    "carbonise": "carbonize",
+    "carbonised": "carbonized",
+    "carbonises": "carbonizes",
+    "carbonising": "carbonizing",
+    "carolled": "caroled",
+    "carolling": "caroling",
+    "catalogue": "catalog",
+    "catalogued": "cataloged",
+    "catalogues": "catalogs",
+    "cataloguing": "cataloging",
+    "catalyse": "catalyze",
+    "catalysed": "catalyzed",
+    "catalyses": "catalyzes",
+    "catalysing": "catalyzing",
+    "categorise": "categorize",
+    "categorised": "categorized",
+    "categorises": "categorizes",
+    "categorising": "categorizing",
+    "cauterise": "cauterize",
+    "cauterised": "cauterized",
+    "cauterises": "cauterizes",
+    "cauterising": "cauterizing",
+    "cavilled": "caviled",
+    "cavilling": "caviling",
+    "centigramme": "centigram",
+    "centigrammes": "centigrams",
+    "centilitre": "centiliter",
+    "centilitres": "centiliters",
+    "centimetre": "centimeter",
+    "centimetres": "centimeters",
+    "centralise": "centralize",
+    "centralised": "centralized",
+    "centralises": "centralizes",
+    "centralising": "centralizing",
+    "centre": "center",
+    "centred": "centered",
+    "centrefold": "centerfold",
+    "centrefolds": "centerfolds",
+    "centrepiece": "centerpiece",
+    "centrepieces": "centerpieces",
+    "centres": "centers",
+    "channelled": "channeled",
+    "channelling": "channeling",
+    "characterise": "characterize",
+    "characterised": "characterized",
+    "characterises": "characterizes",
+    "characterising": "characterizing",
+    "cheque": "check",
+    "chequebook": "checkbook",
+    "chequebooks": "checkbooks",
+    "chequered": "checkered",
+    "cheques": "checks",
+    "chilli": "chili",
+    "chimaera": "chimera",
+    "chimaeras": "chimeras",
+    "chiselled": "chiseled",
+    "chiselling": "chiseling",
+    "circularise": "circularize",
+    "circularised": "circularized",
+    "circularises": "circularizes",
+    "circularising": "circularizing",
+    "civilise": "civilize",
+    "civilised": "civilized",
+    "civilises": "civilizes",
+    "civilising": "civilizing",
+    "clamour": "clamor",
+    "clamoured": "clamored",
+    "clamouring": "clamoring",
+    "clamours": "clamors",
+    "clangour": "clangor",
+    "clarinettist": "clarinetist",
+    "clarinettists": "clarinetists",
+    "collectivise": "collectivize",
+    "collectivised": "collectivized",
+    "collectivises": "collectivizes",
+    "collectivising": "collectivizing",
+    "colonisation": "colonization",
+    "colonise": "colonize",
+    "colonised": "colonized",
+    "coloniser": "colonizer",
+    "colonisers": "colonizers",
+    "colonises": "colonizes",
+    "colonising": "colonizing",
+    "colour": "color",
+    "colourant": "colorant",
+    "colourants": "colorants",
+    "coloured": "colored",
+    "coloureds": "coloreds",
+    "colourful": "colorful",
+    "colourfully": "colorfully",
+    "colouring": "coloring",
+    "colourize": "colorize",
+    "colourized": "colorized",
+    "colourizes": "colorizes",
+    "colourizing": "colorizing",
+    "colourless": "colorless",
+    "colours": "colors",
+    "commercialise": "commercialize",
+    "commercialised": "commercialized",
+    "commercialises": "commercializes",
+    "commercialising": "commercializing",
+    "compartmentalise": "compartmentalize",
+    "compartmentalised": "compartmentalized",
+    "compartmentalises": "compartmentalizes",
+    "compartmentalising": "compartmentalizing",
+    "computerise": "computerize",
+    "computerised": "computerized",
+    "computerises": "computerizes",
+    "computerising": "computerizing",
+    "conceptualise": "conceptualize",
+    "conceptualised": "conceptualized",
+    "conceptualises": "conceptualizes",
+    "conceptualising": "conceptualizing",
+    "connexion": "connection",
+    "connexions": "connections",
+    "contextualise": "contextualize",
+    "contextualised": "contextualized",
+    "contextualises": "contextualizes",
+    "contextualising": "contextualizing",
+    "cosier": "cozier",
+    "cosies": "cozies",
+    "cosiest": "coziest",
+    "cosily": "cozily",
+    "cosiness": "coziness",
+    "cosy": "cozy",
+    "councillor": "councilor",
+    "councillors": "councilors",
+    "counselled": "counseled",
+    "counselling": "counseling",
+    "counsellor": "counselor",
+    "counsellors": "counselors",
+    "crenelated": "crenellated",
+    "criminalise": "criminalize",
+    "criminalised": "criminalized",
+    "criminalises": "criminalizes",
+    "criminalising": "criminalizing",
+    "criticise": "criticize",
+    "criticised": "criticized",
+    "criticises": "criticizes",
+    "criticising": "criticizing",
+    "crueller": "crueler",
+    "cruellest": "cruelest",
+    "crystallisation": "crystallization",
+    "crystallise": "crystallize",
+    "crystallised": "crystallized",
+    "crystallises": "crystallizes",
+    "crystallising": "crystallizing",
+    "cudgelled": "cudgeled",
+    "cudgelling": "cudgeling",
+    "customise": "customize",
+    "customised": "customized",
+    "customises": "customizes",
+    "customising": "customizing",
+    "cypher": "cipher",
+    "cyphers": "ciphers",
+    "decentralisation": "decentralization",
+    "decentralise": "decentralize",
+    "decentralised": "decentralized",
+    "decentralises": "decentralizes",
+    "decentralising": "decentralizing",
+    "decriminalisation": "decriminalization",
+    "decriminalise": "decriminalize",
+    "decriminalised": "decriminalized",
+    "decriminalises": "decriminalizes",
+    "decriminalising": "decriminalizing",
+    "defence": "defense",
+    "defenceless": "defenseless",
+    "defences": "defenses",
+    "dehumanisation": "dehumanization",
+    "dehumanise": "dehumanize",
+    "dehumanised": "dehumanized",
+    "dehumanises": "dehumanizes",
+    "dehumanising": "dehumanizing",
+    "demeanour": "demeanor",
+    "demilitarisation": "demilitarization",
+    "demilitarise": "demilitarize",
+    "demilitarised": "demilitarized",
+    "demilitarises": "demilitarizes",
+    "demilitarising": "demilitarizing",
+    "demobilisation": "demobilization",
+    "demobilise": "demobilize",
+    "demobilised": "demobilized",
+    "demobilises": "demobilizes",
+    "demobilising": "demobilizing",
+    "democratisation": "democratization",
+    "democratise": "democratize",
+    "democratised": "democratized",
+    "democratises": "democratizes",
+    "democratising": "democratizing",
+    "demonise": "demonize",
+    "demonised": "demonized",
+    "demonises": "demonizes",
+    "demonising": "demonizing",
+    "demoralisation": "demoralization",
+    "demoralise": "demoralize",
+    "demoralised": "demoralized",
+    "demoralises": "demoralizes",
+    "demoralising": "demoralizing",
+    "denationalisation": "denationalization",
+    "denationalise": "denationalize",
+    "denationalised": "denationalized",
+    "denationalises": "denationalizes",
+    "denationalising": "denationalizing",
+    "deodorise": "deodorize",
+    "deodorised": "deodorized",
+    "deodorises": "deodorizes",
+    "deodorising": "deodorizing",
+    "depersonalise": "depersonalize",
+    "depersonalised": "depersonalized",
+    "depersonalises": "depersonalizes",
+    "depersonalising": "depersonalizing",
+    "deputise": "deputize",
+    "deputised": "deputized",
+    "deputises": "deputizes",
+    "deputising": "deputizing",
+    "desensitisation": "desensitization",
+    "desensitise": "desensitize",
+    "desensitised": "desensitized",
+    "desensitises": "desensitizes",
+    "desensitising": "desensitizing",
+    "destabilisation": "destabilization",
+    "destabilise": "destabilize",
+    "destabilised": "destabilized",
+    "destabilises": "destabilizes",
+    "destabilising": "destabilizing",
+    "dialled": "dialed",
+    "dialling": "dialing",
+    "dialogue": "dialog",
+    "dialogues": "dialogs",
+    "diarrhoea": "diarrhea",
+    "digitise": "digitize",
+    "digitised": "digitized",
+    "digitises": "digitizes",
+    "digitising": "digitizing",
+    "disc": "disk",
+    "discolour": "discolor",
+    "discoloured": "discolored",
+    "discolouring": "discoloring",
+    "discolours": "discolors",
+    "discs": "disks",
+    "disembowelled": "disemboweled",
+    "disembowelling": "disemboweling",
+    "disfavour": "disfavor",
+    "dishevelled": "disheveled",
+    "dishonour": "dishonor",
+    "dishonourable": "dishonorable",
+    "dishonourably": "dishonorably",
+    "dishonoured": "dishonored",
+    "dishonouring": "dishonoring",
+    "dishonours": "dishonors",
+    "disorganisation": "disorganization",
+    "disorganised": "disorganized",
+    "distil": "distill",
+    "distils": "distills",
+    "dramatisation": "dramatization",
+    "dramatisations": "dramatizations",
+    "dramatise": "dramatize",
+    "dramatised": "dramatized",
+    "dramatises": "dramatizes",
+    "dramatising": "dramatizing",
+    "draught": "draft",
+    "draughtboard": "draftboard",
+    "draughtboards": "draftboards",
+    "draughtier": "draftier",
+    "draughtiest": "draftiest",
+    "draughts": "drafts",
+    "draughtsman": "draftsman",
+    "draughtsmanship": "draftsmanship",
+    "draughtsmen": "draftsmen",
+    "draughtswoman": "draftswoman",
+    "draughtswomen": "draftswomen",
+    "draughty": "drafty",
+    "drivelled": "driveled",
+    "drivelling": "driveling",
+    "duelled": "dueled",
+    "duelling": "dueling",
+    "economise": "economize",
+    "economised": "economized",
+    "economises": "economizes",
+    "economising": "economizing",
+    "edoema": "edema",
+    "editorialise": "editorialize",
+    "editorialised": "editorialized",
+    "editorialises": "editorializes",
+    "editorialising": "editorializing",
+    "empathise": "empathize",
+    "empathised": "empathized",
+    "empathises": "empathizes",
+    "empathising": "empathizing",
+    "emphasise": "emphasize",
+    "emphasised": "emphasized",
+    "emphasises": "emphasizes",
+    "emphasising": "emphasizing",
+    "enamelled": "enameled",
+    "enamelling": "enameling",
+    "enamoured": "enamored",
+    "encyclopaedia": "encyclopedia",
+    "encyclopaedias": "encyclopedias",
+    "encyclopaedic": "encyclopedic",
+    "endeavour": "endeavor",
+    "endeavoured": "endeavored",
+    "endeavouring": "endeavoring",
+    "endeavours": "endeavors",
+    "energise": "energize",
+    "energised": "energized",
+    "energises": "energizes",
+    "energising": "energizing",
+    "enrol": "enroll",
+    "enrols": "enrolls",
+    "enthral": "enthrall",
+    "enthrals": "enthralls",
+    "epaulette": "epaulet",
+    "epaulettes": "epaulets",
+    "epicentre": "epicenter",
+    "epicentres": "epicenters",
+    "epilogue": "epilog",
+    "epilogues": "epilogs",
+    "epitomise": "epitomize",
+    "epitomised": "epitomized",
+    "epitomises": "epitomizes",
+    "epitomising": "epitomizing",
+    "equalisation": "equalization",
+    "equalise": "equalize",
+    "equalised": "equalized",
+    "equaliser": "equalizer",
+    "equalisers": "equalizers",
+    "equalises": "equalizes",
+    "equalising": "equalizing",
+    "eulogise": "eulogize",
+    "eulogised": "eulogized",
+    "eulogises": "eulogizes",
+    "eulogising": "eulogizing",
+    "evangelise": "evangelize",
+    "evangelised": "evangelized",
+    "evangelises": "evangelizes",
+    "evangelising": "evangelizing",
+    "exorcise": "exorcize",
+    "exorcised": "exorcized",
+    "exorcises": "exorcizes",
+    "exorcising": "exorcizing",
+    "extemporisation": "extemporization",
+    "extemporise": "extemporize",
+    "extemporised": "extemporized",
+    "extemporises": "extemporizes",
+    "extemporising": "extemporizing",
+    "externalisation": "externalization",
+    "externalisations": "externalizations",
+    "externalise": "externalize",
+    "externalised": "externalized",
+    "externalises": "externalizes",
+    "externalising": "externalizing",
+    "factorise": "factorize",
+    "factorised": "factorized",
+    "factorises": "factorizes",
+    "factorising": "factorizing",
+    "faecal": "fecal",
+    "faeces": "feces",
+    "familiarisation": "familiarization",
+    "familiarise": "familiarize",
+    "familiarised": "familiarized",
+    "familiarises": "familiarizes",
+    "familiarising": "familiarizing",
+    "fantasise": "fantasize",
+    "fantasised": "fantasized",
+    "fantasises": "fantasizes",
+    "fantasising": "fantasizing",
+    "favour": "favor",
+    "favourable": "favorable",
+    "favourably": "favorably",
+    "favoured": "favored",
+    "favouring": "favoring",
+    "favourite": "favorite",
+    "favourites": "favorites",
+    "favouritism": "favoritism",
+    "favours": "favors",
+    "feminise": "feminize",
+    "feminised": "feminized",
+    "feminises": "feminizes",
+    "feminising": "feminizing",
+    "fertilisation": "fertilization",
+    "fertilise": "fertilize",
+    "fertilised": "fertilized",
+    "fertiliser": "fertilizer",
+    "fertilisers": "fertilizers",
+    "fertilises": "fertilizes",
+    "fertilising": "fertilizing",
+    "fervour": "fervor",
+    "fibre": "fiber",
+    "fibreglass": "fiberglass",
+    "fibres": "fibers",
+    "fictionalisation": "fictionalization",
+    "fictionalisations": "fictionalizations",
+    "fictionalise": "fictionalize",
+    "fictionalised": "fictionalized",
+    "fictionalises": "fictionalizes",
+    "fictionalising": "fictionalizing",
+    "fillet": "filet",
+    "filleted": "fileted",
+    "filleting": "fileting",
+    "fillets": "filets",
+    "finalisation": "finalization",
+    "finalise": "finalize",
+    "finalised": "finalized",
+    "finalises": "finalizes",
+    "finalising": "finalizing",
+    "flautist": "flutist",
+    "flautists": "flutists",
+    "flavour": "flavor",
+    "flavoured": "flavored",
+    "flavouring": "flavoring",
+    "flavourings": "flavorings",
+    "flavourless": "flavorless",
+    "flavours": "flavors",
+    "flavoursome": "flavorsome",
+    "flyer / flier": "flier / flyer",
+    "foetal": "fetal",
+    "foetid": "fetid",
+    "foetus": "fetus",
+    "foetuses": "fetuses",
+    "formalisation": "formalization",
+    "formalise": "formalize",
+    "formalised": "formalized",
+    "formalises": "formalizes",
+    "formalising": "formalizing",
+    "fossilisation": "fossilization",
+    "fossilise": "fossilize",
+    "fossilised": "fossilized",
+    "fossilises": "fossilizes",
+    "fossilising": "fossilizing",
+    "fraternisation": "fraternization",
+    "fraternise": "fraternize",
+    "fraternised": "fraternized",
+    "fraternises": "fraternizes",
+    "fraternising": "fraternizing",
+    "fulfil": "fulfill",
+    "fulfilment": "fulfillment",
+    "fulfils": "fulfills",
+    "funnelled": "funneled",
+    "funnelling": "funneling",
+    "galvanise": "galvanize",
+    "galvanised": "galvanized",
+    "galvanises": "galvanizes",
+    "galvanising": "galvanizing",
+    "gambolled": "gamboled",
+    "gambolling": "gamboling",
+    "gaol": "jail",
+    "gaolbird": "jailbird",
+    "gaolbirds": "jailbirds",
+    "gaolbreak": "jailbreak",
+    "gaolbreaks": "jailbreaks",
+    "gaoled": "jailed",
+    "gaoler": "jailer",
+    "gaolers": "jailers",
+    "gaoling": "jailing",
+    "gaols": "jails",
+    "gasses": "gases",
+    "gage": "gauge",
+    "gaged": "gauged",
+    "gages": "gauges",
+    "gaging": "gauging",
+    "generalisation": "generalization",
+    "generalisations": "generalizations",
+    "generalise": "generalize",
+    "generalised": "generalized",
+    "generalises": "generalizes",
+    "generalising": "generalizing",
+    "ghettoise": "ghettoize",
+    "ghettoised": "ghettoized",
+    "ghettoises": "ghettoizes",
+    "ghettoising": "ghettoizing",
+    "gipsies": "gypsies",
+    "glamorise": "glamorize",
+    "glamorised": "glamorized",
+    "glamorises": "glamorizes",
+    "glamorising": "glamorizing",
+    "glamor": "glamour",
+    "globalisation": "globalization",
+    "globalise": "globalize",
+    "globalised": "globalized",
+    "globalises": "globalizes",
+    "globalising": "globalizing",
+    "glueing": "gluing",
+    "goitre": "goiter",
+    "goitres": "goiters",
+    "gonorrhoea": "gonorrhea",
+    "gramme": "gram",
+    "grammes": "grams",
+    "gravelled": "graveled",
+    "grey": "gray",
+    "greyed": "grayed",
+    "greying": "graying",
+    "greyish": "grayish",
+    "greyness": "grayness",
+    "greys": "grays",
+    "grovelled": "groveled",
+    "grovelling": "groveling",
+    "groyne": "groin",
+    "groynes": "groins",
+    "gruelling": "grueling",
+    "gruellingly": "gruelingly",
+    "gryphon": "griffin",
+    "gryphons": "griffins",
+    "gynaecological": "gynecological",
+    "gynaecologist": "gynecologist",
+    "gynaecologists": "gynecologists",
+    "gynaecology": "gynecology",
+    "haematological": "hematological",
+    "haematologist": "hematologist",
+    "haematologists": "hematologists",
+    "haematology": "hematology",
+    "haemoglobin": "hemoglobin",
+    "haemophilia": "hemophilia",
+    "haemophiliac": "hemophiliac",
+    "haemophiliacs": "hemophiliacs",
+    "haemorrhage": "hemorrhage",
+    "haemorrhaged": "hemorrhaged",
+    "haemorrhages": "hemorrhages",
+    "haemorrhaging": "hemorrhaging",
+    "haemorrhoids": "hemorrhoids",
+    "harbour": "harbor",
+    "harboured": "harbored",
+    "harbouring": "harboring",
+    "harbours": "harbors",
+    "harmonisation": "harmonization",
+    "harmonise": "harmonize",
+    "harmonised": "harmonized",
+    "harmonises": "harmonizes",
+    "harmonising": "harmonizing",
+    "homoeopath": "homeopath",
+    "homoeopathic": "homeopathic",
+    "homoeopaths": "homeopaths",
+    "homoeopathy": "homeopathy",
+    "homogenise": "homogenize",
+    "homogenised": "homogenized",
+    "homogenises": "homogenizes",
+    "homogenising": "homogenizing",
+    "honour": "honor",
+    "honourable": "honorable",
+    "honourably": "honorably",
+    "honoured": "honored",
+    "honouring": "honoring",
+    "honours": "honors",
+    "hospitalisation": "hospitalization",
+    "hospitalise": "hospitalize",
+    "hospitalised": "hospitalized",
+    "hospitalises": "hospitalizes",
+    "hospitalising": "hospitalizing",
+    "humanise": "humanize",
+    "humanised": "humanized",
+    "humanises": "humanizes",
+    "humanising": "humanizing",
+    "humour": "humor",
+    "humoured": "humored",
+    "humouring": "humoring",
+    "humourless": "humorless",
+    "humours": "humors",
+    "hybridise": "hybridize",
+    "hybridised": "hybridized",
+    "hybridises": "hybridizes",
+    "hybridising": "hybridizing",
+    "hypnotise": "hypnotize",
+    "hypnotised": "hypnotized",
+    "hypnotises": "hypnotizes",
+    "hypnotising": "hypnotizing",
+    "hypothesise": "hypothesize",
+    "hypothesised": "hypothesized",
+    "hypothesises": "hypothesizes",
+    "hypothesising": "hypothesizing",
+    "idealisation": "idealization",
+    "idealise": "idealize",
+    "idealised": "idealized",
+    "idealises": "idealizes",
+    "idealising": "idealizing",
+    "idolise": "idolize",
+    "idolised": "idolized",
+    "idolises": "idolizes",
+    "idolising": "idolizing",
+    "immobilisation": "immobilization",
+    "immobilise": "immobilize",
+    "immobilised": "immobilized",
+    "immobiliser": "immobilizer",
+    "immobilisers": "immobilizers",
+    "immobilises": "immobilizes",
+    "immobilising": "immobilizing",
+    "immortalise": "immortalize",
+    "immortalised": "immortalized",
+    "immortalises": "immortalizes",
+    "immortalising": "immortalizing",
+    "immunisation": "immunization",
+    "immunise": "immunize",
+    "immunised": "immunized",
+    "immunises": "immunizes",
+    "immunising": "immunizing",
+    "impanelled": "impaneled",
+    "impanelling": "impaneling",
+    "imperilled": "imperiled",
+    "imperilling": "imperiling",
+    "individualise": "individualize",
+    "individualised": "individualized",
+    "individualises": "individualizes",
+    "individualising": "individualizing",
+    "industrialise": "industrialize",
+    "industrialised": "industrialized",
+    "industrialises": "industrializes",
+    "industrialising": "industrializing",
+    "inflexion": "inflection",
+    "inflexions": "inflections",
+    "initialise": "initialize",
+    "initialised": "initialized",
+    "initialises": "initializes",
+    "initialising": "initializing",
+    "initialled": "initialed",
+    "initialling": "initialing",
+    "instal": "install",
+    "instalment": "installment",
+    "instalments": "installments",
+    "instals": "installs",
+    "instil": "instill",
+    "instils": "instills",
+    "institutionalisation": "institutionalization",
+    "institutionalise": "institutionalize",
+    "institutionalised": "institutionalized",
+    "institutionalises": "institutionalizes",
+    "institutionalising": "institutionalizing",
+    "intellectualise": "intellectualize",
+    "intellectualised": "intellectualized",
+    "intellectualises": "intellectualizes",
+    "intellectualising": "intellectualizing",
+    "internalisation": "internalization",
+    "internalise": "internalize",
+    "internalised": "internalized",
+    "internalises": "internalizes",
+    "internalising": "internalizing",
+    "internationalisation": "internationalization",
+    "internationalise": "internationalize",
+    "internationalised": "internationalized",
+    "internationalises": "internationalizes",
+    "internationalising": "internationalizing",
+    "ionisation": "ionization",
+    "ionise": "ionize",
+    "ionised": "ionized",
+    "ioniser": "ionizer",
+    "ionisers": "ionizers",
+    "ionises": "ionizes",
+    "ionising": "ionizing",
+    "italicise": "italicize",
+    "italicised": "italicized",
+    "italicises": "italicizes",
+    "italicising": "italicizing",
+    "itemise": "itemize",
+    "itemised": "itemized",
+    "itemises": "itemizes",
+    "itemising": "itemizing",
+    "jeopardise": "jeopardize",
+    "jeopardised": "jeopardized",
+    "jeopardises": "jeopardizes",
+    "jeopardising": "jeopardizing",
+    "jewelled": "jeweled",
+    "jeweller": "jeweler",
+    "jewellers": "jewelers",
+    "jewellery": "jewelry",
+    "judgement": "judgment",
+    "kilogramme": "kilogram",
+    "kilogrammes": "kilograms",
+    "kilometre": "kilometer",
+    "kilometres": "kilometers",
+    "labelled": "labeled",
+    "labelling": "labeling",
+    "labour": "labor",
+    "laboured": "labored",
+    "labourer": "laborer",
+    "labourers": "laborers",
+    "labouring": "laboring",
+    "labours": "labors",
+    "lacklustre": "lackluster",
+    "legalisation": "legalization",
+    "legalise": "legalize",
+    "legalised": "legalized",
+    "legalises": "legalizes",
+    "legalising": "legalizing",
+    "legitimise": "legitimize",
+    "legitimised": "legitimized",
+    "legitimises": "legitimizes",
+    "legitimising": "legitimizing",
+    "leukaemia": "leukemia",
+    "levelled": "leveled",
+    "leveller": "leveler",
+    "levellers": "levelers",
+    "levelling": "leveling",
+    "libelled": "libeled",
+    "libelling": "libeling",
+    "libellous": "libelous",
+    "liberalisation": "liberalization",
+    "liberalise": "liberalize",
+    "liberalised": "liberalized",
+    "liberalises": "liberalizes",
+    "liberalising": "liberalizing",
+    "licence": "license",
+    "licenced": "licensed",
+    "licences": "licenses",
+    "licencing": "licensing",
+    "likeable": "likable",
+    "lionisation": "lionization",
+    "lionise": "lionize",
+    "lionised": "lionized",
+    "lionises": "lionizes",
+    "lionising": "lionizing",
+    "liquidise": "liquidize",
+    "liquidised": "liquidized",
+    "liquidiser": "liquidizer",
+    "liquidisers": "liquidizers",
+    "liquidises": "liquidizes",
+    "liquidising": "liquidizing",
+    "litre": "liter",
+    "litres": "liters",
+    "localise": "localize",
+    "localised": "localized",
+    "localises": "localizes",
+    "localising": "localizing",
+    "louvre": "louver",
+    "louvred": "louvered",
+    "louvres": "louvers",
+    "lustre": "luster",
+    "magnetise": "magnetize",
+    "magnetised": "magnetized",
+    "magnetises": "magnetizes",
+    "magnetising": "magnetizing",
+    "manoeuvrability": "maneuverability",
+    "manoeuvrable": "maneuverable",
+    "manoeuvre": "maneuver",
+    "manoeuvred": "maneuvered",
+    "manoeuvres": "maneuvers",
+    "manoeuvring": "maneuvering",
+    "manoeuvrings": "maneuverings",
+    "marginalisation": "marginalization",
+    "marginalise": "marginalize",
+    "marginalised": "marginalized",
+    "marginalises": "marginalizes",
+    "marginalising": "marginalizing",
+    "marshalled": "marshaled",
+    "marshalling": "marshaling",
+    "marvelled": "marveled",
+    "marvelling": "marveling",
+    "marvellous": "marvelous",
+    "marvellously": "marvelously",
+    "materialisation": "materialization",
+    "materialise": "materialize",
+    "materialised": "materialized",
+    "materialises": "materializes",
+    "materialising": "materializing",
+    "maximisation": "maximization",
+    "maximise": "maximize",
+    "maximised": "maximized",
+    "maximises": "maximizes",
+    "maximising": "maximizing",
+    "meagre": "meager",
+    "mechanisation": "mechanization",
+    "mechanise": "mechanize",
+    "mechanised": "mechanized",
+    "mechanises": "mechanizes",
+    "mechanising": "mechanizing",
+    "mediaeval": "medieval",
+    "memorialise": "memorialize",
+    "memorialised": "memorialized",
+    "memorialises": "memorializes",
+    "memorialising": "memorializing",
+    "memorise": "memorize",
+    "memorised": "memorized",
+    "memorises": "memorizes",
+    "memorising": "memorizing",
+    "mesmerise": "mesmerize",
+    "mesmerised": "mesmerized",
+    "mesmerises": "mesmerizes",
+    "mesmerising": "mesmerizing",
+    "metabolise": "metabolize",
+    "metabolised": "metabolized",
+    "metabolises": "metabolizes",
+    "metabolising": "metabolizing",
+    "metre": "meter",
+    "metres": "meters",
+    "micrometre": "micrometer",
+    "micrometres": "micrometers",
+    "militarise": "militarize",
+    "militarised": "militarized",
+    "militarises": "militarizes",
+    "militarising": "militarizing",
+    "milligramme": "milligram",
+    "milligrammes": "milligrams",
+    "millilitre": "milliliter",
+    "millilitres": "milliliters",
+    "millimetre": "millimeter",
+    "millimetres": "millimeters",
+    "miniaturisation": "miniaturization",
+    "miniaturise": "miniaturize",
+    "miniaturised": "miniaturized",
+    "miniaturises": "miniaturizes",
+    "miniaturising": "miniaturizing",
+    "minibusses": "minibuses",
+    "minimise": "minimize",
+    "minimised": "minimized",
+    "minimises": "minimizes",
+    "minimising": "minimizing",
+    "misbehaviour": "misbehavior",
+    "misdemeanour": "misdemeanor",
+    "misdemeanours": "misdemeanors",
+    "misspelt": "misspelled",
+    "mitre": "miter",
+    "mitres": "miters",
+    "mobilisation": "mobilization",
+    "mobilise": "mobilize",
+    "mobilised": "mobilized",
+    "mobilises": "mobilizes",
+    "mobilising": "mobilizing",
+    "modelled": "modeled",
+    "modeller": "modeler",
+    "modellers": "modelers",
+    "modelling": "modeling",
+    "modernise": "modernize",
+    "modernised": "modernized",
+    "modernises": "modernizes",
+    "modernising": "modernizing",
+    "moisturise": "moisturize",
+    "moisturised": "moisturized",
+    "moisturiser": "moisturizer",
+    "moisturisers": "moisturizers",
+    "moisturises": "moisturizes",
+    "moisturising": "moisturizing",
+    "monologue": "monolog",
+    "monologues": "monologs",
+    "monopolisation": "monopolization",
+    "monopolise": "monopolize",
+    "monopolised": "monopolized",
+    "monopolises": "monopolizes",
+    "monopolising": "monopolizing",
+    "moralise": "moralize",
+    "moralised": "moralized",
+    "moralises": "moralizes",
+    "moralising": "moralizing",
+    "motorised": "motorized",
+    "mould": "mold",
+    "moulded": "molded",
+    "moulder": "molder",
+    "mouldered": "moldered",
+    "mouldering": "moldering",
+    "moulders": "molders",
+    "mouldier": "moldier",
+    "mouldiest": "moldiest",
+    "moulding": "molding",
+    "mouldings": "moldings",
+    "moulds": "molds",
+    "mouldy": "moldy",
+    "moult": "molt",
+    "moulted": "molted",
+    "moulting": "molting",
+    "moults": "molts",
+    "moustache": "mustache",
+    "moustached": "mustached",
+    "moustaches": "mustaches",
+    "moustachioed": "mustachioed",
+    "multicoloured": "multicolored",
+    "nationalisation": "nationalization",
+    "nationalisations": "nationalizations",
+    "nationalise": "nationalize",
+    "nationalised": "nationalized",
+    "nationalises": "nationalizes",
+    "nationalising": "nationalizing",
+    "naturalisation": "naturalization",
+    "naturalise": "naturalize",
+    "naturalised": "naturalized",
+    "naturalises": "naturalizes",
+    "naturalising": "naturalizing",
+    "neighbour": "neighbor",
+    "neighbourhood": "neighborhood",
+    "neighbourhoods": "neighborhoods",
+    "neighbouring": "neighboring",
+    "neighbourliness": "neighborliness",
+    "neighbourly": "neighborly",
+    "neighbours": "neighbors",
+    "neutralisation": "neutralization",
+    "neutralise": "neutralize",
+    "neutralised": "neutralized",
+    "neutralises": "neutralizes",
+    "neutralising": "neutralizing",
+    "normalisation": "normalization",
+    "normalise": "normalize",
+    "normalised": "normalized",
+    "normalises": "normalizes",
+    "normalising": "normalizing",
+    "odour": "odor",
+    "odourless": "odorless",
+    "odours": "odors",
+    "oesophagus": "esophagus",
+    "oesophaguses": "esophaguses",
+    "oestrogen": "estrogen",
+    "offence": "offense",
+    "offences": "offenses",
+    "omelette": "omelet",
+    "omelettes": "omelets",
+    "optimise": "optimize",
+    "optimised": "optimized",
+    "optimises": "optimizes",
+    "optimising": "optimizing",
+    "organisation": "organization",
+    "organisational": "organizational",
+    "organisations": "organizations",
+    "organise": "organize",
+    "organised": "organized",
+    "organiser": "organizer",
+    "organisers": "organizers",
+    "organises": "organizes",
+    "organising": "organizing",
+    "orthopaedic": "orthopedic",
+    "orthopaedics": "orthopedics",
+    "ostracise": "ostracize",
+    "ostracised": "ostracized",
+    "ostracises": "ostracizes",
+    "ostracising": "ostracizing",
+    "outmanoeuvre": "outmaneuver",
+    "outmanoeuvred": "outmaneuvered",
+    "outmanoeuvres": "outmaneuvers",
+    "outmanoeuvring": "outmaneuvering",
+    "overemphasise": "overemphasize",
+    "overemphasised": "overemphasized",
+    "overemphasises": "overemphasizes",
+    "overemphasising": "overemphasizing",
+    "oxidisation": "oxidization",
+    "oxidise": "oxidize",
+    "oxidised": "oxidized",
+    "oxidises": "oxidizes",
+    "oxidising": "oxidizing",
+    "paederast": "pederast",
+    "paederasts": "pederasts",
+    "paediatric": "pediatric",
+    "paediatrician": "pediatrician",
+    "paediatricians": "pediatricians",
+    "paediatrics": "pediatrics",
+    "paedophile": "pedophile",
+    "paedophiles": "pedophiles",
+    "paedophilia": "pedophilia",
+    "palaeolithic": "paleolithic",
+    "palaeontologist": "paleontologist",
+    "palaeontologists": "paleontologists",
+    "palaeontology": "paleontology",
+    "panelled": "paneled",
+    "panelling": "paneling",
+    "panellist": "panelist",
+    "panellists": "panelists",
+    "paralyse": "paralyze",
+    "paralysed": "paralyzed",
+    "paralyses": "paralyzes",
+    "paralysing": "paralyzing",
+    "parcelled": "parceled",
+    "parcelling": "parceling",
+    "parlour": "parlor",
+    "parlours": "parlors",
+    "particularise": "particularize",
+    "particularised": "particularized",
+    "particularises": "particularizes",
+    "particularising": "particularizing",
+    "passivisation": "passivization",
+    "passivise": "passivize",
+    "passivised": "passivized",
+    "passivises": "passivizes",
+    "passivising": "passivizing",
+    "pasteurisation": "pasteurization",
+    "pasteurise": "pasteurize",
+    "pasteurised": "pasteurized",
+    "pasteurises": "pasteurizes",
+    "pasteurising": "pasteurizing",
+    "patronise": "patronize",
+    "patronised": "patronized",
+    "patronises": "patronizes",
+    "patronising": "patronizing",
+    "patronisingly": "patronizingly",
+    "pedalled": "pedaled",
+    "pedalling": "pedaling",
+    "pedestrianisation": "pedestrianization",
+    "pedestrianise": "pedestrianize",
+    "pedestrianised": "pedestrianized",
+    "pedestrianises": "pedestrianizes",
+    "pedestrianising": "pedestrianizing",
+    "penalise": "penalize",
+    "penalised": "penalized",
+    "penalises": "penalizes",
+    "penalising": "penalizing",
+    "pencilled": "penciled",
+    "pencilling": "penciling",
+    "personalise": "personalize",
+    "personalised": "personalized",
+    "personalises": "personalizes",
+    "personalising": "personalizing",
+    "pharmacopoeia": "pharmacopeia",
+    "pharmacopoeias": "pharmacopeias",
+    "philosophise": "philosophize",
+    "philosophised": "philosophized",
+    "philosophises": "philosophizes",
+    "philosophising": "philosophizing",
+    "philtre": "filter",
+    "philtres": "filters",
+    "phoney": "phony",
+    "plagiarise": "plagiarize",
+    "plagiarised": "plagiarized",
+    "plagiarises": "plagiarizes",
+    "plagiarising": "plagiarizing",
+    "plough": "plow",
+    "ploughed": "plowed",
+    "ploughing": "plowing",
+    "ploughman": "plowman",
+    "ploughmen": "plowmen",
+    "ploughs": "plows",
+    "ploughshare": "plowshare",
+    "ploughshares": "plowshares",
+    "polarisation": "polarization",
+    "polarise": "polarize",
+    "polarised": "polarized",
+    "polarises": "polarizes",
+    "polarising": "polarizing",
+    "politicisation": "politicization",
+    "politicise": "politicize",
+    "politicised": "politicized",
+    "politicises": "politicizes",
+    "politicising": "politicizing",
+    "popularisation": "popularization",
+    "popularise": "popularize",
+    "popularised": "popularized",
+    "popularises": "popularizes",
+    "popularising": "popularizing",
+    "pouffe": "pouf",
+    "pouffes": "poufs",
+    "practise": "practice",
+    "practised": "practiced",
+    "practises": "practices",
+    "practising": "practicing",
+    "praesidium": "presidium",
+    "praesidiums": "presidiums",
+    "pressurisation": "pressurization",
+    "pressurise": "pressurize",
+    "pressurised": "pressurized",
+    "pressurises": "pressurizes",
+    "pressurising": "pressurizing",
+    "pretence": "pretense",
+    "pretences": "pretenses",
+    "primaeval": "primeval",
+    "prioritisation": "prioritization",
+    "prioritise": "prioritize",
+    "prioritised": "prioritized",
+    "prioritises": "prioritizes",
+    "prioritising": "prioritizing",
+    "privatisation": "privatization",
+    "privatisations": "privatizations",
+    "privatise": "privatize",
+    "privatised": "privatized",
+    "privatises": "privatizes",
+    "privatising": "privatizing",
+    "professionalisation": "professionalization",
+    "professionalise": "professionalize",
+    "professionalised": "professionalized",
+    "professionalises": "professionalizes",
+    "professionalising": "professionalizing",
+    "programme": "program",
+    "programmes": "programs",
+    "prologue": "prolog",
+    "prologues": "prologs",
+    "propagandise": "propagandize",
+    "propagandised": "propagandized",
+    "propagandises": "propagandizes",
+    "propagandising": "propagandizing",
+    "proselytise": "proselytize",
+    "proselytised": "proselytized",
+    "proselytiser": "proselytizer",
+    "proselytisers": "proselytizers",
+    "proselytises": "proselytizes",
+    "proselytising": "proselytizing",
+    "psychoanalyse": "psychoanalyze",
+    "psychoanalysed": "psychoanalyzed",
+    "psychoanalyses": "psychoanalyzes",
+    "psychoanalysing": "psychoanalyzing",
+    "publicise": "publicize",
+    "publicised": "publicized",
+    "publicises": "publicizes",
+    "publicising": "publicizing",
+    "pulverisation": "pulverization",
+    "pulverise": "pulverize",
+    "pulverised": "pulverized",
+    "pulverises": "pulverizes",
+    "pulverising": "pulverizing",
+    "pummelled": "pummel",
+    "pummelling": "pummeled",
+    "pyjama": "pajama",
+    "pyjamas": "pajamas",
+    "pzazz": "pizzazz",
+    "quarrelled": "quarreled",
+    "quarrelling": "quarreling",
+    "radicalise": "radicalize",
+    "radicalised": "radicalized",
+    "radicalises": "radicalizes",
+    "radicalising": "radicalizing",
+    "rancour": "rancor",
+    "randomise": "randomize",
+    "randomised": "randomized",
+    "randomises": "randomizes",
+    "randomising": "randomizing",
+    "rationalisation": "rationalization",
+    "rationalisations": "rationalizations",
+    "rationalise": "rationalize",
+    "rationalised": "rationalized",
+    "rationalises": "rationalizes",
+    "rationalising": "rationalizing",
+    "ravelled": "raveled",
+    "ravelling": "raveling",
+    "realisable": "realizable",
+    "realisation": "realization",
+    "realisations": "realizations",
+    "realise": "realize",
+    "realised": "realized",
+    "realises": "realizes",
+    "realising": "realizing",
+    "recognisable": "recognizable",
+    "recognisably": "recognizably",
+    "recognisance": "recognizance",
+    "recognise": "recognize",
+    "recognised": "recognized",
+    "recognises": "recognizes",
+    "recognising": "recognizing",
+    "reconnoitre": "reconnoiter",
+    "reconnoitred": "reconnoitered",
+    "reconnoitres": "reconnoiters",
+    "reconnoitring": "reconnoitering",
+    "refuelled": "refueled",
+    "refuelling": "refueling",
+    "regularisation": "regularization",
+    "regularise": "regularize",
+    "regularised": "regularized",
+    "regularises": "regularizes",
+    "regularising": "regularizing",
+    "remodelled": "remodeled",
+    "remodelling": "remodeling",
+    "remould": "remold",
+    "remoulded": "remolded",
+    "remoulding": "remolding",
+    "remoulds": "remolds",
+    "reorganisation": "reorganization",
+    "reorganisations": "reorganizations",
+    "reorganise": "reorganize",
+    "reorganised": "reorganized",
+    "reorganises": "reorganizes",
+    "reorganising": "reorganizing",
+    "revelled": "reveled",
+    "reveller": "reveler",
+    "revellers": "revelers",
+    "revelling": "reveling",
+    "revitalise": "revitalize",
+    "revitalised": "revitalized",
+    "revitalises": "revitalizes",
+    "revitalising": "revitalizing",
+    "revolutionise": "revolutionize",
+    "revolutionised": "revolutionized",
+    "revolutionises": "revolutionizes",
+    "revolutionising": "revolutionizing",
+    "rhapsodise": "rhapsodize",
+    "rhapsodised": "rhapsodized",
+    "rhapsodises": "rhapsodizes",
+    "rhapsodising": "rhapsodizing",
+    "rigour": "rigor",
+    "rigours": "rigors",
+    "ritualised": "ritualized",
+    "rivalled": "rivaled",
+    "rivalling": "rivaling",
+    "romanticise": "romanticize",
+    "romanticised": "romanticized",
+    "romanticises": "romanticizes",
+    "romanticising": "romanticizing",
+    "rumour": "rumor",
+    "rumoured": "rumored",
+    "rumours": "rumors",
+    "sabre": "saber",
+    "sabres": "sabers",
+    "saltpetre": "saltpeter",
+    "sanitise": "sanitize",
+    "sanitised": "sanitized",
+    "sanitises": "sanitizes",
+    "sanitising": "sanitizing",
+    "satirise": "satirize",
+    "satirised": "satirized",
+    "satirises": "satirizes",
+    "satirising": "satirizing",
+    "saviour": "savior",
+    "saviours": "saviors",
+    "savour": "savor",
+    "savoured": "savored",
+    "savouries": "savories",
+    "savouring": "savoring",
+    "savours": "savors",
+    "savoury": "savory",
+    "scandalise": "scandalize",
+    "scandalised": "scandalized",
+    "scandalises": "scandalizes",
+    "scandalising": "scandalizing",
+    "sceptic": "skeptic",
+    "sceptical": "skeptical",
+    "sceptically": "skeptically",
+    "scepticism": "skepticism",
+    "sceptics": "skeptics",
+    "sceptre": "scepter",
+    "sceptres": "scepters",
+    "scrutinise": "scrutinize",
+    "scrutinised": "scrutinized",
+    "scrutinises": "scrutinizes",
+    "scrutinising": "scrutinizing",
+    "secularisation": "secularization",
+    "secularise": "secularize",
+    "secularised": "secularized",
+    "secularises": "secularizes",
+    "secularising": "secularizing",
+    "sensationalise": "sensationalize",
+    "sensationalised": "sensationalized",
+    "sensationalises": "sensationalizes",
+    "sensationalising": "sensationalizing",
+    "sensitise": "sensitize",
+    "sensitised": "sensitized",
+    "sensitises": "sensitizes",
+    "sensitising": "sensitizing",
+    "sentimentalise": "sentimentalize",
+    "sentimentalised": "sentimentalized",
+    "sentimentalises": "sentimentalizes",
+    "sentimentalising": "sentimentalizing",
+    "sepulchre": "sepulcher",
+    "sepulchres": "sepulchers",
+    "serialisation": "serialization",
+    "serialisations": "serializations",
+    "serialise": "serialize",
+    "serialised": "serialized",
+    "serialises": "serializes",
+    "serialising": "serializing",
+    "sermonise": "sermonize",
+    "sermonised": "sermonized",
+    "sermonises": "sermonizes",
+    "sermonising": "sermonizing",
+    "sheikh": "sheik",
+    "shovelled": "shoveled",
+    "shovelling": "shoveling",
+    "shrivelled": "shriveled",
+    "shrivelling": "shriveling",
+    "signalise": "signalize",
+    "signalised": "signalized",
+    "signalises": "signalizes",
+    "signalising": "signalizing",
+    "signalled": "signaled",
+    "signalling": "signaling",
+    "smoulder": "smolder",
+    "smouldered": "smoldered",
+    "smouldering": "smoldering",
+    "smoulders": "smolders",
+    "snivelled": "sniveled",
+    "snivelling": "sniveling",
+    "snorkelled": "snorkeled",
+    "snorkelling": "snorkeling",
+    "snowplough": "snowplow",
+    "snowploughs": "snowplow",
+    "socialisation": "socialization",
+    "socialise": "socialize",
+    "socialised": "socialized",
+    "socialises": "socializes",
+    "socialising": "socializing",
+    "sodomise": "sodomize",
+    "sodomised": "sodomized",
+    "sodomises": "sodomizes",
+    "sodomising": "sodomizing",
+    "solemnise": "solemnize",
+    "solemnised": "solemnized",
+    "solemnises": "solemnizes",
+    "solemnising": "solemnizing",
+    "sombre": "somber",
+    "specialisation": "specialization",
+    "specialisations": "specializations",
+    "specialise": "specialize",
+    "specialised": "specialized",
+    "specialises": "specializes",
+    "specialising": "specializing",
+    "spectre": "specter",
+    "spectres": "specters",
+    "spiralled": "spiraled",
+    "spiralling": "spiraling",
+    "splendour": "splendor",
+    "splendours": "splendors",
+    "squirrelled": "squirreled",
+    "squirrelling": "squirreling",
+    "stabilisation": "stabilization",
+    "stabilise": "stabilize",
+    "stabilised": "stabilized",
+    "stabiliser": "stabilizer",
+    "stabilisers": "stabilizers",
+    "stabilises": "stabilizes",
+    "stabilising": "stabilizing",
+    "standardisation": "standardization",
+    "standardise": "standardize",
+    "standardised": "standardized",
+    "standardises": "standardizes",
+    "standardising": "standardizing",
+    "stencilled": "stenciled",
+    "stencilling": "stenciling",
+    "sterilisation": "sterilization",
+    "sterilisations": "sterilizations",
+    "sterilise": "sterilize",
+    "sterilised": "sterilized",
+    "steriliser": "sterilizer",
+    "sterilisers": "sterilizers",
+    "sterilises": "sterilizes",
+    "sterilising": "sterilizing",
+    "stigmatisation": "stigmatization",
+    "stigmatise": "stigmatize",
+    "stigmatised": "stigmatized",
+    "stigmatises": "stigmatizes",
+    "stigmatising": "stigmatizing",
+    "storey": "story",
+    "storeys": "stories",
+    "subsidisation": "subsidization",
+    "subsidise": "subsidize",
+    "subsidised": "subsidized",
+    "subsidiser": "subsidizer",
+    "subsidisers": "subsidizers",
+    "subsidises": "subsidizes",
+    "subsidising": "subsidizing",
+    "succour": "succor",
+    "succoured": "succored",
+    "succouring": "succoring",
+    "succours": "succors",
+    "sulphate": "sulfate",
+    "sulphates": "sulfates",
+    "sulphide": "sulfide",
+    "sulphides": "sulfides",
+    "sulphur": "sulfur",
+    "sulphurous": "sulfurous",
+    "summarise": "summarize",
+    "summarised": "summarized",
+    "summarises": "summarizes",
+    "summarising": "summarizing",
+    "swivelled": "swiveled",
+    "swivelling": "swiveling",
+    "symbolise": "symbolize",
+    "symbolised": "symbolized",
+    "symbolises": "symbolizes",
+    "symbolising": "symbolizing",
+    "sympathise": "sympathize",
+    "sympathised": "sympathized",
+    "sympathiser": "sympathizer",
+    "sympathisers": "sympathizers",
+    "sympathises": "sympathizes",
+    "sympathising": "sympathizing",
+    "synchronisation": "synchronization",
+    "synchronise": "synchronize",
+    "synchronised": "synchronized",
+    "synchronises": "synchronizes",
+    "synchronising": "synchronizing",
+    "synthesise": "synthesize",
+    "synthesised": "synthesized",
+    "synthesiser": "synthesizer",
+    "synthesisers": "synthesizers",
+    "synthesises": "synthesizes",
+    "synthesising": "synthesizing",
+    "syphon": "siphon",
+    "syphoned": "siphoned",
+    "syphoning": "siphoning",
+    "syphons": "siphons",
+    "systematisation": "systematization",
+    "systematise": "systematize",
+    "systematised": "systematized",
+    "systematises": "systematizes",
+    "systematising": "systematizing",
+    "tantalise": "tantalize",
+    "tantalised": "tantalized",
+    "tantalises": "tantalizes",
+    "tantalising": "tantalizing",
+    "tantalisingly": "tantalizingly",
+    "tasselled": "tasseled",
+    "technicolour": "technicolor",
+    "temporise": "temporize",
+    "temporised": "temporized",
+    "temporises": "temporizes",
+    "temporising": "temporizing",
+    "tenderise": "tenderize",
+    "tenderised": "tenderized",
+    "tenderises": "tenderizes",
+    "tenderising": "tenderizing",
+    "terrorise": "terrorize",
+    "terrorised": "terrorized",
+    "terrorises": "terrorizes",
+    "terrorising": "terrorizing",
+    "theatre": "theater",
+    "theatregoer": "theatergoer",
+    "theatregoers": "theatergoers",
+    "theatres": "theaters",
+    "theorise": "theorize",
+    "theorised": "theorized",
+    "theorises": "theorizes",
+    "theorising": "theorizing",
+    "tonne": "ton",
+    "tonnes": "tons",
+    "towelled": "toweled",
+    "towelling": "toweling",
+    "toxaemia": "toxemia",
+    "tranquillise": "tranquilize",
+    "tranquillised": "tranquilized",
+    "tranquilliser": "tranquilizer",
+    "tranquillisers": "tranquilizers",
+    "tranquillises": "tranquilizes",
+    "tranquillising": "tranquilizing",
+    "tranquillity": "tranquility",
+    "tranquillize": "tranquilize",
+    "tranquillized": "tranquilized",
+    "tranquillizer": "tranquilizer",
+    "tranquillizers": "tranquilizers",
+    "tranquillizes": "tranquilizes",
+    "tranquillizing": "tranquilizing",
+    "tranquilly": "tranquility",
+    "transistorised": "transistorized",
+    "traumatise": "traumatize",
+    "traumatised": "traumatized",
+    "traumatises": "traumatizes",
+    "traumatising": "traumatizing",
+    "travelled": "traveled",
+    "traveller": "traveler",
+    "travellers": "travelers",
+    "travelling": "traveling",
+    "travelog": "travelogue",
+    "travelogs": "travelogues",
+    "trialled": "trialed",
+    "trialling": "trialing",
+    "tricolour": "tricolor",
+    "tricolours": "tricolors",
+    "trivialise": "trivialize",
+    "trivialised": "trivialized",
+    "trivialises": "trivializes",
+    "trivialising": "trivializing",
+    "tumour": "tumor",
+    "tumours": "tumors",
+    "tunnelled": "tunneled",
+    "tunnelling": "tunneling",
+    "tyrannise": "tyrannize",
+    "tyrannised": "tyrannized",
+    "tyrannises": "tyrannizes",
+    "tyrannising": "tyrannizing",
+    "tyre": "tire",
+    "tyres": "tires",
+    "unauthorised": "unauthorized",
+    "uncivilised": "uncivilized",
+    "underutilised": "underutilized",
+    "unequalled": "unequaled",
+    "unfavourable": "unfavorable",
+    "unfavourably": "unfavorably",
+    "unionisation": "unionization",
+    "unionise": "unionize",
+    "unionised": "unionized",
+    "unionises": "unionizes",
+    "unionising": "unionizing",
+    "unorganised": "unorganized",
+    "unravelled": "unraveled",
+    "unravelling": "unraveling",
+    "unrecognisable": "unrecognizable",
+    "unrecognised": "unrecognized",
+    "unrivalled": "unrivaled",
+    "unsavoury": "unsavory",
+    "untrammelled": "untrammeled",
+    "urbanisation": "urbanization",
+    "urbanise": "urbanize",
+    "urbanised": "urbanized",
+    "urbanises": "urbanizes",
+    "urbanising": "urbanizing",
+    "utilisable": "utilizable",
+    "utilisation": "utilization",
+    "utilise": "utilize",
+    "utilised": "utilized",
+    "utilises": "utilizes",
+    "utilising": "utilizing",
+    "valour": "valor",
+    "vandalise": "vandalize",
+    "vandalised": "vandalized",
+    "vandalises": "vandalizes",
+    "vandalising": "vandalizing",
+    "vaporisation": "vaporization",
+    "vaporise": "vaporize",
+    "vaporised": "vaporized",
+    "vaporises": "vaporizes",
+    "vaporising": "vaporizing",
+    "vapour": "vapor",
+    "vapours": "vapors",
+    "verbalise": "verbalize",
+    "verbalised": "verbalized",
+    "verbalises": "verbalizes",
+    "verbalising": "verbalizing",
+    "victimisation": "victimization",
+    "victimise": "victimize",
+    "victimised": "victimized",
+    "victimises": "victimizes",
+    "victimising": "victimizing",
+    "videodisc": "videodisk",
+    "videodiscs": "videodisks",
+    "vigour": "vigor",
+    "visualisation": "visualization",
+    "visualisations": "visualizations",
+    "visualise": "visualize",
+    "visualised": "visualized",
+    "visualises": "visualizes",
+    "visualising": "visualizing",
+    "vocalisation": "vocalization",
+    "vocalisations": "vocalizations",
+    "vocalise": "vocalize",
+    "vocalised": "vocalized",
+    "vocalises": "vocalizes",
+    "vocalising": "vocalizing",
+    "vulcanised": "vulcanized",
+    "vulgarisation": "vulgarization",
+    "vulgarise": "vulgarize",
+    "vulgarised": "vulgarized",
+    "vulgarises": "vulgarizes",
+    "vulgarising": "vulgarizing",
+    "waggon": "wagon",
+    "waggons": "wagons",
+    "watercolour": "watercolor",
+    "watercolours": "watercolors",
+    "weaselled": "weaseled",
+    "weaselling": "weaseling",
+    "westernisation": "westernization",
+    "westernise": "westernize",
+    "westernised": "westernized",
+    "westernises": "westernizes",
+    "westernising": "westernizing",
+    "womanise": "womanize",
+    "womanised": "womanized",
+    "womaniser": "womanizer",
+    "womanisers": "womanizers",
+    "womanises": "womanizes",
+    "womanising": "womanizing",
+    "woollen": "woolen",
+    "woollens": "woolens",
+    "woollies": "woolies",
+    "woolly": "wooly",
+    "worshipped": "worshiped",
+    "worshipping": "worshiping",
+    "worshipper": "worshiper",
+    "yodelled": "yodeled",
+    "yodelling": "yodeling",
+    "yoghourt": "yogurt",
+    "yoghourts": "yogurts",
+    "yoghurt": "yogurt",
+    "yoghurts": "yogurts",
+    "mhm": "hmm",
+    "mmm": "hmm"
+}

MMaDA/eval_ASR_TTS/whisper_asr/normalizers/english.py ADDED Viewed

	@@ -0,0 +1,550 @@

+import json
+import os
+import re
+from fractions import Fraction
+from typing import Iterator, List, Match, Optional, Union
+from more_itertools import windowed
+from .basic import remove_symbols_and_diacritics
+class EnglishNumberNormalizer:
+    """
+    Convert any spelled-out numbers into arabic numbers, while handling:
+    - remove any commas
+    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
+    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
+    - spell out `one` and `ones`
+    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
+    """
+    def __init__(self):
+        super().__init__()
+        self.zeros = {"o", "oh", "zero"}
+        self.ones = {
+            name: i
+            for i, name in enumerate(
+                [
+                    "one",
+                    "two",
+                    "three",
+                    "four",
+                    "five",
+                    "six",
+                    "seven",
+                    "eight",
+                    "nine",
+                    "ten",
+                    "eleven",
+                    "twelve",
+                    "thirteen",
+                    "fourteen",
+                    "fifteen",
+                    "sixteen",
+                    "seventeen",
+                    "eighteen",
+                    "nineteen",
+                ],
+                start=1,
+            )
+        }
+        self.ones_plural = {
+            "sixes" if name == "six" else name + "s": (value, "s")
+            for name, value in self.ones.items()
+        }
+        self.ones_ordinal = {
+            "zeroth": (0, "th"),
+            "first": (1, "st"),
+            "second": (2, "nd"),
+            "third": (3, "rd"),
+            "fifth": (5, "th"),
+            "twelfth": (12, "th"),
+            **{
+                name + ("h" if name.endswith("t") else "th"): (value, "th")
+                for name, value in self.ones.items()
+                if value > 3 and value != 5 and value != 12
+            },
+        }
+        self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
+        self.tens = {
+            "twenty": 20,
+            "thirty": 30,
+            "forty": 40,
+            "fifty": 50,
+            "sixty": 60,
+            "seventy": 70,
+            "eighty": 80,
+            "ninety": 90,
+        }
+        self.tens_plural = {
+            name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
+        }
+        self.tens_ordinal = {
+            name.replace("y", "ieth"): (value, "th")
+            for name, value in self.tens.items()
+        }
+        self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
+        self.multipliers = {
+            "hundred": 100,
+            "thousand": 1_000,
+            "million": 1_000_000,
+            "billion": 1_000_000_000,
+            "trillion": 1_000_000_000_000,
+            "quadrillion": 1_000_000_000_000_000,
+            "quintillion": 1_000_000_000_000_000_000,
+            "sextillion": 1_000_000_000_000_000_000_000,
+            "septillion": 1_000_000_000_000_000_000_000_000,
+            "octillion": 1_000_000_000_000_000_000_000_000_000,
+            "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
+            "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
+        }
+        self.multipliers_plural = {
+            name + "s": (value, "s") for name, value in self.multipliers.items()
+        }
+        self.multipliers_ordinal = {
+            name + "th": (value, "th") for name, value in self.multipliers.items()
+        }
+        self.multipliers_suffixed = {
+            **self.multipliers_plural,
+            **self.multipliers_ordinal,
+        }
+        self.decimals = {*self.ones, *self.tens, *self.zeros}
+        self.preceding_prefixers = {
+            "minus": "-",
+            "negative": "-",
+            "plus": "+",
+            "positive": "+",
+        }
+        self.following_prefixers = {
+            "pound": "£",
+            "pounds": "£",
+            "euro": "€",
+            "euros": "€",
+            "dollar": "$",
+            "dollars": "$",
+            "cent": "¢",
+            "cents": "¢",
+        }
+        self.prefixes = set(
+            list(self.preceding_prefixers.values())
+            + list(self.following_prefixers.values())
+        )
+        self.suffixers = {
+            "per": {"cent": "%"},
+            "percent": "%",
+        }
+        self.specials = {"and", "double", "triple", "point"}
+        self.words = set(
+            [
+                key
+                for mapping in [
+                    self.zeros,
+                    self.ones,
+                    self.ones_suffixed,
+                    self.tens,
+                    self.tens_suffixed,
+                    self.multipliers,
+                    self.multipliers_suffixed,
+                    self.preceding_prefixers,
+                    self.following_prefixers,
+                    self.suffixers,
+                    self.specials,
+                ]
+                for key in mapping
+            ]
+        )
+        self.literal_words = {"one", "ones"}
+    def process_words(self, words: List[str]) -> Iterator[str]:
+        prefix: Optional[str] = None
+        value: Optional[Union[str, int]] = None
+        skip = False
+        def to_fraction(s: str):
+            try:
+                return Fraction(s)
+            except ValueError:
+                return None
+        def output(result: Union[str, int]):
+            nonlocal prefix, value
+            result = str(result)
+            if prefix is not None:
+                result = prefix + result
+            value = None
+            prefix = None
+            return result
+        if len(words) == 0:
+            return
+        for prev, current, next in windowed([None] + words + [None], 3):
+            if skip:
+                skip = False
+                continue
+            next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
+            has_prefix = current[0] in self.prefixes
+            current_without_prefix = current[1:] if has_prefix else current
+            if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
+                # arabic numbers (potentially with signs and fractions)
+                f = to_fraction(current_without_prefix)
+                assert f is not None
+                if value is not None:
+                    if isinstance(value, str) and value.endswith("."):
+                        # concatenate decimals / ip address components
+                        value = str(value) + str(current)
+                        continue
+                    else:
+                        yield output(value)
+                prefix = current[0] if has_prefix else prefix
+                if f.denominator == 1:
+                    value = f.numerator  # store integers as int
+                else:
+                    value = current_without_prefix
+            elif current not in self.words:
+                # non-numeric words
+                if value is not None:
+                    yield output(value)
+                yield output(current)
+            elif current in self.zeros:
+                value = str(value or "") + "0"
+            elif current in self.ones:
+                ones = self.ones[current]
+                if value is None:
+                    value = ones
+                elif isinstance(value, str) or prev in self.ones:
+                    if (
+                        prev in self.tens and ones < 10
+                    ):  # replace the last zero with the digit
+                        assert value[-1] == "0"
+                        value = value[:-1] + str(ones)
+                    else:
+                        value = str(value) + str(ones)
+                elif ones < 10:
+                    if value % 10 == 0:
+                        value += ones
+                    else:
+                        value = str(value) + str(ones)
+                else:  # eleven to nineteen
+                    if value % 100 == 0:
+                        value += ones
+                    else:
+                        value = str(value) + str(ones)
+            elif current in self.ones_suffixed:
+                # ordinal or cardinal; yield the number right away
+                ones, suffix = self.ones_suffixed[current]
+                if value is None:
+                    yield output(str(ones) + suffix)
+                elif isinstance(value, str) or prev in self.ones:
+                    if prev in self.tens and ones < 10:
+                        assert value[-1] == "0"
+                        yield output(value[:-1] + str(ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                elif ones < 10:
+                    if value % 10 == 0:
+                        yield output(str(value + ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                else:  # eleven to nineteen
+                    if value % 100 == 0:
+                        yield output(str(value + ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                value = None
+            elif current in self.tens:
+                tens = self.tens[current]
+                if value is None:
+                    value = tens
+                elif isinstance(value, str):
+                    value = str(value) + str(tens)
+                else:
+                    if value % 100 == 0:
+                        value += tens
+                    else:
+                        value = str(value) + str(tens)
+            elif current in self.tens_suffixed:
+                # ordinal or cardinal; yield the number right away
+                tens, suffix = self.tens_suffixed[current]
+                if value is None:
+                    yield output(str(tens) + suffix)
+                elif isinstance(value, str):
+                    yield output(str(value) + str(tens) + suffix)
+                else:
+                    if value % 100 == 0:
+                        yield output(str(value + tens) + suffix)
+                    else:
+                        yield output(str(value) + str(tens) + suffix)
+            elif current in self.multipliers:
+                multiplier = self.multipliers[current]
+                if value is None:
+                    value = multiplier
+                elif isinstance(value, str) or value == 0:
+                    f = to_fraction(value)
+                    p = f * multiplier if f is not None else None
+                    if f is not None and p.denominator == 1:
+                        value = p.numerator
+                    else:
+                        yield output(value)
+                        value = multiplier
+                else:
+                    before = value // 1000 * 1000
+                    residual = value % 1000
+                    value = before + residual * multiplier
+            elif current in self.multipliers_suffixed:
+                multiplier, suffix = self.multipliers_suffixed[current]
+                if value is None:
+                    yield output(str(multiplier) + suffix)
+                elif isinstance(value, str):
+                    f = to_fraction(value)
+                    p = f * multiplier if f is not None else None
+                    if f is not None and p.denominator == 1:
+                        yield output(str(p.numerator) + suffix)
+                    else:
+                        yield output(value)
+                        yield output(str(multiplier) + suffix)
+                else:  # int
+                    before = value // 1000 * 1000
+                    residual = value % 1000
+                    value = before + residual * multiplier
+                    yield output(str(value) + suffix)
+                value = None
+            elif current in self.preceding_prefixers:
+                # apply prefix (positive, minus, etc.) if it precedes a number
+                if value is not None:
+                    yield output(value)
+                if next in self.words or next_is_numeric:
+                    prefix = self.preceding_prefixers[current]
+                else:
+                    yield output(current)
+            elif current in self.following_prefixers:
+                # apply prefix (dollars, cents, etc.) only after a number
+                if value is not None:
+                    prefix = self.following_prefixers[current]
+                    yield output(value)
+                else:
+                    yield output(current)
+            elif current in self.suffixers:
+                # apply suffix symbols (percent -> '%')
+                if value is not None:
+                    suffix = self.suffixers[current]
+                    if isinstance(suffix, dict):
+                        if next in suffix:
+                            yield output(str(value) + suffix[next])
+                            skip = True
+                        else:
+                            yield output(value)
+                            yield output(current)
+                    else:
+                        yield output(str(value) + suffix)
+                else:
+                    yield output(current)
+            elif current in self.specials:
+                if next not in self.words and not next_is_numeric:
+                    # apply special handling only if the next word can be numeric
+                    if value is not None:
+                        yield output(value)
+                    yield output(current)
+                elif current == "and":
+                    # ignore "and" after hundreds, thousands, etc.
+                    if prev not in self.multipliers:
+                        if value is not None:
+                            yield output(value)
+                        yield output(current)
+                elif current == "double" or current == "triple":
+                    if next in self.ones or next in self.zeros:
+                        repeats = 2 if current == "double" else 3
+                        ones = self.ones.get(next, 0)
+                        value = str(value or "") + str(ones) * repeats
+                        skip = True
+                    else:
+                        if value is not None:
+                            yield output(value)
+                        yield output(current)
+                elif current == "point":
+                    if next in self.decimals or next_is_numeric:
+                        value = str(value or "") + "."
+                else:
+                    # should all have been covered at this point
+                    raise ValueError(f"Unexpected token: {current}")
+            else:
+                # all should have been covered at this point
+                raise ValueError(f"Unexpected token: {current}")
+        if value is not None:
+            yield output(value)
+    def preprocess(self, s: str):
+        # replace "<number> and a half" with "<number> point five"
+        results = []
+        segments = re.split(r"\band\s+a\s+half\b", s)
+        for i, segment in enumerate(segments):
+            if len(segment.strip()) == 0:
+                continue
+            if i == len(segments) - 1:
+                results.append(segment)
+            else:
+                results.append(segment)
+                last_word = segment.rsplit(maxsplit=2)[-1]
+                if last_word in self.decimals or last_word in self.multipliers:
+                    results.append("point five")
+                else:
+                    results.append("and a half")
+        s = " ".join(results)
+        # put a space at number/letter boundary
+        s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
+        s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
+        # but remove spaces which could be a suffix
+        s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
+        return s
+    def postprocess(self, s: str):
+        def combine_cents(m: Match):
+            try:
+                currency = m.group(1)
+                integer = m.group(2)
+                cents = int(m.group(3))
+                return f"{currency}{integer}.{cents:02d}"
+            except ValueError:
+                return m.string
+        def extract_cents(m: Match):
+            try:
+                return f"¢{int(m.group(1))}"
+            except ValueError:
+                return m.string
+        # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
+        s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
+        s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
+        # write "one(s)" instead of "1(s)", just for the readability
+        s = re.sub(r"\b1(s?)\b", r"one\1", s)
+        return s
+    def __call__(self, s: str):
+        s = self.preprocess(s)
+        s = " ".join(word for word in self.process_words(s.split()) if word is not None)
+        s = self.postprocess(s)
+        return s
+class EnglishSpellingNormalizer:
+    """
+    Applies British-American spelling mappings as listed in [1].
+    [1] https://www.tysto.com/uk-us-spelling-list.html
+    """
+    def __init__(self):
+        mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
+        self.mapping = json.load(open(mapping_path))
+    def __call__(self, s: str):
+        return " ".join(self.mapping.get(word, word) for word in s.split())
+class EnglishTextNormalizer:
+    def __init__(self):
+        self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
+        self.replacers = {
+            # common contractions
+            r"\bwon't\b": "will not",
+            r"\bcan't\b": "can not",
+            r"\blet's\b": "let us",
+            r"\bain't\b": "aint",
+            r"\by'all\b": "you all",
+            r"\bwanna\b": "want to",
+            r"\bgotta\b": "got to",
+            r"\bgonna\b": "going to",
+            r"\bi'ma\b": "i am going to",
+            r"\bimma\b": "i am going to",
+            r"\bwoulda\b": "would have",
+            r"\bcoulda\b": "could have",
+            r"\bshoulda\b": "should have",
+            r"\bma'am\b": "madam",
+            # contractions in titles/prefixes
+            r"\bmr\b": "mister ",
+            r"\bmrs\b": "missus ",
+            r"\bst\b": "saint ",
+            r"\bdr\b": "doctor ",
+            r"\bprof\b": "professor ",
+            r"\bcapt\b": "captain ",
+            r"\bgov\b": "governor ",
+            r"\bald\b": "alderman ",
+            r"\bgen\b": "general ",
+            r"\bsen\b": "senator ",
+            r"\brep\b": "representative ",
+            r"\bpres\b": "president ",
+            r"\brev\b": "reverend ",
+            r"\bhon\b": "honorable ",
+            r"\basst\b": "assistant ",
+            r"\bassoc\b": "associate ",
+            r"\blt\b": "lieutenant ",
+            r"\bcol\b": "colonel ",
+            r"\bjr\b": "junior ",
+            r"\bsr\b": "senior ",
+            r"\besq\b": "esquire ",
+            # prefect tenses, ideally it should be any past participles, but it's harder..
+            r"'d been\b": " had been",
+            r"'s been\b": " has been",
+            r"'d gone\b": " had gone",
+            r"'s gone\b": " has gone",
+            r"'d done\b": " had done",  # "'s done" is ambiguous
+            r"'s got\b": " has got",
+            # general contractions
+            r"n't\b": " not",
+            r"'re\b": " are",
+            r"'s\b": " is",
+            r"'d\b": " would",
+            r"'ll\b": " will",
+            r"'t\b": " not",
+            r"'ve\b": " have",
+            r"'m\b": " am",
+        }
+        self.standardize_numbers = EnglishNumberNormalizer()
+        self.standardize_spellings = EnglishSpellingNormalizer()
+    def __call__(self, s: str):
+        s = s.lower()
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = re.sub(self.ignore_patterns, "", s)
+        s = re.sub(r"\s+'", "'", s)  # when there's a space before an apostrophe
+        for pattern, replacement in self.replacers.items():
+            s = re.sub(pattern, replacement, s)
+        s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
+        s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
+        s = remove_symbols_and_diacritics(s, keep=".%$¢€£")  # keep numeric symbols
+        s = self.standardize_numbers(s)
+        s = self.standardize_spellings(s)
+        # now remove prefix/suffix symbols that are not preceded/followed by numbers
+        s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
+        s = re.sub(r"([^0-9])%", r"\1 ", s)
+        s = re.sub(r"\s+", " ", s)  # replace any successive whitespaces with a space
+        return s

MMaDA/eval_ASR_TTS/whisper_asr/whisper_asr.py ADDED Viewed

File without changes

MMaDA/eval_emova.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# coding=utf-8
+# Copyright 2025 AIDAS Lab
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import logging
+import editdistance
+from functools import partial
+os.environ["TOKENIZERS_PARALLETISM"] = "true"
+from tqdm import tqdm
+import torch
+import torch.distributed as dist
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+import wandb
+from datasets import load_dataset
+from transformers import AutoModel, AutoProcessor
+# --- Helper Functions (from your reference script) ---
+def setup_logger(rank):
+    """Sets up a logger for each DDP process."""
+    logger = logging.getLogger(__name__)
+    if logger.hasHandlers():
+        logger.handlers.clear()
+    formatter = logging.Formatter(f'%(asctime)s - [RANK {rank}] - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+    ch = logging.StreamHandler()
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+    logger.setLevel(logging.INFO if rank == 0 else logging.WARNING)
+    return logger
+def calculate_WER(recognized_text_list, groundtruth_text_list):
+    """Calculates the Word Error Rate (WER) between predicted and ground truth texts."""
+    word_num, scores = 0.0, 0.0
+    for recognized_text, groundtruth_text in zip(recognized_text_list, groundtruth_text_list):
+        recognized_text = re.sub(r"[^\w\s']", "", recognized_text.lower())
+        groundtruth_text = re.sub(r"[^\w\s']", "", groundtruth_text.lower())
+        recognized_word_list = recognized_text.split()
+        groundtruth_word_list = groundtruth_text.split()
+        current_word_num = len(groundtruth_word_list)
+        word_num += current_word_num
+        scores += editdistance.eval(recognized_word_list, groundtruth_word_list)
+    WER = scores / word_num if word_num > 0 else 0.0
+    return WER, scores, word_num
+def get_librispeech_dataset(logger, split="test.clean"):
+    """Loads the Librispeech ASR dataset from Hugging Face."""
+    logger.info(f"Loading librispeech_asr dataset ({split})...")
+    dataset = load_dataset("librispeech_asr", split=split, trust_remote_code=True)
+    logger.info("Dataset loaded successfully.")
+    return dataset
+def setup_distributed(rank, world_size):
+    """Initializes the distributed process group."""
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+def cleanup_distributed():
+    """Cleans up the distributed process group."""
+    dist.destroy_process_group()
+# --- Custom Dataset and Collate Function for EMOVA ---
+class LibrispeechAudioDataset(Dataset):
+    """A simple dataset that returns audio file path and ground truth text."""
+    def __init__(self, hf_dataset):
+        self.hf_dataset = hf_dataset
+    def __len__(self):
+        return len(self.hf_dataset)
+    def __getitem__(self, idx):
+        example = self.hf_dataset[idx]
+        return {
+            "audio_path": example['file'],
+            "gt_text": example['text'],
+            "sample_id": example['id']
+        }
+class EmovaS2TCollateFn:
+    """
+    Collate function to prepare batches for the EMOVA model using its processor.
+    """
+    def __init__(self, processor):
+        self.processor = processor
+        self.prompt_text = "Transcribe the given audio."
+    def __call__(self, batch):
+        audio_paths = [item["audio_path"] for item in batch]
+        gt_texts = [item["gt_text"] for item in batch]
+        sample_ids = [item["sample_id"] for item in batch]
+        # Construct the text input for each audio file in the batch
+        text_inputs = [
+            [
+                {"role": "user", "content": [{"type": "audio"}, {"type": "text", "text": self.prompt_text}]}
+            ]
+            for _ in audio_paths
+        ]
+        # Use the EMOVA processor to prepare the multimodal batch
+        inputs = self.processor(
+            text=text_inputs,
+            audios=audio_paths,
+            return_tensors="pt",
+            padding=True
+        )
+        inputs['gt_texts'] = gt_texts
+        inputs['sample_ids'] = sample_ids
+        return inputs
+def main():
+    """Main function to run the distributed evaluation."""
+    rank = int(os.environ['RANK'])
+    world_size = int(os.environ['WORLD_SIZE'])
+    setup_distributed(rank, world_size)
+    device = torch.device(f"cuda:{rank}")
+    logger = setup_logger(rank)
+    if rank == 0:
+        wandb.init(project="emova-librispeech-eval")
+    # --- 1. Load EMOVA Models and Processors ---
+    logger.info("Loading EMOVA models and processors...")
+    model_name = "Emova-ollm/emova-qwen-2-5-7b-hf"
+    model = AutoModel.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        attn_implementation='flash_attention_2',
+        low_cpu_mem_usage=True,
+        trust_remote_code=True
+    ).to(device)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    speech_tokenizer = AutoModel.from_pretrained(
+        "Emova-ollm/emova_speech_tokenizer_hf",
+        torch_dtype=torch.float32,
+        trust_remote_code=True
+    ).to(device).eval()
+    processor.set_speech_tokenizer(speech_tokenizer)
+    # Wrap the main model with DDP
+    model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+    logger.info("✅ Models loaded and wrapped with DDP successfully!")
+    # --- 2. Setup DataLoader ---
+    hf_dataset = get_librispeech_dataset(logger, split="test.clean")
+    eval_dataset = LibrispeechAudioDataset(hf_dataset)
+    sampler = DistributedSampler(eval_dataset, num_replicas=world_size, rank=rank, shuffle=False)
+    collate_fn = EmovaS2TCollateFn(processor)
+    dataloader = DataLoader(
+        eval_dataset,
+        batch_size=4,  # Adjust batch size based on your GPU memory
+        sampler=sampler,
+        num_workers=4,
+        collate_fn=collate_fn,
+        pin_memory=True
+    )
+    # --- 3. Evaluation Loop ---
+    local_results = []
+    model.eval()
+    progress_bar = tqdm(dataloader, desc="Evaluating on Librispeech", disable=(rank != 0))
+    for batch in progress_bar:
+        gt_texts = batch.pop("gt_texts")
+        sample_ids = batch.pop("sample_ids")
+        # Move batch tensors to the correct device
+        inputs = {k: v.to(device) for k, v in batch.items()}
+        with torch.no_grad():
+            outputs = model.module.generate(**inputs, max_new_tokens=256, do_sample=False)
+            # Slice to get only the generated tokens
+            generated_ids = outputs[:, inputs['input_ids'].shape[1]:]
+            decoded_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+            for i in range(len(decoded_texts)):
+                local_results.append({
+                    "sample_id": sample_ids[i],
+                    "gt_text": gt_texts[i],
+                    "decoded_text": decoded_texts[i].strip()
+                })
+                if rank == 0 and i == 0 and len(local_results) % 10 == 1: # Log sample every 10 batches on rank 0
+                    logger.info(f"\n--- Sample ---")
+                    logger.info(f"  ID: {sample_ids[i]}")
+                    logger.info(f"  GT: {gt_texts[i]}")
+                    logger.info(f"  PD: {decoded_texts[i].strip()}")
+                    logger.info(f"----------------")
+    # --- 4. Gather Results and Calculate Final Score ---
+    all_results = [None] * world_size
+    dist.all_gather_object(all_results, local_results)
+    if rank == 0:
+        logger.info("Gathering and processing results from all GPUs...")
+        final_results = [item for sublist in all_results for item in sublist]
+        gt_list = [res["gt_text"] for res in final_results]
+        pred_list = [res["decoded_text"] for res in final_results]
+        results_table = wandb.Table(columns=["ID", "Ground Truth", "Prediction"])
+        for res in final_results:
+            results_table.add_data(res["sample_id"], res["gt_text"], res["decoded_text"])
+        wandb.log({"S2T Predictions": results_table})
+        wer, errors, words = calculate_WER(pred_list, gt_list)
+        logger.info(f"Final WER (Librispeech test.clean): {wer:.4f} | Word Errors: {errors} | Total Words: {words}")
+        wandb.log({"WER": wer, "Total Word Errors": errors, "Total Words": words})
+    # --- Cleanup ---
+    if rank == 0:
+        wandb.finish()
+    cleanup_distributed()
+if __name__ == '__main__':
+    # Set master address and port for DDP
+    # os.environ['MASTER_ADDR'] = 'localhost'
+    # os.environ['MASTER_PORT'] = '12355'
+    main()

MMaDA/generate.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+from models import MMadaModelLM
+def add_gumbel_noise(logits, temperature):
+    '''
+    The Gumbel max is a method for sampling categorical distributions.
+    According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
+    Thus, we use float64.
+    '''
+    if temperature == 0:
+        return logits
+    logits = logits.to(torch.float64)
+    noise = torch.rand_like(logits, dtype=torch.float64)
+    gumbel_noise = (- torch.log(noise)) ** temperature
+    return logits.exp() / gumbel_noise
+def get_num_transfer_tokens(mask_index, steps):
+    '''
+    In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals.
+    Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)),
+    the expected number of tokens transitioned at each step should be consistent.
+    This function is designed to precompute the number of tokens that need to be transitioned at each step.
+    '''
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    base = mask_num // steps
+    remainder = mask_num % steps
+    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
+    for i in range(mask_num.size(0)):
+        num_transfer_tokens[i, :remainder[i]] += 1
+    return num_transfer_tokens
+@ torch.no_grad()
+def generate(model, prompt, steps=128, gen_length=128, block_length=128, temperature=0.,
+             cfg_scale=0., remasking='low_confidence', mask_id=126336, attention_mask=None):
+    '''
+    Args:
+        model: Mask predictor.
+        prompt: A tensor of shape (B, L), where B is batch size.
+        steps: Sampling steps, less than or equal to gen_length.
+        gen_length: Generated answer length.
+        block_length: Block length, less than or equal to gen_length. If less than gen_length, it means using semi_autoregressive remasking.
+        temperature: Categorical distribution sampling temperature.
+        cfg_scale: Unsupervised classifier-free guidance scale.
+        remasking: Remasking strategy. 'low_confidence' or 'random'.
+        mask_id: The toke id of [MASK] is 126336.
+    '''
+    if attention_mask is not None and 0.0 in attention_mask:
+        attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
+        print(f"attention_bias: {attention_bias}")
+    else:
+        attention_bias = None
+    batch_size = prompt.shape[0]
+    x = torch.full((batch_size, prompt.shape[1] + gen_length), mask_id, dtype=torch.long).to(model.device)
+    x[:, :prompt.shape[1]] = prompt.clone()
+    prompt_index = (x != mask_id)
+    assert gen_length % block_length == 0
+    num_blocks = gen_length // block_length
+    assert steps % num_blocks == 0
+    steps = steps // num_blocks
+    for num_block in range(num_blocks):
+        block_mask_index = (x[:, prompt.shape[1] + num_block * block_length: prompt.shape[1] + (num_block + 1) * block_length:] == mask_id)
+        num_transfer_tokens = get_num_transfer_tokens(block_mask_index, steps)
+        for i in range(steps):
+            mask_index = (x == mask_id)
+            if cfg_scale > 0.:
+                un_x = x.clone()
+                un_x[prompt_index] = mask_id
+                x_ = torch.cat([x, un_x], dim=0)
+                logits = model(x_).logits
+                logits, un_logits = torch.chunk(logits, 2, dim=0)
+                logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
+            else:
+                logits = model(x, attention_bias=attention_bias).logits
+            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+            x0 = torch.argmax(logits_with_noise, dim=-1) # b, l
+            if remasking == 'low_confidence':
+                p = F.softmax(logits.to(torch.float64), dim=-1)
+                x0_p = torch.squeeze(
+                    torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
+            elif remasking == 'random':
+                x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+            else:
+                raise NotImplementedError(remasking)
+            x0_p[:, prompt.shape[1] + (num_block + 1) * block_length:] = -np.inf
+            x0 = torch.where(mask_index, x0, x)
+            confidence = torch.where(mask_index, x0_p, -np.inf)
+            # print(confidence.shape)
+            transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+            for j in range(confidence.shape[0]):
+                _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j, i])
+                transfer_index[j, select_index] = True
+            x[transfer_index] = x0[transfer_index]
+    return x
+def main():
+    device = 'cuda'
+    # Load from HF
+    # model = MMadaModelLM.from_pretrained("Gen-Verse/MMaDA-8B-Base", trust_remote_code=True, torch_dtype=torch.bfloat16).to(device).eval()
+    # tokenizer = AutoTokenizer.from_pretrained("Gen-Verse/MMaDA-8B-Base", trust_remote_code=True)
+    train_step = 135000
+    trained_checkpoint_path = f"/home/work/AIDAS/ckpts/omada/omada-training-stage1/checkpoint-{train_step}/unwrapped_model/"
+    model = MMadaModelLM.from_pretrained(
+        trained_checkpoint_path,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+        config="/home/work/AIDAS/ckpts/omada/omada-training-stage1/config.json"
+    ).to(device)
+    tokenizer = AutoTokenizer.from_pretrained("Gen-Verse/MMaDA-8B-MixCoT", trust_remote_code=True)
+    tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}"
+    prompt = "Lily can run 12 kilometers per hour for 4 hours. After that, she runs 6 kilometers per hour. How many kilometers can she run in 8 hours?"
+    m = [{"role": "user", "content": prompt}, ]
+    prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
+    input_ids = tokenizer(text=prompt, return_tensors="pt", padding=True, padding_side="left")['input_ids']
+    input_ids = torch.tensor(input_ids).to(device)
+    out = generate(model, input_ids, steps=128, gen_length=128, block_length=128, temperature=1, cfg_scale=0., remasking='low_confidence')
+    print(tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True))
+if __name__ == '__main__':
+    main()