import torch from transformers import AutoTokenizer import librosa from omni_speech.model import * import os from tqdm import tqdm import json from jiwer import wer def process_asr_files(input_folder, output_folder, model_path, processor_path): os.makedirs(output_folder, exist_ok=True) # Create symbolic links for processor files for filename in os.listdir(processor_path): source_file = os.path.join(processor_path, filename) if filename.endswith(".py"): target_file = os.path.join(model_path, filename) try: os.remove(target_file) except FileNotFoundError: print(f"Created new symlink for: {filename}") os.symlink(source_file, target_file) # Load model and tokenizer model = MiniCPMO.from_pretrained( model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16, init_vision=False, init_audio=True, init_tts=False, processor_path=model_path, ) model = model.eval().cuda() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) mimick_prompt = "Please transcribe this audio into text." # Process each JSONL file in the input folder for filename in os.listdir(input_folder): if filename.endswith(".jsonl"): input_file = os.path.join(input_folder, filename) output_file = os.path.join(output_folder, filename) with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) response_data = [] all_predictions = [] all_answers = [] for item in tqdm(data, desc=f"Processing {filename}"): audio_path = item["conversations"][0]["content"] audio_input, _ = librosa.load(audio_path, sr=16000, mono=True) msgs = [{'role': 'system', 'content': 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'}] msgs.append({'role': 'user', 'content': [audio_input, mimick_prompt]}) response_dict = {"question_id": item["id"], "groundtruth": item["conversations"][-1]["content"]} res = model.chat( msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=128, temperature=0.3, generate_audio=False, ) response_dict["prediction"] = str(res) response_data.append(response_dict) all_predictions.append(response_dict["prediction"]) all_answers.append(response_dict["groundtruth"]) # Compute WER overall_wer = wer(all_answers, all_predictions) print(f"{filename} - WER: {overall_wer:.4f}") # Save results to output file with open(output_file, 'w', encoding='utf-8') as f: json.dump(response_data, f, ensure_ascii=False, indent=2) # Example usage input_folder = "/data1/speech/anhnmt2/dataset/s2s/minicpmo/asr/test/8k" output_folder = "/data1/speech/anhnmt2/dataset/s2s/minicpmo/asr/test/preds_8k" model_path = "/data1/speech/anhnmt2/Speech2Speech/LLaMA-Factory/exp/saves/minicpmo_2_6/full_sft_asr_mixed_16M" processor_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo" process_asr_files(input_folder, output_folder, model_path, processor_path)