Spaces:

viditk
/

en-stt-model

Sleeping

App Files Files Community

viditk commited on May 19

Commit

0f0361e

verified ·

1 Parent(s): dd210a3

Create app.py

Browse files

Files changed (1) hide show

app.py +77 -0

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gradio as gr
+import torch
+import speech_recognition as sr
+from pydub import AudioSegment
+from sentence_transformers import SentenceTransformer, util
+import os
+# Load Sentence Transformer model
+model = SentenceTransformer('all-MiniLM-L6-v2')
+# Constants
+MAX_AUDIO_DURATION = 600  # in seconds
+# --- Helper: Convert audio to wav ---
+def convert_audio_to_wav(file_path):
+    audio = AudioSegment.from_file(file_path)
+    wav_path = file_path.replace(file_path.split(".")[-1], "wav")
+    audio.export(wav_path, format="wav")
+    return wav_path
+# --- Helper: Transcribe audio in chunks ---
+def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
+    recognizer = sr.Recognizer()
+    audio = AudioSegment.from_wav(audio_path)
+    if len(audio) > MAX_AUDIO_DURATION * 1000:
+        audio = audio[:MAX_AUDIO_DURATION * 1000]
+    full_text = []
+    for i in range(0, len(audio), chunk_duration * 1000):
+        chunk = audio[i : i + chunk_duration * 1000]
+        chunk_path = f"temp_chunk.wav"
+        chunk.export(chunk_path, format="wav")
+        with sr.AudioFile(chunk_path) as source:
+            audio_data = recognizer.record(source)
+            try:
+                text = recognizer.recognize_google(audio_data, language="en-IN")
+                full_text.append(text)
+            except sr.UnknownValueError:
+                full_text.append("[Unrecognized Audio]")
+            except sr.RequestError as e:
+                full_text.append(f"[Speech Error: {e}]")
+    return " ".join(full_text)
+# --- Main Function ---
+def transcribe_and_evaluate(audio, reference_text):
+    if not audio.endswith(".wav"):
+        audio = convert_audio_to_wav(audio)
+    transcription = transcribe_audio_in_chunks(audio)
+    # Calculate semantic similarity
+    embeddings = model.encode([transcription, reference_text])
+    similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
+    accuracy = round(similarity * 100, 2)  # percentage
+    return transcription, f"{accuracy} %"
+# --- Gradio UI ---
+iface = gr.Interface(
+    fn=transcribe_and_evaluate,
+    inputs=[
+        gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input English Audio"),
+        gr.Textbox(lines=4, placeholder="Enter the reference English text here", label="Reference Text")
+    ],
+    outputs=[
+        gr.Textbox(label="Transcribed Text"),
+        gr.Textbox(label="Semantic Accuracy (%)")
+    ],
+    title="English Speech Recognition + Semantic Accuracy",
+    description="Upload or record English audio → Transcribe → Compare with reference text → Get semantic similarity accuracy.",
+    allow_flagging="never"
+)
+iface.launch(debug=True, share=True)