viditk commited on
Commit
0f0361e
·
verified ·
1 Parent(s): dd210a3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -0
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import speech_recognition as sr
4
+ from pydub import AudioSegment
5
+ from sentence_transformers import SentenceTransformer, util
6
+ import os
7
+
8
+ # Load Sentence Transformer model
9
+ model = SentenceTransformer('all-MiniLM-L6-v2')
10
+
11
+ # Constants
12
+ MAX_AUDIO_DURATION = 600 # in seconds
13
+
14
+ # --- Helper: Convert audio to wav ---
15
+ def convert_audio_to_wav(file_path):
16
+ audio = AudioSegment.from_file(file_path)
17
+ wav_path = file_path.replace(file_path.split(".")[-1], "wav")
18
+ audio.export(wav_path, format="wav")
19
+ return wav_path
20
+
21
+ # --- Helper: Transcribe audio in chunks ---
22
+ def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
23
+ recognizer = sr.Recognizer()
24
+ audio = AudioSegment.from_wav(audio_path)
25
+
26
+ if len(audio) > MAX_AUDIO_DURATION * 1000:
27
+ audio = audio[:MAX_AUDIO_DURATION * 1000]
28
+
29
+ full_text = []
30
+ for i in range(0, len(audio), chunk_duration * 1000):
31
+ chunk = audio[i : i + chunk_duration * 1000]
32
+ chunk_path = f"temp_chunk.wav"
33
+ chunk.export(chunk_path, format="wav")
34
+
35
+ with sr.AudioFile(chunk_path) as source:
36
+ audio_data = recognizer.record(source)
37
+ try:
38
+ text = recognizer.recognize_google(audio_data, language="en-IN")
39
+ full_text.append(text)
40
+ except sr.UnknownValueError:
41
+ full_text.append("[Unrecognized Audio]")
42
+ except sr.RequestError as e:
43
+ full_text.append(f"[Speech Error: {e}]")
44
+
45
+ return " ".join(full_text)
46
+
47
+ # --- Main Function ---
48
+ def transcribe_and_evaluate(audio, reference_text):
49
+ if not audio.endswith(".wav"):
50
+ audio = convert_audio_to_wav(audio)
51
+
52
+ transcription = transcribe_audio_in_chunks(audio)
53
+
54
+ # Calculate semantic similarity
55
+ embeddings = model.encode([transcription, reference_text])
56
+ similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
57
+ accuracy = round(similarity * 100, 2) # percentage
58
+
59
+ return transcription, f"{accuracy} %"
60
+
61
+ # --- Gradio UI ---
62
+ iface = gr.Interface(
63
+ fn=transcribe_and_evaluate,
64
+ inputs=[
65
+ gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input English Audio"),
66
+ gr.Textbox(lines=4, placeholder="Enter the reference English text here", label="Reference Text")
67
+ ],
68
+ outputs=[
69
+ gr.Textbox(label="Transcribed Text"),
70
+ gr.Textbox(label="Semantic Accuracy (%)")
71
+ ],
72
+ title="English Speech Recognition + Semantic Accuracy",
73
+ description="Upload or record English audio → Transcribe → Compare with reference text → Get semantic similarity accuracy.",
74
+ allow_flagging="never"
75
+ )
76
+
77
+ iface.launch(debug=True, share=True)