jeevitha-app commited on
Commit
34e4cc0
·
verified ·
1 Parent(s): fb5f2c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -37
app.py CHANGED
@@ -1,49 +1,150 @@
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from transformers import pipeline
3
- from langdetect import detect
 
4
 
5
- # Load translation pipeline (you can change to another model)
6
- translator = pipeline("translation", model="IndicTrans2")
7
 
 
 
 
 
 
 
8
 
9
- # Supported languages and their model codes
10
- language_models = {
11
- "en": "Helsinki-NLP/opus-mt-en-ROMANCE",
12
- "fr": "Helsinki-NLP/opus-mt-ROMANCE-en",
13
- "de": "Helsinki-NLP/opus-mt-de-en",
14
- "es": "Helsinki-NLP/opus-mt-es-en",
15
- "hi": "Helsinki-NLP/opus-mt-hi-en",
16
- "ta": "Helsinki-NLP/opus-mt-ta-en"
17
  }
18
 
19
- def translate_file(file_obj, target_lang):
20
- text = file_obj.read().decode("utf-8")
21
- source_lang = detect(text)
22
-
23
- # Load appropriate model based on target language
24
- if target_lang == source_lang:
25
- return "Source and target languages are the same. No translation needed."
26
-
27
- model_name = None
28
- for k, v in language_models.items():
29
- if target_lang == k:
30
- model_name = v
31
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- if not model_name:
34
- return f"Unsupported target language: {target_lang}"
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- model = pipeline("translation", model=model_name)
37
- translated = model(text, max_length=1000)[0]['translation_text']
38
- return f"Detected source: {source_lang.upper()} → Translated to: {target_lang.upper()}\n\n{translated}"
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  gr.Interface(
41
- fn=translate_file,
42
  inputs=[
43
- gr.File(label="Upload Text File (.txt)"),
44
- gr.Dropdown(choices=list(language_models.keys()), label="Target Language (Code)"),
 
 
 
 
 
 
 
 
 
45
  ],
46
- outputs="text",
47
- title="Multilingual File Translator",
48
- description="Upload a .txt file in any language. Choose a target language to translate."
49
- ).launch()
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
2
+ from sentence_transformers import SentenceTransformer
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import faiss
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
  import gradio as gr
9
+ from sacrebleu import corpus_bleu
10
+ import os
11
+ import tempfile
12
 
 
 
13
 
14
+ # Load Models
15
+ lang_detect_model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
16
+ lang_detect_tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
17
+ trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
18
+ trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
19
+ embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
20
 
21
+ # Language Mappings
22
+ id2lang = lang_detect_model.config.id2label
23
+
24
+ nllb_langs = {
25
+ "eng_Latn": "English", "fra_Latn": "French", "hin_Deva": "Hindi",
26
+ "spa_Latn": "Spanish", "deu_Latn": "German", "tam_Taml": "Tamil",
27
+ "tel_Telu": "Telugu", "jpn_Jpan": "Japanese", "zho_Hans": "Chinese",
28
+ "arb_Arab": "Arabic", "san_Deva": "Sanskrit"
29
  }
30
 
31
+ xlm_to_nllb = {
32
+ "en": "eng_Latn", "fr": "fra_Latn", "hi": "hin_Deva", "es": "spa_Latn", "de": "deu_Latn",
33
+ "ta": "tam_Taml", "te": "tel_Telu", "ja": "jpn_Jpan", "zh": "zho_Hans", "ar": "arb_Arab",
34
+ "sa": "san_Deva"
35
+ }
36
+
37
+ # Static Corpus
38
+ corpus = [
39
+ "धर्म एव हतो हन्ति धर्मो रक्षति रक्षितः",
40
+ "Dharma when destroyed, destroys; when protected, protects.",
41
+ "The moon affects tides and mood, according to Jyotisha",
42
+ "One should eat according to the season – Rituacharya",
43
+ "Balance of Tridosha is health – Ayurveda principle",
44
+ "Ethics in Mahabharata reflect situational dharma",
45
+ "Meditation improves memory and mental clarity",
46
+ "Jyotisha links planetary motion with life patterns"
47
+ ]
48
+ corpus_embeddings = embed_model.encode(corpus, convert_to_numpy=True)
49
+ dimension = corpus_embeddings.shape[1]
50
+ index = faiss.IndexFlatL2(dimension)
51
+ index.add(corpus_embeddings)
52
+
53
+ # Detect Language
54
+ def detect_language(text):
55
+ inputs = lang_detect_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
56
+ with torch.no_grad():
57
+ outputs = lang_detect_model(**inputs)
58
+ probs = F.softmax(outputs.logits, dim=1)
59
+ pred = torch.argmax(probs, dim=1).item()
60
+ return id2lang[pred]
61
+
62
+ # Translate
63
+ def translate(text, src_code, tgt_code):
64
+ trans_tokenizer.src_lang = src_code
65
+ encoded = trans_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
66
+ try:
67
+ target_lang_id = trans_tokenizer.convert_tokens_to_ids([tgt_code])[0]
68
+ generated = trans_model.generate(**encoded, forced_bos_token_id=target_lang_id)
69
+ return trans_tokenizer.decode(generated[0], skip_special_tokens=True)
70
+ except:
71
+ return ""
72
+
73
+ # Semantic Search
74
+ def search_semantic(query, top_k=3):
75
+ query_embedding = embed_model.encode([query])
76
+ distances, indices = index.search(query_embedding, top_k)
77
+ return [(corpus[i], float(distances[0][idx])) for idx, i in enumerate(indices[0])]
78
 
79
+ # Create downloadable output file
80
+ def save_output_to_file(detected_lang, translated, sem_results, bleu_score):
81
+ with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
82
+ f.write(f"Detected Language: {detected_lang}\n")
83
+ f.write(f"Translated Text: {translated}\n\n")
84
+ f.write("Top Semantic Matches:\n")
85
+ for i, (text, score) in enumerate(sem_results):
86
+ f.write(f"{i+1}. {text} (Score: {score:.2f})\n")
87
+ if bleu_score:
88
+ f.write(f"\nBLEU Score: {bleu_score}")
89
+ return f.name
90
+
91
+ def full_pipeline(user_input_text, target_lang_code, human_ref=""):
92
+ if not user_input_text.strip():
93
+ return "Empty input", "", [], "", "", None
94
 
95
+ if len(user_input_text) > 2048:
96
+ return " Input too long", "Please enter shorter text (under 2000 characters).", [], "", "", None
 
97
 
98
+ detected_lang = detect_language(user_input_text)
99
+ src_nllb = xlm_to_nllb.get(detected_lang, "eng_Latn")
100
+
101
+ translated = translate(user_input_text, src_nllb, target_lang_code)
102
+ if not translated:
103
+ return detected_lang, " Translation failed", [], "", "", None
104
+
105
+ sem_results = search_semantic(translated)
106
+ result_list = [f"{i+1}. {txt} (Score: {score:.2f})" for i, (txt, score) in enumerate(sem_results)]
107
+
108
+ # Plot
109
+ labels = [f"{i+1}" for i in range(len(sem_results))]
110
+ scores = [score for _, score in sem_results]
111
+ plt.figure(figsize=(6, 4))
112
+ bars = plt.barh(labels, scores, color="lightgreen")
113
+ plt.xlabel("Similarity Score")
114
+ plt.title("Top Semantic Matches")
115
+ plt.gca().invert_yaxis()
116
+ for bar in bars:
117
+ plt.text(bar.get_width() + 0.01, bar.get_y() + 0.1, f"{bar.get_width():.2f}", fontsize=8)
118
+ plt.tight_layout()
119
+ plot_path = "/tmp/sem_plot.png"
120
+ plt.savefig(plot_path)
121
+ plt.close()
122
+
123
+ bleu_score = ""
124
+ if human_ref.strip():
125
+ bleu = corpus_bleu([translated], [[human_ref]])
126
+ bleu_score = f"{bleu.score:.2f}"
127
+
128
+ download_file_path = save_output_to_file(detected_lang, translated, sem_results, bleu_score)
129
+ return detected_lang, translated, "\n".join(result_list), plot_path, bleu_score, download_file_path
130
+
131
+
132
+ # Gradio Interface
133
  gr.Interface(
134
+ fn=full_pipeline,
135
  inputs=[
136
+ gr.Textbox(label="Input Text", lines=4, placeholder="Enter text to translate..."),
137
+ gr.Dropdown(label="Target Language", choices=list(nllb_langs.keys()), value="eng_Latn"),
138
+ gr.Textbox(label="(Optional) Human Reference Translation", lines=2, placeholder="Paste human translation here (for BLEU)...")
139
+ ],
140
+ outputs=[
141
+ gr.Textbox(label="Detected Language"),
142
+ gr.Textbox(label="Translated Text"),
143
+ gr.Textbox(label="Top Semantic Matches"),
144
+ gr.Image(label="Semantic Similarity Plot"),
145
+ gr.Textbox(label="BLEU Score"),
146
+ gr.File(label="Download Translation Report") # NEW OUTPUT
147
  ],
148
+ title=" Multilingual Translator + Semantic Search",
149
+ description="Detects language → Translates → Finds related Sanskrit concepts → BLEU optional → Downloadable report."
150
+ ).launch()