fisherman611 commited on
Commit
d8c1fb7
·
verified ·
1 Parent(s): f18959d

Update models/rule_based_mt.py

Browse files
Files changed (1) hide show
  1. models/rule_based_mt.py +471 -470
models/rule_based_mt.py CHANGED
@@ -1,470 +1,471 @@
1
- import os
2
- import sys
3
-
4
- sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
5
-
6
- import re
7
- import nltk
8
- from nltk.tokenize import word_tokenize
9
- from nltk.tag import pos_tag
10
- from nltk.parse import ChartParser, ViterbiParser
11
- from nltk.grammar import CFG, PCFG, Nonterminal, ProbabilisticProduction
12
- from nltk.tree import Tree
13
- import contractions
14
- import string
15
- from collections import defaultdict
16
- import spacy
17
-
18
- nlp = spacy.load("en_core_web_sm")
19
-
20
- import json
21
-
22
- with open("data/en_vi_dictionary.json", "r", encoding='utf-8') as json_file:
23
- dictionary = json.load(json_file)
24
-
25
- with open('grammar.txt', 'r', encoding='utf-8') as text_file:
26
- grammar = text_file.read()
27
-
28
-
29
- class TransferBasedMT:
30
-
31
- def __init__(self) -> None:
32
- # English - Vietnamese dictionary
33
- self.dictionary = dictionary
34
-
35
- # Define the CFG grammar for English sentence structure
36
- self.grammar = grammar
37
-
38
-
39
- ################################################ STAGE 1: PREPROCESSING SOURCE SENTENCE ###################################################
40
-
41
- def preprocessing(self, sentence: str) -> str:
42
- """Preprocess the input sentence: handle named entities, lowercase, expand contractions, and tokenize and regroup."""
43
- # Handle named entities, e.g. New York -> New_York
44
- doc = nlp(sentence)
45
- entities = {ent.text: ent.label_ for ent in doc.ents}
46
- for ent_text in sorted(entities.keys(), key=len,reverse=True):
47
- ent_joined = ent_text.replace(" ", "_")
48
- sentence = sentence.replace(ent_text, ent_joined)
49
-
50
- # Lowercase and strip redundant space
51
- sentence = sentence.lower().strip()
52
-
53
- # Expand contractions, e.g. don't -> do not
54
- sentence = contractions.fix(sentence) #type: ignore
55
-
56
- # Tokenize and regroup tokens
57
- sentence = " ".join(word_tokenize(sentence))
58
-
59
- return sentence
60
-
61
-
62
- def safe_tag(self, tag):
63
- """Convert tags with special characters to safe nonterminal symbols."""
64
- return tag.replace("$", "S")
65
-
66
-
67
- ################################################ STAGE 2: ANALYZE SOURCE SENTENCE #########################################################
68
-
69
- def analyze_source(self, sentence: str):
70
- """Analyze the source sentence: tokenize, POS tag, and parse into a syntax tree."""
71
- doc = nlp(sentence)
72
- filtered_pos_tagged = []
73
- punctuation_marks = []
74
-
75
- for i, token in enumerate(doc):
76
- word = token.text
77
- tag = token.tag_
78
- if all(char in string.punctuation for char in word):
79
- punctuation_marks.append((i, word, tag))
80
- else:
81
- filtered_pos_tagged.append((token.lemma_.lower(), tag))
82
-
83
- grammar_str = self.grammar
84
-
85
- # Add terminal rule grammars
86
- for word, tag in filtered_pos_tagged:
87
- safe_tag = self.safe_tag(tag)
88
- escaped_word = word.replace('"', '\\"')
89
- grammar_str += f'\n{safe_tag} -> "{escaped_word}"'
90
-
91
- try:
92
- grammar = CFG.fromstring(grammar_str)
93
- parser = ChartParser(grammar)
94
- tagged_tokens_only = [word for word, _ in filtered_pos_tagged]
95
-
96
- parses = list(parser.parse(tagged_tokens_only)) # Generate parse trees
97
-
98
- tree = (parses[0] if parses else self._create_fallback_tree(filtered_pos_tagged)) # Use first parse or fallback
99
- tree = self._add_punctuation_to_tree(tree, punctuation_marks) # Reattach punctuation
100
-
101
- return tree
102
-
103
- except Exception as e:
104
- print(f"Grammar creation error: {e}")
105
- return self._create_fallback_tree(filtered_pos_tagged) # Fallback on error
106
-
107
-
108
- def _create_fallback_tree(self, pos_tagged):
109
- """Create a simple fallback tree when parsing fails."""
110
- children = [Tree(self.safe_tag(tag), [word]) for word, tag in pos_tagged] # Create leaf nodes for each token
111
- return Tree("S", children) # Wrap in a sentence node
112
-
113
-
114
- def _add_punctuation_to_tree(self, tree, punctuation_marks):
115
- """Add punctuation marks back to the syntax tree."""
116
- if not punctuation_marks:
117
- return tree
118
- if tree.label() == "S": # Only add to root sentence node
119
- for _, word, tag in sorted(punctuation_marks):
120
- tree.append(Tree(self.safe_tag(tag), [word]))
121
- return tree
122
-
123
-
124
- #################################################### STAGE 3: TRANSFER GRAMMAR ############################################################
125
-
126
- def transfer_grammar(self, tree):
127
- """Transfer the English parse tree to Vietnamese structure."""
128
- if not isinstance(tree, nltk.Tree):
129
- return tree
130
-
131
- # Sentence level: recurse through children
132
- if tree.label() == "S":
133
- return Tree("S", [self.transfer_grammar(child) for child in tree])
134
-
135
- # Verb Phrase: adjust word order
136
- elif tree.label() == "VP":
137
- children = [self.transfer_grammar(child) for child in tree]
138
- child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
139
-
140
- if (len(children) >= 3 and "V" in child_labels and "To" in child_labels and "VP" in child_labels): # Remove TO from V TO VP
141
- return Tree("VP", [children[0], children[2]])
142
-
143
- return Tree("VP", children) # Default: preserve order
144
-
145
- # Noun Phrase: adjust word order
146
- elif tree.label() == "NP":
147
- children = [self.transfer_grammar(child) for child in tree]
148
- child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
149
-
150
- if (len(children) >= 3 and 'Det' in child_labels and 'AdjP' in child_labels and 'N' in child_labels): # Reorder Det Adj N -> Det N Adj
151
- return Tree("NP", [children[0], children[2], children[1]])
152
-
153
- elif (len(children) >= 2 and 'PRPS' in child_labels and 'N' in child_labels): # Reorder PRPS N -> N PRPS
154
- return Tree("NP", [children[1], children[0]])
155
-
156
- elif (len(children) >= 2 and 'Det' in child_labels and 'N' in child_labels): # Remove Det from Det N
157
- return Tree("NP", [children[1]])
158
-
159
- return Tree("NP", children) # Default: preserve order
160
-
161
- # Prepositional Phrase: adjust word order
162
- elif tree.label() == "PP":
163
- children = [self.transfer_grammar(child) for child in tree]
164
- return Tree("PP", children) # Default: preserve order
165
-
166
- # Adverbial Phrase: adjust word order
167
- elif tree.label() == 'AdvP':
168
- children = [self.transfer_grammar(child) for child in tree]
169
- return Tree("AdvP", children) # Default: preserve order
170
-
171
- # Adjective Phrase: adjust word order
172
- elif tree.label() == 'AdjP':
173
- children = [self.transfer_grammar(child) for child in tree]
174
- return Tree("AdjP", children) # Default: preserve order
175
-
176
- # Wh-Question: adjust word order
177
- elif tree.label() == "WhQ":
178
- children = [self.transfer_grammar(child) for child in tree]
179
- child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
180
-
181
- if len(children) >= 4 and "WH_Word" in child_labels and "AUX" in child_labels and "NP" in child_labels and "VP" in child_labels:
182
- return Tree("WhQ", [children[2], children[3], children[0]]) # Remove AUX from WH_Word AUX NP VP
183
-
184
- elif len(children) >= 3 and "WH_Word" in child_labels and "NP" in child_labels and "VP" in child_labels and "AUX" not in child_labels:
185
- return Tree("WhQ", [children[1], children[2], children[0]])
186
-
187
- elif len(children) >= 2 and "WH_Word" in child_labels and "VP" in child_labels:
188
- if len(children[1]) >= 2:
189
- return Tree("WhQ", [children[1][1], children[1][0], children[0]]) # WH_Word VP -> WH_Word V NP
190
-
191
- else:
192
- return Tree("WhQ", children) # Default: preserve order
193
-
194
- # Yes/No-Question: adjust word order
195
- elif tree.label() == "YNQ":
196
- children = [self.transfer_grammar(child) for child in tree]
197
- child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
198
-
199
- if len(children) >= 3 and "AUX" in child_labels and "NP" in child_labels and "VP" in child_labels:
200
- return Tree("YNQ", [children[1], children[2]])
201
-
202
- elif len(children) >= 3 and "DO" in child_labels and "NP" in child_labels and "VP" in child_labels:
203
- return Tree("YNQ", [children[1], children[2]])
204
-
205
- elif len(children) >= 3 and "MD" in child_labels and "NP" in child_labels and "VP" in child_labels:
206
- return Tree("YNQ", [children[1], children[2]])
207
-
208
- return Tree("YNQ", children)
209
-
210
-
211
- # Other labels: recurse through children
212
- else:
213
- return Tree(tree.label(), [self.transfer_grammar(child) for child in tree])
214
-
215
-
216
- #################################################### STAGE 4: GENERATION STAGE ############################################################
217
-
218
- def generate(self, tree):
219
- """Generate Vietnamese output from the transformed tree."""
220
- if not isinstance(tree, nltk.Tree):
221
- return self._lexical_transfer(tree) # Translate leaf nodes
222
-
223
- words = [self.generate(child) for child in tree if self.generate(child)] # Recurse
224
-
225
- # Handle questions specifically
226
- if tree.label() == "WhQ":
227
- words = self._process_wh_question(tree, words)
228
- elif tree.label() == "YNQ":
229
- words = self._process_yn_question(tree, words)
230
- elif tree.label() == "NP": # Add classifiers for nouns
231
- words = self._add_classifiers(tree, words)
232
- elif tree.label() == "VP": # Apply tense/aspect/mood markers
233
- words = self._apply_tam_mapping(tree, words)
234
-
235
- words = self._apply_agreement(tree, words) # Handle agreement (e.g., plurals)
236
- result = " ".join(words) # Join words into a string
237
-
238
- return result
239
-
240
-
241
- def _process_wh_question(self, tree, words):
242
- """Process a Wh-question structure for Vietnamese."""
243
- words = [w for w in words if w]
244
-
245
- wh_word = None
246
- for word in words:
247
- if word in ["cái gì", "ai", "ở đâu", "khi nào", "tại sao", "như thế nào", "cái nào", "của ai"]:
248
- wh_word = word
249
- break
250
-
251
- if wh_word == "tại sao":
252
- if words and words[0] != "tại sao":
253
- words.remove("tại sao")
254
- words.insert(0, "tại sao")
255
- elif wh_word == "như thế nào":
256
- if "vậy" not in words:
257
- words.append("vậy")
258
-
259
- question_particles = ["vậy", "thế", "à", "hả"]
260
- has_particle = any(particle in words for particle in question_particles)
261
-
262
- if not has_particle and wh_word != "tại sao":
263
- words.append("vậy")
264
-
265
- return words
266
-
267
-
268
- def _process_yn_question(self, tree, words):
269
- """Process a Yes/No question structure for Vietnamese."""
270
-
271
- words = [w for w in words if w not in ["", "do_vn", "does_vn", "did_vn"]]
272
-
273
- has_question_particle = any(w in ["không", "à", "hả", "nhỉ", "chứ"] or
274
- w in ["không_vn", "à_vn", "hả_vn", "nhỉ_vn", "chứ_vn"]
275
- for w in words)
276
-
277
- if not has_question_particle:
278
- if "đã" in words or "đã_vn" in words:
279
- words.append("phải không")
280
- else:
281
- words.append("không")
282
-
283
- return words
284
-
285
-
286
- def _lexical_transfer(self, word):
287
- """Translate English words to Vietnamese using the dictionary."""
288
- if word in self.dictionary:
289
- return self.dictionary[word] # Return translation if in dictionary
290
- return f"{word}_vn" # Mark untranslated words with _vn suffix
291
-
292
-
293
- def _add_classifiers(self, np_tree, words):
294
- """Add Vietnamese classifiers based on nouns."""
295
- # noun_indices = [
296
- # i for i, child in enumerate(np_tree) if isinstance(child, Tree)
297
- # and child.label() in ["N", "NN", "NNS", "NNP", "NNPS"]
298
- # ] # Find noun positions
299
- # for i in noun_indices:
300
- # if len(words) > i and not any(words[i].startswith(prefix) for prefix in ["một_vn", "những_vn", "các_vn"]): # Check if classifier is needed
301
- # if words[i].endswith("_vn"): # Add default classifier for untranslated nouns
302
- # words.insert(i, "cái_vn")
303
- return words
304
-
305
-
306
- def _apply_tam_mapping(self, vp_tree, words):
307
- """Apply Vietnamese TAM (Tense, Aspect, Mood) markers to the word list.
308
-
309
- Args:
310
- vp_tree: A parse tree node representing the verb phrase.
311
- words: List of words to be modified with TAM markers.
312
-
313
- Returns:
314
- List of words with appropriate Vietnamese TAM markers inserted.
315
- """
316
- verb_tense = None
317
- mood = None
318
-
319
- # Identify verb tense and mood from the verb phrase tree
320
- for child in vp_tree:
321
- if isinstance(child, Tree):
322
- if child.label() in ["V", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
323
- verb_tense = child.label()
324
- if child.label() == "MD": # Modal verbs indicating mood
325
- mood = "indicative"
326
- elif child.label() == "TO": # Infinitive marker, often subjunctive
327
- mood = "subjunctive"
328
-
329
- if not verb_tense:
330
- print("Warning: No verb tense identified in the verb phrase tree.")
331
- return words
332
-
333
- # Apply TAM markers based on verb tense
334
- if verb_tense == "VBD":
335
- words.insert(0, "đã_vn")
336
- elif verb_tense == "VB":
337
- if "will_vn" in words:
338
- words = [w for w in words if w != "will_vn"]
339
- words.insert(0, "sẽ_vn")
340
- elif "going_to_vn" in words:
341
- words = [w for w in words if w != "going_to_vn"]
342
- words.insert(0, "sẽ_vn")
343
- elif verb_tense == "VBG":
344
- words.insert(0, "đang_vn")
345
- if "đã_vn" in words:
346
- words.insert(0, "đã_vn")
347
- elif verb_tense == "VBN":
348
- words.insert(0, "đã_vn")
349
- elif verb_tense == "VBP" or verb_tense == "VBZ":
350
- pass
351
-
352
- # Handle future continuous (e.g., "will be running" -> "sẽ đang")
353
- if verb_tense == "VBG" and "will_vn" in words:
354
- words = [w for w in words if w != "will_vn"]
355
- words.insert(0, "đang_vn") # Continuous marker
356
- words.insert(0, "sẽ_vn") # Future marker
357
-
358
- # Apply mood markers if applicable
359
- if mood == "subjunctive":
360
- words.insert(0, "nếu_vn") # Subjunctive marker (e.g., "if" clause)
361
- elif mood == "indicative" and "must_vn" in words:
362
- words = [w for w in words if w != "must_vn"]
363
- words.insert(0, "phải_vn") # Necessity marker
364
-
365
- return words
366
-
367
-
368
- def _apply_agreement(self, tree, words):
369
- """Apply agreement rules for Vietnamese (e.g., pluralization)."""
370
- if tree.label() == "NP":
371
- for i, word in enumerate(words):
372
- if "_vn" in word and word.replace("_vn", "").endswith("s"): # Handle English plurals
373
- base_word = word.replace("_vn", "")[:-1] + "_vn" # Remove 's'
374
- words[i] = base_word
375
- words.insert(i, "các_vn") # Add plural marker
376
- return words
377
-
378
-
379
- def _post_process_vietnamese(self, text):
380
- """Post-process the Vietnamese output: remove _vn, fix punctuation, capitalize."""
381
- text = text.replace("_vn", "") # Remove untranslated markers
382
-
383
- def fix_entities(word):
384
- if "_" in word:
385
- word = " ".join([w for w in word.split("_")])
386
- return word.title()
387
- return word.lower() # Lowercase non-entity words
388
-
389
- words = text.split()
390
- words = [fix_entities(word) for word in words]
391
-
392
- text = " ".join(words)
393
- for punct in [".", ",", "!", "?", ":", ";"]: # Attach punctuation directly
394
- text = text.replace(f" {punct}", punct)
395
-
396
- if text:
397
- words = text.split()
398
- words[0] = words[0].capitalize() # Capitalize first word
399
- text = ' '.join(words)
400
- return text
401
-
402
-
403
- def translate(self, english_sentence):
404
- """Main translation function that applies all stages of the process."""
405
- # Step 1: Preprocess input
406
- preprocessed = self.preprocessing(english_sentence)
407
-
408
- # Step 2: Parse English sentence
409
- source_tree = self.analyze_source(preprocessed)
410
- print("English parse tree:")
411
- source_tree.pretty_print() # Display English parse tree
412
-
413
- # Step 3: Transform to Vietnamese structure
414
- target_tree = self.transfer_grammar(source_tree)
415
- print("Vietnamese structure tree:")
416
- target_tree.pretty_print() # Display Vietnamese parse tree
417
-
418
- # Step 4: Generate final translation
419
- raw_output = self.generate(target_tree)
420
- vietnamese_output = self._post_process_vietnamese(raw_output)
421
- return vietnamese_output
422
-
423
-
424
- if __name__ == "__main__":
425
- translator = TransferBasedMT()
426
- test_sentences = [
427
- "I read books.", "The student studies at school.",
428
- "She has a beautiful house.", "They want to buy a new car.",
429
- "This is a good computer.", "Are you ready to listen?",
430
- "I want to eat.", "This is my book.","What is your name?",
431
- "Do you like books?",
432
- "Is she at school?",
433
- "Are you ready to listen?",
434
- "Can they buy a new car?",
435
- "Did he read the book yesterday?",
436
- "What is your name?",
437
- "Where do you live?",
438
- "Who is your teacher?",
439
- "When will you go to school?",
440
- "Why did he leave early?",
441
- "How do you feel today?",
442
- "I live in New York"
443
- ]
444
-
445
- test_sentences_2 = [
446
- # YNQ -> BE NP
447
- "Is the renowned astrophysicist still available for the conference?",
448
- "Are those adventurous explorers currently in the remote jungle?",
449
- "Was the mysterious stranger already gone by midnight?",
450
- # YNQ -> BE NP Adj
451
- "Is the vibrant annual festival exceptionally spectacular this season?",
452
- "Are the newly discovered species remarkably resilient to harsh climates?",
453
- "Were the ancient ruins surprisingly well-preserved after centuries?",
454
- # YNQ -> BE NP NP
455
- "Is she the brilliant leader of the innovative research team?",
456
- "Are they the enthusiastic organizers of the grand charity event?",
457
- "Was he the sole survivor of the perilous expedition?",
458
- # YNQ -> BE NP PP
459
- "Is the priceless artifact still hidden in the ancient underground chamber?",
460
- "Are the colorful tropical birds nesting high above the lush rainforest canopy?",
461
- "Was the historic manuscript carefully stored within the fortified library vault?"
462
- ]
463
-
464
- print("English to Vietnamese Translation Examples:")
465
- print("-" * 50)
466
- for sentence in test_sentences_2:
467
- print(f"English: {sentence}")
468
- translation = translator.translate(sentence)
469
- print(f"Vietnamese: {translation}")
470
- print()
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
5
+
6
+ import re
7
+ import nltk
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.tag import pos_tag
10
+ from nltk.parse import ChartParser, ViterbiParser
11
+ from nltk.grammar import CFG, PCFG, Nonterminal, ProbabilisticProduction
12
+ from nltk.tree import Tree
13
+ import contractions
14
+ import string
15
+ from collections import defaultdict
16
+ import spacy
17
+
18
+ spacy.cli.download("en_core_web_sm")
19
+ nlp = spacy.load("en_core_web_sm")
20
+
21
+ import json
22
+
23
+ with open("data/en_vi_dictionary.json", "r", encoding='utf-8') as json_file:
24
+ dictionary = json.load(json_file)
25
+
26
+ with open('grammar.txt', 'r', encoding='utf-8') as text_file:
27
+ grammar = text_file.read()
28
+
29
+
30
+ class TransferBasedMT:
31
+
32
+ def __init__(self) -> None:
33
+ # English - Vietnamese dictionary
34
+ self.dictionary = dictionary
35
+
36
+ # Define the CFG grammar for English sentence structure
37
+ self.grammar = grammar
38
+
39
+
40
+ ################################################ STAGE 1: PREPROCESSING SOURCE SENTENCE ###################################################
41
+
42
+ def preprocessing(self, sentence: str) -> str:
43
+ """Preprocess the input sentence: handle named entities, lowercase, expand contractions, and tokenize and regroup."""
44
+ # Handle named entities, e.g. New York -> New_York
45
+ doc = nlp(sentence)
46
+ entities = {ent.text: ent.label_ for ent in doc.ents}
47
+ for ent_text in sorted(entities.keys(), key=len,reverse=True):
48
+ ent_joined = ent_text.replace(" ", "_")
49
+ sentence = sentence.replace(ent_text, ent_joined)
50
+
51
+ # Lowercase and strip redundant space
52
+ sentence = sentence.lower().strip()
53
+
54
+ # Expand contractions, e.g. don't -> do not
55
+ sentence = contractions.fix(sentence) #type: ignore
56
+
57
+ # Tokenize and regroup tokens
58
+ sentence = " ".join(word_tokenize(sentence))
59
+
60
+ return sentence
61
+
62
+
63
+ def safe_tag(self, tag):
64
+ """Convert tags with special characters to safe nonterminal symbols."""
65
+ return tag.replace("$", "S")
66
+
67
+
68
+ ################################################ STAGE 2: ANALYZE SOURCE SENTENCE #########################################################
69
+
70
+ def analyze_source(self, sentence: str):
71
+ """Analyze the source sentence: tokenize, POS tag, and parse into a syntax tree."""
72
+ doc = nlp(sentence)
73
+ filtered_pos_tagged = []
74
+ punctuation_marks = []
75
+
76
+ for i, token in enumerate(doc):
77
+ word = token.text
78
+ tag = token.tag_
79
+ if all(char in string.punctuation for char in word):
80
+ punctuation_marks.append((i, word, tag))
81
+ else:
82
+ filtered_pos_tagged.append((token.lemma_.lower(), tag))
83
+
84
+ grammar_str = self.grammar
85
+
86
+ # Add terminal rule grammars
87
+ for word, tag in filtered_pos_tagged:
88
+ safe_tag = self.safe_tag(tag)
89
+ escaped_word = word.replace('"', '\\"')
90
+ grammar_str += f'\n{safe_tag} -> "{escaped_word}"'
91
+
92
+ try:
93
+ grammar = CFG.fromstring(grammar_str)
94
+ parser = ChartParser(grammar)
95
+ tagged_tokens_only = [word for word, _ in filtered_pos_tagged]
96
+
97
+ parses = list(parser.parse(tagged_tokens_only)) # Generate parse trees
98
+
99
+ tree = (parses[0] if parses else self._create_fallback_tree(filtered_pos_tagged)) # Use first parse or fallback
100
+ tree = self._add_punctuation_to_tree(tree, punctuation_marks) # Reattach punctuation
101
+
102
+ return tree
103
+
104
+ except Exception as e:
105
+ print(f"Grammar creation error: {e}")
106
+ return self._create_fallback_tree(filtered_pos_tagged) # Fallback on error
107
+
108
+
109
+ def _create_fallback_tree(self, pos_tagged):
110
+ """Create a simple fallback tree when parsing fails."""
111
+ children = [Tree(self.safe_tag(tag), [word]) for word, tag in pos_tagged] # Create leaf nodes for each token
112
+ return Tree("S", children) # Wrap in a sentence node
113
+
114
+
115
+ def _add_punctuation_to_tree(self, tree, punctuation_marks):
116
+ """Add punctuation marks back to the syntax tree."""
117
+ if not punctuation_marks:
118
+ return tree
119
+ if tree.label() == "S": # Only add to root sentence node
120
+ for _, word, tag in sorted(punctuation_marks):
121
+ tree.append(Tree(self.safe_tag(tag), [word]))
122
+ return tree
123
+
124
+
125
+ #################################################### STAGE 3: TRANSFER GRAMMAR ############################################################
126
+
127
+ def transfer_grammar(self, tree):
128
+ """Transfer the English parse tree to Vietnamese structure."""
129
+ if not isinstance(tree, nltk.Tree):
130
+ return tree
131
+
132
+ # Sentence level: recurse through children
133
+ if tree.label() == "S":
134
+ return Tree("S", [self.transfer_grammar(child) for child in tree])
135
+
136
+ # Verb Phrase: adjust word order
137
+ elif tree.label() == "VP":
138
+ children = [self.transfer_grammar(child) for child in tree]
139
+ child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
140
+
141
+ if (len(children) >= 3 and "V" in child_labels and "To" in child_labels and "VP" in child_labels): # Remove TO from V TO VP
142
+ return Tree("VP", [children[0], children[2]])
143
+
144
+ return Tree("VP", children) # Default: preserve order
145
+
146
+ # Noun Phrase: adjust word order
147
+ elif tree.label() == "NP":
148
+ children = [self.transfer_grammar(child) for child in tree]
149
+ child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
150
+
151
+ if (len(children) >= 3 and 'Det' in child_labels and 'AdjP' in child_labels and 'N' in child_labels): # Reorder Det Adj N -> Det N Adj
152
+ return Tree("NP", [children[0], children[2], children[1]])
153
+
154
+ elif (len(children) >= 2 and 'PRPS' in child_labels and 'N' in child_labels): # Reorder PRPS N -> N PRPS
155
+ return Tree("NP", [children[1], children[0]])
156
+
157
+ elif (len(children) >= 2 and 'Det' in child_labels and 'N' in child_labels): # Remove Det from Det N
158
+ return Tree("NP", [children[1]])
159
+
160
+ return Tree("NP", children) # Default: preserve order
161
+
162
+ # Prepositional Phrase: adjust word order
163
+ elif tree.label() == "PP":
164
+ children = [self.transfer_grammar(child) for child in tree]
165
+ return Tree("PP", children) # Default: preserve order
166
+
167
+ # Adverbial Phrase: adjust word order
168
+ elif tree.label() == 'AdvP':
169
+ children = [self.transfer_grammar(child) for child in tree]
170
+ return Tree("AdvP", children) # Default: preserve order
171
+
172
+ # Adjective Phrase: adjust word order
173
+ elif tree.label() == 'AdjP':
174
+ children = [self.transfer_grammar(child) for child in tree]
175
+ return Tree("AdjP", children) # Default: preserve order
176
+
177
+ # Wh-Question: adjust word order
178
+ elif tree.label() == "WhQ":
179
+ children = [self.transfer_grammar(child) for child in tree]
180
+ child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
181
+
182
+ if len(children) >= 4 and "WH_Word" in child_labels and "AUX" in child_labels and "NP" in child_labels and "VP" in child_labels:
183
+ return Tree("WhQ", [children[2], children[3], children[0]]) # Remove AUX from WH_Word AUX NP VP
184
+
185
+ elif len(children) >= 3 and "WH_Word" in child_labels and "NP" in child_labels and "VP" in child_labels and "AUX" not in child_labels:
186
+ return Tree("WhQ", [children[1], children[2], children[0]])
187
+
188
+ elif len(children) >= 2 and "WH_Word" in child_labels and "VP" in child_labels:
189
+ if len(children[1]) >= 2:
190
+ return Tree("WhQ", [children[1][1], children[1][0], children[0]]) # WH_Word VP -> WH_Word V NP
191
+
192
+ else:
193
+ return Tree("WhQ", children) # Default: preserve order
194
+
195
+ # Yes/No-Question: adjust word order
196
+ elif tree.label() == "YNQ":
197
+ children = [self.transfer_grammar(child) for child in tree]
198
+ child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
199
+
200
+ if len(children) >= 3 and "AUX" in child_labels and "NP" in child_labels and "VP" in child_labels:
201
+ return Tree("YNQ", [children[1], children[2]])
202
+
203
+ elif len(children) >= 3 and "DO" in child_labels and "NP" in child_labels and "VP" in child_labels:
204
+ return Tree("YNQ", [children[1], children[2]])
205
+
206
+ elif len(children) >= 3 and "MD" in child_labels and "NP" in child_labels and "VP" in child_labels:
207
+ return Tree("YNQ", [children[1], children[2]])
208
+
209
+ return Tree("YNQ", children)
210
+
211
+
212
+ # Other labels: recurse through children
213
+ else:
214
+ return Tree(tree.label(), [self.transfer_grammar(child) for child in tree])
215
+
216
+
217
+ #################################################### STAGE 4: GENERATION STAGE ############################################################
218
+
219
+ def generate(self, tree):
220
+ """Generate Vietnamese output from the transformed tree."""
221
+ if not isinstance(tree, nltk.Tree):
222
+ return self._lexical_transfer(tree) # Translate leaf nodes
223
+
224
+ words = [self.generate(child) for child in tree if self.generate(child)] # Recurse
225
+
226
+ # Handle questions specifically
227
+ if tree.label() == "WhQ":
228
+ words = self._process_wh_question(tree, words)
229
+ elif tree.label() == "YNQ":
230
+ words = self._process_yn_question(tree, words)
231
+ elif tree.label() == "NP": # Add classifiers for nouns
232
+ words = self._add_classifiers(tree, words)
233
+ elif tree.label() == "VP": # Apply tense/aspect/mood markers
234
+ words = self._apply_tam_mapping(tree, words)
235
+
236
+ words = self._apply_agreement(tree, words) # Handle agreement (e.g., plurals)
237
+ result = " ".join(words) # Join words into a string
238
+
239
+ return result
240
+
241
+
242
+ def _process_wh_question(self, tree, words):
243
+ """Process a Wh-question structure for Vietnamese."""
244
+ words = [w for w in words if w]
245
+
246
+ wh_word = None
247
+ for word in words:
248
+ if word in ["cái gì", "ai", "ở đâu", "khi nào", "tại sao", "như thế nào", "cái nào", "của ai"]:
249
+ wh_word = word
250
+ break
251
+
252
+ if wh_word == "tại sao":
253
+ if words and words[0] != "tại sao":
254
+ words.remove("tại sao")
255
+ words.insert(0, "tại sao")
256
+ elif wh_word == "như thế nào":
257
+ if "vậy" not in words:
258
+ words.append("vậy")
259
+
260
+ question_particles = ["vậy", "thế", "à", "hả"]
261
+ has_particle = any(particle in words for particle in question_particles)
262
+
263
+ if not has_particle and wh_word != "tại sao":
264
+ words.append("vậy")
265
+
266
+ return words
267
+
268
+
269
+ def _process_yn_question(self, tree, words):
270
+ """Process a Yes/No question structure for Vietnamese."""
271
+
272
+ words = [w for w in words if w not in ["", "do_vn", "does_vn", "did_vn"]]
273
+
274
+ has_question_particle = any(w in ["không", "à", "hả", "nhỉ", "chứ"] or
275
+ w in ["không_vn", "à_vn", "hả_vn", "nhỉ_vn", "chứ_vn"]
276
+ for w in words)
277
+
278
+ if not has_question_particle:
279
+ if "đã" in words or "đã_vn" in words:
280
+ words.append("phải không")
281
+ else:
282
+ words.append("không")
283
+
284
+ return words
285
+
286
+
287
+ def _lexical_transfer(self, word):
288
+ """Translate English words to Vietnamese using the dictionary."""
289
+ if word in self.dictionary:
290
+ return self.dictionary[word] # Return translation if in dictionary
291
+ return f"{word}_vn" # Mark untranslated words with _vn suffix
292
+
293
+
294
+ def _add_classifiers(self, np_tree, words):
295
+ """Add Vietnamese classifiers based on nouns."""
296
+ # noun_indices = [
297
+ # i for i, child in enumerate(np_tree) if isinstance(child, Tree)
298
+ # and child.label() in ["N", "NN", "NNS", "NNP", "NNPS"]
299
+ # ] # Find noun positions
300
+ # for i in noun_indices:
301
+ # if len(words) > i and not any(words[i].startswith(prefix) for prefix in ["một_vn", "những_vn", "các_vn"]): # Check if classifier is needed
302
+ # if words[i].endswith("_vn"): # Add default classifier for untranslated nouns
303
+ # words.insert(i, "cái_vn")
304
+ return words
305
+
306
+
307
+ def _apply_tam_mapping(self, vp_tree, words):
308
+ """Apply Vietnamese TAM (Tense, Aspect, Mood) markers to the word list.
309
+
310
+ Args:
311
+ vp_tree: A parse tree node representing the verb phrase.
312
+ words: List of words to be modified with TAM markers.
313
+
314
+ Returns:
315
+ List of words with appropriate Vietnamese TAM markers inserted.
316
+ """
317
+ verb_tense = None
318
+ mood = None
319
+
320
+ # Identify verb tense and mood from the verb phrase tree
321
+ for child in vp_tree:
322
+ if isinstance(child, Tree):
323
+ if child.label() in ["V", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
324
+ verb_tense = child.label()
325
+ if child.label() == "MD": # Modal verbs indicating mood
326
+ mood = "indicative"
327
+ elif child.label() == "TO": # Infinitive marker, often subjunctive
328
+ mood = "subjunctive"
329
+
330
+ if not verb_tense:
331
+ print("Warning: No verb tense identified in the verb phrase tree.")
332
+ return words
333
+
334
+ # Apply TAM markers based on verb tense
335
+ if verb_tense == "VBD":
336
+ words.insert(0, "đã_vn")
337
+ elif verb_tense == "VB":
338
+ if "will_vn" in words:
339
+ words = [w for w in words if w != "will_vn"]
340
+ words.insert(0, "sẽ_vn")
341
+ elif "going_to_vn" in words:
342
+ words = [w for w in words if w != "going_to_vn"]
343
+ words.insert(0, "sẽ_vn")
344
+ elif verb_tense == "VBG":
345
+ words.insert(0, "đang_vn")
346
+ if "đã_vn" in words:
347
+ words.insert(0, "đã_vn")
348
+ elif verb_tense == "VBN":
349
+ words.insert(0, "đã_vn")
350
+ elif verb_tense == "VBP" or verb_tense == "VBZ":
351
+ pass
352
+
353
+ # Handle future continuous (e.g., "will be running" -> "sẽ đang")
354
+ if verb_tense == "VBG" and "will_vn" in words:
355
+ words = [w for w in words if w != "will_vn"]
356
+ words.insert(0, "đang_vn") # Continuous marker
357
+ words.insert(0, "sẽ_vn") # Future marker
358
+
359
+ # Apply mood markers if applicable
360
+ if mood == "subjunctive":
361
+ words.insert(0, "nếu_vn") # Subjunctive marker (e.g., "if" clause)
362
+ elif mood == "indicative" and "must_vn" in words:
363
+ words = [w for w in words if w != "must_vn"]
364
+ words.insert(0, "phải_vn") # Necessity marker
365
+
366
+ return words
367
+
368
+
369
+ def _apply_agreement(self, tree, words):
370
+ """Apply agreement rules for Vietnamese (e.g., pluralization)."""
371
+ if tree.label() == "NP":
372
+ for i, word in enumerate(words):
373
+ if "_vn" in word and word.replace("_vn", "").endswith("s"): # Handle English plurals
374
+ base_word = word.replace("_vn", "")[:-1] + "_vn" # Remove 's'
375
+ words[i] = base_word
376
+ words.insert(i, "các_vn") # Add plural marker
377
+ return words
378
+
379
+
380
+ def _post_process_vietnamese(self, text):
381
+ """Post-process the Vietnamese output: remove _vn, fix punctuation, capitalize."""
382
+ text = text.replace("_vn", "") # Remove untranslated markers
383
+
384
+ def fix_entities(word):
385
+ if "_" in word:
386
+ word = " ".join([w for w in word.split("_")])
387
+ return word.title()
388
+ return word.lower() # Lowercase non-entity words
389
+
390
+ words = text.split()
391
+ words = [fix_entities(word) for word in words]
392
+
393
+ text = " ".join(words)
394
+ for punct in [".", ",", "!", "?", ":", ";"]: # Attach punctuation directly
395
+ text = text.replace(f" {punct}", punct)
396
+
397
+ if text:
398
+ words = text.split()
399
+ words[0] = words[0].capitalize() # Capitalize first word
400
+ text = ' '.join(words)
401
+ return text
402
+
403
+
404
+ def translate(self, english_sentence):
405
+ """Main translation function that applies all stages of the process."""
406
+ # Step 1: Preprocess input
407
+ preprocessed = self.preprocessing(english_sentence)
408
+
409
+ # Step 2: Parse English sentence
410
+ source_tree = self.analyze_source(preprocessed)
411
+ print("English parse tree:")
412
+ source_tree.pretty_print() # Display English parse tree
413
+
414
+ # Step 3: Transform to Vietnamese structure
415
+ target_tree = self.transfer_grammar(source_tree)
416
+ print("Vietnamese structure tree:")
417
+ target_tree.pretty_print() # Display Vietnamese parse tree
418
+
419
+ # Step 4: Generate final translation
420
+ raw_output = self.generate(target_tree)
421
+ vietnamese_output = self._post_process_vietnamese(raw_output)
422
+ return vietnamese_output
423
+
424
+
425
+ if __name__ == "__main__":
426
+ translator = TransferBasedMT()
427
+ test_sentences = [
428
+ "I read books.", "The student studies at school.",
429
+ "She has a beautiful house.", "They want to buy a new car.",
430
+ "This is a good computer.", "Are you ready to listen?",
431
+ "I want to eat.", "This is my book.","What is your name?",
432
+ "Do you like books?",
433
+ "Is she at school?",
434
+ "Are you ready to listen?",
435
+ "Can they buy a new car?",
436
+ "Did he read the book yesterday?",
437
+ "What is your name?",
438
+ "Where do you live?",
439
+ "Who is your teacher?",
440
+ "When will you go to school?",
441
+ "Why did he leave early?",
442
+ "How do you feel today?",
443
+ "I live in New York"
444
+ ]
445
+
446
+ test_sentences_2 = [
447
+ # YNQ -> BE NP
448
+ "Is the renowned astrophysicist still available for the conference?",
449
+ "Are those adventurous explorers currently in the remote jungle?",
450
+ "Was the mysterious stranger already gone by midnight?",
451
+ # YNQ -> BE NP Adj
452
+ "Is the vibrant annual festival exceptionally spectacular this season?",
453
+ "Are the newly discovered species remarkably resilient to harsh climates?",
454
+ "Were the ancient ruins surprisingly well-preserved after centuries?",
455
+ # YNQ -> BE NP NP
456
+ "Is she the brilliant leader of the innovative research team?",
457
+ "Are they the enthusiastic organizers of the grand charity event?",
458
+ "Was he the sole survivor of the perilous expedition?",
459
+ # YNQ -> BE NP PP
460
+ "Is the priceless artifact still hidden in the ancient underground chamber?",
461
+ "Are the colorful tropical birds nesting high above the lush rainforest canopy?",
462
+ "Was the historic manuscript carefully stored within the fortified library vault?"
463
+ ]
464
+
465
+ print("English to Vietnamese Translation Examples:")
466
+ print("-" * 50)
467
+ for sentence in test_sentences_2:
468
+ print(f"English: {sentence}")
469
+ translation = translator.translate(sentence)
470
+ print(f"Vietnamese: {translation}")
471
+ print()