File size: 20,132 Bytes
d8c1fb7
 
 
 
 
 
 
f6f1729
071631a
d8c1fb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

import re
import nltk
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.parse import ChartParser, ViterbiParser
from nltk.grammar import CFG, PCFG, Nonterminal, ProbabilisticProduction
from nltk.tree import Tree
import contractions
import string
from collections import defaultdict
import spacy

spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

import json

with open("data/en_vi_dictionary.json", "r", encoding='utf-8') as json_file:
    dictionary = json.load(json_file)

with open('grammar.txt', 'r', encoding='utf-8') as text_file:
    grammar = text_file.read()


class TransferBasedMT:

    def __init__(self) -> None:
        # English - Vietnamese dictionary
        self.dictionary = dictionary

        # Define the CFG grammar for English sentence structure
        self.grammar = grammar
        

################################################ STAGE 1: PREPROCESSING SOURCE SENTENCE ###################################################

    def preprocessing(self, sentence: str) -> str:
        """Preprocess the input sentence: handle named entities, lowercase, expand contractions, and tokenize and regroup."""
        # Handle named entities, e.g. New York -> New_York
        doc = nlp(sentence)
        entities = {ent.text: ent.label_ for ent in doc.ents}
        for ent_text in sorted(entities.keys(), key=len,reverse=True):
            ent_joined = ent_text.replace(" ", "_")
            sentence = sentence.replace(ent_text, ent_joined)
            
        # Lowercase and strip redundant space
        sentence = sentence.lower().strip()
        
        # Expand contractions, e.g. don't -> do not 
        sentence = contractions.fix(sentence)   #type: ignore
        
        # Tokenize and regroup tokens
        sentence = " ".join(word_tokenize(sentence))
        
        return sentence
    
    
    def safe_tag(self, tag):
        """Convert tags with special characters to safe nonterminal symbols."""
        return tag.replace("$", "S")


################################################ STAGE 2: ANALYZE SOURCE SENTENCE #########################################################

    def analyze_source(self, sentence: str):
        """Analyze the source sentence: tokenize, POS tag, and parse into a syntax tree."""
        doc = nlp(sentence)
        filtered_pos_tagged = []  
        punctuation_marks = []
        
        for i, token in enumerate(doc):
            word = token.text
            tag = token.tag_
            if all(char in string.punctuation for char in word):
                punctuation_marks.append((i, word, tag))
            else:
                filtered_pos_tagged.append((token.lemma_.lower(), tag))
                
        grammar_str = self.grammar
        
        # Add terminal rule grammars
        for word, tag in filtered_pos_tagged:
            safe_tag = self.safe_tag(tag)
            escaped_word = word.replace('"', '\\"') 
            grammar_str += f'\n{safe_tag} -> "{escaped_word}"'
            
        try:
            grammar = CFG.fromstring(grammar_str)
            parser = ChartParser(grammar)
            tagged_tokens_only = [word for word, _ in filtered_pos_tagged]
            
            parses = list(parser.parse(tagged_tokens_only))  # Generate parse trees
            
            tree = (parses[0] if parses else self._create_fallback_tree(filtered_pos_tagged))  # Use first parse or fallback
            tree = self._add_punctuation_to_tree(tree, punctuation_marks)  # Reattach punctuation
            
            return tree
        
        except Exception as e:
            print(f"Grammar creation error: {e}")
            return self._create_fallback_tree(filtered_pos_tagged)  # Fallback on error


    def _create_fallback_tree(self, pos_tagged):
        """Create a simple fallback tree when parsing fails."""
        children = [Tree(self.safe_tag(tag), [word]) for word, tag in pos_tagged]  # Create leaf nodes for each token
        return Tree("S", children)  # Wrap in a sentence node


    def _add_punctuation_to_tree(self, tree, punctuation_marks):
        """Add punctuation marks back to the syntax tree."""
        if not punctuation_marks:
            return tree
        if tree.label() == "S":  # Only add to root sentence node
            for _, word, tag in sorted(punctuation_marks): 
                tree.append(Tree(self.safe_tag(tag), [word]))
        return tree


#################################################### STAGE 3: TRANSFER GRAMMAR ############################################################

    def transfer_grammar(self, tree):
        """Transfer the English parse tree to Vietnamese structure."""
        if not isinstance(tree, nltk.Tree):
            return tree 
        
        # Sentence level: recurse through children
        if tree.label() == "S":
            return Tree("S", [self.transfer_grammar(child) for child in tree])
        
        # Verb Phrase: adjust word order
        elif tree.label() == "VP":
            children = [self.transfer_grammar(child) for child in tree]
            child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
            
            if (len(children) >= 3 and "V" in child_labels and "To" in child_labels and "VP" in child_labels):  # Remove TO from V TO VP
                return Tree("VP", [children[0], children[2]]) 
            
            return Tree("VP", children)  # Default: preserve order
        
        # Noun Phrase: adjust word order
        elif tree.label() == "NP":
            children = [self.transfer_grammar(child) for child in tree]
            child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
            
            if (len(children) >= 3 and 'Det' in child_labels and 'AdjP' in child_labels and 'N' in child_labels): # Reorder Det Adj N -> Det N Adj
                return Tree("NP", [children[0], children[2], children[1]])
            
            elif (len(children) >= 2 and 'PRPS' in child_labels and 'N' in child_labels):  # Reorder PRPS N -> N PRPS
                return Tree("NP", [children[1], children[0]])
            
            elif (len(children) >= 2 and 'Det' in child_labels and 'N' in child_labels):   # Remove Det from Det N
                return Tree("NP", [children[1]])
            
            return Tree("NP", children)  # Default: preserve order
        
        # Prepositional Phrase: adjust word order
        elif tree.label() == "PP":
            children = [self.transfer_grammar(child) for child in tree]
            return Tree("PP", children)  # Default: preserve order
        
        # Adverbial Phrase: adjust word order 
        elif tree.label() == 'AdvP':
            children = [self.transfer_grammar(child) for child in tree]
            return Tree("AdvP", children)  # Default: preserve order
        
        # Adjective Phrase: adjust word order 
        elif tree.label() == 'AdjP':
            children = [self.transfer_grammar(child) for child in tree]
            return Tree("AdjP", children)  # Default: preserve order
        
        # Wh-Question: adjust word order 
        elif tree.label() == "WhQ":
            children = [self.transfer_grammar(child) for child in tree]
            child_labels = [child.label() if isinstance(child, Tree) else child for child in children] 
            
            if len(children) >= 4 and "WH_Word" in child_labels and "AUX" in child_labels and "NP" in child_labels and "VP" in child_labels:
                return Tree("WhQ", [children[2], children[3], children[0]])  # Remove AUX from  WH_Word AUX NP VP            
            
            elif len(children) >= 3 and "WH_Word" in child_labels and "NP" in child_labels and "VP" in child_labels and "AUX" not in child_labels:
                return Tree("WhQ", [children[1], children[2], children[0]])
            
            elif len(children) >= 2 and "WH_Word" in child_labels and "VP" in child_labels:
                if len(children[1]) >= 2:
                    return Tree("WhQ", [children[1][1], children[1][0], children[0]])  # WH_Word VP -> WH_Word V NP

            else:
                return Tree("WhQ", children)  # Default: preserve order
            
        # Yes/No-Question: adjust word order 
        elif tree.label() == "YNQ":
            children = [self.transfer_grammar(child) for child in tree]
            child_labels = [child.label() if isinstance(child, Tree) else child for child in children] 
            
            if len(children) >= 3 and "AUX" in child_labels and "NP" in child_labels and "VP" in child_labels:
                return Tree("YNQ", [children[1], children[2]])
            
            elif len(children) >= 3 and "DO" in child_labels and "NP" in child_labels and "VP" in child_labels:
                return Tree("YNQ", [children[1], children[2]])
            
            elif len(children) >= 3 and "MD" in child_labels and "NP" in child_labels and "VP" in child_labels:
                return Tree("YNQ", [children[1], children[2]])
            
            return Tree("YNQ", children)
            
        
        # Other labels: recurse through children
        else:
            return Tree(tree.label(), [self.transfer_grammar(child) for child in tree])


#################################################### STAGE 4: GENERATION STAGE ############################################################

    def generate(self, tree):
        """Generate Vietnamese output from the transformed tree."""
        if not isinstance(tree, nltk.Tree):
            return self._lexical_transfer(tree)  # Translate leaf nodes
            
        words = [self.generate(child) for child in tree if self.generate(child)]  # Recurse
        
        # Handle questions specifically
        if tree.label() == "WhQ":
            words = self._process_wh_question(tree, words)
        elif tree.label() == "YNQ":
            words = self._process_yn_question(tree, words)
        elif tree.label() == "NP":  # Add classifiers for nouns
            words = self._add_classifiers(tree, words)
        elif tree.label() == "VP":  # Apply tense/aspect/mood markers
            words = self._apply_tam_mapping(tree, words)
            
        words = self._apply_agreement(tree, words)  # Handle agreement (e.g., plurals)
        result = " ".join(words)  # Join words into a string
        
        return result


    def _process_wh_question(self, tree, words):
        """Process a Wh-question structure for Vietnamese."""
        words = [w for w in words if w]
  
        wh_word = None
        for word in words:
            if word in ["cái gì", "ai", "ở đâu", "khi nào", "tại sao", "như thế nào", "cái nào", "của ai"]:
                wh_word = word
                break
        
        if wh_word == "tại sao": 
            if words and words[0] != "tại sao":
                words.remove("tại sao")
                words.insert(0, "tại sao")
        elif wh_word == "như thế nào":
            if "vậy" not in words:
                words.append("vậy")
        
        question_particles = ["vậy", "thế", "à", "hả"]
        has_particle = any(particle in words for particle in question_particles)
        
        if not has_particle and wh_word != "tại sao": 
            words.append("vậy")
            
        return words


    def _process_yn_question(self, tree, words):
        """Process a Yes/No question structure for Vietnamese."""
        
        words = [w for w in words if w not in ["", "do_vn", "does_vn", "did_vn"]]
        
        has_question_particle = any(w in ["không", "à", "hả", "nhỉ", "chứ"] or 
                                   w in ["không_vn", "à_vn", "hả_vn", "nhỉ_vn", "chứ_vn"] 
                                   for w in words)
        
        if not has_question_particle:
            if "đã" in words or "đã_vn" in words:
                words.append("phải không")
            else:
                words.append("không")
                
        return words


    def _lexical_transfer(self, word):
        """Translate English words to Vietnamese using the dictionary."""
        if word in self.dictionary:
            return self.dictionary[word]  # Return translation if in dictionary
        return f"{word}_vn"  # Mark untranslated words with _vn suffix


    def _add_classifiers(self, np_tree, words):
        """Add Vietnamese classifiers based on nouns."""
        # noun_indices = [
        #     i for i, child in enumerate(np_tree) if isinstance(child, Tree)
        #     and child.label() in ["N", "NN", "NNS", "NNP", "NNPS"]
        # ]  # Find noun positions
        # for i in noun_indices:
        #     if len(words) > i and not any(words[i].startswith(prefix) for prefix in ["một_vn", "những_vn", "các_vn"]):  # Check if classifier is needed
        #         if words[i].endswith("_vn"):  # Add default classifier for untranslated nouns
        #             words.insert(i, "cái_vn")
        return words


    def _apply_tam_mapping(self, vp_tree, words):
        """Apply Vietnamese TAM (Tense, Aspect, Mood) markers to the word list.
        
        Args:
            vp_tree: A parse tree node representing the verb phrase.
            words: List of words to be modified with TAM markers.
        
        Returns:
            List of words with appropriate Vietnamese TAM markers inserted.
        """
        verb_tense = None
        mood = None

        # Identify verb tense and mood from the verb phrase tree
        for child in vp_tree:
            if isinstance(child, Tree):
                if child.label() in ["V", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
                    verb_tense = child.label()
                if child.label() == "MD":  # Modal verbs indicating mood
                    mood = "indicative"
                elif child.label() == "TO":  # Infinitive marker, often subjunctive
                    mood = "subjunctive"

        if not verb_tense:
            print("Warning: No verb tense identified in the verb phrase tree.")
            return words

        # Apply TAM markers based on verb tense
        if verb_tense == "VBD":  
            words.insert(0, "đã_vn") 
        elif verb_tense == "VB":
            if "will_vn" in words:  
                words = [w for w in words if w != "will_vn"]
                words.insert(0, "sẽ_vn")  
            elif "going_to_vn" in words:  
                words = [w for w in words if w != "going_to_vn"]
                words.insert(0, "sẽ_vn")
        elif verb_tense == "VBG":  
            words.insert(0, "đang_vn")  
            if "đã_vn" in words:
                words.insert(0, "đã_vn")  
        elif verb_tense == "VBN":  
            words.insert(0, "đã_vn")  
        elif verb_tense == "VBP" or verb_tense == "VBZ":
            pass

        # Handle future continuous (e.g., "will be running" -> "sẽ đang")
        if verb_tense == "VBG" and "will_vn" in words:
            words = [w for w in words if w != "will_vn"]
            words.insert(0, "đang_vn")  # Continuous marker
            words.insert(0, "sẽ_vn")    # Future marker

        # Apply mood markers if applicable
        if mood == "subjunctive":
            words.insert(0, "nếu_vn")  # Subjunctive marker (e.g., "if" clause)
        elif mood == "indicative" and "must_vn" in words:
            words = [w for w in words if w != "must_vn"]
            words.insert(0, "phải_vn")  # Necessity marker

        return words


    def _apply_agreement(self, tree, words):
        """Apply agreement rules for Vietnamese (e.g., pluralization)."""
        if tree.label() == "NP":
            for i, word in enumerate(words):
                if "_vn" in word and word.replace("_vn", "").endswith("s"):  # Handle English plurals
                    base_word = word.replace("_vn", "")[:-1] + "_vn"  # Remove 's'
                    words[i] = base_word
                    words.insert(i, "các_vn")  # Add plural marker
        return words


    def _post_process_vietnamese(self, text):
        """Post-process the Vietnamese output: remove _vn, fix punctuation, capitalize."""
        text = text.replace("_vn", "")  # Remove untranslated markers
        
        def fix_entities(word):
            if "_" in word:
                word = " ".join([w for w in word.split("_")])
                return word.title()
            return word.lower()  # Lowercase non-entity words

        words = text.split()
        words = [fix_entities(word) for word in words]
        
        text = " ".join(words)
        for punct in [".", ",", "!", "?", ":", ";"]:  # Attach punctuation directly
            text = text.replace(f" {punct}", punct)
        
        if text:
            words = text.split()
            words[0] = words[0].capitalize()  # Capitalize first word
            text = ' '.join(words)
        return text


    def translate(self, english_sentence):
        """Main translation function that applies all stages of the process."""
        # Step 1: Preprocess input
        preprocessed = self.preprocessing(english_sentence)
        
        # Step 2: Parse English sentence
        source_tree = self.analyze_source(preprocessed)
        print("English parse tree:")
        source_tree.pretty_print()  # Display English parse tree
        
        # Step 3: Transform to Vietnamese structure
        target_tree = self.transfer_grammar(source_tree)
        print("Vietnamese structure tree:")
        target_tree.pretty_print()  # Display Vietnamese parse tree
        
        # Step 4: Generate final translation
        raw_output = self.generate(target_tree)
        vietnamese_output = self._post_process_vietnamese(raw_output)
        return vietnamese_output


if __name__ == "__main__":
    translator = TransferBasedMT()
    test_sentences = [
        "I read books.", "The student studies at school.",
        "She has a beautiful house.", "They want to buy a new car.",
        "This is a good computer.", "Are you ready to listen?", 
        "I want to eat.", "This is my book.","What is your name?",
        "Do you like books?",
        "Is she at school?",
        "Are you ready to listen?",
        "Can they buy a new car?",
        "Did he read the book yesterday?",
        "What is your name?",
        "Where do you live?",
        "Who is your teacher?",
        "When will you go to school?",
        "Why did he leave early?",
        "How do you feel today?",
        "I live in New York"
    ]
    
    test_sentences_2 = [
        # YNQ -> BE NP
        "Is the renowned astrophysicist still available for the conference?",
        "Are those adventurous explorers currently in the remote jungle?",
        "Was the mysterious stranger already gone by midnight?",
        # YNQ -> BE NP Adj
        "Is the vibrant annual festival exceptionally spectacular this season?",
        "Are the newly discovered species remarkably resilient to harsh climates?",
        "Were the ancient ruins surprisingly well-preserved after centuries?",
        # YNQ -> BE NP NP
        "Is she the brilliant leader of the innovative research team?",
        "Are they the enthusiastic organizers of the grand charity event?",
        "Was he the sole survivor of the perilous expedition?",
        # YNQ -> BE NP PP
        "Is the priceless artifact still hidden in the ancient underground chamber?",
        "Are the colorful tropical birds nesting high above the lush rainforest canopy?",
        "Was the historic manuscript carefully stored within the fortified library vault?"
    ]
    
    print("English to Vietnamese Translation Examples:")
    print("-" * 50)
    for sentence in test_sentences_2:
        print(f"English: {sentence}")
        translation = translator.translate(sentence)
        print(f"Vietnamese: {translation}")
        print()