|
|
import pandas as pd
|
|
|
from nltk.translate import AlignedSent
|
|
|
from nltk.translate.ibm1 import IBMModel1
|
|
|
from nltk.lm import MLE
|
|
|
from nltk.lm.preprocessing import padded_everygram_pipeline
|
|
|
from collections import defaultdict, Counter
|
|
|
import math
|
|
|
import os
|
|
|
from tqdm import tqdm
|
|
|
import pickle
|
|
|
import random
|
|
|
import gc
|
|
|
import matplotlib.pyplot as plt
|
|
|
import numpy as np
|
|
|
import contractions
|
|
|
BILINGUAL_DATA_PATH = "bilingual_cleaned_dataset.csv"
|
|
|
VIE_DATA_PATH = "vie_cleaned_dataset.csv"
|
|
|
VISUALIZATION_PATH = "visualizations"
|
|
|
BEAM_SIZE = 3
|
|
|
MAX_PHRASE_LENGTH = 7
|
|
|
LM_ORDER = 3
|
|
|
ALPHA = 0.7
|
|
|
BETA = 0.3
|
|
|
BATCH_SIZE = 1000
|
|
|
MIN_PHRASE_COUNT = 3
|
|
|
LIMIT_VOCAB = 100000
|
|
|
MODE_VISUALIZATION = False
|
|
|
from pyvi import ViTokenizer
|
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LanguageModel:
|
|
|
"""Memory-optimized Language Model"""
|
|
|
def __init__(self, order=LM_ORDER, MODE_VISUALIZATION=MODE_VISUALIZATION):
|
|
|
self.order = order
|
|
|
self.lm = None
|
|
|
self.vocab_size = 0
|
|
|
self.MODE_VISUALIZATION = MODE_VISUALIZATION
|
|
|
|
|
|
def preprocess(self, text):
|
|
|
"""Tokenize Vietnamese words"""
|
|
|
|
|
|
return ViTokenizer.tokenize(text.lower()).split()
|
|
|
|
|
|
def visualize_iterations(self, word_freq, iteration, batch_tokens, output_dir="/kaggle/working/visualizations"):
|
|
|
if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
|
|
|
|
|
|
output_dir = "/kaggle/working/visualizations"
|
|
|
else:
|
|
|
output_dir = VISUALIZATION_PATH
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
"""Visualize word frequency for a given iteration"""
|
|
|
if not self.MODE_VISUALIZATION:
|
|
|
return
|
|
|
|
|
|
print(f"\nIteration {iteration} - Word Frequency (Top 5):")
|
|
|
top_words = word_freq.most_common(5)
|
|
|
for word, count in top_words:
|
|
|
print(f" {word}: {count}")
|
|
|
|
|
|
if not os.path.exists(output_dir):
|
|
|
os.makedirs(output_dir)
|
|
|
|
|
|
words, counts = zip(*word_freq.most_common(10)) if word_freq else ([], [])
|
|
|
if words:
|
|
|
plt.figure(figsize=(8, 6))
|
|
|
plt.bar(words, counts, color='purple', alpha=0.7)
|
|
|
plt.title(f'Word Frequency - Iteration {iteration}')
|
|
|
plt.xlabel('Words')
|
|
|
plt.ylabel('Frequency')
|
|
|
plt.xticks(rotation=45)
|
|
|
plt.grid(True, axis='y')
|
|
|
plt.savefig(os.path.join(output_dir, f'word_freq_iter_{iteration}.png'))
|
|
|
plt.close()
|
|
|
|
|
|
def get_probability(self, tokens):
|
|
|
"""Calculate probability P(V) for a vietnamese tokens sequence"""
|
|
|
if not tokens or not self.lm:
|
|
|
return 0.0
|
|
|
|
|
|
start_tokens = ['<s>'] * (self.order - 1)
|
|
|
tokens = start_tokens + tokens
|
|
|
log_prob = 0.0
|
|
|
|
|
|
for i in range(self.order - 1, len(tokens)):
|
|
|
context = tokens[max(0, i - self.order + 1):i]
|
|
|
word = tokens[i]
|
|
|
prob = self.lm.score(word, context) or 1e-10
|
|
|
log_prob += math.log(prob)
|
|
|
|
|
|
return log_prob
|
|
|
|
|
|
def visualize_log_probabilities(self, sentences, max_sentences=100, output_dir="/kaggle/working/visualizations"):
|
|
|
if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
|
|
|
|
|
|
output_dir = "/kaggle/working/visualizations"
|
|
|
else:
|
|
|
|
|
|
output_dir = VISUALIZATION_PATH
|
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
"""Visualize the log probabilities of a sample of sentences"""
|
|
|
if not self.MODE_VISUALIZATION:
|
|
|
return
|
|
|
|
|
|
if not self.lm:
|
|
|
print("Cannot visualize log probabilities: Language model not trained.")
|
|
|
return
|
|
|
|
|
|
|
|
|
sample_size = min(len(sentences), max_sentences)
|
|
|
sample_sentences = random.sample(sentences, sample_size) if len(sentences) > max_sentences else sentences
|
|
|
|
|
|
|
|
|
log_probs = []
|
|
|
for sent in sample_sentences:
|
|
|
tokens = self.preprocess(sent)
|
|
|
log_prob = self.get_probability(tokens)
|
|
|
log_probs.append(log_prob)
|
|
|
|
|
|
|
|
|
print(f"\nLog Probabilities for {len(log_probs)} sentences:")
|
|
|
print(f" Mean Log Probability: {np.mean(log_probs):.2f}")
|
|
|
print(f" Min Log Probability: {min(log_probs):.2f}")
|
|
|
print(f" Max Log Probability: {max(log_probs):.2f}")
|
|
|
|
|
|
|
|
|
if not os.path.exists(output_dir):
|
|
|
os.makedirs(output_dir)
|
|
|
|
|
|
plt.figure(figsize=(8, 6))
|
|
|
plt.hist(log_probs, bins=30, color='blue', alpha=0.7)
|
|
|
plt.title('Distribution of Log Probabilities for Sentences')
|
|
|
plt.xlabel('Log Probability')
|
|
|
plt.ylabel('Frequency')
|
|
|
plt.grid(True)
|
|
|
plt.savefig(os.path.join(output_dir, 'log_probabilities.png'))
|
|
|
plt.close()
|
|
|
print(f"Log probabilities visualization saved to {output_dir}/log_probabilities.png")
|
|
|
|
|
|
def train(self, vietnamese_sentences, max_sentences=200000):
|
|
|
"""Training Language Model with memory optimization"""
|
|
|
print(f"Training Language Model on {min(len(vietnamese_sentences), max_sentences)} sentences...")
|
|
|
|
|
|
|
|
|
if len(vietnamese_sentences) > max_sentences:
|
|
|
print(f"Sampling {max_sentences} sentences from {len(vietnamese_sentences)} for LM training")
|
|
|
vietnamese_sentences = random.sample(vietnamese_sentences, max_sentences)
|
|
|
|
|
|
|
|
|
all_tokens = []
|
|
|
batch_size = 10000
|
|
|
word_freq = Counter()
|
|
|
iteration = 0
|
|
|
|
|
|
for i in range(0, len(vietnamese_sentences), batch_size):
|
|
|
batch = vietnamese_sentences[i:i+batch_size]
|
|
|
batch_tokens = [self.preprocess(sent) for sent in batch]
|
|
|
all_tokens.extend(batch_tokens)
|
|
|
|
|
|
|
|
|
if self.MODE_VISUALIZATION and iteration < 2:
|
|
|
for tokens in batch_tokens:
|
|
|
word_freq.update(tokens)
|
|
|
self.visualize_iterations(word_freq, iteration + 1, batch_tokens)
|
|
|
iteration += 1
|
|
|
|
|
|
|
|
|
if i % (batch_size * 5) == 0:
|
|
|
gc.collect()
|
|
|
|
|
|
vocab = set()
|
|
|
for tokens in all_tokens:
|
|
|
vocab.update(tokens)
|
|
|
|
|
|
|
|
|
if len(vocab) > LIMIT_VOCAB:
|
|
|
word_freq = Counter()
|
|
|
for tokens in all_tokens:
|
|
|
word_freq.update(tokens)
|
|
|
|
|
|
|
|
|
most_common = word_freq.most_common(LIMIT_VOCAB)
|
|
|
vocab = set(word for word, _ in most_common)
|
|
|
print(f"Limited vocabulary to {len(vocab)} most frequent words")
|
|
|
|
|
|
self.vocab_size = len(vocab)
|
|
|
|
|
|
|
|
|
filtered_sentences = []
|
|
|
for tokens in all_tokens:
|
|
|
filtered_tokens = [token for token in tokens if token in vocab]
|
|
|
if filtered_tokens:
|
|
|
filtered_sentences.append(filtered_tokens)
|
|
|
|
|
|
|
|
|
del all_tokens
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
|
train_data, padded_sents = padded_everygram_pipeline(self.order, filtered_sentences)
|
|
|
self.lm = MLE(self.order)
|
|
|
self.lm.fit(train_data, padded_sents)
|
|
|
|
|
|
|
|
|
if self.MODE_VISUALIZATION:
|
|
|
self.visualize_log_probabilities(vietnamese_sentences)
|
|
|
|
|
|
|
|
|
del filtered_sentences, train_data, padded_sents
|
|
|
gc.collect()
|
|
|
|
|
|
return {"vocab_size": self.vocab_size, "ngram_order": self.order}
|
|
|
|
|
|
|
|
|
|
|
|
class TranslationModel:
|
|
|
"""Memory-optimized Translation Model"""
|
|
|
def __init__(self, max_phrase_length=MAX_PHRASE_LENGTH, MODE_VISUALIZATION=MODE_VISUALIZATION):
|
|
|
self.max_phrase_length = max_phrase_length
|
|
|
self.phrase_table = {}
|
|
|
self.word_alignments = []
|
|
|
self.MODE_VISUALIZATION = MODE_VISUALIZATION
|
|
|
|
|
|
def preprocess(self, text, lang):
|
|
|
"""Preprocess text for both languages"""
|
|
|
text = text.lower()
|
|
|
if lang == 'eng':
|
|
|
text = contractions.fix(text)
|
|
|
return word_tokenize(text)
|
|
|
elif lang == 'vie':
|
|
|
return ViTokenizer.tokenize(text).split()
|
|
|
else:
|
|
|
return text.split()
|
|
|
|
|
|
def load_bilingual_data_batch(self, file_path, batch_size=BATCH_SIZE):
|
|
|
"""Load bilingual data in batches to reduce memory usage"""
|
|
|
print(f"Loading bilingual data from {file_path} in batches")
|
|
|
|
|
|
try:
|
|
|
df = pd.read_csv(file_path)
|
|
|
except FileNotFoundError:
|
|
|
file_path = os.path.join('datatest', BILINGUAL_DATA_PATH)
|
|
|
df = pd.read_csv(file_path)
|
|
|
total_rows = len(df)
|
|
|
print(f"Total rows: {total_rows}")
|
|
|
|
|
|
for start_idx in range(0, total_rows, batch_size):
|
|
|
end_idx = min(start_idx + batch_size, total_rows)
|
|
|
batch_df = df.iloc[start_idx:end_idx]
|
|
|
|
|
|
aligned_sentences = []
|
|
|
for _, row in batch_df.iterrows():
|
|
|
eng_tokens = self.preprocess(row['en'], 'eng')
|
|
|
vie_tokens = self.preprocess(row['vi'], 'vie')
|
|
|
|
|
|
|
|
|
if len(eng_tokens) <= 50 and len(vie_tokens) <= 50:
|
|
|
aligned_sentences.append(AlignedSent(eng_tokens, vie_tokens))
|
|
|
|
|
|
yield aligned_sentences
|
|
|
|
|
|
|
|
|
del batch_df, aligned_sentences
|
|
|
gc.collect()
|
|
|
|
|
|
def visualize_alignments(self, aligned_sentences, max_sentences=2, output_dir="/kaggle/working/visualizations"):
|
|
|
if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
|
|
|
|
|
|
output_dir = "/kaggle/working/visualizations"
|
|
|
else:
|
|
|
|
|
|
output_dir = VISUALIZATION_PATH
|
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
"""Visualize word alignments for a sample of sentence pairs"""
|
|
|
if not self.MODE_VISUALIZATION:
|
|
|
return
|
|
|
|
|
|
if not self.ibm_model:
|
|
|
print("Cannot visualize alignments: IBM Model 1 not trained.")
|
|
|
return
|
|
|
|
|
|
|
|
|
sample_size = min(len(aligned_sentences), max_sentences)
|
|
|
sample_sentences = random.sample(aligned_sentences, sample_size) if len(aligned_sentences) > max_sentences else aligned_sentences
|
|
|
|
|
|
if not os.path.exists(output_dir):
|
|
|
os.makedirs(output_dir)
|
|
|
|
|
|
for idx, sent in enumerate(sample_sentences):
|
|
|
src_words = sent.words
|
|
|
tgt_words = sent.mots
|
|
|
alignment = sent.alignment
|
|
|
|
|
|
|
|
|
matrix = np.zeros((len(tgt_words), len(src_words)))
|
|
|
for src_idx, tgt_idx in alignment:
|
|
|
if tgt_idx is not None and src_idx < len(src_words) and tgt_idx < len(tgt_words):
|
|
|
matrix[tgt_idx, src_idx] = 1
|
|
|
|
|
|
|
|
|
plt.figure(figsize=(8, 6))
|
|
|
plt.imshow(matrix, cmap='Blues', interpolation='nearest')
|
|
|
plt.title(f'Alignment Matrix - Sentence Pair {idx + 1}')
|
|
|
plt.xlabel('English Words')
|
|
|
plt.ylabel('Vietnamese Words')
|
|
|
plt.xticks(range(len(src_words)), src_words, rotation=45, ha='right')
|
|
|
plt.yticks(range(len(tgt_words)), tgt_words)
|
|
|
plt.tight_layout()
|
|
|
plt.savefig(os.path.join(output_dir, f'alignment_matrix_{idx + 1}.png'))
|
|
|
plt.close()
|
|
|
|
|
|
|
|
|
print(f"\nSentence Pair {idx + 1}:")
|
|
|
print(f" English: {' '.join(src_words)}")
|
|
|
print(f" Vietnamese: {' '.join(tgt_words)}")
|
|
|
print(f" Alignments: {[(src_words[src], tgt_words[tgt]) for src, tgt in alignment if tgt is not None]}")
|
|
|
|
|
|
print(f"Alignment visualizations saved to {output_dir}/")
|
|
|
|
|
|
def _extract_alignments_memory_efficient(self, aligned_sentences, ibm_model):
|
|
|
"""Memory-efficient alignment extraction"""
|
|
|
alignments = []
|
|
|
|
|
|
|
|
|
batch_size = 5000
|
|
|
for i in range(0, len(aligned_sentences), batch_size):
|
|
|
batch_alignments = []
|
|
|
batch_sentences = aligned_sentences[i:i+batch_size]
|
|
|
|
|
|
for sent_pair in batch_sentences:
|
|
|
eng_tokens = sent_pair.words
|
|
|
vie_tokens = sent_pair.mots
|
|
|
|
|
|
|
|
|
alignment = []
|
|
|
for eng_i, eng_word in enumerate(eng_tokens):
|
|
|
best_prob = 0
|
|
|
best_vie_i = -1
|
|
|
|
|
|
for vie_i, vie_word in enumerate(vie_tokens):
|
|
|
prob = ibm_model.translation_table.get(eng_word, {}).get(vie_word, 0)
|
|
|
if prob > best_prob:
|
|
|
best_prob = prob
|
|
|
best_vie_i = vie_i
|
|
|
|
|
|
|
|
|
if best_prob > 0.01:
|
|
|
alignment.append((eng_i, best_vie_i))
|
|
|
|
|
|
batch_alignments.append(alignment)
|
|
|
|
|
|
alignments.extend(batch_alignments)
|
|
|
|
|
|
|
|
|
if i % (batch_size * 10) == 0:
|
|
|
gc.collect()
|
|
|
|
|
|
return alignments
|
|
|
|
|
|
def extract_phrases_memory_efficient(self, aligned_sentences):
|
|
|
"""Memory-efficient phrase extraction"""
|
|
|
print("Extracting phrase pairs with memory optimization...")
|
|
|
|
|
|
|
|
|
phrase_counts = defaultdict(lambda: defaultdict(int))
|
|
|
|
|
|
|
|
|
batch_size = 5000
|
|
|
for i in range(0, len(aligned_sentences), batch_size):
|
|
|
batch_sentences = aligned_sentences[i:i+batch_size]
|
|
|
batch_alignments = self.word_alignments[i:i+batch_size]
|
|
|
|
|
|
for sent_pair, alignments in zip(batch_sentences, batch_alignments):
|
|
|
if not alignments:
|
|
|
continue
|
|
|
|
|
|
eng_tokens = sent_pair.words
|
|
|
vie_tokens = sent_pair.mots
|
|
|
alignment_set = set(alignments)
|
|
|
|
|
|
|
|
|
for eng_i, vie_i in alignments:
|
|
|
if eng_i < len(eng_tokens) and vie_i < len(vie_tokens):
|
|
|
eng_word = eng_tokens[eng_i]
|
|
|
vie_word = vie_tokens[vie_i]
|
|
|
phrase_counts[eng_word][vie_word] += 1
|
|
|
|
|
|
|
|
|
max_len = min(3, self.max_phrase_length)
|
|
|
consistent_phrases = self._extract_consistent_phrases(
|
|
|
eng_tokens, vie_tokens, alignment_set, max_len
|
|
|
)
|
|
|
|
|
|
for eng_phrase, vie_phrase in consistent_phrases:
|
|
|
phrase_counts[eng_phrase][vie_phrase] += 1
|
|
|
|
|
|
|
|
|
if i % (batch_size * 5) == 0:
|
|
|
gc.collect()
|
|
|
print(f"Processed {i+batch_size} sentences...")
|
|
|
|
|
|
|
|
|
self.phrase_table = {}
|
|
|
for eng_phrase, vie_phrases in phrase_counts.items():
|
|
|
total_count = sum(vie_phrases.values())
|
|
|
if total_count >= MIN_PHRASE_COUNT:
|
|
|
|
|
|
sorted_phrases = sorted(vie_phrases.items(), key=lambda x: x[1], reverse=True)[:3]
|
|
|
|
|
|
filtered_phrases = {}
|
|
|
for vie_phrase, count in sorted_phrases:
|
|
|
if count >= MIN_PHRASE_COUNT:
|
|
|
filtered_phrases[vie_phrase] = count / total_count
|
|
|
|
|
|
if filtered_phrases:
|
|
|
self.phrase_table[eng_phrase] = filtered_phrases
|
|
|
|
|
|
print(f"Extracted {len(self.phrase_table)} phrase pairs (filtered)")
|
|
|
|
|
|
if self.MODE_VISUALIZATION:
|
|
|
self.visualize_phrase_table()
|
|
|
|
|
|
return self.phrase_table
|
|
|
|
|
|
def _extract_consistent_phrases(self, eng_tokens, vie_tokens, alignments, max_length):
|
|
|
"""Extract consistent phrase pairs with length limit"""
|
|
|
consistent_phrases = []
|
|
|
eng_len = len(eng_tokens)
|
|
|
|
|
|
|
|
|
for e_start in range(eng_len):
|
|
|
for e_end in range(e_start, min(eng_len, e_start + max_length)):
|
|
|
vie_positions = set()
|
|
|
for e_pos in range(e_start, e_end + 1):
|
|
|
for (eng_idx, vie_idx) in alignments:
|
|
|
if eng_idx == e_pos:
|
|
|
vie_positions.add(vie_idx)
|
|
|
|
|
|
if not vie_positions:
|
|
|
continue
|
|
|
|
|
|
v_start, v_end = min(vie_positions), max(vie_positions)
|
|
|
|
|
|
if v_end - v_start + 1 <= max_length:
|
|
|
if self._is_consistent_phrase_pair(e_start, e_end, v_start, v_end, alignments):
|
|
|
eng_phrase = ' '.join(eng_tokens[e_start:e_end+1])
|
|
|
vie_phrase = ' '.join(vie_tokens[v_start:v_end+1])
|
|
|
consistent_phrases.append((eng_phrase, vie_phrase))
|
|
|
|
|
|
return consistent_phrases
|
|
|
|
|
|
def _is_consistent_phrase_pair(self, e_start, e_end, v_start, v_end, alignments):
|
|
|
"""Check if a phrase pair is consistent"""
|
|
|
for (eng_idx, vie_idx) in alignments:
|
|
|
if (e_start <= eng_idx <= e_end) and not (v_start <= vie_idx <= v_end):
|
|
|
return False
|
|
|
if (v_start <= vie_idx <= v_end) and not (e_start <= eng_idx <= e_end):
|
|
|
return False
|
|
|
return True
|
|
|
|
|
|
def train_ibm_model_incremental(self, file_path="/kaggle/input/general-data/bilingual_cleaned_dataset.csv", iterations=5):
|
|
|
"""Train IBM Model 1 incrementally to reduce memory usage"""
|
|
|
if not os.path.exists(file_path):
|
|
|
file_path = os.path.join('datatest', BILINGUAL_DATA_PATH)
|
|
|
print(f"Training IBM Model 1 incrementally with {iterations} iterations...")
|
|
|
|
|
|
|
|
|
all_aligned_sentences = []
|
|
|
eng_vocab = set()
|
|
|
vie_vocab = set()
|
|
|
|
|
|
for batch in self.load_bilingual_data_batch(file_path):
|
|
|
for sent_pair in batch:
|
|
|
eng_vocab.update(sent_pair.words)
|
|
|
vie_vocab.update(sent_pair.mots)
|
|
|
all_aligned_sentences.append(sent_pair)
|
|
|
|
|
|
|
|
|
if len(all_aligned_sentences) >= 300000:
|
|
|
print(f"Limited training to {len(all_aligned_sentences)} sentences")
|
|
|
break
|
|
|
|
|
|
print(f"Training on {len(all_aligned_sentences)} aligned sentences")
|
|
|
print(f"English vocab: {len(eng_vocab)}, Vietnamese vocab: {len(vie_vocab)}")
|
|
|
|
|
|
ibm_model = IBMModel1(all_aligned_sentences, iterations)
|
|
|
|
|
|
|
|
|
self.word_alignments = self._extract_alignments_memory_efficient(all_aligned_sentences, ibm_model)
|
|
|
|
|
|
|
|
|
del ibm_model
|
|
|
gc.collect()
|
|
|
|
|
|
return all_aligned_sentences
|
|
|
|
|
|
def visualize_phrase_table(self, max_phrases=10, output_dir="/kaggle/working/visualizations"):
|
|
|
if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
|
|
|
|
|
|
output_dir = "/kaggle/working/visualizations"
|
|
|
else:
|
|
|
|
|
|
output_dir = VISUALIZATION_PATH
|
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
"""Visualize the phrase table as a heatmap with English phrases as columns and Vietnamese phrases as rows"""
|
|
|
if not self.MODE_VISUALIZATION:
|
|
|
return
|
|
|
|
|
|
if not self.phrase_table:
|
|
|
print("Cannot visualize phrase table: Phrase table is empty.")
|
|
|
return
|
|
|
|
|
|
|
|
|
eng_phrases = sorted(self.phrase_table.keys(), key=lambda x: sum(self.phrase_table[x].values()), reverse=True)[:max_phrases]
|
|
|
vie_phrases = set()
|
|
|
for eng in eng_phrases:
|
|
|
vie_phrases.update(self.phrase_table[eng].keys())
|
|
|
vie_phrases = sorted(list(vie_phrases))[:max_phrases]
|
|
|
|
|
|
|
|
|
matrix = np.zeros((len(vie_phrases), len(eng_phrases)))
|
|
|
for i, vie in enumerate(vie_phrases):
|
|
|
for j, eng in enumerate(eng_phrases):
|
|
|
matrix[i, j] = self.phrase_table.get(eng, {}).get(vie, 0)
|
|
|
|
|
|
|
|
|
if not os.path.exists(output_dir):
|
|
|
os.makedirs(output_dir)
|
|
|
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
plt.imshow(matrix, cmap='Blues', interpolation='nearest')
|
|
|
plt.title('Phrase Table Translation Probabilities')
|
|
|
plt.xlabel('English Phrases')
|
|
|
plt.ylabel('Vietnamese Phrases')
|
|
|
plt.xticks(range(len(eng_phrases)), eng_phrases, rotation=45, ha='right')
|
|
|
plt.yticks(range(len(vie_phrases)), vie_phrases)
|
|
|
plt.colorbar(label='Translation Probability')
|
|
|
plt.tight_layout()
|
|
|
plt.savefig(os.path.join(output_dir, 'phrase_table.png'))
|
|
|
plt.close()
|
|
|
|
|
|
|
|
|
print("\nSample Phrase Table Entries (Top 5 English phrases):")
|
|
|
for eng in eng_phrases[:5]:
|
|
|
print(f" English: {eng}")
|
|
|
for vie, prob in sorted(self.phrase_table[eng].items(), key=lambda x: x[1], reverse=True)[:3]:
|
|
|
print(f" -> Vietnamese: {vie}, Probability: {prob:.4f}")
|
|
|
|
|
|
print(f"Phrase table visualization saved to {output_dir}/phrase_table.png")
|
|
|
|
|
|
|
|
|
|
|
|
class Decoder:
|
|
|
"""Memory-optimized decoder"""
|
|
|
def __init__(self, phrase_table, language_model, beam_size=BEAM_SIZE):
|
|
|
self.phrase_table = phrase_table
|
|
|
self.lm = language_model
|
|
|
self.beam_size = beam_size
|
|
|
def translate(self, sentence):
|
|
|
"""Translate sentence with memory optimization"""
|
|
|
tokens = sentence.lower().split()
|
|
|
if not tokens:
|
|
|
return ""
|
|
|
return self._greedy_translate(tokens)
|
|
|
|
|
|
def _greedy_translate(self, tokens):
|
|
|
"""Greedy translation to save memory"""
|
|
|
translation = []
|
|
|
i = 0
|
|
|
|
|
|
while i < len(tokens):
|
|
|
best_phrase_len = 1
|
|
|
best_translation = tokens[i]
|
|
|
|
|
|
|
|
|
for phrase_len in range(min(3, len(tokens) - i), 0, -1):
|
|
|
eng_phrase = ' '.join(tokens[i:i+phrase_len])
|
|
|
|
|
|
if eng_phrase in self.phrase_table:
|
|
|
|
|
|
vie_translations = self.phrase_table[eng_phrase]
|
|
|
if vie_translations:
|
|
|
best_vie_phrase = max(vie_translations.items(), key=lambda x: x[1])
|
|
|
best_translation = best_vie_phrase[0]
|
|
|
best_phrase_len = phrase_len
|
|
|
break
|
|
|
|
|
|
translation.append(best_translation)
|
|
|
i += best_phrase_len
|
|
|
|
|
|
return ' '.join(translation)
|
|
|
|
|
|
class Hypothesis:
|
|
|
"""Lightweight hypothesis class"""
|
|
|
def __init__(self, translation, coverage, score, last_phrase_end):
|
|
|
self.translation = translation
|
|
|
self.coverage = coverage
|
|
|
self.score = score
|
|
|
self.last_phrase_end = last_phrase_end
|
|
|
|
|
|
|
|
|
class SMT:
|
|
|
"""Memory-optimized SMT system"""
|
|
|
def __init__(self):
|
|
|
self.lm = LanguageModel(order=LM_ORDER)
|
|
|
self.tm = TranslationModel(max_phrase_length=MAX_PHRASE_LENGTH)
|
|
|
self.decoder = None
|
|
|
|
|
|
def post_process(self, text):
|
|
|
"""Replaces underscores with spaces in the translated text."""
|
|
|
return text.replace("_", " ")
|
|
|
|
|
|
def train(self):
|
|
|
bilingual_path = "/kaggle/input/general-data/bilingual_cleaned_dataset.csv"
|
|
|
vie_path = "/kaggle/input/general-data/vie_cleaned_dataset.csv"
|
|
|
|
|
|
if not os.path.exists(bilingual_path):
|
|
|
bilingual_path = os.path.join("datatest", BILINGUAL_DATA_PATH)
|
|
|
vie_path = os.path.join("datatest", VIE_DATA_PATH)
|
|
|
print("=== Training Translation Model ===")
|
|
|
aligned_sentences = self.tm.train_ibm_model_incremental(bilingual_path)
|
|
|
phrase_table = self.tm.extract_phrases_memory_efficient(aligned_sentences)
|
|
|
|
|
|
del aligned_sentences
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
|
print("\n=== Training Language Model ===")
|
|
|
vie_df = pd.read_csv(vie_path)
|
|
|
vietnamese_sentences = vie_df['vi'].tolist()
|
|
|
del vie_df
|
|
|
gc.collect()
|
|
|
|
|
|
lm_stats = self.lm.train(vietnamese_sentences, max_sentences=50000)
|
|
|
del vietnamese_sentences
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
|
self.decoder = Decoder(phrase_table, self.lm)
|
|
|
|
|
|
|
|
|
self.save_model()
|
|
|
|
|
|
return {
|
|
|
"phrase_pairs": len(phrase_table),
|
|
|
"lm_stats": lm_stats
|
|
|
}
|
|
|
|
|
|
def translate_sentence(self, sentence):
|
|
|
"""Translate a single sentence"""
|
|
|
if self.decoder is None:
|
|
|
raise ValueError("Model not trained or loaded.")
|
|
|
translated_text_with_underscores = self.decoder.translate(sentence)
|
|
|
return self.post_process(translated_text_with_underscores)
|
|
|
|
|
|
def save_model(self):
|
|
|
"""Save the trained model"""
|
|
|
if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
|
|
|
|
|
|
model_dir = "/kaggle/working/checkpoints"
|
|
|
else:
|
|
|
|
|
|
model_dir = "checkpoints"
|
|
|
|
|
|
os.makedirs(model_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
with open(os.path.join(model_dir, "phrase_table.pkl"), 'wb') as f:
|
|
|
pickle.dump(self.tm.phrase_table, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
with open(os.path.join(model_dir, "lm_object.pkl"), 'wb') as f:
|
|
|
pickle.dump(self.lm, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
|
|
|
print(f"Model saved to {model_dir}")
|
|
|
|
|
|
def load_model(self, model_dir='checkpoints'):
|
|
|
"""Load a pre-trained model"""
|
|
|
with open(os.path.join(model_dir, "phrase_table.pkl"), 'rb') as f:
|
|
|
phrase_table = pickle.load(f)
|
|
|
with open(os.path.join(model_dir, "lm_object.pkl"), 'rb') as f:
|
|
|
self.lm = pickle.load(f)
|
|
|
|
|
|
self.decoder = Decoder(phrase_table, self.lm, BEAM_SIZE)
|
|
|
self.tm.phrase_table = phrase_table
|
|
|
|
|
|
print(f"Model loaded from {model_dir}")
|
|
|
|
|
|
def evaluate(self, test_file='/kaggle/input/general-data/test_cleaned_dataset.csv', sample_size=5):
|
|
|
"""Evaluate model on test set"""
|
|
|
try :
|
|
|
df = pd.read_csv(test_file)
|
|
|
except FileNotFoundError:
|
|
|
test_file = 'datatest/test_cleaned_dataset.csv'
|
|
|
df = pd.read_csv(test_file)
|
|
|
sample_size = min(sample_size, len(df))
|
|
|
sample_indices = random.sample(range(len(df)), sample_size)
|
|
|
|
|
|
results = []
|
|
|
for idx in sample_indices:
|
|
|
try:
|
|
|
source = df.iloc[idx]['en']
|
|
|
reference = df.iloc[idx]['vi']
|
|
|
translation = self.translate_sentence(source)
|
|
|
|
|
|
results.append({
|
|
|
"source": source,
|
|
|
"reference": reference,
|
|
|
"translation": translation
|
|
|
})
|
|
|
except Exception as e:
|
|
|
print(f"Error translating sentence {idx}: {e}")
|
|
|
results.append({
|
|
|
"source": df.iloc[idx]['en'],
|
|
|
"reference": df.iloc[idx]['vi'],
|
|
|
"translation": "Translation failed"
|
|
|
})
|
|
|
|
|
|
return results
|
|
|
|
|
|
def save_predictions_batch(self, test_file="/kaggle/input/general-data/test_cleaned_dataset.csv", output_file="/kaggle/working/predicted.csv", batch_size=1000):
|
|
|
"""Save predictions in batches to avoid memory issues"""
|
|
|
|
|
|
if not os.path.exists(test_file):
|
|
|
test_file = "datatest/test_cleaned_dataset.csv"
|
|
|
output_file = "datatest/predicted1.csv"
|
|
|
print(f"Output file will be saved to: {output_file}")
|
|
|
|
|
|
df_info = pd.read_csv(test_file, nrows=0)
|
|
|
total_rows = len(pd.read_csv(test_file))
|
|
|
|
|
|
print(f"Processing {total_rows} sentences in batches of {batch_size}")
|
|
|
|
|
|
|
|
|
first_batch = True
|
|
|
for start_idx in tqdm(range(0, total_rows, batch_size), desc="Processing batches"):
|
|
|
end_idx = min(start_idx + batch_size, total_rows)
|
|
|
|
|
|
|
|
|
batch_df = pd.read_csv(test_file, skiprows=range(1, start_idx+1), nrows=batch_size)
|
|
|
|
|
|
|
|
|
batch_predictions = []
|
|
|
for _, row in batch_df.iterrows():
|
|
|
try:
|
|
|
source = row['en']
|
|
|
reference = row['vi']
|
|
|
translation = self.translate_sentence(source)
|
|
|
|
|
|
batch_predictions.append({
|
|
|
"en": source,
|
|
|
"vi": reference,
|
|
|
"pre": translation
|
|
|
})
|
|
|
except Exception as e:
|
|
|
batch_predictions.append({
|
|
|
"en": row['en'],
|
|
|
"vi": row['vi'],
|
|
|
"pre": "Translation failed"
|
|
|
})
|
|
|
|
|
|
|
|
|
batch_pred_df = pd.DataFrame(batch_predictions)
|
|
|
|
|
|
if first_batch:
|
|
|
batch_pred_df.to_csv(output_file, index=False)
|
|
|
first_batch = False
|
|
|
else:
|
|
|
batch_pred_df.to_csv(output_file, mode='a', header=False, index=False)
|
|
|
|
|
|
|
|
|
del batch_df, batch_predictions, batch_pred_df
|
|
|
gc.collect()
|
|
|
|
|
|
print(f"Predictions saved to {output_file}")
|
|
|
return output_file
|
|
|
|
|
|
def main():
|
|
|
print("Starting Memory-Optimized SMT System...")
|
|
|
smt = SMT()
|
|
|
model_dir = "checkpoints"
|
|
|
if os.path.exists(model_dir) and os.path.isfile(os.path.join(model_dir, "phrase_table.pkl")):
|
|
|
print("Loading existing model...")
|
|
|
smt.load_model()
|
|
|
else:
|
|
|
print("Training new model...")
|
|
|
stats = smt.train()
|
|
|
print(f"Training complete: {stats}")
|
|
|
|
|
|
|
|
|
print("\nEvaluating model...")
|
|
|
results = smt.evaluate(sample_size=1)
|
|
|
print("\nExample translations:")
|
|
|
for i, result in enumerate(results):
|
|
|
print(f"\nExample {i+1}:")
|
|
|
print(f"English: {result['source']}")
|
|
|
print(f"Reference: {result['reference']}")
|
|
|
print(f"Translation: {result['translation']}")
|
|
|
|
|
|
|
|
|
print("\nSaving predictions in batches...")
|
|
|
output_file = smt.save_predictions_batch(batch_size=500)
|
|
|
print(f"All predictions saved to: {output_file}")
|
|
|
|
|
|
|
|
|
gc.collect()
|
|
|
print("Processing complete!")
|
|
|
|
|
|
class SMTExtended(SMT):
|
|
|
def infer(self, sentence):
|
|
|
"""Translate a single arbitrary English sentence into Vietnamese using beam search"""
|
|
|
if self.decoder is None:
|
|
|
raise ValueError("Model not trained or loaded.")
|
|
|
|
|
|
|
|
|
tokens = self.tm.preprocess(sentence, 'eng')
|
|
|
if not tokens:
|
|
|
return ""
|
|
|
|
|
|
|
|
|
beam = [(0.0, [], 0, set())]
|
|
|
best_score = float('-inf')
|
|
|
best_translation = []
|
|
|
|
|
|
|
|
|
while beam:
|
|
|
new_beam = []
|
|
|
for score, trans_tokens, last_pos, covered in beam:
|
|
|
|
|
|
if len(covered) == len(tokens):
|
|
|
if score > best_score:
|
|
|
best_score = score
|
|
|
best_translation = trans_tokens
|
|
|
continue
|
|
|
|
|
|
|
|
|
next_pos = last_pos
|
|
|
while next_pos in covered and next_pos < len(tokens):
|
|
|
next_pos += 1
|
|
|
|
|
|
if next_pos >= len(tokens):
|
|
|
if score > best_score:
|
|
|
best_score = score
|
|
|
best_translation = trans_tokens
|
|
|
continue
|
|
|
|
|
|
|
|
|
for phrase_len in range(1, min(self.tm.max_phrase_length + 1, len(tokens) - next_pos + 1)):
|
|
|
eng_phrase = ' '.join(tokens[next_pos:next_pos + phrase_len])
|
|
|
|
|
|
|
|
|
vie_translations = self.tm.phrase_table.get(eng_phrase, {})
|
|
|
if not vie_translations and phrase_len == 1:
|
|
|
|
|
|
vie_translations = {tokens[next_pos]: 1.0}
|
|
|
|
|
|
for vie_phrase, trans_prob in vie_translations.items():
|
|
|
|
|
|
vie_tokens = vie_phrase.split()
|
|
|
|
|
|
log_trans_prob = math.log(trans_prob) if trans_prob > 0 else math.log(1e-10)
|
|
|
lm_score = self.lm.get_probability(trans_tokens + vie_tokens)
|
|
|
new_score = ALPHA * log_trans_prob + BETA * lm_score
|
|
|
|
|
|
|
|
|
new_covered = covered | set(range(next_pos, next_pos + phrase_len))
|
|
|
|
|
|
new_beam.append((score + new_score, trans_tokens + vie_tokens, next_pos + phrase_len, new_covered))
|
|
|
|
|
|
|
|
|
new_beam.sort(key=lambda x: x[0], reverse=True)
|
|
|
beam = new_beam[:self.decoder.beam_size]
|
|
|
|
|
|
|
|
|
return ' '.join(best_translation) if best_translation else "Translation failed"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|
|
|
|