tommytracx commited on
Commit
4da61ae
·
verified ·
1 Parent(s): 2a9218c

Add tokenization_nqlm.py

Browse files
Files changed (1) hide show
  1. tokenization_nqlm.py +116 -0
tokenization_nqlm.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ NeuralQuantum NQLM Tokenizer for Hugging Face Transformers
3
+ """
4
+
5
+ import json
6
+ from typing import List, Optional, Union
7
+ from transformers import PreTrainedTokenizer
8
+
9
+
10
+ class NeuralQuantumTokenizer(PreTrainedTokenizer):
11
+ """Tokenizer for NeuralQuantum NQLM model"""
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_file=None,
16
+ merges_file=None,
17
+ tokenizer_file=None,
18
+ unk_token="<|endoftext|>",
19
+ bos_token="<|endoftext|>",
20
+ eos_token="<|endoftext|>",
21
+ pad_token="<|endoftext|>",
22
+ quantum_token="<|quantum|>",
23
+ classical_token="<|classical|>",
24
+ add_prefix_space=False,
25
+ **kwargs
26
+ ):
27
+ # Simple vocabulary for demonstration
28
+ vocab = {
29
+ "<|endoftext|>": 0,
30
+ "<|quantum|>": 1,
31
+ "<|classical|>": 2,
32
+ }
33
+
34
+ # Add basic vocabulary
35
+ for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:'\"-()[]{}"):
36
+ vocab[char] = i + 3
37
+
38
+ # Set vocab before calling super().__init__
39
+ self._vocab = vocab
40
+ self._ids_to_tokens = {v: k for k, v in vocab.items()}
41
+
42
+ super().__init__(
43
+ unk_token=unk_token,
44
+ bos_token=bos_token,
45
+ eos_token=eos_token,
46
+ pad_token=pad_token,
47
+ add_prefix_space=add_prefix_space,
48
+ **kwargs
49
+ )
50
+
51
+ self.quantum_token = quantum_token
52
+ self.classical_token = classical_token
53
+
54
+ @property
55
+ def vocab_size(self):
56
+ return len(self._vocab)
57
+
58
+ def get_vocab(self):
59
+ return dict(self._vocab)
60
+
61
+ def _tokenize(self, text):
62
+ """Basic tokenization - split by whitespace and characters"""
63
+ tokens = []
64
+ current_token = ""
65
+
66
+ for char in text:
67
+ if char.isspace():
68
+ if current_token:
69
+ tokens.append(current_token)
70
+ current_token = ""
71
+ else:
72
+ current_token += char
73
+
74
+ if current_token:
75
+ tokens.append(current_token)
76
+
77
+ return tokens
78
+
79
+ def _convert_token_to_id(self, token):
80
+ """Convert token to ID"""
81
+ return self._vocab.get(token, self._vocab[self.unk_token])
82
+
83
+ def _convert_id_to_token(self, index):
84
+ """Convert ID to token"""
85
+ return self._ids_to_tokens.get(index, self.unk_token)
86
+
87
+ def convert_tokens_to_string(self, tokens):
88
+ """Convert tokens back to string"""
89
+ return " ".join(tokens)
90
+
91
+ def save_vocabulary(self, save_directory, filename_prefix=None):
92
+ """Save vocabulary to files"""
93
+ vocab_file = f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json"
94
+ vocab_path = f"{save_directory}/{vocab_file}"
95
+
96
+ with open(vocab_path, 'w') as f:
97
+ json.dump(self._vocab, f, indent=2)
98
+
99
+ return (vocab_path,)
100
+
101
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
102
+ """Build input with special tokens"""
103
+ if token_ids_1 is None:
104
+ return token_ids_0 + [self.eos_token_id]
105
+ return token_ids_0 + token_ids_1 + [self.eos_token_id]
106
+
107
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
108
+ """Get special tokens mask"""
109
+ if already_has_special_tokens:
110
+ return super().get_special_tokens_mask(
111
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
112
+ )
113
+
114
+ if token_ids_1 is not None:
115
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
116
+ return [1] + ([0] * len(token_ids_0)) + [1]