|
|
""" |
|
|
NeuralQuantum NQLM Tokenizer for Hugging Face Transformers |
|
|
""" |
|
|
|
|
|
import json |
|
|
from typing import List, Optional, Union |
|
|
from transformers import PreTrainedTokenizer |
|
|
|
|
|
|
|
|
class NeuralQuantumTokenizer(PreTrainedTokenizer): |
|
|
"""Tokenizer for NeuralQuantum NQLM model""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vocab_file=None, |
|
|
merges_file=None, |
|
|
tokenizer_file=None, |
|
|
unk_token="<|endoftext|>", |
|
|
bos_token="<|endoftext|>", |
|
|
eos_token="<|endoftext|>", |
|
|
pad_token="<|endoftext|>", |
|
|
quantum_token="<|quantum|>", |
|
|
classical_token="<|classical|>", |
|
|
add_prefix_space=False, |
|
|
**kwargs |
|
|
): |
|
|
|
|
|
vocab = { |
|
|
"<|endoftext|>": 0, |
|
|
"<|quantum|>": 1, |
|
|
"<|classical|>": 2, |
|
|
} |
|
|
|
|
|
|
|
|
for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:'\"-()[]{}"): |
|
|
vocab[char] = i + 3 |
|
|
|
|
|
|
|
|
self._vocab = vocab |
|
|
self._ids_to_tokens = {v: k for k, v in vocab.items()} |
|
|
|
|
|
super().__init__( |
|
|
unk_token=unk_token, |
|
|
bos_token=bos_token, |
|
|
eos_token=eos_token, |
|
|
pad_token=pad_token, |
|
|
add_prefix_space=add_prefix_space, |
|
|
**kwargs |
|
|
) |
|
|
|
|
|
self.quantum_token = quantum_token |
|
|
self.classical_token = classical_token |
|
|
|
|
|
@property |
|
|
def vocab_size(self): |
|
|
return len(self._vocab) |
|
|
|
|
|
def get_vocab(self): |
|
|
return dict(self._vocab) |
|
|
|
|
|
def _tokenize(self, text): |
|
|
"""Basic tokenization - split by whitespace and characters""" |
|
|
tokens = [] |
|
|
current_token = "" |
|
|
|
|
|
for char in text: |
|
|
if char.isspace(): |
|
|
if current_token: |
|
|
tokens.append(current_token) |
|
|
current_token = "" |
|
|
else: |
|
|
current_token += char |
|
|
|
|
|
if current_token: |
|
|
tokens.append(current_token) |
|
|
|
|
|
return tokens |
|
|
|
|
|
def _convert_token_to_id(self, token): |
|
|
"""Convert token to ID""" |
|
|
return self._vocab.get(token, self._vocab[self.unk_token]) |
|
|
|
|
|
def _convert_id_to_token(self, index): |
|
|
"""Convert ID to token""" |
|
|
return self._ids_to_tokens.get(index, self.unk_token) |
|
|
|
|
|
def convert_tokens_to_string(self, tokens): |
|
|
"""Convert tokens back to string""" |
|
|
return " ".join(tokens) |
|
|
|
|
|
def save_vocabulary(self, save_directory, filename_prefix=None): |
|
|
"""Save vocabulary to files""" |
|
|
vocab_file = f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json" |
|
|
vocab_path = f"{save_directory}/{vocab_file}" |
|
|
|
|
|
with open(vocab_path, 'w') as f: |
|
|
json.dump(self._vocab, f, indent=2) |
|
|
|
|
|
return (vocab_path,) |
|
|
|
|
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): |
|
|
"""Build input with special tokens""" |
|
|
if token_ids_1 is None: |
|
|
return token_ids_0 + [self.eos_token_id] |
|
|
return token_ids_0 + token_ids_1 + [self.eos_token_id] |
|
|
|
|
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): |
|
|
"""Get special tokens mask""" |
|
|
if already_has_special_tokens: |
|
|
return super().get_special_tokens_mask( |
|
|
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True |
|
|
) |
|
|
|
|
|
if token_ids_1 is not None: |
|
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] |
|
|
return [1] + ([0] * len(token_ids_0)) + [1] |