magnetar-50m-id / tokenization_helix.py
Veenn's picture
Upload MAGNETAR-50M-ID checkpoint step 6000
5bde5a1 verified
"""
HELIX Tokenizer — HuggingFace PreTrainedTokenizer wrapper
Bahasa Indonesia · Unigram SentencePiece 32k
"""
import os
from shutil import copyfile
from typing import Dict, List, Optional, Tuple
from transformers import PreTrainedTokenizer
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
SPECIAL_TOKENS = [
r"\HELIX→pad←HELIX/", # 0 PAD
r"\HELIX→unk←HELIX/", # 1 UNK
r"\HELIX→start←HELIX/", # 2 BOS
r"\HELIX→end←HELIX/", # 3 EOS
r"\HELIX→user←HELIX/", # 4
r"\HELIX→assistant←HELIX/", # 5
r"\HELIX→system←HELIX/", # 6
r"\HELIX→think.open←HELIX/", # 7
r"\HELIX→think.close←HELIX/", # 8
r"\HELIX→tool.call.open←HELIX/", # 9
r"\HELIX→tool.call.close←HELIX/", # 10
r"\HELIX→tool.result.open←HELIX/", # 11
r"\HELIX→tool.result.close←HELIX/",# 12
r"\HELIX→tool.name←HELIX/", # 13
r"\HELIX→tool.args←HELIX/", # 14
r"\HELIX→doc.open←HELIX/", # 15
r"\HELIX→doc.close←HELIX/", # 16
r"\HELIX→ctx.open←HELIX/", # 17
r"\HELIX→ctx.close←HELIX/", # 18
r"\HELIX→sep←HELIX/", # 19
r"\HELIX→retrieved.open←HELIX/", # 20
r"\HELIX→retrieved.close←HELIX/", # 21
r"\HELIX→source←HELIX/", # 22
r"\HELIX→code.open←HELIX/", # 23
r"\HELIX→code.close←HELIX/", # 24
r"\HELIX→memory.open←HELIX/", # 25
r"\HELIX→memory.close←HELIX/", # 26
r"\HELIX→persona←HELIX/", # 27
r"\HELIX→image.open←HELIX/", # 28
r"\HELIX→image.close←HELIX/", # 29
r"\HELIX→audio.open←HELIX/", # 30
r"\HELIX→audio.close←HELIX/", # 31
r"\HELIX→cite.open←HELIX/", # 32
r"\HELIX→cite.close←HELIX/", # 33
r"\HELIX→reserved.0←HELIX/", # 34
r"\HELIX→reserved.1←HELIX/", # 35
r"\HELIX→reserved.2←HELIX/", # 36
r"\HELIX→reserved.3←HELIX/", # 37
r"\HELIX→reserved.4←HELIX/", # 38
r"\HELIX→reserved.5←HELIX/", # 39
r"\HELIX→reserved.6←HELIX/", # 40
r"\HELIX→reserved.7←HELIX/", # 41
r"\HELIX→reserved.8←HELIX/", # 42
r"\HELIX→reserved.9←HELIX/", # 43
]
class HelixTokenizer(PreTrainedTokenizer):
"""
HELIX Tokenizer — SentencePiece Unigram 32k, dioptimasi untuk Bahasa Indonesia.
Mendukung teks formal, informal, slang, dan code-switching Indonesia-Inggris.
"""
vocab_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token = r"\HELIX→start←HELIX/",
eos_token = r"\HELIX→end←HELIX/",
unk_token = r"\HELIX→unk←HELIX/",
pad_token = r"\HELIX→pad←HELIX/",
sp_model_kwargs = None,
**kwargs,
):
self.sp_model_kwargs = sp_model_kwargs if sp_model_kwargs is not None else {}
import sentencepiece as spm
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
self.vocab_file = vocab_file
# Special token id → string mapping (override SentencePiece)
self._special_tokens_map_by_id: Dict[int, str] = {
i: tok for i, tok in enumerate(SPECIAL_TOKENS)
}
self._special_tokens_map_by_str: Dict[str, int] = {
tok: i for i, tok in enumerate(SPECIAL_TOKENS)
}
super().__init__(
bos_token = bos_token,
eos_token = eos_token,
unk_token = unk_token,
pad_token = pad_token,
sp_model_kwargs = self.sp_model_kwargs,
**kwargs,
)
@property
def vocab_size(self) -> int:
return self.sp_model.GetPieceSize()
def get_vocab(self) -> Dict[str, int]:
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> List[str]:
return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token: str) -> int:
if token in self._special_tokens_map_by_str:
return self._special_tokens_map_by_str[token]
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index: int) -> str:
if index in self._special_tokens_map_by_id:
return self._special_tokens_map_by_id[index]
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
# Filter out special tokens sebelum decode
filtered = [t for t in tokens if t not in self._special_tokens_map_by_str]
return self.sp_model.DecodePieces(filtered)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
bos = [self.bos_token_id]
eos = [self.eos_token_id]
if token_ids_1 is None:
return bos + token_ids_0 + eos
return bos + token_ids_0 + eos + bos + token_ids_1 + eos
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False,
) -> List[int]:
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True,
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
bos, eos = [self.bos_token_id], [self.eos_token_id]
if token_ids_1 is None:
return len(bos + token_ids_0 + eos) * [0]
return len(bos + token_ids_0 + eos) * [0] + len(bos + token_ids_1 + eos) * [1]
def save_vocabulary(
self, save_directory: str, filename_prefix: Optional[str] = None
) -> Tuple[str]:
if not os.path.isdir(save_directory):
raise ValueError(f"Vocabulary path ({save_directory}) should be a directory")
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)