""" HELIX Tokenizer — HuggingFace PreTrainedTokenizer wrapper Bahasa Indonesia · Unigram SentencePiece 32k """ import os from shutil import copyfile from typing import Dict, List, Optional, Tuple from transformers import PreTrainedTokenizer VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} SPECIAL_TOKENS = [ r"\HELIX→pad←HELIX/", # 0 PAD r"\HELIX→unk←HELIX/", # 1 UNK r"\HELIX→start←HELIX/", # 2 BOS r"\HELIX→end←HELIX/", # 3 EOS r"\HELIX→user←HELIX/", # 4 r"\HELIX→assistant←HELIX/", # 5 r"\HELIX→system←HELIX/", # 6 r"\HELIX→think.open←HELIX/", # 7 r"\HELIX→think.close←HELIX/", # 8 r"\HELIX→tool.call.open←HELIX/", # 9 r"\HELIX→tool.call.close←HELIX/", # 10 r"\HELIX→tool.result.open←HELIX/", # 11 r"\HELIX→tool.result.close←HELIX/",# 12 r"\HELIX→tool.name←HELIX/", # 13 r"\HELIX→tool.args←HELIX/", # 14 r"\HELIX→doc.open←HELIX/", # 15 r"\HELIX→doc.close←HELIX/", # 16 r"\HELIX→ctx.open←HELIX/", # 17 r"\HELIX→ctx.close←HELIX/", # 18 r"\HELIX→sep←HELIX/", # 19 r"\HELIX→retrieved.open←HELIX/", # 20 r"\HELIX→retrieved.close←HELIX/", # 21 r"\HELIX→source←HELIX/", # 22 r"\HELIX→code.open←HELIX/", # 23 r"\HELIX→code.close←HELIX/", # 24 r"\HELIX→memory.open←HELIX/", # 25 r"\HELIX→memory.close←HELIX/", # 26 r"\HELIX→persona←HELIX/", # 27 r"\HELIX→image.open←HELIX/", # 28 r"\HELIX→image.close←HELIX/", # 29 r"\HELIX→audio.open←HELIX/", # 30 r"\HELIX→audio.close←HELIX/", # 31 r"\HELIX→cite.open←HELIX/", # 32 r"\HELIX→cite.close←HELIX/", # 33 r"\HELIX→reserved.0←HELIX/", # 34 r"\HELIX→reserved.1←HELIX/", # 35 r"\HELIX→reserved.2←HELIX/", # 36 r"\HELIX→reserved.3←HELIX/", # 37 r"\HELIX→reserved.4←HELIX/", # 38 r"\HELIX→reserved.5←HELIX/", # 39 r"\HELIX→reserved.6←HELIX/", # 40 r"\HELIX→reserved.7←HELIX/", # 41 r"\HELIX→reserved.8←HELIX/", # 42 r"\HELIX→reserved.9←HELIX/", # 43 ] class HelixTokenizer(PreTrainedTokenizer): """ HELIX Tokenizer — SentencePiece Unigram 32k, dioptimasi untuk Bahasa Indonesia. Mendukung teks formal, informal, slang, dan code-switching Indonesia-Inggris. """ vocab_files_names = VOCAB_FILES_NAMES model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file, bos_token = r"\HELIX→start←HELIX/", eos_token = r"\HELIX→end←HELIX/", unk_token = r"\HELIX→unk←HELIX/", pad_token = r"\HELIX→pad←HELIX/", sp_model_kwargs = None, **kwargs, ): self.sp_model_kwargs = sp_model_kwargs if sp_model_kwargs is not None else {} import sentencepiece as spm self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) self.vocab_file = vocab_file # Special token id → string mapping (override SentencePiece) self._special_tokens_map_by_id: Dict[int, str] = { i: tok for i, tok in enumerate(SPECIAL_TOKENS) } self._special_tokens_map_by_str: Dict[str, int] = { tok: i for i, tok in enumerate(SPECIAL_TOKENS) } super().__init__( bos_token = bos_token, eos_token = eos_token, unk_token = unk_token, pad_token = pad_token, sp_model_kwargs = self.sp_model_kwargs, **kwargs, ) @property def vocab_size(self) -> int: return self.sp_model.GetPieceSize() def get_vocab(self) -> Dict[str, int]: vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text: str) -> List[str]: return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token: str) -> int: if token in self._special_tokens_map_by_str: return self._special_tokens_map_by_str[token] return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index: int) -> str: if index in self._special_tokens_map_by_id: return self._special_tokens_map_by_id[index] return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens: List[str]) -> str: # Filter out special tokens sebelum decode filtered = [t for t in tokens if t not in self._special_tokens_map_by_str] return self.sp_model.DecodePieces(filtered) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: bos = [self.bos_token_id] eos = [self.eos_token_id] if token_ids_1 is None: return bos + token_ids_0 + eos return bos + token_ids_0 + eos + bos + token_ids_1 + eos def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False, ) -> List[int]: if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True, ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: bos, eos = [self.bos_token_id], [self.eos_token_id] if token_ids_1 is None: return len(bos + token_ids_0 + eos) * [0] return len(bos + token_ids_0 + eos) * [0] + len(bos + token_ids_1 + eos) * [1] def save_vocabulary( self, save_directory: str, filename_prefix: Optional[str] = None ) -> Tuple[str]: if not os.path.isdir(save_directory): raise ValueError(f"Vocabulary path ({save_directory}) should be a directory") out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"], ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,)