diff --git a/research/internlm2/internlm2_tokenizer.py b/research/internlm2/internlm2_tokenizer.py index 181f942b0bd374beb91e8d90b9353c4582cf45a2..e9fd30536a3ef3c9d81509455c5c5e8ae4778499 100644 --- a/research/internlm2/internlm2_tokenizer.py +++ b/research/internlm2/internlm2_tokenizer.py @@ -69,6 +69,16 @@ class InternLM2Tokenizer(PreTrainedTokenizer): self.sp_model.Load(vocab_file) self._no_prefix_space_tokens = None + if kwargs.get("chat_template") is None: + kwargs["chat_template"] = "{{ bos_token }}" \ + "{% for message in messages %}" \ + "{{'<|im_start|>' + message['role'] + '\n' + " \ + "message['content'] + '<|im_end|>' + '\n'}}" \ + "{% endfor %}" \ + "{% if add_generation_prompt %}" \ + "{{ '<|im_start|>assistant\n' }}" \ + "{% endif %}" + if kwargs.get("added_tokens_decoder") is None: kwargs["added_tokens_decoder"] = {} @@ -103,7 +113,7 @@ class InternLM2Tokenizer(PreTrainedTokenizer): def no_prefix_space_tokens(self): if self._no_prefix_space_tokens is None: vocab = self.convert_ids_to_tokens(list(range(self.vocab_size))) - self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")} + self._no_prefix_space_tokens = {tok for tok in vocab if not tok.startswith("▁")} return self._no_prefix_space_tokens @property @@ -147,22 +157,17 @@ class InternLM2Tokenizer(PreTrainedTokenizer): """Converts a sequence of tokens (string) in a single string.""" current_sub_tokens = [] out_string = "" - prev_is_special = False for token in tokens: # make sure that special tokens are not decoded using sentencepiece model if token in self.all_special_tokens: - if not prev_is_special: - out_string += " " out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True current_sub_tokens = [] else: current_sub_tokens.append(token) - prev_is_special = False out_string += self.sp_model.decode(current_sub_tokens) out_string = self.clean_up_tokenization(out_string) out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string) - return out_string[1:] + return out_string def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: """