Skip to content

使用ChatGLM2-6B分词报错 #57

@hykyle

Description

@hykyle

(tuning) [yons@Ubuntu 17:54:44] ~/work/tuning/LLM-Tuning
$ python3 tokenize_dataset_rows.py --model_checkpoint /home/yons/work/glm/ChatGLM2-6B/THUDM/chatglm2-6b --input_file CMeiE-train.json --prompt_key q --target_key a --save_name simple_math_4op --max_seq_length 2000 --skip_overlength False
Downloading and preparing dataset generator/default to file:///home/yons/.cache/huggingface/datasets/generator/default-35c7964d6cacead3/0.0.0...
Traceback (most recent call last):
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1608, in _prepare_split_single
for key, record in generator:
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/packaged_modules/generator/generator.py", line 30, in _generate_examples
for idx, ex in enumerate(self.config.generator(**gen_kwargs)):
File "/home/yons/work/tuning/LLM-Tuning/tokenize_dataset_rows.py", line 40, in read_jsonl
tokenizer = AutoTokenizer.from_pretrained(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py", line 738, in from_pretrained
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2017, in from_pretrained
return cls._from_pretrained(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2249, in _from_pretrained
tokenizer = cls(*init_inputs, **init_kwargs)
File "/home/yons/.cache/huggingface/modules/transformers_modules/chatglm2-6b/tokenization_chatglm.py", line 69, in init
super().init(padding_side=padding_side, **kwargs)
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils.py", line 367, in init
self._add_tokens(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils.py", line 467, in _add_tokens
current_vocab = self.get_vocab().copy()
File "/home/yons/.cache/huggingface/modules/transformers_modules/chatglm2-6b/tokenization_chatglm.py", line 108, in get_vocab
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
File "/home/yons/.cache/huggingface/modules/transformers_modules/chatglm2-6b/tokenization_chatglm.py", line 104, in vocab_size
return self.tokenizer.n_words
AttributeError: 'ChatGLMTokenizer' object has no attribute 'tokenizer'. Did you mean: 'tokenize'?

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "/home/yons/work/tuning/LLM-Tuning/tokenize_dataset_rows.py", line 58, in
dataset = datasets.Dataset.from_generator(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 1058, in from_generator
).read()
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/io/generator.py", line 47, in read
self.builder.download_and_prepare(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 890, in download_and_prepare
self._download_and_prepare(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1649, in _download_and_prepare
super()._download_and_prepare(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 985, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1487, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1644, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.builder.DatasetGenerationError: An error occurred while generating the dataset

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions