-
Notifications
You must be signed in to change notification settings - Fork 32.2k
Closed
huggingface/accelerate
#1753Description
System Info
transformersversion: 4.29.0- Platform: Linux-3.10.0-1160.92.1.el7.x86_64-x86_64-with-glibc2.31
- Python version: 3.10.9
- Huggingface_hub version: 0.15.1
- Safetensors version: 0.3.1
- PyTorch version (GPU?): 2.0.1+cu117 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?:
- Using distributed or parallel set-up in script?:
Who can help?
No response
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
##Here is my code.
import os
import logging
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence
import torch
import transformers
from datasets import load_dataset, load_from_disk
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Trainer,
DataCollatorForSeq2Seq,
)
IGNORE_INDEX = -100
PROMPT_DICT = {
"prompt_input": (
"### 指令:\n{instruction}\n\n### 输入:\n{input}\n\n### 回答:"
),
"prompt_no_input": (
"### 指令:\n{instruction}\n\n### 回答:"
),
}
@dataclass
class TrainingArguments(transformers.TrainingArguments):
model_name_or_path: Optional[str] = field(default=None, metadata={"help": "模型名称"})
cache_dir: Optional[str] = field(default=None, metadata={"help": "模型地址"})
data_path: str = field(default=None, metadata={"help": "数据地址"})
mask_input: bool = field(default=True, metadata={"help": "是否遮掉指令,只计算回答的损失"})
model_max_length: int = field(default=512, metadata={"help": "最大序列长度"})
optim: str = field(default="adamw_torch", metadata={"help": "优化器"})
@dataclass
class DataCollatorForSupervisedDataset(object):
"""Collate examples for supervised fine-tuning."""
tokenizer: transformers.PreTrainedTokenizer
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
input_ids, labels = tuple([torch.tensor(instance[key]) for instance in instances]
for key in ("input_ids", "labels"))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
)
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
return dict(
input_ids=input_ids,
labels=labels,
attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
)
def train():
local_rank = int(os.environ["LOCAL_RANK"])
parser = transformers.HfArgumentParser(TrainingArguments)
training_args, = parser.parse_args_into_dataclasses()
if local_rank == 0:
print(training_args)
tokenizer = AutoTokenizer.from_pretrained(
training_args.model_name_or_path,
cache_dir=training_args.cache_dir,
model_max_length=training_args.model_max_length,
padding_side="right"
)
model = AutoModelForCausalLM.from_pretrained(
training_args.model_name_or_path,
cache_dir=training_args.cache_dir,
# torch_dtype=torch.float16
)
def generate_and_tokenize(sample):
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
source = prompt_input.format_map(sample) if sample.get("input", "") != "" \
else prompt_no_input.format_map(sample)
target = f"\n{sample['output']}{tokenizer.eos_token}"
complete = source + target
# </s> 1 2 3 : a b </s>
complete_tokenized = tokenizer(complete,
truncation=True,
max_length=training_args.model_max_length)
# </s> 1 2 3 :
source_tokenized = tokenizer(source,
truncation=True,
max_length=training_args.model_max_length)
if training_args.mask_input:
source_len = len(source_tokenized['input_ids'])
complete_tokenized['labels'] = [IGNORE_INDEX] * source_len + complete_tokenized['input_ids'][source_len:]
else:
complete_tokenized['labels'] = complete_tokenized['input_ids'].copy()
return complete_tokenized
tokenized_path = os.path.join(os.path.dirname(training_args.data_path),
f"{training_args.model_name_or_path.split('/')[-1]}_tokenized")
if not os.path.exists(tokenized_path):
logging.warning("tokenized data not existed, tokenize data...")
data = load_dataset("json", data_files=training_args.data_path)
train_dataset = data['train'].shuffle().map(generate_and_tokenize,
batched=False,
remove_columns=["instruction", "input", "output"])
if local_rank == 0:
train_dataset.save_to_disk(tokenized_path)
else:
logging.warning("tokenized data existed, load data...")
train_dataset = load_from_disk(tokenized_path)
# data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
label_pad_token_id=IGNORE_INDEX,
pad_to_multiple_of=8)
logging.warning("training...")
trainer = Trainer(model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_dataset,
eval_dataset=None,
data_collator=data_collator)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)
tokenizer.save_pretrained(save_directory=training_args.output_dir)
if __name__ == '__main__':
train()
Expected behavior
Has anyone encountered this problem? I used the same instruction fine-tuning code. It runs successfully with transformers package version 4.29.0, but when I upgrade to version 4.30.2, it fails to run and throws an OOM (Out of Memory) error. Does anyone know the reason behind this?
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels
