在采用llamafactory训练时,开启deepspeed加速,参数选择none时可以正常训练,选择2、3时提示训练错误,无法执行。deepspeed需要配置什么吗?老师上课的时候也没说特别的。请指导一下。
错误信息:
Traceback (most recent call last):
File "/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/LLaMA-Factory/src/llamafactory/train/tuner.py", line 110, in run_exp
_training_function(config={"args": args, "callbacks": callbacks})
File "/LLaMA-Factory/src/llamafactory/train/tuner.py", line 55, in _training_function
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 208, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 186, in _parse_train_args
return _parse_args(parser, args, allow_extra_keys=allow_extra_keys)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 80, in _parse_args
return parser.parse_dict(args, allow_extra_keys=allow_extra_keys)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/hf_argparser.py", line 393, in parse_dict
obj = dtype(**inputs)
File "<string>", line 143, in __init__
File "/LLaMA-Factory/src/llamafactory/hparams/training_args.py", line 81, in __post_init__
Seq2SeqTrainingArguments.__post_init__(self)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 1738, in __post_init__
self.device
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 2268, in device
return self._setup_devices
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/utils/generic.py", line 67, in __get__
cached = self.fget(obj)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 2195, in _setup_devices
self.distributed_state = PartialState(**accelerator_state_kwargs)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/accelerate/state.py", line 208, in __init__
raise ImportError(
ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
Traceback (most recent call last):
File "/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/LLaMA-Factory/src/llamafactory/train/tuner.py", line 110, in run_exp
_training_function(config={"args": args, "callbacks": callbacks})
File "/LLaMA-Factory/src/llamafactory/train/tuner.py", line 55, in _training_function
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 208, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 186, in _parse_train_args
return _parse_args(parser, args, allow_extra_keys=allow_extra_keys)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 80, in _parse_args
return parser.parse_dict(args, allow_extra_keys=allow_extra_keys)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/hf_argparser.py", line 393, in parse_dict
obj = dtype(**inputs)
File "<string>", line 143, in __init__
File "/LLaMA-Factory/src/llamafactory/hparams/training_args.py", line 81, in __post_init__
Seq2SeqTrainingArguments.__post_init__(self)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 1738, in __post_init__
self.device
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 2268, in device
return self._setup_devices
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/utils/generic.py", line 67, in __get__
cached = self.fget(obj)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 2195, in _setup_devices
self.distributed_state = PartialState(**accelerator_state_kwargs)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/accelerate/state.py", line 208, in __init__
raise ImportError(
ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
E0619 09:11:10.152398 3663 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 3740) of binary: /hy-tmp/myenv/llamafactory/bin/python3.10
Traceback (most recent call last):
File "/hy-tmp/myenv/llamafactory/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
return f(*args, **kwargs)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in main
run(args)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/torch/distributed/run.py", line 883, in run
elastic_launch(
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 139, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/LLaMA-Factory/src/llamafactory/launcher.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2025-06-19_09:11:10
host : I220270aafa02801e83
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 3741)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2025-06-19_09:11:10
host : I220270aafa02801e83
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 3740)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
Traceback (most recent call last):
File "/hy-tmp/myenv/llamafactory/bin/llamafactory-cli", line 8, in <module>
sys.exit(main())
File "/LLaMA-Factory/src/llamafactory/cli.py", line 130, in main
process = subprocess.run(
File "/hy-tmp/myenv/llamafactory/lib/python3.10/subprocess.py", line 524, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['torchrun', '--nnodes', '1', '--node_rank', '0', '--nproc_per_node', '2', '--master_addr', '127.0.0.1', '--master_port', '48357', '/LLaMA-Factory/src/llamafactory/launcher.py', 'saves/Qwen1.5-0.5B-Chat/lora/train_2025-06-19-08-49-02/training_args.yaml']' returned non-zero exit status 1.
[INFO|2025-06-19 09:12:51] llamafactory.cli:143 >> Initializing 2 distributed tasks at: 127.0.0.1:51419
W0619 09:12:53.297603 3899 site-packages/torch/distributed/run.py:766]
W0619 09:12:53.297603 3899 site-packages/torch/distributed/run.py:766] *****************************************
W0619 09:12:53.297603 3899 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0619 09:12:53.297603 3899 site-packages/torch/distributed/run.py:766] *****************************************
Traceback (most recent call last):
File "/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/LLaMA-Factory/src/llamafactory/train/tuner.py", line 110, in run_exp
_training_function(config={"args": args, "callbacks": callbacks})
File "/LLaMA-Factory/src/llamafactory/train/tuner.py", line 55, in _training_function
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 208, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 186, in _parse_train_args
return _parse_args(parser, args, allow_extra_keys=allow_extra_keys)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 80, in _parse_args
return parser.parse_dict(args, allow_extra_keys=allow_extra_keys)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/hf_argparser.py", line 393, in parse_dict
obj = dtype(**inputs)
File "<string>", line 143, in __init__
File "/LLaMA-Factory/src/llamafactory/hparams/training_args.py", line 81, in __post_init__
Seq2SeqTrainingArguments.__post_init__(self)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 1738, in __post_init__
self.device
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 2268, in device
return self._setup_devices
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/utils/generic.py", line 67, in __get__
cached = self.fget(obj)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 2195, in _setup_devices
self.distributed_state = PartialState(**accelerator_state_kwargs)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/accelerate/state.py", line 208, in __init__
raise ImportError(
ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
Traceback (most recent call last):
File "/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/LLaMA-Factory/src/llamafactory/train/tuner.py", line 110, in run_exp
_training_function(config={"args": args, "callbacks": callbacks})
File "/LLaMA-Factory/src/llamafactory/train/tuner.py", line 55, in _training_function
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 208, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 186, in _parse_train_args
return _parse_args(parser, args, allow_extra_keys=allow_extra_keys)
File "/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 80, in _parse_args
return parser.parse_dict(args, allow_extra_keys=allow_extra_keys)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/hf_argparser.py", line 393, in parse_dict
obj = dtype(**inputs)
File "<string>", line 143, in __init__
File "/LLaMA-Factory/src/llamafactory/hparams/training_args.py", line 81, in __post_init__
Seq2SeqTrainingArguments.__post_init__(self)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 1738, in __post_init__
self.device
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 2268, in device
return self._setup_devices
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/utils/generic.py", line 67, in __get__
cached = self.fget(obj)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/transformers/training_args.py", line 2195, in _setup_devices
self.distributed_state = PartialState(**accelerator_state_kwargs)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/accelerate/state.py", line 208, in __init__
raise ImportError(
ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
W0619 09:13:06.970967 3899 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 3977 closing signal SIGTERM
E0619 09:13:07.035178 3899 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 3976) of binary: /hy-tmp/myenv/llamafactory/bin/python3.10
Traceback (most recent call last):
File "/hy-tmp/myenv/llamafactory/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
return f(*args, **kwargs)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in main
run(args)
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/torch/distributed/run.py", line 883, in run
elastic_launch(
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 139, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/hy-tmp/myenv/llamafactory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/LLaMA-Factory/src/llamafactory/launcher.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2025-06-19_09:13:06
host : I220270aafa02801e83
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 3976)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
Traceback (most recent call last):
File "/hy-tmp/myenv/llamafactory/bin/llamafactory-cli", line 8, in <module>
sys.exit(main())
File "/LLaMA-Factory/src/llamafactory/cli.py", line 130, in main
process = subprocess.run(
File "/hy-tmp/myenv/llamafactory/lib/python3.10/subprocess.py", line 524, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['torchrun', '--nnodes', '1', '--node_rank', '0', '--nproc_per_node', '2', '--master_addr', '127.0.0.1', '--master_port', '51419', '/LLaMA-Factory/src/llamafactory/launcher.py', 'saves/Qwen1.5-0.5B-Chat/lora/train_2025-06-19-08-49-02/training_args.yaml']' returned non-zero exit status 1.