-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Description
System Info
Traceback (most recent call last):
File "verl/recipe/fully_async_policy/fully_async_main.py", line 307, in main
run_ppo(config, task_runner_class=FullyAsyncTaskRunner)
File "verl/verl/trainer/main_ppo.py", line 96, in run_ppo
ray.get(runner.run.remote(config))
File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2967, in get
values, debugger_breakpoint = worker.get_objects(
File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1015, in get_objects
raise value.as_instanceof_cause()
File "verl/recipe/fully_async_policy/fully_async_main.py", line 139, in run
self._run_training_loop()
File "verl/recipe/fully_async_policy/fully_async_main.py", line 283, in _run_training_loop
raise e
File "verl/recipe/fully_async_policy/fully_async_main.py", line 277, in _run_training_loop
ray.get(future)
File "verl/recipe/fully_async_policy/fully_async_trainer.py", line 309, in fit
self._check_save_checkpoint(True, timing_raw) # TODO: check checkpoint
File "verl/recipe/fully_async_policy/ray_trainer.py", line 526, in _check_save_checkpoint
self._save_checkpoint()
File "verl/verl/trainer/ppo/ray_trainer.py", line 833, in _save_checkpoint
dataloader_state_dict = self.train_dataloader.state_dict()
AttributeError: 'FullyAsyncTrainer' object has no attribute 'train_dataloader'. Did you mean: '_create_dataloader'?. Did you mean: '_return_value'?
FullyAsyncTrainer类并没有继承RayPPOTrainer的__init__,因此缺少train_dataloader属性
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
data_path=/dataset
model_path=/model
rollout_mode="async"
rollout_name="vllm"
train_prompt_bsz=0
gen_prompt_bsz=1
n_resp_per_prompt=1
train_prompt_mini_bsz=32
test_freq=10
staleness_threshold=0.1
trigger_parameter_sync_step=4
partial_rollout=True
python3 -m recipe.fully_async_policy.fully_async_main \
data.train_batch_size=${train_prompt_bsz} \
data.gen_batch_size=${gen_prompt_bsz} \
data.return_raw_chat=True \
data.train_files=${data_path}/gsm8k/train.parquet \
data.val_files=${data_path}/gsm8k/test.parquet \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.seed=100 \
actor_rollout_ref.model.path=${model_path}/Qwen2.5-0.5B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.free_cache_engine=False \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
actor_rollout_ref.actor.strategy=fsdp \
critic.strategy=fsdp \
actor_rollout_ref.hybrid_engine=False \
actor_rollout_ref.rollout.name=${rollout_name} \
actor_rollout_ref.rollout.mode=${rollout_mode} \
actor_rollout_ref.rollout.calculate_log_probs=True \
critic.optim.lr=1e-5 \
critic.model.path=${model_path}/Qwen2.5-0.5B-Instruct \
critic.ppo_micro_batch_size_per_gpu=4 \
trainer.nnodes=1 \
trainer.n_gpus_per_node=4 \
rollout.nnodes=1 \
rollout.n_gpus_per_node=4 \
rollout.test_freq="${test_freq}" \
trainer.logger=['console','tensorboard'] \
trainer.val_before_train=False \
trainer.project_name="h20_fully_async_policy" \
trainer.experiment_name="test_async" \
trainer.save_freq=100 \
trainer.test_freq=100 \
trainer.total_epochs=3 \
async_training.staleness_threshold="${staleness_threshold}" \
async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
async_training.partial_rollout="${partial_rollout}"
当我用以上命令执行时报以下错误,应该是FullyAsyncTrainer类并没有继承RayPPOTrainer的__init__,因此缺少train_dataloader属性
Traceback (most recent call last):
File "verl/recipe/fully_async_policy/fully_async_main.py", line 307, in main
run_ppo(config, task_runner_class=FullyAsyncTaskRunner)
File "verl/verl/trainer/main_ppo.py", line 96, in run_ppo
ray.get(runner.run.remote(config))
File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2967, in get
values, debugger_breakpoint = worker.get_objects(
File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1015, in get_objects
raise value.as_instanceof_cause()
File "verl/recipe/fully_async_policy/fully_async_main.py", line 139, in run
self._run_training_loop()
File "verl/recipe/fully_async_policy/fully_async_main.py", line 283, in _run_training_loop
raise e
File "verl/recipe/fully_async_policy/fully_async_main.py", line 277, in _run_training_loop
ray.get(future)
File "verl/recipe/fully_async_policy/fully_async_trainer.py", line 309, in fit
self._check_save_checkpoint(True, timing_raw) # TODO: check checkpoint
File "verl/recipe/fully_async_policy/ray_trainer.py", line 526, in _check_save_checkpoint
self._save_checkpoint()
File "verl/verl/trainer/ppo/ray_trainer.py", line 833, in _save_checkpoint
dataloader_state_dict = self.train_dataloader.state_dict()
AttributeError: 'FullyAsyncTrainer' object has no attribute 'train_dataloader'. Did you mean: '_create_dataloader'?. Did you mean: '_return_value'?
Expected behavior
no error