Skip to content

AttributeError: 'FullyAsyncTrainer' object has no attribute 'train_dataloader' #4794

@Silentssss

Description

@Silentssss

System Info

Traceback (most recent call last):
File "verl/recipe/fully_async_policy/fully_async_main.py", line 307, in main
run_ppo(config, task_runner_class=FullyAsyncTaskRunner)
File "verl/verl/trainer/main_ppo.py", line 96, in run_ppo
ray.get(runner.run.remote(config))
File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2967, in get
values, debugger_breakpoint = worker.get_objects(
File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1015, in get_objects
raise value.as_instanceof_cause()
File "verl/recipe/fully_async_policy/fully_async_main.py", line 139, in run
self._run_training_loop()
File "verl/recipe/fully_async_policy/fully_async_main.py", line 283, in _run_training_loop
raise e
File "verl/recipe/fully_async_policy/fully_async_main.py", line 277, in _run_training_loop
ray.get(future)
File "verl/recipe/fully_async_policy/fully_async_trainer.py", line 309, in fit
self._check_save_checkpoint(True, timing_raw) # TODO: check checkpoint
File "verl/recipe/fully_async_policy/ray_trainer.py", line 526, in _check_save_checkpoint
self._save_checkpoint()
File "verl/verl/trainer/ppo/ray_trainer.py", line 833, in _save_checkpoint
dataloader_state_dict = self.train_dataloader.state_dict()

AttributeError: 'FullyAsyncTrainer' object has no attribute 'train_dataloader'. Did you mean: '_create_dataloader'?. Did you mean: '_return_value'?

FullyAsyncTrainer类并没有继承RayPPOTrainer的__init__,因此缺少train_dataloader属性

Information

  • The official example scripts
  • My own modified scripts

Tasks

  • An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
  • My own task or dataset (give details below)

Reproduction

data_path=/dataset
model_path=/model
rollout_mode="async"
rollout_name="vllm"
train_prompt_bsz=0
gen_prompt_bsz=1
n_resp_per_prompt=1
train_prompt_mini_bsz=32
test_freq=10
staleness_threshold=0.1
trigger_parameter_sync_step=4
partial_rollout=True

python3 -m recipe.fully_async_policy.fully_async_main \
          data.train_batch_size=${train_prompt_bsz} \
          data.gen_batch_size=${gen_prompt_bsz} \
          data.return_raw_chat=True \
          data.train_files=${data_path}/gsm8k/train.parquet \
          data.val_files=${data_path}/gsm8k/test.parquet \
          data.max_prompt_length=512 \
          data.max_response_length=512 \
          data.seed=100 \
          actor_rollout_ref.model.path=${model_path}/Qwen2.5-0.5B-Instruct \
          actor_rollout_ref.actor.optim.lr=1e-6 \
          actor_rollout_ref.actor.ppo_mini_batch_size=32 \
          actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
          actor_rollout_ref.rollout.name=vllm \
          actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
          actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
          actor_rollout_ref.rollout.free_cache_engine=False \
          actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
          actor_rollout_ref.actor.strategy=fsdp \
          critic.strategy=fsdp \
          actor_rollout_ref.hybrid_engine=False \
          actor_rollout_ref.rollout.name=${rollout_name} \
          actor_rollout_ref.rollout.mode=${rollout_mode} \
          actor_rollout_ref.rollout.calculate_log_probs=True \
          critic.optim.lr=1e-5 \
          critic.model.path=${model_path}/Qwen2.5-0.5B-Instruct \
          critic.ppo_micro_batch_size_per_gpu=4 \
          trainer.nnodes=1 \
          trainer.n_gpus_per_node=4 \
          rollout.nnodes=1 \
          rollout.n_gpus_per_node=4 \
          rollout.test_freq="${test_freq}" \
          trainer.logger=['console','tensorboard'] \
          trainer.val_before_train=False \
          trainer.project_name="h20_fully_async_policy" \
          trainer.experiment_name="test_async" \
          trainer.save_freq=100 \
          trainer.test_freq=100 \
          trainer.total_epochs=3 \
          async_training.staleness_threshold="${staleness_threshold}" \
          async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
          async_training.partial_rollout="${partial_rollout}"

当我用以上命令执行时报以下错误,应该是FullyAsyncTrainer类并没有继承RayPPOTrainer的__init__,因此缺少train_dataloader属性

Traceback (most recent call last):
  File "verl/recipe/fully_async_policy/fully_async_main.py", line 307, in main
    run_ppo(config, task_runner_class=FullyAsyncTaskRunner)
  File "verl/verl/trainer/main_ppo.py", line 96, in run_ppo
    ray.get(runner.run.remote(config))
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2967, in get
    values, debugger_breakpoint = worker.get_objects(
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1015, in get_objects
    raise value.as_instanceof_cause()
  File "verl/recipe/fully_async_policy/fully_async_main.py", line 139, in run
    self._run_training_loop()
  File "verl/recipe/fully_async_policy/fully_async_main.py", line 283, in _run_training_loop
    raise e
  File "verl/recipe/fully_async_policy/fully_async_main.py", line 277, in _run_training_loop
    ray.get(future)
  File "verl/recipe/fully_async_policy/fully_async_trainer.py", line 309, in fit
    self._check_save_checkpoint(True, timing_raw)  # TODO: check checkpoint
  File "verl/recipe/fully_async_policy/ray_trainer.py", line 526, in _check_save_checkpoint
    self._save_checkpoint()
  File "verl/verl/trainer/ppo/ray_trainer.py", line 833, in _save_checkpoint
    dataloader_state_dict = self.train_dataloader.state_dict()

AttributeError: 'FullyAsyncTrainer' object has no attribute 'train_dataloader'. Did you mean: '_create_dataloader'?. Did you mean: '_return_value'?

Expected behavior

no error

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions