Skip to content

Using FP16 #195

@JunJ-H-hub

Description

@JunJ-H-hub

My hardware environment is a Tesla V100, which does not support BF16 acceleration. Currently, it seems that torch.bfloat16 or related configurations are hardcoded in the code, causing it to fail to run. I have already made modifications. How else do I need to modify the code further?
Modifications I made I have attempted to force FP16 by doing the following:

  1. Running with command: accelerate launch --multi_gpu --mixed_precision fp16 ...
  2. Code Modification in models/TimeLLM.py: Changed the input casting in forecast method:
    Before
    enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))
    After
    enc_out, n_vars = self.patch_embedding(x_enc.to(torch.float16))
  3. DeepSpeed Config (ds_config_zero2.json): Enabled fp16 and disabled bf16:
    {
    "fp16": {
    "enabled": true,
    "auto_cast": true,
    "loss_scale": 0,
    "initial_scale_power": 16,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
    },
    "bf16": {
    "enabled": false
    },
    "zero_optimization": {
    "stage": 2,
    "allgather_partitions": true,
    "allgather_bucket_size": 2e8,
    "overlap_comm": true,
    "reduce_scatter": true,
    "reduce_bucket_size": 2e8,
    "contiguous_gradients": true,
    "sub_group_size": 1e9
    },
    "gradient_accumulation_steps": "auto",
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "steps_per_print": 10,
    "wall_clock_breakdown": false
    }
    The current error is as follows:
    Traceback (most recent call last):
    File "/root/Time-LLM/run_main.py", line 211, in
    outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1852, in forward
    loss = self.module(*inputs, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^Traceback (most recent call last):
    ^^^^^^^ File "/root/Time-LLM/run_main.py", line 211, in
    ^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
    return forward_call(*args, **kwargs)
    ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ^^^ File "/root/Time-LLM/models/TimeLLM.py", line 238, in forward
    ^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^return self._call_impl(*args, **kwargs)^
    ^^
    File "/root/Time-LLM/models/TimeLLM.py", line 282, in forecast
    ^^^^^^^^^^^ ^enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))^
    ^^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^ ^return self._call_impl(*args, **kwargs)^
    ^^^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^
    ^^^^^^ File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    ret_val = func(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^
    return forward_call(*args, **kwargs)
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1852, in forward
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/layers/Embed.py", line 184, in forward
    x = self.value_embedding(x)
    ^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    loss = self.module(*inputs, **kwargs)
    ^^^^^ ^return self._call_impl(*args, **kwargs)^
    ^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^
    ^^^ File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    ^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^ ^return self._call_impl(*args, **kwargs)^^
    ^^^^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^
    ^^^ File "/root/Time-LLM/layers/Embed.py", line 42, in forward
    ^^^^^^^^^^^^^ ^x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)^
    ^^^^^^ ^ ^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^ ^return self._call_impl(*args, **kwargs)^
    ^^^^^^ ^ ^
    File "/root/Time-LLM/models/TimeLLM.py", line 238, in forward
    ^^^^^^^^^^^^^^^^^^ ^dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)^
    ^^^^^^^ ^ ^ ^ ^ ^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^return forward_call(*args, **kwargs)^
    ^^^^^ ^ ^
    File "/root/Time-LLM/models/TimeLLM.py", line 282, in forecast
    ^^^^^^^^^^^^^^^^^^^^^^^ ^enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))^
    ^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 310, in forward
    ^^^^ ^return self._conv_forward(input, self.weight, self.bias)^
    ^^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ^^ File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    ^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 303, in _conv_forward
    return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
    return self._call_impl(*args, **kwargs)
    ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ^^ File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    RuntimeError: expected scalar type BFloat16 but found Half
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/layers/Embed.py", line 184, in forward
    x = self.value_embedding(x)
    ^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/layers/Embed.py", line 42, in forward
    x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 310, in forward
    return self._conv_forward(input, self.weight, self.bias)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 303, in _conv_forward
    return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    RuntimeError: expected scalar type BFloat16 but found Half
    0it [00:00, ?it/s]
    Traceback (most recent call last):
    File "/root/Time-LLM/run_main.py", line 211, in
    outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1852, in forward
    loss = self.module(*inputs, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/models/TimeLLM.py", line 238, in forward
    dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/models/TimeLLM.py", line 282, in forecast
    enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/layers/Embed.py", line 184, in forward
    x = self.value_embedding(x)
    ^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/layers/Embed.py", line 42, in forward
    x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 310, in forward
    return self._conv_forward(input, self.weight, self.bias)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 303, in _conv_forward
    return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    RuntimeError: expected scalar type BFloat16 but found Half
    0it [00:00, ?it/s]
    Traceback (most recent call last):
    File "/root/Time-LLM/run_main.py", line 211, in
    outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1852, in forward
    loss = self.module(*inputs, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/models/TimeLLM.py", line 238, in forward
    dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/models/TimeLLM.py", line 282, in forecast
    enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/layers/Embed.py", line 184, in forward
    x = self.value_embedding(x)
    ^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/Time-LLM/layers/Embed.py", line 42, in forward
    x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 310, in forward
    return self._conv_forward(input, self.weight, self.bias)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 303, in _conv_forward
    return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    RuntimeError: expected scalar type BFloat16 but found Half
    [2025-12-10 11:31:11,623] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 265569) of binary: /root/anaconda3/envs/timellm/bin/python3.11
    Traceback (most recent call last):
    File "/root/anaconda3/envs/timellm/bin/accelerate", line 7, in
    sys.exit(main())
    ^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 46, in main
    args.func(args)
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1048, in launch_command
    multi_gpu_launcher(args)
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/accelerate/commands/launch.py", line 702, in multi_gpu_launcher
    distrib_run.run(args)
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/distributed/run.py", line 803, in run
    elastic_launch(
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 135, in call
    return launch_agent(self._config, self._entrypoint, list(args))
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
    raise ChildFailedError(
    torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
    ============================================================
    run_main.py FAILED

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions