-
Notifications
You must be signed in to change notification settings - Fork 449
Open
Description
My hardware environment is a Tesla V100, which does not support BF16 acceleration. Currently, it seems that torch.bfloat16 or related configurations are hardcoded in the code, causing it to fail to run. I have already made modifications. How else do I need to modify the code further?
Modifications I made I have attempted to force FP16 by doing the following:
- Running with command: accelerate launch --multi_gpu --mixed_precision fp16 ...
- Code Modification in models/TimeLLM.py: Changed the input casting in forecast method:
Before
enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))
After
enc_out, n_vars = self.patch_embedding(x_enc.to(torch.float16)) - DeepSpeed Config (ds_config_zero2.json): Enabled fp16 and disabled bf16:
{
"fp16": {
"enabled": true,
"auto_cast": true,
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": false
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true,
"sub_group_size": 1e9
},
"gradient_accumulation_steps": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"steps_per_print": 10,
"wall_clock_breakdown": false
}
The current error is as follows:
Traceback (most recent call last):
File "/root/Time-LLM/run_main.py", line 211, in
outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1852, in forward
loss = self.module(*inputs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^Traceback (most recent call last):
^^^^^^^ File "/root/Time-LLM/run_main.py", line 211, in
^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
return forward_call(*args, **kwargs)
^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^ File "/root/Time-LLM/models/TimeLLM.py", line 238, in forward
^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^return self._call_impl(*args, **kwargs)^
^^
File "/root/Time-LLM/models/TimeLLM.py", line 282, in forecast
^^^^^^^^^^^ ^enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))^
^^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return forward_call(*args, **kwargs)
^^^^^^^^ ^return self._call_impl(*args, **kwargs)^
^^^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^
^^^^^^ File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
ret_val = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1852, in forward
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/layers/Embed.py", line 184, in forward
x = self.value_embedding(x)
^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
loss = self.module(*inputs, **kwargs)
^^^^^ ^return self._call_impl(*args, **kwargs)^
^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^
^^^ File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^ ^return self._call_impl(*args, **kwargs)^^
^^^^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^
^^^ File "/root/Time-LLM/layers/Embed.py", line 42, in forward
^^^^^^^^^^^^^ ^x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)^
^^^^^^ ^ ^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^ ^return self._call_impl(*args, **kwargs)^
^^^^^^ ^ ^
File "/root/Time-LLM/models/TimeLLM.py", line 238, in forward
^^^^^^^^^^^^^^^^^^ ^dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)^
^^^^^^^ ^ ^ ^ ^ ^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^return forward_call(*args, **kwargs)^
^^^^^ ^ ^
File "/root/Time-LLM/models/TimeLLM.py", line 282, in forecast
^^^^^^^^^^^^^^^^^^^^^^^ ^enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))^
^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 310, in forward
^^^^ ^return self._conv_forward(input, self.weight, self.bias)^
^^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^ File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 303, in _conv_forward
return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
return self._call_impl(*args, **kwargs)
^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^ File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: expected scalar type BFloat16 but found Half
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/layers/Embed.py", line 184, in forward
x = self.value_embedding(x)
^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/layers/Embed.py", line 42, in forward
x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 310, in forward
return self._conv_forward(input, self.weight, self.bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 303, in _conv_forward
return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: expected scalar type BFloat16 but found Half
0it [00:00, ?it/s]
Traceback (most recent call last):
File "/root/Time-LLM/run_main.py", line 211, in
outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1852, in forward
loss = self.module(*inputs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/models/TimeLLM.py", line 238, in forward
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/models/TimeLLM.py", line 282, in forecast
enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/layers/Embed.py", line 184, in forward
x = self.value_embedding(x)
^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/layers/Embed.py", line 42, in forward
x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 310, in forward
return self._conv_forward(input, self.weight, self.bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 303, in _conv_forward
return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: expected scalar type BFloat16 but found Half
0it [00:00, ?it/s]
Traceback (most recent call last):
File "/root/Time-LLM/run_main.py", line 211, in
outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1852, in forward
loss = self.module(*inputs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/models/TimeLLM.py", line 238, in forward
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/models/TimeLLM.py", line 282, in forecast
enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/layers/Embed.py", line 184, in forward
x = self.value_embedding(x)
^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Time-LLM/layers/Embed.py", line 42, in forward
x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 310, in forward
return self._conv_forward(input, self.weight, self.bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/nn/modules/conv.py", line 303, in _conv_forward
return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: expected scalar type BFloat16 but found Half
[2025-12-10 11:31:11,623] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 265569) of binary: /root/anaconda3/envs/timellm/bin/python3.11
Traceback (most recent call last):
File "/root/anaconda3/envs/timellm/bin/accelerate", line 7, in
sys.exit(main())
^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 46, in main
args.func(args)
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1048, in launch_command
multi_gpu_launcher(args)
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/accelerate/commands/launch.py", line 702, in multi_gpu_launcher
distrib_run.run(args)
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 135, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/timellm/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
run_main.py FAILED
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels