! bash ./scripts/TimeLLM_ETTh1.sh
TimeLLM_ETTh1.sh 파일은
model_name=TimeLLM
train_epochs=100
learning_rate=0.01
llama_layers=32
master_port=00097
num_process=1
batch_size=24
d_model=32
d_ff=128
comment='TimeLLM-ETTh1'
accelerate launch --mixed_precision bf16 --num_processes $num_process run_main.py
--task_name long_term_forecast
--is_training 1
--root_path ./dataset/ETT-small/
--data_path ETTh1.csv
--model_id ETTh1_512_96
--model $model_name
--data ETTh1
--features M
--seq_len 512
--label_len 48
--pred_len 96
--factor 3
--enc_in 7
--dec_in 7
--c_out 7
--des 'Exp'
--itr 1
--d_model $d_model
--d_ff $d_ff
--batch_size $batch_size
--learning_rate $learning_rate
--llm_layers $llama_layers
--train_epochs $train_epochs
--model_comment $comment
accelerate launch --mixed_precision bf16 --num_processes $num_process run_main.py
--task_name long_term_forecast
--is_training 1
--root_path ./dataset/ETT-small/
--data_path ETTh1.csv
--model_id ETTh1_512_192
--model $model_name
--data ETTh1
--features M
--seq_len 512
--label_len 48
--pred_len 192
--factor 3
--enc_in 7
--dec_in 7
--c_out 7
--des 'Exp'
--itr 1
--d_model 32
--d_ff 128
--batch_size $batch_size
--learning_rate 0.02
--llm_layers $llama_layers
--train_epochs $train_epochs
--model_comment $comment
accelerate launch --mixed_precision bf16 --num_processes $num_process run_main.py
--task_name long_term_forecast
--is_training 1
--root_path ./dataset/ETT-small/
--data_path ETTh1.csv
--model_id ETTh1_512_336
--model $model_name
--data ETTh1
--features M
--seq_len 512
--label_len 48
--pred_len 336
--factor 3
--enc_in 7
--dec_in 7
--c_out 7
--des 'Exp'
--itr 1
--d_model $d_model
--d_ff $d_ff
--batch_size $batch_size
--lradj 'COS'
--learning_rate 0.001
--llm_layers $llama_layers
--train_epochs $train_epochs
--model_comment $comment
accelerate launch --mixed_precision bf16 --num_processes $num_process run_main.py
--task_name long_term_forecast
--is_training 1
--root_path ./dataset/ETT-small/
--data_path ETTh1.csv
--model_id ETTh1_512_720
--model $model_name
--data ETTh1
--features M
--seq_len 512
--label_len 48
--pred_len 720
--factor 3
--enc_in 7
--dec_in 7
--c_out 7
--des 'Exp'
--itr 1
--d_model $d_model
--d_ff $d_ff
--batch_size $batch_size
--learning_rate $learning_rate
--llm_layers $llama_layers
--train_epochs $train_epochs
--model_comment $comment
입니다.
실행결과
Loading checkpoint shards: 100% 2/2 [00:05<00:00, 2.72s/it]
[2025-01-07 12:50:13,988] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-01-07 12:50:14,267] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.14.0, git-hash=unknown, git-branch=unknown
[2025-01-07 12:50:14,267] [INFO] [comm.py:637:init_distributed] cdb=None
[2025-01-07 12:50:14,267] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2025-01-07 12:50:15,350] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.28.0.12, master_port=29500
[2025-01-07 12:50:15,351] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2025-01-07 12:50:30,478] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2025-01-07 12:50:30,480] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
[2025-01-07 12:50:30,480] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2025-01-07 12:50:30,481] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = Adam
[2025-01-07 12:50:30,481] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=Adam type=<class 'torch.optim.adam.Adam'>
[2025-01-07 12:50:30,482] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer
[2025-01-07 12:50:30,482] [INFO] [stage_1_and_2.py:149:init] Reduce bucket size 200000000
[2025-01-07 12:50:30,482] [INFO] [stage_1_and_2.py:150:init] Allgather bucket size 200000000
[2025-01-07 12:50:30,482] [INFO] [stage_1_and_2.py:151:init] CPU Offload: False
[2025-01-07 12:50:30,482] [INFO] [stage_1_and_2.py:152:init] Round robin gradient partitioning: False
[2025-01-07 12:50:30,845] [INFO] [utils.py:800:see_memory_usage] Before initializing optimizer states
[2025-01-07 12:50:30,846] [INFO] [utils.py:801:see_memory_usage] MA 12.59 GB Max_MA 12.68 GB CA 12.68 GB Max_CA 13 GB
[2025-01-07 12:50:30,846] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.43 GB, percent = 4.1%
[2025-01-07 12:50:31,014] [INFO] [utils.py:800:see_memory_usage] After initializing optimizer states
[2025-01-07 12:50:31,015] [INFO] [utils.py:801:see_memory_usage] MA 12.59 GB Max_MA 12.76 GB CA 12.85 GB Max_CA 13 GB
[2025-01-07 12:50:31,015] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.43 GB, percent = 4.1%
[2025-01-07 12:50:31,015] [INFO] [stage_1_and_2.py:539:init] optimizer state initialized
[2025-01-07 12:50:31,181] [INFO] [utils.py:800:see_memory_usage] After initializing ZeRO optimizer
[2025-01-07 12:50:31,182] [INFO] [utils.py:801:see_memory_usage] MA 12.59 GB Max_MA 12.59 GB CA 12.85 GB Max_CA 13 GB
[2025-01-07 12:50:31,182] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.43 GB, percent = 4.1%
[2025-01-07 12:50:31,183] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = Adam
[2025-01-07 12:50:31,183] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
[2025-01-07 12:50:31,183] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
[2025-01-07 12:50:31,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0003999999999999993], mom=[(0.95, 0.999)]
[2025-01-07 12:50:31,184] [INFO] [config.py:996:print] DeepSpeedEngine configuration:
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] activation_checkpointing_config {
"partition_activations": false,
"contiguous_memory_optimization": false,
"cpu_checkpointing": false,
"number_checkpoints": null,
"synchronize_checkpoint_boundary": false,
"profile": false
}
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] amp_enabled .................. False
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] amp_params ................... False
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] autotuning_config ............ {
"enabled": false,
"start_step": null,
"end_step": null,
"metric_path": null,
"arg_mappings": null,
"metric": "throughput",
"model_info": null,
"results_dir": "autotuning_results",
"exps_dir": "autotuning_exps",
"overwrite": true,
"fast": true,
"start_profile_step": 3,
"end_profile_step": 5,
"tuner_type": "gridsearch",
"tuner_early_stopping": 5,
"tuner_num_trials": 50,
"model_info_path": null,
"mp_size": 1,
"max_train_batch_size": null,
"min_train_batch_size": 1,
"max_train_micro_batch_size_per_gpu": 1.024000e+03,
"min_train_micro_batch_size_per_gpu": 1,
"num_tuning_micro_batch_sizes": 3
}
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] bfloat16_enabled ............. True
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] bfloat16_immediate_grad_update False
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] checkpoint_parallel_write_pipeline False
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] checkpoint_tag_validation_enabled True
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] checkpoint_tag_validation_fail False
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7995509fa140>
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] communication_data_type ...... None
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] compile_config ............... enabled=False backend='inductor' kwargs={}
[2025-01-07 12:50:31,185] [INFO] [config.py:1000:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] curriculum_enabled_legacy .... False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] curriculum_params_legacy ..... False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] data_efficiency_enabled ...... False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] dataloader_drop_last ......... False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] disable_allgather ............ False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] dump_state ................... False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] dynamic_loss_scale_args ...... None
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] eigenvalue_enabled ........... False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] eigenvalue_gas_boundary_resolution 1
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] eigenvalue_layer_name ........ bert.encoder.layer
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] eigenvalue_layer_num ......... 0
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] eigenvalue_max_iter .......... 100
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] eigenvalue_stability ......... 1e-06
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] eigenvalue_tol ............... 0.01
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] eigenvalue_verbose ........... False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] elasticity_enabled ........... False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] flops_profiler_config ........ {
"enabled": false,
"recompute_fwd_factor": 0.0,
"profile_step": 1,
"module_depth": -1,
"top_modules": 1,
"detailed": true,
"output_file": null
}
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] fp16_auto_cast ............... None
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] fp16_enabled ................. False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] fp16_master_weights_and_gradients False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] global_rank .................. 0
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] grad_accum_dtype ............. None
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] gradient_accumulation_steps .. 1
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] gradient_clipping ............ 0.0
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] gradient_predivide_factor .... 1.0
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] graph_harvesting ............. False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] initial_dynamic_scale ........ 1
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] load_universal_checkpoint .... False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] loss_scale ................... 1.0
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] memory_breakdown ............. False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] mics_hierarchial_params_gather False
[2025-01-07 12:50:31,186] [INFO] [config.py:1000:print] mics_shard_size .............. -1
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] nebula_config ................ {
"enabled": false,
"persistent_storage_path": null,
"persistent_time_interval": 100,
"num_of_version_in_retention": 2,
"enable_nebula_load": true,
"load_path": null
}
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] optimizer_legacy_fusion ...... False
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] optimizer_name ............... None
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] optimizer_params ............. None
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] pld_enabled .................. False
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] pld_params ................... False
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] prescale_gradients ........... False
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] scheduler_name ............... None
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] scheduler_params ............. None
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] seq_parallel_communication_data_type torch.float32
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] sparse_attention ............. None
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] sparse_gradients_enabled ..... False
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] steps_per_print .............. inf
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] train_batch_size ............. 24
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] train_micro_batch_size_per_gpu 24
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] use_data_before_expert_parallel_ False
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] use_node_local_storage ....... False
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] wall_clock_breakdown ......... False
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] weight_quantization_config ... None
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] world_size ................... 1
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] zero_allow_untested_optimizer True
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=200000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=200000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] zero_enabled ................. True
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] zero_force_ds_cpu_optimizer .. True
[2025-01-07 12:50:31,187] [INFO] [config.py:1000:print] zero_optimization_stage ...... 2
[2025-01-07 12:50:31,187] [INFO] [config.py:986:print_user_config] json = {
"bf16": {
"enabled": true,
"auto_cast": true
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2.000000e+08,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2.000000e+08,
"contiguous_gradients": true,
"sub_group_size": 1.000000e+09
},
"gradient_accumulation_steps": 1,
"train_batch_size": 24,
"train_micro_batch_size_per_gpu": 24,
"steps_per_print": inf,
"wall_clock_breakdown": false,
"fp16": {
"enabled": false
},
"zero_allow_untested_optimizer": true
}
0it [00:01, ?it/s]
Traceback (most recent call last):
File "/content/drive/MyDrive/time-llm/Time-LLM/run_main.py", line 211, in
outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1852, in forward
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/content/drive/MyDrive/time-llm/Time-LLM/models/TimeLLM.py", line 196, in forward
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
File "/content/drive/MyDrive/time-llm/Time-LLM/models/TimeLLM.py", line 243, in forecast
dec_out = self.llm_model(inputs_embeds=llama_enc_out).last_hidden_state
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 693, in forward
layer_outputs = decoder_layer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 408, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 346, in forward
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 1860, in softmax
ret = input.softmax(dim, dtype=dtype)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 170.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 144.81 MiB is free. Process 152763 has 39.41 GiB memory in use. Of the allocated memory 38.58 GiB is allocated by PyTorch, and 12.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (CUDA semantics — PyTorch 2.5 documentation)
Traceback (most recent call last):
File "/usr/local/bin/accelerate", line 8, in
sys.exit(main())
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/accelerate_cli.py", line 46, in main
args.func(args)
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 1057, in launch_command
simple_launcher(args)
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 673, in simple_launcher
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['/usr/bin/python3', 'run_main.py', '--task_name', 'long_term_forecast', '--is_training', '1', '--root_path', './dataset/ETT-small/', '--data_path', 'ETTh1.csv', '--model_id', 'ETTh1_512_96', '--model', 'TimeLLM', '--data', 'ETTh1', '--features', 'M', '--seq_len', '512', '--label_len', '48', '--pred_len', '96', '--factor', '3', '--enc_in', '7', '--dec_in', '7', '--c_out', '7', '--des', 'Exp', '--itr', '1', '--d_model', '32', '--d_ff', '128', '--batch_size', '24', '--learning_rate', '0.01', '--llm_layers', '32', '--train_epochs', '100', '--model_comment', 'TimeLLM-ETTh1']' returned non-zero exit status 1.
Loading checkpoint shards: 100% 2/2 [00:05<00:00, 2.79s/it]
[2025-01-07 12:51:42,380] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-01-07 12:51:42,648] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.14.0, git-hash=unknown, git-branch=unknown
[2025-01-07 12:51:42,648] [INFO] [comm.py:637:init_distributed] cdb=None
[2025-01-07 12:51:42,648] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2025-01-07 12:51:43,524] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.28.0.12, master_port=29500
[2025-01-07 12:51:43,524] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2025-01-07 12:51:58,542] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2025-01-07 12:51:58,543] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
[2025-01-07 12:51:58,543] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2025-01-07 12:51:58,545] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = Adam
[2025-01-07 12:51:58,545] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=Adam type=<class 'torch.optim.adam.Adam'>
[2025-01-07 12:51:58,545] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer
[2025-01-07 12:51:58,545] [INFO] [stage_1_and_2.py:149:init] Reduce bucket size 200000000
[2025-01-07 12:51:58,545] [INFO] [stage_1_and_2.py:150:init] Allgather bucket size 200000000
[2025-01-07 12:51:58,545] [INFO] [stage_1_and_2.py:151:init] CPU Offload: False
[2025-01-07 12:51:58,545] [INFO] [stage_1_and_2.py:152:init] Round robin gradient partitioning: False
[2025-01-07 12:51:58,902] [INFO] [utils.py:800:see_memory_usage] Before initializing optimizer states
[2025-01-07 12:51:58,903] [INFO] [utils.py:801:see_memory_usage] MA 12.6 GB Max_MA 12.69 GB CA 12.69 GB Max_CA 13 GB
[2025-01-07 12:51:58,903] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.45 GB, percent = 4.1%
[2025-01-07 12:51:59,064] [INFO] [utils.py:800:see_memory_usage] After initializing optimizer states
[2025-01-07 12:51:59,065] [INFO] [utils.py:801:see_memory_usage] MA 12.6 GB Max_MA 12.77 GB CA 12.86 GB Max_CA 13 GB
[2025-01-07 12:51:59,065] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.45 GB, percent = 4.1%
[2025-01-07 12:51:59,065] [INFO] [stage_1_and_2.py:539:init] optimizer state initialized
[2025-01-07 12:51:59,222] [INFO] [utils.py:800:see_memory_usage] After initializing ZeRO optimizer
[2025-01-07 12:51:59,223] [INFO] [utils.py:801:see_memory_usage] MA 12.6 GB Max_MA 12.6 GB CA 12.86 GB Max_CA 13 GB
[2025-01-07 12:51:59,223] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.45 GB, percent = 4.1%
[2025-01-07 12:51:59,224] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = Adam
[2025-01-07 12:51:59,224] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
[2025-01-07 12:51:59,224] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
[2025-01-07 12:51:59,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0007999999999999986], mom=[(0.95, 0.999)]
[2025-01-07 12:51:59,225] [INFO] [config.py:996:print] DeepSpeedEngine configuration:
[2025-01-07 12:51:59,225] [INFO] [config.py:1000:print] activation_checkpointing_config {
"partition_activations": false,
"contiguous_memory_optimization": false,
"cpu_checkpointing": false,
"number_checkpoints": null,
"synchronize_checkpoint_boundary": false,
"profile": false
}
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] amp_enabled .................. False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] amp_params ................... False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] autotuning_config ............ {
"enabled": false,
"start_step": null,
"end_step": null,
"metric_path": null,
"arg_mappings": null,
"metric": "throughput",
"model_info": null,
"results_dir": "autotuning_results",
"exps_dir": "autotuning_exps",
"overwrite": true,
"fast": true,
"start_profile_step": 3,
"end_profile_step": 5,
"tuner_type": "gridsearch",
"tuner_early_stopping": 5,
"tuner_num_trials": 50,
"model_info_path": null,
"mp_size": 1,
"max_train_batch_size": null,
"min_train_batch_size": 1,
"max_train_micro_batch_size_per_gpu": 1.024000e+03,
"min_train_micro_batch_size_per_gpu": 1,
"num_tuning_micro_batch_sizes": 3
}
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] bfloat16_enabled ............. True
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] bfloat16_immediate_grad_update False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] checkpoint_parallel_write_pipeline False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] checkpoint_tag_validation_enabled True
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] checkpoint_tag_validation_fail False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7d92b2526290>
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] communication_data_type ...... None
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] compile_config ............... enabled=False backend='inductor' kwargs={}
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] curriculum_enabled_legacy .... False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] curriculum_params_legacy ..... False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] data_efficiency_enabled ...... False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] dataloader_drop_last ......... False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] disable_allgather ............ False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] dump_state ................... False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] dynamic_loss_scale_args ...... None
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] eigenvalue_enabled ........... False
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] eigenvalue_gas_boundary_resolution 1
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] eigenvalue_layer_name ........ bert.encoder.layer
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] eigenvalue_layer_num ......... 0
[2025-01-07 12:51:59,226] [INFO] [config.py:1000:print] eigenvalue_max_iter .......... 100
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] eigenvalue_stability ......... 1e-06
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] eigenvalue_tol ............... 0.01
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] eigenvalue_verbose ........... False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] elasticity_enabled ........... False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] flops_profiler_config ........ {
"enabled": false,
"recompute_fwd_factor": 0.0,
"profile_step": 1,
"module_depth": -1,
"top_modules": 1,
"detailed": true,
"output_file": null
}
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] fp16_auto_cast ............... None
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] fp16_enabled ................. False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] fp16_master_weights_and_gradients False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] global_rank .................. 0
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] grad_accum_dtype ............. None
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] gradient_accumulation_steps .. 1
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] gradient_clipping ............ 0.0
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] gradient_predivide_factor .... 1.0
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] graph_harvesting ............. False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] initial_dynamic_scale ........ 1
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] load_universal_checkpoint .... False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] loss_scale ................... 1.0
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] memory_breakdown ............. False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] mics_hierarchial_params_gather False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] mics_shard_size .............. -1
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] nebula_config ................ {
"enabled": false,
"persistent_storage_path": null,
"persistent_time_interval": 100,
"num_of_version_in_retention": 2,
"enable_nebula_load": true,
"load_path": null
}
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] optimizer_legacy_fusion ...... False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] optimizer_name ............... None
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] optimizer_params ............. None
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] pld_enabled .................. False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] pld_params ................... False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] prescale_gradients ........... False
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] scheduler_name ............... None
[2025-01-07 12:51:59,227] [INFO] [config.py:1000:print] scheduler_params ............. None
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] seq_parallel_communication_data_type torch.float32
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] sparse_attention ............. None
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] sparse_gradients_enabled ..... False
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] steps_per_print .............. inf
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] train_batch_size ............. 24
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] train_micro_batch_size_per_gpu 24
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] use_data_before_expert_parallel False
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] use_node_local_storage ....... False
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] wall_clock_breakdown ......... False
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] weight_quantization_config ... None
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] world_size ................... 1
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] zero_allow_untested_optimizer True
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=200000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=200000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] zero_enabled ................. True
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] zero_force_ds_cpu_optimizer .. True
[2025-01-07 12:51:59,228] [INFO] [config.py:1000:print] zero_optimization_stage ...... 2
[2025-01-07 12:51:59,228] [INFO] [config.py:986:print_user_config] json = {
"bf16": {
"enabled": true,
"auto_cast": true
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2.000000e+08,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2.000000e+08,
"contiguous_gradients": true,
"sub_group_size": 1.000000e+09
},
"gradient_accumulation_steps": 1,
"train_batch_size": 24,
"train_micro_batch_size_per_gpu": 24,
"steps_per_print": inf,
"wall_clock_breakdown": false,
"fp16": {
"enabled": false
},
"zero_allow_untested_optimizer": true
}
0it [00:01, ?it/s]
Traceback (most recent call last):
File "/content/drive/MyDrive/time-llm/Time-LLM/run_main.py", line 211, in
outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1852, in forward
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/content/drive/MyDrive/time-llm/Time-LLM/models/TimeLLM.py", line 196, in forward
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
File "/content/drive/MyDrive/time-llm/Time-LLM/models/TimeLLM.py", line 243, in forecast
dec_out = self.llm_model(inputs_embeds=llama_enc_out).last_hidden_state
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 693, in forward
layer_outputs = decoder_layer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 408, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 346, in forward
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 1860, in softmax
ret = input.softmax(dim, dtype=dtype)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 170.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 64.81 MiB is free. Process 159212 has 39.49 GiB memory in use. Of the allocated memory 38.65 GiB is allocated by PyTorch, and 11.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (CUDA semantics — PyTorch 2.5 documentation)
Traceback (most recent call last):
File "/usr/local/bin/accelerate", line 8, in
sys.exit(main())
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/accelerate_cli.py", line 46, in main
args.func(args)
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 1057, in launch_command
simple_launcher(args)
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 673, in simple_launcher
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['/usr/bin/python3', 'run_main.py', '--task_name', 'long_term_forecast', '--is_training', '1', '--root_path', './dataset/ETT-small/', '--data_path', 'ETTh1.csv', '--model_id', 'ETTh1_512_192', '--model', 'TimeLLM', '--data', 'ETTh1', '--features', 'M', '--seq_len', '512', '--label_len', '48', '--pred_len', '192', '--factor', '3', '--enc_in', '7', '--dec_in', '7', '--c_out', '7', '--des', 'Exp', '--itr', '1', '--d_model', '32', '--d_ff', '128', '--batch_size', '24', '--learning_rate', '0.02', '--llm_layers', '32', '--train_epochs', '100', '--model_comment', 'TimeLLM-ETTh1']' returned non-zero exit status 1.
Loading checkpoint shards: 100% 2/2 [00:05<00:00, 2.77s/it]
[2025-01-07 12:53:10,060] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-01-07 12:53:10,331] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.14.0, git-hash=unknown, git-branch=unknown
[2025-01-07 12:53:10,331] [INFO] [comm.py:637:init_distributed] cdb=None
[2025-01-07 12:53:10,331] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2025-01-07 12:53:11,207] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.28.0.12, master_port=29500
[2025-01-07 12:53:11,208] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2025-01-07 12:53:26,145] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2025-01-07 12:53:26,147] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
[2025-01-07 12:53:26,147] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2025-01-07 12:53:26,148] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = Adam
[2025-01-07 12:53:26,148] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=Adam type=<class 'torch.optim.adam.Adam'>
[2025-01-07 12:53:26,149] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer
[2025-01-07 12:53:26,149] [INFO] [stage_1_and_2.py:149:init] Reduce bucket size 200000000
[2025-01-07 12:53:26,149] [INFO] [stage_1_and_2.py:150:init] Allgather bucket size 200000000
[2025-01-07 12:53:26,149] [INFO] [stage_1_and_2.py:151:init] CPU Offload: False
[2025-01-07 12:53:26,149] [INFO] [stage_1_and_2.py:152:init] Round robin gradient partitioning: False
[2025-01-07 12:53:26,517] [INFO] [utils.py:800:see_memory_usage] Before initializing optimizer states
[2025-01-07 12:53:26,517] [INFO] [utils.py:801:see_memory_usage] MA 12.61 GB Max_MA 12.7 GB CA 12.7 GB Max_CA 13 GB
[2025-01-07 12:53:26,518] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.46 GB, percent = 4.1%
[2025-01-07 12:53:26,687] [INFO] [utils.py:800:see_memory_usage] After initializing optimizer states
[2025-01-07 12:53:26,688] [INFO] [utils.py:801:see_memory_usage] MA 12.61 GB Max_MA 12.78 GB CA 12.88 GB Max_CA 13 GB
[2025-01-07 12:53:26,688] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.46 GB, percent = 4.1%
[2025-01-07 12:53:26,688] [INFO] [stage_1_and_2.py:539:init] optimizer state initialized
[2025-01-07 12:53:26,854] [INFO] [utils.py:800:see_memory_usage] After initializing ZeRO optimizer
[2025-01-07 12:53:26,855] [INFO] [utils.py:801:see_memory_usage] MA 12.61 GB Max_MA 12.61 GB CA 12.88 GB Max_CA 13 GB
[2025-01-07 12:53:26,855] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.46 GB, percent = 4.1%
[2025-01-07 12:53:26,856] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = Adam
[2025-01-07 12:53:26,856] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
[2025-01-07 12:53:26,856] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
[2025-01-07 12:53:26,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2025-01-07 12:53:26,857] [INFO] [config.py:996:print] DeepSpeedEngine configuration:
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] activation_checkpointing_config {
"partition_activations": false,
"contiguous_memory_optimization": false,
"cpu_checkpointing": false,
"number_checkpoints": null,
"synchronize_checkpoint_boundary": false,
"profile": false
}
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] amp_enabled .................. False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] amp_params ................... False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] autotuning_config ............ {
"enabled": false,
"start_step": null,
"end_step": null,
"metric_path": null,
"arg_mappings": null,
"metric": "throughput",
"model_info": null,
"results_dir": "autotuning_results",
"exps_dir": "autotuning_exps",
"overwrite": true,
"fast": true,
"start_profile_step": 3,
"end_profile_step": 5,
"tuner_type": "gridsearch",
"tuner_early_stopping": 5,
"tuner_num_trials": 50,
"model_info_path": null,
"mp_size": 1,
"max_train_batch_size": null,
"min_train_batch_size": 1,
"max_train_micro_batch_size_per_gpu": 1.024000e+03,
"min_train_micro_batch_size_per_gpu": 1,
"num_tuning_micro_batch_sizes": 3
}
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] bfloat16_enabled ............. True
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] bfloat16_immediate_grad_update False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] checkpoint_parallel_write_pipeline False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] checkpoint_tag_validation_enabled True
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] checkpoint_tag_validation_fail False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7cc73af1a170>
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] communication_data_type ...... None
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] compile_config ............... enabled=False backend='inductor' kwargs={}
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] curriculum_enabled_legacy .... False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] curriculum_params_legacy ..... False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] data_efficiency_enabled ...... False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] dataloader_drop_last ......... False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] disable_allgather ............ False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] dump_state ................... False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] dynamic_loss_scale_args ...... None
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] eigenvalue_enabled ........... False
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] eigenvalue_gas_boundary_resolution 1
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] eigenvalue_layer_name ........ bert.encoder.layer
[2025-01-07 12:53:26,858] [INFO] [config.py:1000:print] eigenvalue_layer_num ......... 0
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] eigenvalue_max_iter .......... 100
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] eigenvalue_stability ......... 1e-06
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] eigenvalue_tol ............... 0.01
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] eigenvalue_verbose ........... False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] elasticity_enabled ........... False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] flops_profiler_config ........ {
"enabled": false,
"recompute_fwd_factor": 0.0,
"profile_step": 1,
"module_depth": -1,
"top_modules": 1,
"detailed": true,
"output_file": null
}
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] fp16_auto_cast ............... None
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] fp16_enabled ................. False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] fp16_master_weights_and_gradients False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] global_rank .................. 0
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] grad_accum_dtype ............. None
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] gradient_accumulation_steps .. 1
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] gradient_clipping ............ 0.0
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] gradient_predivide_factor .... 1.0
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] graph_harvesting ............. False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] initial_dynamic_scale ........ 1
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] load_universal_checkpoint .... False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] loss_scale ................... 1.0
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] memory_breakdown ............. False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] mics_hierarchial_params_gather False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] mics_shard_size .............. -1
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] nebula_config ................ {
"enabled": false,
"persistent_storage_path": null,
"persistent_time_interval": 100,
"num_of_version_in_retention": 2,
"enable_nebula_load": true,
"load_path": null
}
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] optimizer_legacy_fusion ...... False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] optimizer_name ............... None
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] optimizer_params ............. None
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] pld_enabled .................. False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] pld_params ................... False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] prescale_gradients ........... False
[2025-01-07 12:53:26,859] [INFO] [config.py:1000:print] scheduler_name ............... None
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] scheduler_params ............. None
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] seq_parallel_communication_data_type torch.float32
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] sparse_attention ............. None
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] sparse_gradients_enabled ..... False
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] steps_per_print .............. inf
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] train_batch_size ............. 24
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] train_micro_batch_size_per_gpu 24
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] use_data_before_expert_parallel False
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] use_node_local_storage ....... False
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] wall_clock_breakdown ......... False
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] weight_quantization_config ... None
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] world_size ................... 1
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] zero_allow_untested_optimizer True
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=200000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=200000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] zero_enabled ................. True
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] zero_force_ds_cpu_optimizer .. True
[2025-01-07 12:53:26,860] [INFO] [config.py:1000:print] zero_optimization_stage ...... 2
[2025-01-07 12:53:26,860] [INFO] [config.py:986:print_user_config] json = {
"bf16": {
"enabled": true,
"auto_cast": true
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2.000000e+08,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2.000000e+08,
"contiguous_gradients": true,
"sub_group_size": 1.000000e+09
},
"gradient_accumulation_steps": 1,
"train_batch_size": 24,
"train_micro_batch_size_per_gpu": 24,
"steps_per_print": inf,
"wall_clock_breakdown": false,
"fp16": {
"enabled": false
},
"zero_allow_untested_optimizer": true
}
0it [00:01, ?it/s]
Traceback (most recent call last):
File "/content/drive/MyDrive/time-llm/Time-LLM/run_main.py", line 211, in
outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1852, in forward
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/content/drive/MyDrive/time-llm/Time-LLM/models/TimeLLM.py", line 196, in forward
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
File "/content/drive/MyDrive/time-llm/Time-LLM/models/TimeLLM.py", line 243, in forecast
dec_out = self.llm_model(inputs_embeds=llama_enc_out).last_hidden_state
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 693, in forward
layer_outputs = decoder_layer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 408, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 330, in forward
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 80.81 MiB is free. Process 165528 has 39.48 GiB memory in use. Of the allocated memory 38.64 GiB is allocated by PyTorch, and 13.69 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (CUDA semantics — PyTorch 2.5 documentation)
Traceback (most recent call last):
File "/usr/local/bin/accelerate", line 8, in
sys.exit(main())
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/accelerate_cli.py", line 46, in main
args.func(args)
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 1057, in launch_command
simple_launcher(args)
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 673, in simple_launcher
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['/usr/bin/python3', 'run_main.py', '--task_name', 'long_term_forecast', '--is_training', '1', '--root_path', './dataset/ETT-small/', '--data_path', 'ETTh1.csv', '--model_id', 'ETTh1_512_336', '--model', 'TimeLLM', '--data', 'ETTh1', '--features', 'M', '--seq_len', '512', '--label_len', '48', '--pred_len', '336', '--factor', '3', '--enc_in', '7', '--dec_in', '7', '--c_out', '7', '--des', 'Exp', '--itr', '1', '--d_model', '32', '--d_ff', '128', '--batch_size', '24', '--lradj', 'COS', '--learning_rate', '0.001', '--llm_layers', '32', '--train_epochs', '100', '--model_comment', 'TimeLLM-ETTh1']' returned non-zero exit status 1.
Loading checkpoint shards: 100% 2/2 [00:05<00:00, 2.77s/it]
[2025-01-07 12:54:37,812] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-01-07 12:54:38,083] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.14.0, git-hash=unknown, git-branch=unknown
[2025-01-07 12:54:38,083] [INFO] [comm.py:637:init_distributed] cdb=None
[2025-01-07 12:54:38,083] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2025-01-07 12:54:38,959] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.28.0.12, master_port=29500
[2025-01-07 12:54:38,960] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2025-01-07 12:54:53,894] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2025-01-07 12:54:53,896] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
[2025-01-07 12:54:53,896] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2025-01-07 12:54:53,898] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = Adam
[2025-01-07 12:54:53,898] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=Adam type=<class 'torch.optim.adam.Adam'>
[2025-01-07 12:54:53,898] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer
[2025-01-07 12:54:53,898] [INFO] [stage_1_and_2.py:149:init] Reduce bucket size 200000000
[2025-01-07 12:54:53,898] [INFO] [stage_1_and_2.py:150:init] Allgather bucket size 200000000
[2025-01-07 12:54:53,898] [INFO] [stage_1_and_2.py:151:init] CPU Offload: False
[2025-01-07 12:54:53,898] [INFO] [stage_1_and_2.py:152:init] Round robin gradient partitioning: False
[2025-01-07 12:54:54,277] [INFO] [utils.py:800:see_memory_usage] Before initializing optimizer states
[2025-01-07 12:54:54,277] [INFO] [utils.py:801:see_memory_usage] MA 12.62 GB Max_MA 12.72 GB CA 12.72 GB Max_CA 13 GB
[2025-01-07 12:54:54,278] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.5 GB, percent = 4.2%
[2025-01-07 12:54:54,445] [INFO] [utils.py:800:see_memory_usage] After initializing optimizer states
[2025-01-07 12:54:54,446] [INFO] [utils.py:801:see_memory_usage] MA 12.62 GB Max_MA 12.81 GB CA 12.91 GB Max_CA 13 GB
[2025-01-07 12:54:54,446] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.5 GB, percent = 4.2%
[2025-01-07 12:54:54,446] [INFO] [stage_1_and_2.py:539:init] optimizer state initialized
[2025-01-07 12:54:54,612] [INFO] [utils.py:800:see_memory_usage] After initializing ZeRO optimizer
[2025-01-07 12:54:54,613] [INFO] [utils.py:801:see_memory_usage] MA 12.62 GB Max_MA 12.62 GB CA 12.91 GB Max_CA 13 GB
[2025-01-07 12:54:54,613] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 3.5 GB, percent = 4.2%
[2025-01-07 12:54:54,614] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = Adam
[2025-01-07 12:54:54,614] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
[2025-01-07 12:54:54,614] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
[2025-01-07 12:54:54,614] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0003999999999999993], mom=[(0.95, 0.999)]
[2025-01-07 12:54:54,615] [INFO] [config.py:996:print] DeepSpeedEngine configuration:
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] activation_checkpointing_config {
"partition_activations": false,
"contiguous_memory_optimization": false,
"cpu_checkpointing": false,
"number_checkpoints": null,
"synchronize_checkpoint_boundary": false,
"profile": false
}
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] amp_enabled .................. False
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] amp_params ................... False
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] autotuning_config ............ {
"enabled": false,
"start_step": null,
"end_step": null,
"metric_path": null,
"arg_mappings": null,
"metric": "throughput",
"model_info": null,
"results_dir": "autotuning_results",
"exps_dir": "autotuning_exps",
"overwrite": true,
"fast": true,
"start_profile_step": 3,
"end_profile_step": 5,
"tuner_type": "gridsearch",
"tuner_early_stopping": 5,
"tuner_num_trials": 50,
"model_info_path": null,
"mp_size": 1,
"max_train_batch_size": null,
"min_train_batch_size": 1,
"max_train_micro_batch_size_per_gpu": 1.024000e+03,
"min_train_micro_batch_size_per_gpu": 1,
"num_tuning_micro_batch_sizes": 3
}
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] bfloat16_enabled ............. True
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] bfloat16_immediate_grad_update False
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] checkpoint_parallel_write_pipeline False
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] checkpoint_tag_validation_enabled True
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] checkpoint_tag_validation_fail False
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7a2eccdfa0b0>
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] communication_data_type ...... None
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] compile_config ............... enabled=False backend='inductor' kwargs={}
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] curriculum_enabled_legacy .... False
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] curriculum_params_legacy ..... False
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] data_efficiency_enabled ...... False
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] dataloader_drop_last ......... False
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] disable_allgather ............ False
[2025-01-07 12:54:54,616] [INFO] [config.py:1000:print] dump_state ................... False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] dynamic_loss_scale_args ...... None
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] eigenvalue_enabled ........... False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] eigenvalue_gas_boundary_resolution 1
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] eigenvalue_layer_name ........ bert.encoder.layer
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] eigenvalue_layer_num ......... 0
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] eigenvalue_max_iter .......... 100
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] eigenvalue_stability ......... 1e-06
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] eigenvalue_tol ............... 0.01
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] eigenvalue_verbose ........... False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] elasticity_enabled ........... False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] flops_profiler_config ........ {
"enabled": false,
"recompute_fwd_factor": 0.0,
"profile_step": 1,
"module_depth": -1,
"top_modules": 1,
"detailed": true,
"output_file": null
}
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] fp16_auto_cast ............... None
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] fp16_enabled ................. False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] fp16_master_weights_and_gradients False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] global_rank .................. 0
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] grad_accum_dtype ............. None
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] gradient_accumulation_steps .. 1
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] gradient_clipping ............ 0.0
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] gradient_predivide_factor .... 1.0
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] graph_harvesting ............. False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] initial_dynamic_scale ........ 1
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] load_universal_checkpoint .... False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] loss_scale ................... 1.0
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] memory_breakdown ............. False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] mics_hierarchial_params_gather False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] mics_shard_size .............. -1
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] nebula_config ................ {
"enabled": false,
"persistent_storage_path": null,
"persistent_time_interval": 100,
"num_of_version_in_retention": 2,
"enable_nebula_load": true,
"load_path": null
}
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] optimizer_legacy_fusion ...... False
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] optimizer_name ............... None
[2025-01-07 12:54:54,617] [INFO] [config.py:1000:print] optimizer_params ............. None
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] pld_enabled .................. False
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] pld_params ................... False
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] prescale_gradients ........... False
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] scheduler_name ............... None
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] scheduler_params ............. None
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] seq_parallel_communication_data_type torch.float32
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] sparse_attention ............. None
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] sparse_gradients_enabled ..... False
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] steps_per_print .............. inf
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] train_batch_size ............. 24
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] train_micro_batch_size_per_gpu 24
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] use_data_before_expert_parallel False
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] use_node_local_storage ....... False
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] wall_clock_breakdown ......... False
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] weight_quantization_config ... None
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] world_size ................... 1
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] zero_allow_untested_optimizer True
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=200000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=200000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] zero_enabled ................. True
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] zero_force_ds_cpu_optimizer .. True
[2025-01-07 12:54:54,618] [INFO] [config.py:1000:print] zero_optimization_stage ...... 2
[2025-01-07 12:54:54,618] [INFO] [config.py:986:print_user_config] json = {
"bf16": {
"enabled": true,
"auto_cast": true
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2.000000e+08,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2.000000e+08,
"contiguous_gradients": true,
"sub_group_size": 1.000000e+09
},
"gradient_accumulation_steps": 1,
"train_batch_size": 24,
"train_micro_batch_size_per_gpu": 24,
"steps_per_print": inf,
"wall_clock_breakdown": false,
"fp16": {
"enabled": false
},
"zero_allow_untested_optimizer": true
}
0it [00:01, ?it/s]
Traceback (most recent call last):
File "/content/drive/MyDrive/time-llm/Time-LLM/run_main.py", line 211, in
outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1852, in forward
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/content/drive/MyDrive/time-llm/Time-LLM/models/TimeLLM.py", line 196, in forward
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
File "/content/drive/MyDrive/time-llm/Time-LLM/models/TimeLLM.py", line 243, in forecast
dec_out = self.llm_model(inputs_embeds=llama_enc_out).last_hidden_state
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 693, in forward
layer_outputs = decoder_layer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 408, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 330, in forward
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 56.81 MiB is free. Process 171661 has 39.50 GiB memory in use. Of the allocated memory 38.66 GiB is allocated by PyTorch, and 13.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (CUDA semantics — PyTorch 2.5 documentation)
Traceback (most recent call last):
File "/usr/local/bin/accelerate", line 8, in
sys.exit(main())
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/accelerate_cli.py", line 46, in main
args.func(args)
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 1057, in launch_command
simple_launcher(args)
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 673, in simple_launcher
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['/usr/bin/python3', 'run_main.py', '--task_name', 'long_term_forecast', '--is_training', '1', '--root_path', './dataset/ETT-small/', '--data_path', 'ETTh1.csv', '--model_id', 'ETTh1_512_720', '--model', 'TimeLLM', '--data', 'ETTh1', '--features', 'M', '--seq_len', '512', '--label_len', '48', '--pred_len', '720', '--factor', '3', '--enc_in', '7', '--dec_in', '7', '--c_out', '7', '--des', 'Exp', '--itr', '1', '--d_model', '32', '--d_ff', '128', '--batch_size', '24', '--learning_rate', '0.01', '--llm_layers', '32', '--train_epochs', '100', '--model_comment', 'TimeLLM-ETTh1']' returned non-zero exit status 1.
이렇게 뜨고 알아본 결과로 메모리 문제라고 하는데 코랩 상에서 메모리에는 여유가 있고 심지어 GPU RAM은 계속 0/40GB 상태입니다.
런타임 유형 변경에서 A100 설정한 상황입니다.
도와주세요 ㅜㅜ