안녕하세요.
현재 WSL 환경에 적응하고 있는 파이토치린이 입니다.
현재 i7-14700k 와 4090을 쓰고 있는데, 자꾸 이런오류가 떠서요
RuntimeError Traceback (most recent call last)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:44, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
43 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
---> 44 return trainer_fn(*args, **kwargs)
46 except _TunerExitException:
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:580, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
574 ckpt_path = self._checkpoint_connector._select_ckpt_path(
575 self.state.fn,
576 ckpt_path,
577 model_provided=True,
578 model_connected=self.lightning_module is not None,
579 )
--> 580 self._run(model, ckpt_path=ckpt_path)
582 assert self.state.stopped
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:989, in Trainer._run(self, model, ckpt_path)
986 # ----------------------------
987 # RUN THE TRAINER
988 # ----------------------------
--> 989 results = self._run_stage()
991 # ----------------------------
992 # POST-Training CLEAN UP
993 # ----------------------------
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:1035, in Trainer._run_stage(self)
1034 with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1035 self.fit_loop.run()
1036 return None
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:202, in _FitLoop.run(self)
201 self.on_advance_start()
--> 202 self.advance()
203 self.on_advance_end()
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:359, in _FitLoop.advance(self)
358 assert self._data_fetcher is not None
--> 359 self.epoch_loop.run(self._data_fetcher)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py:136, in _TrainingEpochLoop.run(self, data_fetcher)
135 try:
--> 136 self.advance(data_fetcher)
137 self.on_advance_end(data_fetcher)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py:240, in _TrainingEpochLoop.advance(self, data_fetcher)
238 if trainer.lightning_module.automatic_optimization:
239 # in automatic optimization, there can only be one optimizer
--> 240 batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
241 else:
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py:187, in _AutomaticOptimization.run(self, optimizer, batch_idx, kwargs)
182 # ------------------------------
183 # BACKWARD PASS
184 # ------------------------------
185 # gradient update with accumulated gradients
186 else:
--> 187 self._optimizer_step(batch_idx, closure)
189 result = closure.consume_result()
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py:265, in _AutomaticOptimization._optimizer_step(self, batch_idx, train_step_and_backward_closure)
264 # model hook
--> 265 call._call_lightning_module_hook(
266 trainer,
267 "optimizer_step",
268 trainer.current_epoch,
269 batch_idx,
270 optimizer,
271 train_step_and_backward_closure,
272 )
274 if not should_accumulate:
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:157, in _call_lightning_module_hook(trainer, hook_name, pl_module, *args, **kwargs)
156 with trainer.profiler.profile(f"[LightningModule]{pl_module.class.name}.{hook_name}"):
--> 157 output = fn(*args, **kwargs)
159 # restore current_fx when nested context
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/core/module.py:1291, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure)
1259 r"""Override this method to adjust the default way the :class:~lightning.pytorch.trainer.trainer.Trainer
calls
1260 the optimizer.
1261
(...)
1289
1290 """
-> 1291 optimizer.step(closure=optimizer_closure)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/core/optimizer.py:151, in LightningOptimizer.step(self, closure, **kwargs)
150 assert self._strategy is not None
--> 151 step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
153 self._on_after_step()
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py:230, in Strategy.optimizer_step(self, optimizer, closure, model, **kwargs)
229 assert isinstance(model, pl.LightningModule)
--> 230 return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/amp.py:74, in MixedPrecision.optimizer_step(self, optimizer, model, closure, **kwargs)
72 if self.scaler is None:
73 # skip scaler logic, as bfloat16 does not require scaler
---> 74 return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs)
75 if isinstance(optimizer, LBFGS):
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/precision.py:117, in Precision.optimizer_step(self, optimizer, model, closure, **kwargs)
116 closure = partial(self._wrap_closure, model, optimizer, closure)
--> 117 return optimizer.step(closure=closure, **kwargs)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/torch/optim/optimizer.py:373, in Optimizer.profile_hook_step..wrapper(*args, **kwargs)
369 raise RuntimeError(
370 f"{func} must return None or a tuple of (new_args, new_kwargs), but got {result}."
371 )
--> 373 out = func(*args, **kwargs)
374 self._optimizer_step_code()
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/torch/optim/optimizer.py:76, in _use_grad_for_differentiable.._use_grad(self, *args, **kwargs)
75 torch._dynamo.graph_break()
---> 76 ret = func(self, *args, **kwargs)
77 finally:
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/torch/optim/adamw.py:161, in AdamW.step(self, closure)
160 with torch.enable_grad():
--> 161 loss = closure()
163 for group in self.param_groups:
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/precision.py:104, in Precision._wrap_closure(self, model, optimizer, closure)
97 """This double-closure allows makes sure the closure
is executed before the on_before_optimizer_step
98 hook is called.
99
(...)
102
103 """
--> 104 closure_result = closure()
105 self._after_closure(model, optimizer)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py:140, in Closure.call(self, *args, **kwargs)
139 def call(self, *args: Any, **kwargs: Any) -> Optional[Tensor]:
--> 140 self._result = self.closure(*args, **kwargs)
141 return self._result.loss
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py:135, in Closure.closure(self, *args, **kwargs)
134 if self._backward_fn is not None and step_output.closure_loss is not None:
--> 135 self._backward_fn(step_output.closure_loss)
137 return step_output
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py:236, in _AutomaticOptimization._make_backward_fn..backward_fn(loss)
235 def backward_fn(loss: Tensor) -> None:
--> 236 call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:309, in _call_strategy_hook(trainer, hook_name, *args, **kwargs)
308 with trainer.profiler.profile(f"[Strategy]{trainer.strategy.class.name}.{hook_name}"):
--> 309 output = fn(*args, **kwargs)
311 # restore current_fx when nested context
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py:204, in Strategy.backward(self, closure_loss, optimizer, *args, **kwargs)
202 closure_loss = self.precision_plugin.pre_backward(closure_loss, self.lightning_module)
--> 204 self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
206 closure_loss = self.precision_plugin.post_backward(closure_loss, self.lightning_module)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/precision.py:69, in Precision.backward(self, tensor, model, optimizer, *args, **kwargs)
58 r"""Performs the actual backpropagation.
59
60 Args:
(...)
67
68 """
---> 69 model.backward(tensor, *args, **kwargs)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/core/module.py:1078, in LightningModule.backward(self, loss, *args, **kwargs)
1077 else:
-> 1078 loss.backward(*args, **kwargs)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/torch/_tensor.py:492, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
483 return handle_torch_function(
484 Tensor.backward,
485 (self,),
(...)
490 inputs=inputs,
491 )
--> 492 torch.autograd.backward(
493 self, gradient, retain_graph, create_graph, inputs=inputs
494 )
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/torch/autograd/init.py:251, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
248 # The reason we repeat the same comment below is that
249 # some Python versions print out the first line of a multi-line function
250 # calls in the traceback and some print out the last line
--> 251 Variable.execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
252 tensors,
253 grad_tensors,
254 retain_graph,
255 create_graph,
256 inputs,
257 allow_unreachable=True,
258 accumulate_grad=True,
259 )
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
Cell In[36], line 1
----> 1 trainer.fit(lit_jigsaw_electra, train_dataloader, val_dataloader)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:544, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
542 self.state.status = TrainerStatus.RUNNING
543 self.training = True
--> 544 call._call_and_handle_interrupt(
545 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
546 )
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:68, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
66 for logger in trainer.loggers:
67 logger.finalize("failed")
---> 68 trainer._teardown()
69 # teardown might access the stage so we reset it after
70 trainer.state.stage = None
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:1012, in Trainer._teardown(self)
1009 def _teardown(self) -> None:
1010 """This is the Trainer's internal teardown, unrelated to the teardown
hooks in LightningModule and Callback;
1011 those are handled by :meth:_call_teardown_hook
."""
-> 1012 self.strategy.teardown()
1013 loop = self._active_loop
1014 # loop should never be None
here but it can because we don't know the trainer stage with ddp_spawn
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py:524, in Strategy.teardown(self)
518 def teardown(self) -> None:
519 """This method is called to teardown the training process.
520
521 It is the right place to release memory and free other resources.
522
523 """
--> 524 _optimizers_to_device(self.optimizers, torch.device("cpu"))
526 if self.lightning_module is not None:
527 log.debug(f"{self.class.name}: moving model to CPU")
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/fabric/utilities/optimizer.py:28, in _optimizers_to_device(optimizers, device)
26 """Moves optimizer states for a sequence of optimizers to the device."""
27 for opt in optimizers:
---> 28 _optimizer_to_device(opt, device)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/fabric/utilities/optimizer.py:34, in _optimizer_to_device(optimizer, device)
32 """Moves the state of a single optimizer to the device."""
33 for p, v in optimizer.state.items():
---> 34 optimizer.state[p] = apply_to_collection(v, Tensor, move_data_to_device, device, allow_frozen=True)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning_utilities/core/apply_func.py:52, in apply_to_collection(data, dtype, function, wrong_dtype, include_none, allow_frozen, *args, **kwargs)
33 """Recursively applies a function to all elements of a certain dtype.
34
35 Args:
(...)
48
49 """
50 if include_none is False or wrong_dtype is not None or allow_frozen is True:
51 # not worth implementing these on the fast path: go with the slower option
---> 52 return _apply_to_collection_slow(
53 data,
54 dtype,
55 function,
56 *args,
57 wrong_dtype=wrong_dtype,
58 include_none=include_none,
59 allow_frozen=allow_frozen,
60 **kwargs,
61 )
62 # fast path for the most common cases:
63 if isinstance(data, dtype): # single element
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning_utilities/core/apply_func.py:104, in _apply_to_collection_slow(data, dtype, function, wrong_dtype, include_none, allow_frozen, *args, **kwargs)
102 out =
103 for k, v in data.items():
--> 104 v = _apply_to_collection_slow(
105 v,
106 dtype,
107 function,
108 *args,
109 wrong_dtype=wrong_dtype,
110 include_none=include_none,
111 allow_frozen=allow_frozen,
112 **kwargs,
113 )
114 if include_none or v is not None:
115 out.append((k, v))
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning_utilities/core/apply_func.py:96, in _apply_to_collection_slow(data, dtype, function, wrong_dtype, include_none, allow_frozen, *args, **kwargs)
84 def _apply_to_collection_slow(
85 data: Any,
86 dtype: Union[type, Any, Tuple[Union[type, Any]]],
(...)
93 ) -> Any:
94 # Breaking condition
95 if isinstance(data, dtype) and (wrong_dtype is None or not isinstance(data, wrong_dtype)):
---> 96 return function(data, *args, **kwargs)
98 elem_type = type(data)
100 # Recursively apply to collection items
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/fabric/utilities/apply_func.py:102, in move_data_to_device(batch, device)
99 # user wrongly implemented the _TransferableDataType
and forgot to return self
.
100 return data
--> 102 return apply_to_collection(batch, dtype=_TransferableDataType, function=batch_to)
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning_utilities/core/apply_func.py:64, in apply_to_collection(data, dtype, function, wrong_dtype, include_none, allow_frozen, *args, **kwargs)
62 # fast path for the most common cases:
63 if isinstance(data, dtype): # single element
---> 64 return function(data, *args, **kwargs)
65 if data.class is list and all(isinstance(x, dtype) for x in data): # 1d homogeneous list
66 return [function(x, *args, **kwargs) for x in data]
File ~/anaconda3/envs/py11/lib/python3.11/site-packages/lightning/fabric/utilities/apply_func.py:96, in move_data_to_device..batch_to(data)
94 if isinstance(data, Tensor) and isinstance(device, torch.device) and device.type not in _BLOCKING_DEVICE_TYPES:
95 kwargs["non_blocking"] = True
---> 96 data_output = data.to(device, **kwargs)
97 if data_output is not None:
98 return data_output
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
이 에러인데, 처음엔 메모리문제인가보다 하고,
배치사이즈가 문제인가보다 생각해서 배치사이즈를 확줄여도 동일 오류 메시지가 뜨는데
변칙적으로 학습 도중에 멈추고 미쳐버리겠습니다. ㅠㅠㅠ
파이토치 라이트닝이긴하지만, 문제가 명확하지 않아서, 질문드려봅니다.
추가로) 멀티프로세싱을 사용하기 위해서 WSL 우분투 환경에서 작업하고 있는데, WSL에서 cuda 사용법이
윈도우에도 CUDA 12.1.0 을 설치하고, WSL 우분투에도 12.1.0을 설치하는것이 올바른 방법인지 질문드려봅니다.
현재 torch 2.1.2 사용중입니다
가끔가다가
RuntimeError: CUDA error: misaligned address
이런 오류도 뜨는데 cuda 버전이 문제인건지,,
RTX 4090 사용중입니다. 그래픽카드 드라이버 버전은 최신이고요.. ㅠㅠ