You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Can someone help me take a look at the errors I have encountered?Here are the error codes I encountered.
Training with 1 GPUs.
Using random seed 0
Make folder logs/example_group/example_name
wandb_scalar_iter: 100
cudnn benchmark: True
cudnn deterministic: False
Setup trainer.
Using random seed 0
Traceback (most recent call last):
File "train.py", line 104, in
main()
File "train.py", line 79, in main
trainer = get_trainer(cfg, is_inference=False, seed=args.seed)
File "/home/intel/neuralangelo/imaginaire/trainers/utils/get_trainer.py", line 32, in get_trainer
trainer = trainer_lib.Trainer(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/projects/neuralangelo/trainer.py", line 26, in init
super().init(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/projects/nerf/trainers/base.py", line 28, in init
super().init(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/imaginaire/trainers/base.py", line 50, in init
self.model = self.setup_model(cfg, seed=seed)
File "/home/intel/neuralangelo/imaginaire/trainers/base.py", line 116, in setup_model
lib_model = importlib.import_module(cfg.model.type)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1014, in _gcd_import
File "", line 991, in _find_and_load
File "", line 975, in _find_and_load_unlocked
File "", line 671, in _load_unlocked
File "", line 843, in exec_module
File "", line 219, in _call_with_frames_removed
File "/home/intel/neuralangelo/projects/neuralangelo/model.py", line 21, in
from projects.neuralangelo.utils.modules import NeuralSDF, NeuralRGB, BackgroundNeRF
File "/home/intel/neuralangelo/projects/neuralangelo/utils/modules.py", line 16, in
import tinycudann as tcnn
ModuleNotFoundError: No module named 'tinycudann'
[2024-03-25 20:11:48,544] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 37238) of binary: /home/intel/miniconda3/envs/neuralangelo/bin/python
Traceback (most recent call last):
File "/home/intel/miniconda3/envs/neuralangelo/bin/torchrun", line 10, in
sys.exit(main())
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
train.py FAILED
Can someone help me take a look at the errors I have encountered?Here are the error codes I encountered.
Training with 1 GPUs.
Using random seed 0
Make folder logs/example_group/example_name
cudnn benchmark: True
cudnn deterministic: False
Setup trainer.
Using random seed 0
Traceback (most recent call last):
File "train.py", line 104, in
main()
File "train.py", line 79, in main
trainer = get_trainer(cfg, is_inference=False, seed=args.seed)
File "/home/intel/neuralangelo/imaginaire/trainers/utils/get_trainer.py", line 32, in get_trainer
trainer = trainer_lib.Trainer(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/projects/neuralangelo/trainer.py", line 26, in init
super().init(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/projects/nerf/trainers/base.py", line 28, in init
super().init(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/imaginaire/trainers/base.py", line 50, in init
self.model = self.setup_model(cfg, seed=seed)
File "/home/intel/neuralangelo/imaginaire/trainers/base.py", line 116, in setup_model
lib_model = importlib.import_module(cfg.model.type)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1014, in _gcd_import
File "", line 991, in _find_and_load
File "", line 975, in _find_and_load_unlocked
File "", line 671, in _load_unlocked
File "", line 843, in exec_module
File "", line 219, in _call_with_frames_removed
File "/home/intel/neuralangelo/projects/neuralangelo/model.py", line 21, in
from projects.neuralangelo.utils.modules import NeuralSDF, NeuralRGB, BackgroundNeRF
File "/home/intel/neuralangelo/projects/neuralangelo/utils/modules.py", line 16, in
import tinycudann as tcnn
ModuleNotFoundError: No module named 'tinycudann'
[2024-03-25 20:11:48,544] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 37238) of binary: /home/intel/miniconda3/envs/neuralangelo/bin/python
Traceback (most recent call last):
File "/home/intel/miniconda3/envs/neuralangelo/bin/torchrun", line 10, in
sys.exit(main())
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
train.py FAILED
Failures:
<NO_OTHER_FAILURES>
Root Cause (first observed failure):
[0]:
time : 2024-03-25_20:11:48
host : intel-MD72-HB1-00
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 37238)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
The text was updated successfully, but these errors were encountered: