Distributed Training
Distributed training with Optimum Habana
python gaudi_spawn.py \
--world_size number_of_hpu_you_have --use_mpi \
path_to_script.py --args1 --args2 ... --argsNfrom optimum.habana.distributed import DistributedRunner
from optimum.utils import logging
world_size=8 # Number of HPUs to use (1 or 8)
# define distributed runner
distributed_runner = DistributedRunner(
command_list=["scripts/train.py --args1 --args2 ... --argsN"],
world_size=world_size,
use_mpi=True,
)
# start job
ret_code = distributed_runner.run()Last updated