4

Training CIFAR-100 by DeepSpeed

 4 months ago
source link: https://donghao.org/2024/01/19/training-cifar-100-by-deepspeed/
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

Training CIFAR-100 by DeepSpeed

deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json

deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json
Python
deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json

deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json

deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json

deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json
Python
deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json

deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json

deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json

deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json
Shell
deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json

deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json

deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json
Python
deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json

deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json

deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json
Python
deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json

deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json

  1. Using a shared file system (Filestore of GCP, EFS of AWS, or just NFS) for the cluster and only letting the master node save the checkpoint. The saved checkpoint will be seen by all other nodes through the shared file system.
  2. Or, just set “use_node_local_storage” to true. Then all the nodes will save the checkpoints.

deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json

deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json
Python
deepspeed \
  --master_addr=rogpt1 \
  --elastic_training \
  --min_elastic_nodes=1 \
  --max_elastic_nodes=2 \
  --hostfile=hostfile \
  train.py \
  --deepspeed_config ds_config.json
{
   "steps_per_print": 2000,
   "checkpoint": {
     "use_node_local_storage": true
   },
   "elasticity": {
     "enabled": true,
     "micro_batch_sizes": [64,128,256],
     "max_train_batch_size": 1024
   },
   "optimizer": {
     "type": "Adam",
     "params": {
       "lr": 0.001,
       "betas": [
         0.8,
         0.999
       ],
       "eps": 1e-8,
       "weight_decay": 3e-7
     }
   },
   "scheduler": {
     "type": "WarmupLR",
     "params": {
       "warmup_min_lr": 0,
       "warmup_max_lr": 0.001,
       "warmup_num_steps": 1000
     }
   },
   "wall_clock_breakdown": false
}
Python
{
   "steps_per_print": 2000,
   "checkpoint": {
     "use_node_local_storage": true
   },
   "elasticity": {
     "enabled": true,
     "micro_batch_sizes": [64,128,256],
     "max_train_batch_size": 1024
   },
   "optimizer": {
     "type": "Adam",
     "params": {
       "lr": 0.001,
       "betas": [
         0.8,
         0.999
       ],
       "eps": 1e-8,
       "weight_decay": 3e-7
     }
   },
   "scheduler": {
     "type": "WarmupLR",
     "params": {
       "warmup_min_lr": 0,
       "warmup_max_lr": 0.001,
       "warmup_num_steps": 1000
     }
   },
   "wall_clock_breakdown": false
}

Related Posts

January 19, 2024 - 6:12 RobinDong machine learning
DeepSpeed, PyTorch
Leave a comment

Leave a Reply Cancel reply

Your email address will not be published. Required fields are marked *

Comment *

Name *

Email *

Website

Save my name, email, and website in this browser for the next time I comment.


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK