export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_DISABLE=0 export NCCL_SOCKET_IFNAME=eth1 export NCCL_DEBUG=INFO export NCCL_NVLS_ENABLE=0 export TEXT_ENCODER_NAME="google/t5-v1_1-xxl" export VISION_ENCODER_NAME="google/siglip-so400m-patch14-384" export OUTPUT_DIR="./checkpoints/aloha_box_into_pot" export CFLAGS="-I/usr/include" export LDFLAGS="-L/usr/lib/x86_64-linux-gnu" export CUTLASS_PATH="/home/jellyho/cutlass" export WANDB_PROJECT="RDT" if [ ! -d "$OUTPUT_DIR" ]; then mkdir "$OUTPUT_DIR" echo "Folder '$OUTPUT_DIR' created" else echo "Folder '$OUTPUT_DIR' already exists" fi # For run in a single node/machine # accelerate launch main.py \ # --deepspeed="./configs/zero2.json" \ # ... # --pretrained_model_name_or_path="robotics-diffusion-transformer/rdt-1b" \ accelerate launch main.py \ --deepspeed="./configs/zero2.json" \ --pretrained_text_encoder_name_or_path=$TEXT_ENCODER_NAME \ --pretrained_vision_encoder_name_or_path=$VISION_ENCODER_NAME \ --output_dir=$OUTPUT_DIR \ --train_batch_size=8 \ --sample_batch_size=8 \ --max_train_steps=100000 \ --checkpointing_period=5000 \ --sample_period=500 \ --checkpoints_total_limit=10 \ --lr_scheduler="constant" \ --learning_rate=1e-4 \ --mixed_precision="bf16" \ --dataloader_num_workers=8 \ --dataset_type="pretrain" \ --report_to=wandb \ --gradient_accumulation_steps 1 \ --resume_from_checkpoint="checkpoint-50000" # Use this to resume training from some previous checkpoint # --resume_from_checkpoint="checkpoint-1000" \