diff --git a/yamls/hydra-yamls/SD-2-base-256.yaml b/yamls/hydra-yamls/SD-2-base-256.yaml index b466fa98..d0ad1470 100644 --- a/yamls/hydra-yamls/SD-2-base-256.yaml +++ b/yamls/hydra-yamls/SD-2-base-256.yaml @@ -1,8 +1,6 @@ project: # Insert wandb project name -batch_size: 2048 -seed: 17 -scale_schedule_ratio: 1.0 name: # Insert wandb run name +seed: 17 eval_first: false algorithms: low_precision_groupnorm: @@ -22,15 +20,14 @@ model: val_guidance_scales: [] loss_bins: [] dataset: - train_batch_size: ${batch_size} - eval_batch_size: 1024 # Should be 8 per device + train_batch_size: 2048 # Global training batch size + eval_batch_size: 1024 # Global evaluation batch size train_dataset: _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader remote: # Path to object store bucket(s) local: # Path to corresponding local dataset(s) - batch_size: ${batch_size} tokenizer_name_or_path: stabilityai/stable-diffusion-2-base caption_drop_prob: 0.1 resize_size: 256 @@ -46,7 +43,6 @@ dataset: _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader remote: # Path to object store bucket local: # Path to local dataset cache - batch_size: 8 resize_size: 256 prefetch_factor: 2 num_workers: 8 @@ -57,10 +53,9 @@ optimizer: lr: 1.0e-4 weight_decay: 0.01 scheduler: - _target_: composer.optim.MultiStepWithWarmupScheduler + _target_: composer.optim.LinearWithWarmupScheduler t_warmup: 10000ba - milestones: - - 200ep + alpha_f: 1.0 logger: wandb: _target_: composer.loggers.wandb_logger.WandBLogger @@ -87,7 +82,6 @@ trainer: device_train_microbatch_size: 16 run_name: ${name} seed: ${seed} - scale_schedule_ratio: ${scale_schedule_ratio} save_folder: # Insert path to save folder or bucket save_interval: 10000ba save_overwrite: true diff --git a/yamls/hydra-yamls/SD-2-base-512.yaml b/yamls/hydra-yamls/SD-2-base-512.yaml index ab254624..39a1e659 100644 --- a/yamls/hydra-yamls/SD-2-base-512.yaml +++ b/yamls/hydra-yamls/SD-2-base-512.yaml @@ -1,8 +1,6 @@ project: # Insert wandb project name -batch_size: 2048 -seed: 17 -scale_schedule_ratio: 1.0 name: # Insert wandb run name +seed: 17 eval_first: false algorithms: ema: @@ -28,15 +26,14 @@ model: val_guidance_scales: [] loss_bins: [] dataset: - train_batch_size: ${batch_size} - eval_batch_size: 1024 # Should be 8 per device + train_batch_size: 2048 # Global training batch size + eval_batch_size: 1024 # Global evaluation batch size train_dataset: _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader remote: # Path to object store bucket(s) local: # Path to corresponding local dataset(s) - batch_size: ${batch_size} tokenizer_name_or_path: stabilityai/stable-diffusion-2-base caption_drop_prob: 0.1 resize_size: 512 @@ -52,7 +49,6 @@ dataset: _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader remote: # Path to object store bucket local: # Path to local dataset cache - batch_size: 8 resize_size: 512 prefetch_factor: 2 num_workers: 8 @@ -63,10 +59,9 @@ optimizer: lr: 1.0e-4 weight_decay: 0.01 scheduler: - _target_: composer.optim.MultiStepWithWarmupScheduler + _target_: composer.optim.LinearWithWarmupScheduler t_warmup: 10000ba - milestones: - - 200ep + alpha_f: 1.0 logger: wandb: _target_: composer.loggers.wandb_logger.WandBLogger @@ -93,7 +88,6 @@ trainer: device_train_microbatch_size: 16 run_name: ${name} seed: ${seed} - scale_schedule_ratio: ${scale_schedule_ratio} save_folder: # Insert path to save folder or bucket save_interval: 10000ba save_overwrite: true diff --git a/yamls/mosaic-yamls/SD-2-base-256.yaml b/yamls/mosaic-yamls/SD-2-base-256.yaml index 99755738..d6bbe901 100644 --- a/yamls/mosaic-yamls/SD-2-base-256.yaml +++ b/yamls/mosaic-yamls/SD-2-base-256.yaml @@ -1,24 +1,30 @@ run_name: SD2-base-256 -cluster: # Insert cluster here -gpu_num: # Insert number of GPUs image: mosaicml/pytorch_vision:1.13.1_cu117-python3.10-ubuntu20.04 +compute: + gpus: # Number of GPUs to use + + ## These configurations are optional + # cluster: TODO # Name of the cluster to use for this run + # gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments + integrations: - integration_type: "git_repo" - git_repo: mosaicml/diffusion2 + git_repo: mosaicml/diffusion git_branch: main pip_install: .[all] - integration_type: "wandb" project: # Insert wandb project name entity: # Insert wandb entity name +env_variables: +- key: HYDRA_FULL_ERROR + value: '1' # Set to '0' to limit Hydra tracebacks command: | - cd diffusion2 - HYDRA_FULL_ERROR=1 composer run.py + cd diffusion + composer run.py --config-path /mnt/config --config-name parameters parameters: project: # Insert wandb project name - batch_size: 2048 - seed: 17 - scale_schedule_ratio: 1.0 name: # Insert wandb run name + seed: 17 eval_first: false algorithms: low_precision_groupnorm: @@ -38,15 +44,14 @@ parameters: val_guidance_scales: [] loss_bins: [] dataset: - train_batch_size: ${batch_size} - eval_batch_size: 1024 # Should be 8 per device + train_batch_size: 2048 # Global training batch size + eval_batch_size: 1024 # Global evaluation batch size train_dataset: _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader remote: # Path to object store bucket(s) local: # Path to corresponding local dataset(s) - batch_size: ${batch_size} tokenizer_name_or_path: stabilityai/stable-diffusion-2-base caption_drop_prob: 0.1 resize_size: 256 @@ -62,7 +67,6 @@ parameters: _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader remote: # Path to object store bucket local: # Path to local dataset cache - batch_size: 8 resize_size: 256 prefetch_factor: 2 num_workers: 8 @@ -73,10 +77,9 @@ parameters: lr: 1.0e-4 weight_decay: 0.01 scheduler: - _target_: composer.optim.MultiStepWithWarmupScheduler + _target_: composer.optim.LinearWithWarmupScheduler t_warmup: 10000ba - milestones: - - 200ep + alpha_f: 1.0 logger: wandb: _target_: composer.loggers.wandb_logger.WandBLogger @@ -103,7 +106,6 @@ parameters: device_train_microbatch_size: 16 run_name: ${name} seed: ${seed} - scale_schedule_ratio: ${scale_schedule_ratio} save_folder: # Insert path to save folder or bucket save_interval: 10000ba save_overwrite: true diff --git a/yamls/mosaic-yamls/SD-2-base-512.yaml b/yamls/mosaic-yamls/SD-2-base-512.yaml index df82250d..ac26c68a 100644 --- a/yamls/mosaic-yamls/SD-2-base-512.yaml +++ b/yamls/mosaic-yamls/SD-2-base-512.yaml @@ -1,7 +1,7 @@ name: SD2-base-512 image: mosaicml/pytorch_vision:1.13.1_cu117-python3.10-ubuntu20.04 compute: - gpus: 8 # Number of GPUs to use + gpus: # Number of GPUs to use ## These configurations are optional # cluster: TODO # Name of the cluster to use for this run @@ -9,21 +9,22 @@ compute: integrations: - integration_type: "git_repo" - git_repo: mosaicml/diffusion2 + git_repo: mosaicml/diffusion git_branch: main pip_install: .[all] - integration_type: "wandb" project: # Insert wandb project name entity: # Insert wandb entity name +env_variables: +- key: HYDRA_FULL_ERROR + value: '1' # Set to '0' to limit Hydra tracebacks command: | - cd diffusion2 - HYDRA_FULL_ERROR=1 composer run.py + cd diffusion + composer run.py --config-path /mnt/config --config-name parameters parameters: project: # Insert wandb project name - batch_size: 2048 - seed: 17 - scale_schedule_ratio: 1.0 name: # Insert wandb run name + seed: 17 eval_first: false algorithms: ema: @@ -49,15 +50,14 @@ parameters: val_guidance_scales: [] loss_bins: [] dataset: - train_batch_size: ${batch_size} - eval_batch_size: 1024 # Should be 8 per device + train_batch_size: 2048 # Global training batch size + eval_batch_size: 1024 # Global evaluation batch size train_dataset: _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader remote: # Path to object store bucket(s) local: # Path to corresponding local dataset(s) - batch_size: ${batch_size} tokenizer_name_or_path: stabilityai/stable-diffusion-2-base caption_drop_prob: 0.1 resize_size: 512 @@ -73,7 +73,6 @@ parameters: _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader remote: # Path to object store bucket local: # Path to local dataset cache - batch_size: 8 resize_size: 512 prefetch_factor: 2 num_workers: 8 @@ -84,10 +83,9 @@ parameters: lr: 1.0e-4 weight_decay: 0.01 scheduler: - _target_: composer.optim.MultiStepWithWarmupScheduler + _target_: composer.optim.LinearWithWarmupScheduler t_warmup: 10000ba - milestones: - - 200ep + alpha_f: 1.0 logger: wandb: _target_: composer.loggers.wandb_logger.WandBLogger @@ -114,7 +112,6 @@ parameters: device_train_microbatch_size: 16 run_name: ${name} seed: ${seed} - scale_schedule_ratio: ${scale_schedule_ratio} save_folder: # Insert path to save folder or bucket save_interval: 10000ba save_overwrite: true