| #!/bin/bash |
|
|
| |
| |
|
|
| echo "=== Launching Multi-GPU AMP Flow Matching Training with FULL DATA ===" |
| echo "Using 4 H100 GPUs for distributed training" |
| echo "Using ALL available peptide embeddings and UniProt data" |
| echo "EXTENDED TRAINING: 5000 iterations with CFG support" |
| echo "" |
|
|
| |
| echo "Checking required files..." |
| if [ ! -f "final_compressor_model.pth" ]; then |
| echo "β Missing final_compressor_model.pth" |
| echo "Please run compressor_with_embeddings.py first" |
| exit 1 |
| fi |
|
|
| if [ ! -f "final_decompressor_model.pth" ]; then |
| echo "β Missing final_decompressor_model.pth" |
| echo "Please run compressor_with_embeddings.py first" |
| exit 1 |
| fi |
|
|
| if [ ! -d "/data2/edwardsun/flow_project/peptide_embeddings/" ]; then |
| echo "β Missing /data2/edwardsun/flow_project/peptide_embeddings/ directory" |
| echo "Please run final_sequence_encoder.py first" |
| exit 1 |
| fi |
|
|
| |
| if [ ! -f "/data2/edwardsun/flow_project/peptide_embeddings/all_peptide_embeddings.pt" ]; then |
| echo "β οΈ Warning: all_peptide_embeddings.pt not found" |
| echo "Will use individual embedding files instead" |
| else |
| echo "β Found all_peptide_embeddings.pt (4.3GB - ALL peptide data)" |
| fi |
|
|
| |
| if [ ! "$(ls -A /data2/edwardsun/flow_project/peptide_embeddings/*.pt 2>/dev/null)" ]; then |
| echo "β No .pt files found in /data2/edwardsun/flow_project/peptide_embeddings/ directory" |
| echo "Please run final_sequence_encoder.py first" |
| exit 1 |
| fi |
|
|
| echo "β All required files found!" |
| echo "" |
|
|
| |
| export NCCL_DEBUG=INFO |
| export NCCL_IB_DISABLE=0 |
| export NCCL_P2P_DISABLE=0 |
|
|
| |
| echo "Starting distributed training with torchrun..." |
| echo "Configuration (FULL DATA TRAINING):" |
| echo " - Number of GPUs: 4" |
| echo " - Batch size per GPU: 64" |
| echo " - Total batch size: 256" |
| echo " - Total iterations: 5,000" |
| echo " - Data: ALL peptide embeddings + ALL UniProt data" |
| echo " - Estimated time: ~30-45 minutes (4x faster than single GPU)" |
| echo "" |
|
|
| |
| torchrun \ |
| --nproc_per_node=4 \ |
| --nnodes=1 \ |
| --node_rank=0 \ |
| --master_addr=localhost \ |
| --master_port=29500 \ |
| amp_flow_training_multi_gpu.py |
|
|
| echo "" |
| echo "=== Training Complete with FULL DATA ===" |
| echo "Check for output files:" |
| echo " - amp_flow_model_final_full_data.pth (final model with full data)" |
| echo " - amp_flow_checkpoint_full_data_step_*.pth (checkpoints)" |
| echo "" |
| echo "Next steps:" |
| echo "1. Test the model: python generate_amps.py" |
| echo "2. If successful, increase iterations for full training" |
| echo "3. Implement reflow for 1-step generation" |
| echo "4. Add conditioning for toxicity" |