Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- DEPLOY_CHECKLIST.md +42 -97
- app.py +3 -12
- run_transformers_training.py +244 -110
- transformers_config.json +3 -0
DEPLOY_CHECKLIST.md
CHANGED
|
@@ -1,107 +1,52 @@
|
|
| 1 |
-
# Phi-4 Training
|
| 2 |
-
|
| 3 |
-
##
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
- [ ]
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
- [ ]
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
- [ ]
|
| 21 |
-
- [ ]
|
| 22 |
-
- [ ]
|
| 23 |
-
- [ ]
|
| 24 |
-
- [ ]
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
- [ ]
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
- [ ]
|
| 33 |
-
- [ ]
|
| 34 |
-
- [ ]
|
| 35 |
-
- [ ] Sequential sampler used in dataloader (no shuffling)
|
| 36 |
-
- [ ] Max sequence length of 2048 applied
|
| 37 |
-
- [ ] Format validation for first few examples enabled
|
| 38 |
-
|
| 39 |
-
### 4. Dependency Management ✓
|
| 40 |
-
|
| 41 |
-
- [ ] requirements.txt includes all necessary packages:
|
| 42 |
-
- [ ] unsloth
|
| 43 |
-
- [ ] peft
|
| 44 |
-
- [ ] bitsandbytes
|
| 45 |
-
- [ ] einops
|
| 46 |
-
- [ ] sentencepiece
|
| 47 |
-
- [ ] datasets
|
| 48 |
-
- [ ] transformers
|
| 49 |
-
- [ ] Optional packages marked as such (e.g., flash-attn)
|
| 50 |
-
- [ ] Dependency version constraints avoid known conflicts
|
| 51 |
-
|
| 52 |
-
### 5. Error Handling & Logging ✓
|
| 53 |
-
|
| 54 |
-
- [ ] Proper error catching for dataset loading
|
| 55 |
-
- [ ] Fallback mechanisms for chat template application
|
| 56 |
-
- [ ] Clear, concise log messages that work with HF Space interface
|
| 57 |
-
- [ ] Memory usage tracking at key points (start, end, periodic)
|
| 58 |
-
- [ ] Third-party loggers set to WARNING to reduce noise
|
| 59 |
-
- [ ] Low-verbosity log format for better HF Space compatibility
|
| 60 |
-
|
| 61 |
-
### 6. Training Setup ✓
|
| 62 |
-
|
| 63 |
-
- [ ] Number of epochs properly configured (default: 3)
|
| 64 |
-
- [ ] Learning rate appropriate (default: 2e-5)
|
| 65 |
-
- [ ] Warmup ratio set (default: 0.05)
|
| 66 |
-
- [ ] Checkpointing frequency set to reasonable value (default: 100 steps)
|
| 67 |
-
- [ ] Output directory correctly configured
|
| 68 |
-
- [ ] HuggingFace Hub parameters set correctly if pushing models
|
| 69 |
-
|
| 70 |
-
### 7. Pre-Flight Verification ✓
|
| 71 |
-
|
| 72 |
-
- [ ] No linting errors or indentation issues
|
| 73 |
-
- [ ] Updated config values are consistent across files
|
| 74 |
-
- [ ] Batch size × gradient accumulation × GPUs gives reasonable total batch
|
| 75 |
-
- [ ] Verified that requirements.txt matches actual imports in code
|
| 76 |
-
- [ ] Confirmed tokenizer settings match the model requirements
|
| 77 |
|
| 78 |
---
|
| 79 |
|
| 80 |
-
##
|
| 81 |
-
|
| 82 |
-
If you've made any configuration changes, record them here before deployment:
|
| 83 |
|
| 84 |
-
|
|
| 85 |
-
|
| 86 |
-
|
|
| 87 |
-
|
|
|
|
|
| 88 |
|
| 89 |
---
|
| 90 |
|
| 91 |
-
|
| 92 |
|
| 93 |
-
**
|
| 94 |
-
|
| 95 |
-
**Expected Training Speed**: ~XXX examples/second with current configuration
|
| 96 |
-
|
| 97 |
-
**Memory Requirements**: Peak usage expected to be ~20GB per GPU
|
| 98 |
-
|
| 99 |
-
**Common Issues to Watch For**:
|
| 100 |
-
- OOM errors on GPU 0: If seen, reduce batch size by 2 and increase grad accumulation by 1
|
| 101 |
-
- Imbalanced GPU usage: Check device mapping and FSDP configuration
|
| 102 |
-
- Slow training: Verify that all GPUs are being utilized efficiently
|
| 103 |
-
- Log flooding: Reduce verbosity of component logs (transformers, datasets, etc.)
|
| 104 |
-
|
| 105 |
-
---
|
| 106 |
|
| 107 |
*Last Updated: 2025-03-09*
|
|
|
|
| 1 |
+
# Phi-4 Training Critical Deployment Checklist
|
| 2 |
+
|
| 3 |
+
## Essential Configuration Requirements
|
| 4 |
+
|
| 5 |
+
### 1. Model Configuration
|
| 6 |
+
- [ ] Model name: `unsloth/phi-4-unsloth-bnb-4bit`
|
| 7 |
+
- [ ] BF16 precision enabled, FP16 disabled
|
| 8 |
+
- [ ] Appropriate sequence length (2048)
|
| 9 |
+
- [ ] LoRA parameters correctly configured (r: 32, alpha: 16)
|
| 10 |
+
|
| 11 |
+
### 2. Hardware & Resource Management
|
| 12 |
+
- [ ] Per-device batch size ≤ 16
|
| 13 |
+
- [ ] Gradient accumulation steps ≥ 3
|
| 14 |
+
- [ ] Gradient checkpointing enabled
|
| 15 |
+
- [ ] Memory usage limits properly set (85% of GPU capacity)
|
| 16 |
+
|
| 17 |
+
### 3. Critical Dataset Handling Rules
|
| 18 |
+
- [ ] **NO REORDERING of dataset entries** - original order must be preserved
|
| 19 |
+
- [ ] **NO COMBINING of separate entries** - each entry must remain distinct
|
| 20 |
+
- [ ] **SEQUENTIAL PROCESSING required** - entries must be processed one after another
|
| 21 |
+
- [ ] `sort_by_id` and `maintain_paper_order` flags properly set to preserve data sequence
|
| 22 |
+
- [ ] Sequential sampler used with no shuffling (`"shuffle": false`)
|
| 23 |
+
- [ ] Dataset sequential integrity verified with validation samples
|
| 24 |
+
- [ ] Conversation structure preserved (original format maintained)
|
| 25 |
+
|
| 26 |
+
### 4. Essential Error Handling
|
| 27 |
+
- [ ] Clear error catching for dataset loading issues
|
| 28 |
+
- [ ] Memory tracking at key training points
|
| 29 |
+
- [ ] Low-verbosity logging for HF Space compatibility
|
| 30 |
+
|
| 31 |
+
### 5. Training Core Requirements
|
| 32 |
+
- [ ] Appropriate learning rate (2e-5)
|
| 33 |
+
- [ ] Proper checkpointing frequency
|
| 34 |
+
- [ ] Hub settings correctly configured for model saving
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
---
|
| 37 |
|
| 38 |
+
## Pre-Deployment Verification
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
| Requirement | Status | Notes |
|
| 41 |
+
|-------------|--------|-------|
|
| 42 |
+
| Data sequential integrity | | Confirm entries processed in order |
|
| 43 |
+
| GPU memory within limits | | Check peak memory doesn't exceed 20GB per GPU |
|
| 44 |
+
| Training batch verification | | Verify first few batches maintain proper order |
|
| 45 |
|
| 46 |
---
|
| 47 |
|
| 48 |
+
**Current Hardware**: 4× NVIDIA L4 GPUs (24GB VRAM each)
|
| 49 |
|
| 50 |
+
**CRITICAL REMINDER**: Data sequence preservation is the highest priority - any shuffling, reordering, or combining of entries will compromise model quality.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
*Last Updated: 2025-03-09*
|
app.py
CHANGED
|
@@ -109,18 +109,9 @@ def display_config():
|
|
| 109 |
def start_training():
|
| 110 |
"""Start the training process."""
|
| 111 |
try:
|
| 112 |
-
#
|
| 113 |
-
log_info("
|
| 114 |
-
|
| 115 |
-
try:
|
| 116 |
-
result = subprocess.run(verify_cmd, shell=True, check=True, capture_output=True, text=True)
|
| 117 |
-
if "All critical checks passed!" not in result.stdout:
|
| 118 |
-
log_info("Verification found issues. Please review:")
|
| 119 |
-
log_info(result.stdout)
|
| 120 |
-
return "Verification detected potential issues. Please review the logs before proceeding."
|
| 121 |
-
except subprocess.CalledProcessError as e:
|
| 122 |
-
log_info(f"Verification failed: {e.stderr}")
|
| 123 |
-
return "Verification failed. Please check the logs for details."
|
| 124 |
|
| 125 |
# Start training
|
| 126 |
log_info("Starting training process...")
|
|
|
|
| 109 |
def start_training():
|
| 110 |
"""Start the training process."""
|
| 111 |
try:
|
| 112 |
+
# Log configuration check
|
| 113 |
+
log_info("Preparing to start training process...")
|
| 114 |
+
log_info("Using consolidated configuration from transformers_config.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
# Start training
|
| 117 |
log_info("Starting training process...")
|
run_transformers_training.py
CHANGED
|
@@ -8,6 +8,14 @@ import argparse
|
|
| 8 |
import logging
|
| 9 |
from datetime import datetime
|
| 10 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Import Unsloth first, before other ML imports
|
| 13 |
try:
|
|
@@ -19,7 +27,6 @@ except ImportError:
|
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
logger.warning("Unsloth not available. Please install with: pip install unsloth")
|
| 21 |
|
| 22 |
-
import torch
|
| 23 |
from datasets import load_dataset
|
| 24 |
from transformers import (
|
| 25 |
AutoModelForCausalLM,
|
|
@@ -46,6 +53,9 @@ logging.getLogger("accelerate").setLevel(logging.WARNING)
|
|
| 46 |
logging.getLogger("torch").setLevel(logging.WARNING)
|
| 47 |
logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
|
| 48 |
|
|
|
|
|
|
|
|
|
|
| 49 |
# Define a clean logging function for HF Space compatibility
|
| 50 |
def log_info(message):
|
| 51 |
"""Log information in a format compatible with Hugging Face Spaces"""
|
|
@@ -336,6 +346,45 @@ def load_dataset_with_mapping(dataset_config):
|
|
| 336 |
# Note: Explicitly NOT sorting the dataset to preserve original order
|
| 337 |
logger.info("Preserving original dataset order (no sorting)")
|
| 338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
# Log examples without printing full content
|
| 340 |
if "conversations" in dataset.column_names:
|
| 341 |
sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
|
|
@@ -532,37 +581,107 @@ class SimpleDataCollator:
|
|
| 532 |
|
| 533 |
class LoggingCallback(TrainerCallback):
|
| 534 |
def __init__(self):
|
|
|
|
|
|
|
| 535 |
self.last_log_time = time.time()
|
| 536 |
-
self.
|
|
|
|
|
|
|
|
|
|
| 537 |
|
| 538 |
def on_step_end(self, args, state, control, **kwargs):
|
| 539 |
# Log every 50 steps or every 5 minutes, whichever comes first
|
| 540 |
current_time = time.time()
|
| 541 |
|
| 542 |
-
#
|
| 543 |
-
if
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
def on_train_begin(self, args, state, control, **kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
log_info("=== Training is starting ===")
|
| 567 |
|
| 568 |
# Log important training parameters for visibility
|
|
@@ -571,9 +690,9 @@ class LoggingCallback(TrainerCallback):
|
|
| 571 |
log_info(f"Epochs: {args.num_train_epochs}")
|
| 572 |
|
| 573 |
# Log memory information in compact format
|
| 574 |
-
if
|
| 575 |
memory_info = []
|
| 576 |
-
for i in range(
|
| 577 |
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
| 578 |
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
| 579 |
memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
|
|
@@ -581,15 +700,18 @@ class LoggingCallback(TrainerCallback):
|
|
| 581 |
log_info(f"Initial memory usage - {', '.join(memory_info)}")
|
| 582 |
|
| 583 |
def on_train_end(self, args, state, control, **kwargs):
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
|
|
|
| 591 |
|
| 592 |
-
|
|
|
|
|
|
|
| 593 |
|
| 594 |
log_info(f"Total steps: {state.global_step}")
|
| 595 |
log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
|
|
@@ -627,6 +749,15 @@ def main():
|
|
| 627 |
# Set up logging
|
| 628 |
log_info("Starting Phi-4 fine-tuning process")
|
| 629 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 630 |
# Parse arguments
|
| 631 |
args = parse_args()
|
| 632 |
|
|
@@ -645,64 +776,66 @@ def main():
|
|
| 645 |
else:
|
| 646 |
log_info("Running in non-distributed mode (single process)")
|
| 647 |
|
| 648 |
-
# Load all configurations
|
| 649 |
try:
|
| 650 |
configs = load_configs(args.config_dir)
|
| 651 |
|
| 652 |
-
# Extract specific configs
|
| 653 |
if not configs:
|
| 654 |
logger.error("Failed to load configuration")
|
| 655 |
return 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
|
| 657 |
# Verify configuration sections exist
|
| 658 |
-
if
|
| 659 |
logger.error("transformers_config.json not found or invalid")
|
| 660 |
return 1
|
| 661 |
|
| 662 |
-
if
|
| 663 |
logger.warning("Hardware configuration section not found in transformers_config.json. Using default hardware configuration.")
|
| 664 |
|
| 665 |
-
if
|
| 666 |
logger.error("Dataset configuration section not found in transformers_config.json")
|
| 667 |
return 1
|
| 668 |
|
| 669 |
# Validate model configuration
|
| 670 |
-
|
| 671 |
-
|
|
|
|
|
|
|
|
|
|
| 672 |
logger.error("Model name not specified in configuration")
|
| 673 |
logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
|
| 674 |
return 1
|
| 675 |
|
| 676 |
-
model_name = model_config.get("model", {}).get("name") or model_config.get("model_name_or_path") or model_config.get("model_name")
|
| 677 |
log_info(f"Using model: {model_name}")
|
| 678 |
log_info("All configurations loaded successfully")
|
| 679 |
|
| 680 |
-
# Extract specific configs
|
| 681 |
-
model_config = configs["transformers"]
|
| 682 |
-
hardware_config = configs.get("hardware", {})
|
| 683 |
-
dataset_config = configs["dataset"]
|
| 684 |
-
|
| 685 |
# Apply hardware-specific settings if available
|
| 686 |
if hardware_config:
|
| 687 |
# Get training optimizations from hardware config
|
| 688 |
training_opts = hardware_config.get("training_optimizations", {})
|
| 689 |
|
| 690 |
# Apply batch size and gradient accumulation settings
|
| 691 |
-
if training_opts.get("per_device_batch_size") and
|
| 692 |
batch_size = training_opts.get("per_device_batch_size")
|
| 693 |
-
|
| 694 |
log_info(f"Applied hardware-optimized batch size: {batch_size}")
|
| 695 |
|
| 696 |
-
if training_opts.get("gradient_accumulation_steps") and
|
| 697 |
grad_steps = training_opts.get("gradient_accumulation_steps")
|
| 698 |
-
|
| 699 |
log_info(f"Applied hardware-optimized gradient accumulation: {grad_steps}")
|
| 700 |
|
| 701 |
# Apply memory optimizations
|
| 702 |
memory_opts = training_opts.get("memory_optimizations", {})
|
| 703 |
-
if memory_opts.get("use_gradient_checkpointing") is not None and
|
| 704 |
grad_ckpt = memory_opts.get("use_gradient_checkpointing")
|
| 705 |
-
|
| 706 |
log_info(f"Applied hardware-optimized gradient checkpointing: {grad_ckpt}")
|
| 707 |
|
| 708 |
# Apply system settings
|
|
@@ -720,38 +853,27 @@ def main():
|
|
| 720 |
return 1
|
| 721 |
|
| 722 |
# Set random seed for reproducibility
|
| 723 |
-
seed =
|
| 724 |
set_seed(seed)
|
| 725 |
log_info(f"Set random seed to {seed} for reproducibility")
|
| 726 |
|
| 727 |
-
#
|
| 728 |
-
if
|
| 729 |
-
# Empty CUDA cache
|
| 730 |
torch.cuda.empty_cache()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
# Get memory fraction from hardware config
|
| 736 |
-
cuda_memory_fraction = hardware_config.get("system_settings", {}).get("cuda_memory_fraction", 0.85)
|
| 737 |
-
|
| 738 |
-
# Log initial memory information in a compact form
|
| 739 |
-
gpu_info = []
|
| 740 |
-
for i in range(torch.cuda.device_count()):
|
| 741 |
-
name = torch.cuda.get_device_name(i)
|
| 742 |
-
allocated = torch.cuda.memory_allocated(i) / 1024**3
|
| 743 |
-
total = torch.cuda.get_device_properties(i).total_memory / 1024**3
|
| 744 |
-
reserved_memory = total * cuda_memory_fraction
|
| 745 |
-
gpu_info.append(f"GPU {i}: {name} ({allocated:.1f}GB/{reserved_memory:.1f}GB)")
|
| 746 |
-
|
| 747 |
-
log_info(f"Hardware: {torch.cuda.device_count()} GPUs detected")
|
| 748 |
-
log_info(f"GPU details: {', '.join(gpu_info)}")
|
| 749 |
-
else:
|
| 750 |
-
log_info("No GPU detected, using CPU (training will be very slow)")
|
| 751 |
|
| 752 |
try:
|
| 753 |
log_info("Loading model and tokenizer...")
|
| 754 |
-
model, tokenizer = load_model_and_tokenizer(
|
| 755 |
log_info("Model and tokenizer loaded successfully")
|
| 756 |
|
| 757 |
# Load dataset with proper mapping
|
|
@@ -781,25 +903,21 @@ def main():
|
|
| 781 |
log_info("Using FP16 precision from hardware config")
|
| 782 |
else:
|
| 783 |
# Fall back to transformers config
|
| 784 |
-
use_bf16 =
|
| 785 |
-
use_fp16 =
|
| 786 |
log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
|
| 787 |
|
| 788 |
# Get per device batch size - from transformers config, but possibly overridden by hardware config
|
| 789 |
-
per_device_batch_size =
|
| 790 |
-
gradient_accumulation_steps =
|
| 791 |
|
| 792 |
# For multi-GPU setup, adjust for better balance
|
| 793 |
-
if
|
| 794 |
-
log_info(f"Multi-GPU setup
|
| 795 |
-
log_info(f"Training config: {per_device_batch_size} samples/GPU × {gradient_accumulation_steps} accumulation steps")
|
| 796 |
-
|
| 797 |
-
# Determine multi-GPU strategy from hardware config
|
| 798 |
-
multi_gpu_strategy = hardware_config.get("training_optimizations", {}).get("multi_gpu_strategy", "data_parallel")
|
| 799 |
|
| 800 |
# Set up FSDP for multi-GPU training if specified and in distributed mode
|
| 801 |
fsdp_config = None
|
| 802 |
-
if multi_gpu_strategy == "fsdp" and is_distributed and
|
| 803 |
try:
|
| 804 |
from torch.distributed.fsdp import (
|
| 805 |
FullyShardedDataParallel as FSDP,
|
|
@@ -845,33 +963,33 @@ def main():
|
|
| 845 |
# Set up training arguments
|
| 846 |
log_info("Setting up training arguments")
|
| 847 |
training_args = TrainingArguments(
|
| 848 |
-
output_dir=
|
| 849 |
-
num_train_epochs=
|
| 850 |
per_device_train_batch_size=per_device_batch_size,
|
| 851 |
gradient_accumulation_steps=gradient_accumulation_steps,
|
| 852 |
-
learning_rate=
|
| 853 |
-
weight_decay=
|
| 854 |
-
warmup_ratio=
|
| 855 |
-
lr_scheduler_type=
|
| 856 |
-
logging_steps=
|
| 857 |
-
save_strategy=
|
| 858 |
-
save_steps=
|
| 859 |
-
save_total_limit=
|
| 860 |
fp16=use_fp16,
|
| 861 |
bf16=use_bf16,
|
| 862 |
-
max_grad_norm=
|
| 863 |
-
push_to_hub=
|
| 864 |
-
hub_model_id=
|
| 865 |
hub_token=os.environ.get("HF_TOKEN", None),
|
| 866 |
report_to="tensorboard",
|
| 867 |
remove_unused_columns=False, # Keep all columns
|
| 868 |
-
gradient_checkpointing=
|
| 869 |
dataloader_pin_memory=pin_memory,
|
| 870 |
-
optim=
|
| 871 |
ddp_find_unused_parameters=False, # Improve distributed training efficiency
|
| 872 |
dataloader_drop_last=False, # Process all examples
|
| 873 |
dataloader_num_workers=dataloader_workers,
|
| 874 |
-
no_cuda=False if
|
| 875 |
# Only add FSDP if we're in distributed mode with FSDP strategy
|
| 876 |
fsdp=fsdp_config if is_distributed and multi_gpu_strategy == "fsdp" else None,
|
| 877 |
)
|
|
@@ -894,11 +1012,27 @@ def main():
|
|
| 894 |
"""Custom dataloader that preserves original dataset order"""
|
| 895 |
log_info("Creating sequential dataloader to maintain original dataset order")
|
| 896 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 897 |
# Calculate batch size based on device availability
|
| 898 |
if getattr(training_args, "no_cuda", False):
|
| 899 |
batch_size = training_args.per_device_train_batch_size
|
| 900 |
else:
|
| 901 |
-
batch_size = max(training_args.per_device_train_batch_size * max(1,
|
| 902 |
|
| 903 |
log_info(f"Using sequential sampler with batch size {batch_size}")
|
| 904 |
|
|
@@ -920,12 +1054,12 @@ def main():
|
|
| 920 |
log_info("=== Starting Training ===")
|
| 921 |
try:
|
| 922 |
# Empty cache again right before training
|
| 923 |
-
if
|
| 924 |
torch.cuda.empty_cache()
|
| 925 |
log_info("Cleared CUDA cache before training")
|
| 926 |
|
| 927 |
# Display compact training info
|
| 928 |
-
total_steps = int(len(dataset) / (per_device_batch_size *
|
| 929 |
log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
|
| 930 |
|
| 931 |
trainer.train()
|
|
@@ -937,8 +1071,8 @@ def main():
|
|
| 937 |
log_info(f"Model saved to {training_args.output_dir}")
|
| 938 |
|
| 939 |
# Push to hub if enabled
|
| 940 |
-
if
|
| 941 |
-
hub_id =
|
| 942 |
log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
|
| 943 |
trainer.push_to_hub()
|
| 944 |
log_info("Model successfully pushed to Hub")
|
|
@@ -947,9 +1081,9 @@ def main():
|
|
| 947 |
except Exception as e:
|
| 948 |
logger.error(f"Training failed with error: {str(e)}")
|
| 949 |
# Log CUDA memory info if available in compact format
|
| 950 |
-
if
|
| 951 |
memory_info = []
|
| 952 |
-
for i in range(
|
| 953 |
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
| 954 |
reserved = torch.cuda.memory_reserved(i) / 1024**2
|
| 955 |
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
|
|
|
| 8 |
import logging
|
| 9 |
from datetime import datetime
|
| 10 |
import time
|
| 11 |
+
import warnings
|
| 12 |
+
import torch
|
| 13 |
+
from importlib.util import find_spec
|
| 14 |
+
|
| 15 |
+
# Global variables for hardware detection
|
| 16 |
+
CUDA_AVAILABLE = torch.cuda.is_available()
|
| 17 |
+
NUM_GPUS = torch.cuda.device_count() if CUDA_AVAILABLE else 0
|
| 18 |
+
DEVICE_TYPE = "cuda" if CUDA_AVAILABLE else "cpu"
|
| 19 |
|
| 20 |
# Import Unsloth first, before other ML imports
|
| 21 |
try:
|
|
|
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
logger.warning("Unsloth not available. Please install with: pip install unsloth")
|
| 29 |
|
|
|
|
| 30 |
from datasets import load_dataset
|
| 31 |
from transformers import (
|
| 32 |
AutoModelForCausalLM,
|
|
|
|
| 53 |
logging.getLogger("torch").setLevel(logging.WARNING)
|
| 54 |
logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
|
| 55 |
|
| 56 |
+
# Check availability of libraries
|
| 57 |
+
peft_available = find_spec("peft") is not None
|
| 58 |
+
|
| 59 |
# Define a clean logging function for HF Space compatibility
|
| 60 |
def log_info(message):
|
| 61 |
"""Log information in a format compatible with Hugging Face Spaces"""
|
|
|
|
| 346 |
# Note: Explicitly NOT sorting the dataset to preserve original order
|
| 347 |
logger.info("Preserving original dataset order (no sorting)")
|
| 348 |
|
| 349 |
+
# Check data ordering requirements
|
| 350 |
+
processing_config = dataset_config.get("dataset", {}).get("processing", {})
|
| 351 |
+
data_loading_config = dataset_config.get("data_loading", {})
|
| 352 |
+
|
| 353 |
+
# Flag consolidation - we only need one flag to control sequence preservation
|
| 354 |
+
# Default to True to ensure safety
|
| 355 |
+
preserve_sequence = processing_config.get("preserve_entry_sequence", True)
|
| 356 |
+
shuffle_disabled = not data_loading_config.get("shuffle", False)
|
| 357 |
+
|
| 358 |
+
if not preserve_sequence:
|
| 359 |
+
logger.warning("CRITICAL: preserve_entry_sequence is set to False. This is NOT RECOMMENDED!")
|
| 360 |
+
logger.warning("Data sequence integrity is essential for proper model training.")
|
| 361 |
+
|
| 362 |
+
if not shuffle_disabled:
|
| 363 |
+
logger.error("CRITICAL: shuffle is enabled in the dataset config!")
|
| 364 |
+
logger.error("This will RANDOMIZE your dataset and break sequential order.")
|
| 365 |
+
logger.error("Please set shuffle: false in your data_loading configuration.")
|
| 366 |
+
# Actually enforce sequence preservation by raising an error
|
| 367 |
+
raise ValueError("Dataset shuffling is enabled but preserve_entry_sequence is required. " +
|
| 368 |
+
"Please disable shuffling in your configuration.")
|
| 369 |
+
|
| 370 |
+
# Verify the IDs are in sequential order if they're numeric
|
| 371 |
+
try:
|
| 372 |
+
if len(dataset) > 1 and all(isinstance(example.get('id', ''), (int, str)) for example in dataset.select(range(min(10, len(dataset))))):
|
| 373 |
+
sample_ids = [example['id'] for example in dataset.select(range(min(10, len(dataset))))]
|
| 374 |
+
logger.info(f"Verifying sequential integrity with first few IDs: {sample_ids}")
|
| 375 |
+
|
| 376 |
+
# Check if IDs are numeric and ordered
|
| 377 |
+
if all(isinstance(id, int) or id.isdigit() for id in sample_ids):
|
| 378 |
+
numeric_ids = [int(id) if isinstance(id, str) else id for id in sample_ids]
|
| 379 |
+
is_ordered = all(numeric_ids[i] <= numeric_ids[i+1] for i in range(len(numeric_ids)-1))
|
| 380 |
+
if not is_ordered:
|
| 381 |
+
logger.warning("WARNING: Sample IDs are not in sequential order.")
|
| 382 |
+
logger.warning("This may indicate that data sequence is not preserved.")
|
| 383 |
+
else:
|
| 384 |
+
logger.info("Sample IDs appear to be in sequential order.")
|
| 385 |
+
except Exception as e:
|
| 386 |
+
logger.warning(f"Could not verify sequential integrity: {e}")
|
| 387 |
+
|
| 388 |
# Log examples without printing full content
|
| 389 |
if "conversations" in dataset.column_names:
|
| 390 |
sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
|
|
|
|
| 581 |
|
| 582 |
class LoggingCallback(TrainerCallback):
|
| 583 |
def __init__(self):
|
| 584 |
+
super().__init__()
|
| 585 |
+
self.training_started = time.time()
|
| 586 |
self.last_log_time = time.time()
|
| 587 |
+
self.last_step = 0
|
| 588 |
+
self.verify_sequence = None
|
| 589 |
+
self.sequence_samples = None
|
| 590 |
+
self.sample_indices = None
|
| 591 |
|
| 592 |
def on_step_end(self, args, state, control, **kwargs):
|
| 593 |
# Log every 50 steps or every 5 minutes, whichever comes first
|
| 594 |
current_time = time.time()
|
| 595 |
|
| 596 |
+
# Perform actual sequence integrity verification if enabled
|
| 597 |
+
if self.verify_sequence is True and state.global_step % 100 == 0 and self.sequence_samples:
|
| 598 |
+
try:
|
| 599 |
+
# Get a batch of data without disturbing the training
|
| 600 |
+
batch = next(iter(trainer.get_train_dataloader()))
|
| 601 |
+
if 'input_ids' in batch and 'labels' in batch:
|
| 602 |
+
log_info("Verifying data sequence integrity...")
|
| 603 |
+
|
| 604 |
+
# Check if we can access some of our reference samples
|
| 605 |
+
current_indices = list(range(min(3, len(trainer.train_dataset))))
|
| 606 |
+
current_samples = [trainer.train_dataset[i] for i in current_indices]
|
| 607 |
+
|
| 608 |
+
# Compare current samples with our reference samples from training start
|
| 609 |
+
is_sequence_maintained = True
|
| 610 |
+
for i, (orig_idx, orig_sample) in enumerate(zip(self.sample_indices, self.sequence_samples)):
|
| 611 |
+
# Check if sample IDs still match our reference
|
| 612 |
+
if orig_idx < len(current_samples):
|
| 613 |
+
current_sample = current_samples[i]
|
| 614 |
+
|
| 615 |
+
# Compare IDs if available
|
| 616 |
+
if 'id' in orig_sample and 'id' in current_sample:
|
| 617 |
+
if orig_sample['id'] != current_sample['id']:
|
| 618 |
+
log_info(f"WARNING: Sequence integrity compromised! Sample {i} ID changed from {orig_sample['id']} to {current_sample['id']}")
|
| 619 |
+
is_sequence_maintained = False
|
| 620 |
+
|
| 621 |
+
# Compare input fingerprints
|
| 622 |
+
if 'conversations' in orig_sample and 'conversations' in current_sample:
|
| 623 |
+
orig_len = len(orig_sample['conversations'])
|
| 624 |
+
curr_len = len(current_sample['conversations'])
|
| 625 |
+
if orig_len != curr_len:
|
| 626 |
+
log_info(f"WARNING: Sequence integrity compromised! Sample {i} conversation length changed from {orig_len} to {curr_len}")
|
| 627 |
+
is_sequence_maintained = False
|
| 628 |
+
|
| 629 |
+
if is_sequence_maintained:
|
| 630 |
+
log_info("Data sequence integrity check: OK")
|
| 631 |
+
else:
|
| 632 |
+
log_info("CRITICAL WARNING: Data sequence integrity check FAILED!")
|
| 633 |
+
except Exception as e:
|
| 634 |
+
log_info(f"Warning: Couldn't verify sequence integrity: {e}")
|
| 635 |
|
| 636 |
+
time_interval = current_time - self.last_log_time
|
| 637 |
+
step_interval = state.global_step - self.last_step
|
| 638 |
+
|
| 639 |
+
if step_interval >= 50 or time_interval >= 300: # 5 minutes = 300 seconds
|
| 640 |
+
# Calculate throughput
|
| 641 |
+
examples_per_second = step_interval * args.per_device_train_batch_size * args.gradient_accumulation_steps / max(time_interval, 1e-6)
|
| 642 |
+
|
| 643 |
+
elapsed_total = time.strftime("%H:%M:%S", time.gmtime(current_time - self.training_started))
|
| 644 |
+
|
| 645 |
+
# Log progress
|
| 646 |
+
log_info(f"Step: {state.global_step}, Loss: {state.log_history[-1]['loss']:.4f}, "
|
| 647 |
+
f"Rate: {examples_per_second:.2f} examples/sec, Elapsed: {elapsed_total}")
|
| 648 |
|
| 649 |
+
# Report memory usage if CUDA is available
|
| 650 |
+
if CUDA_AVAILABLE:
|
| 651 |
+
log_info(f"GPU Memory: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB allocated, "
|
| 652 |
+
f"{torch.cuda.max_memory_reserved() / 1024**3:.2f} GB reserved")
|
| 653 |
+
|
| 654 |
+
# Reset for next interval
|
| 655 |
+
self.last_log_time = current_time
|
| 656 |
+
self.last_step = state.global_step
|
| 657 |
+
|
| 658 |
def on_train_begin(self, args, state, control, **kwargs):
|
| 659 |
+
log_info(f"=== Training started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
|
| 660 |
+
log_info(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")
|
| 661 |
+
|
| 662 |
+
# Set up sequence verification with actual sample capturing
|
| 663 |
+
try:
|
| 664 |
+
self.verify_sequence = dataset_config.get("validation", {}).get("verify_sequence_integrity", False)
|
| 665 |
+
if self.verify_sequence:
|
| 666 |
+
log_info("Sequence integrity verification enabled during training")
|
| 667 |
+
|
| 668 |
+
# Save actual samples for later verification
|
| 669 |
+
if trainer and trainer.train_dataset:
|
| 670 |
+
# Get some reference samples from the beginning of the dataset
|
| 671 |
+
self.sample_indices = list(range(min(5, len(trainer.train_dataset))))
|
| 672 |
+
self.sequence_samples = [trainer.train_dataset[i] for i in self.sample_indices]
|
| 673 |
+
log_info(f"Captured {len(self.sequence_samples)} reference samples for sequence integrity verification")
|
| 674 |
+
|
| 675 |
+
# Log sample IDs for debugging
|
| 676 |
+
if len(self.sequence_samples) > 0 and 'id' in self.sequence_samples[0]:
|
| 677 |
+
sample_ids = [s.get('id') for s in self.sequence_samples if 'id' in s]
|
| 678 |
+
log_info(f"Reference sample IDs: {sample_ids}")
|
| 679 |
+
else:
|
| 680 |
+
log_info("Warning: Could not capture reference samples - verification will be limited")
|
| 681 |
+
except Exception as e:
|
| 682 |
+
log_info(f"Warning: Could not set up sequence integrity verification: {e}")
|
| 683 |
+
self.verify_sequence = False
|
| 684 |
+
|
| 685 |
log_info("=== Training is starting ===")
|
| 686 |
|
| 687 |
# Log important training parameters for visibility
|
|
|
|
| 690 |
log_info(f"Epochs: {args.num_train_epochs}")
|
| 691 |
|
| 692 |
# Log memory information in compact format
|
| 693 |
+
if CUDA_AVAILABLE:
|
| 694 |
memory_info = []
|
| 695 |
+
for i in range(NUM_GPUS):
|
| 696 |
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
| 697 |
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
| 698 |
memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
|
|
|
|
| 700 |
log_info(f"Initial memory usage - {', '.join(memory_info)}")
|
| 701 |
|
| 702 |
def on_train_end(self, args, state, control, **kwargs):
|
| 703 |
+
training_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - self.training_started))
|
| 704 |
+
log_info(f"=== Training completed in {training_time} ===")
|
| 705 |
+
|
| 706 |
+
# Log final memory usage
|
| 707 |
+
if CUDA_AVAILABLE:
|
| 708 |
+
for i in range(NUM_GPUS):
|
| 709 |
+
max_mem = torch.cuda.max_memory_allocated(i) / 1024**3 # GB
|
| 710 |
+
log_info(f"GPU {i} max memory: {max_mem:.2f} GB")
|
| 711 |
|
| 712 |
+
# Clear GPU memory
|
| 713 |
+
torch.cuda.empty_cache()
|
| 714 |
+
log_info("GPU memory cleared")
|
| 715 |
|
| 716 |
log_info(f"Total steps: {state.global_step}")
|
| 717 |
log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
|
|
|
|
| 749 |
# Set up logging
|
| 750 |
log_info("Starting Phi-4 fine-tuning process")
|
| 751 |
|
| 752 |
+
# Log hardware information
|
| 753 |
+
log_info(f"Hardware detection: CUDA {'available' if CUDA_AVAILABLE else 'not available'}")
|
| 754 |
+
if CUDA_AVAILABLE:
|
| 755 |
+
log_info(f"Found {NUM_GPUS} GPUs")
|
| 756 |
+
for i in range(NUM_GPUS):
|
| 757 |
+
log_info(f" GPU {i}: {torch.cuda.get_device_name(i)}")
|
| 758 |
+
else:
|
| 759 |
+
log_info("Running on CPU (training will be very slow)")
|
| 760 |
+
|
| 761 |
# Parse arguments
|
| 762 |
args = parse_args()
|
| 763 |
|
|
|
|
| 776 |
else:
|
| 777 |
log_info("Running in non-distributed mode (single process)")
|
| 778 |
|
| 779 |
+
# Load all configurations - do this once
|
| 780 |
try:
|
| 781 |
configs = load_configs(args.config_dir)
|
| 782 |
|
| 783 |
+
# Extract specific configs immediately after loading
|
| 784 |
if not configs:
|
| 785 |
logger.error("Failed to load configuration")
|
| 786 |
return 1
|
| 787 |
+
|
| 788 |
+
# Store configurations in clear variables
|
| 789 |
+
transformers_config = configs.get("transformers", {})
|
| 790 |
+
hardware_config = configs.get("hardware", {})
|
| 791 |
+
dataset_config = configs.get("dataset", {})
|
| 792 |
|
| 793 |
# Verify configuration sections exist
|
| 794 |
+
if not transformers_config:
|
| 795 |
logger.error("transformers_config.json not found or invalid")
|
| 796 |
return 1
|
| 797 |
|
| 798 |
+
if not hardware_config:
|
| 799 |
logger.warning("Hardware configuration section not found in transformers_config.json. Using default hardware configuration.")
|
| 800 |
|
| 801 |
+
if not dataset_config:
|
| 802 |
logger.error("Dataset configuration section not found in transformers_config.json")
|
| 803 |
return 1
|
| 804 |
|
| 805 |
# Validate model configuration
|
| 806 |
+
model_name = (transformers_config.get("model", {}).get("name") or
|
| 807 |
+
transformers_config.get("model_name_or_path") or
|
| 808 |
+
transformers_config.get("model_name"))
|
| 809 |
+
|
| 810 |
+
if not model_name:
|
| 811 |
logger.error("Model name not specified in configuration")
|
| 812 |
logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
|
| 813 |
return 1
|
| 814 |
|
|
|
|
| 815 |
log_info(f"Using model: {model_name}")
|
| 816 |
log_info("All configurations loaded successfully")
|
| 817 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
# Apply hardware-specific settings if available
|
| 819 |
if hardware_config:
|
| 820 |
# Get training optimizations from hardware config
|
| 821 |
training_opts = hardware_config.get("training_optimizations", {})
|
| 822 |
|
| 823 |
# Apply batch size and gradient accumulation settings
|
| 824 |
+
if training_opts.get("per_device_batch_size") and transformers_config.get("training"):
|
| 825 |
batch_size = training_opts.get("per_device_batch_size")
|
| 826 |
+
transformers_config["training"]["per_device_train_batch_size"] = batch_size
|
| 827 |
log_info(f"Applied hardware-optimized batch size: {batch_size}")
|
| 828 |
|
| 829 |
+
if training_opts.get("gradient_accumulation_steps") and transformers_config.get("training"):
|
| 830 |
grad_steps = training_opts.get("gradient_accumulation_steps")
|
| 831 |
+
transformers_config["training"]["gradient_accumulation_steps"] = grad_steps
|
| 832 |
log_info(f"Applied hardware-optimized gradient accumulation: {grad_steps}")
|
| 833 |
|
| 834 |
# Apply memory optimizations
|
| 835 |
memory_opts = training_opts.get("memory_optimizations", {})
|
| 836 |
+
if memory_opts.get("use_gradient_checkpointing") is not None and transformers_config.get("training"):
|
| 837 |
grad_ckpt = memory_opts.get("use_gradient_checkpointing")
|
| 838 |
+
transformers_config["training"]["gradient_checkpointing"] = grad_ckpt
|
| 839 |
log_info(f"Applied hardware-optimized gradient checkpointing: {grad_ckpt}")
|
| 840 |
|
| 841 |
# Apply system settings
|
|
|
|
| 853 |
return 1
|
| 854 |
|
| 855 |
# Set random seed for reproducibility
|
| 856 |
+
seed = transformers_config.get("seed", 42)
|
| 857 |
set_seed(seed)
|
| 858 |
log_info(f"Set random seed to {seed} for reproducibility")
|
| 859 |
|
| 860 |
+
# Empty CUDA cache to ensure clean state
|
| 861 |
+
if CUDA_AVAILABLE:
|
|
|
|
| 862 |
torch.cuda.empty_cache()
|
| 863 |
+
log_info("Cleared CUDA cache")
|
| 864 |
+
|
| 865 |
+
# Setup environment variable for CUDA memory allocation
|
| 866 |
+
if CUDA_AVAILABLE:
|
| 867 |
+
system_settings = hardware_config.get("system_settings", {})
|
| 868 |
+
cuda_memory_fraction = system_settings.get("cuda_memory_fraction", 0.85)
|
| 869 |
|
| 870 |
+
if cuda_memory_fraction < 1.0:
|
| 871 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:128,expandable_segments:True"
|
| 872 |
+
log_info(f"Set CUDA memory allocation limit to expandable with max_split_size_mb:128")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 873 |
|
| 874 |
try:
|
| 875 |
log_info("Loading model and tokenizer...")
|
| 876 |
+
model, tokenizer = load_model_and_tokenizer(transformers_config)
|
| 877 |
log_info("Model and tokenizer loaded successfully")
|
| 878 |
|
| 879 |
# Load dataset with proper mapping
|
|
|
|
| 903 |
log_info("Using FP16 precision from hardware config")
|
| 904 |
else:
|
| 905 |
# Fall back to transformers config
|
| 906 |
+
use_bf16 = transformers_config.get("bf16", False) or transformers_config.get("torch_dtype", "") == "bfloat16"
|
| 907 |
+
use_fp16 = transformers_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
|
| 908 |
log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
|
| 909 |
|
| 910 |
# Get per device batch size - from transformers config, but possibly overridden by hardware config
|
| 911 |
+
per_device_batch_size = transformers_config.get("training", {}).get("per_device_train_batch_size", 16)
|
| 912 |
+
gradient_accumulation_steps = transformers_config.get("training", {}).get("gradient_accumulation_steps", 3)
|
| 913 |
|
| 914 |
# For multi-GPU setup, adjust for better balance
|
| 915 |
+
if CUDA_AVAILABLE and NUM_GPUS > 1:
|
| 916 |
+
log_info(f"Multi-GPU setup: Adjusting for {NUM_GPUS} GPUs")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 917 |
|
| 918 |
# Set up FSDP for multi-GPU training if specified and in distributed mode
|
| 919 |
fsdp_config = None
|
| 920 |
+
if multi_gpu_strategy == "fsdp" and is_distributed and NUM_GPUS > 1:
|
| 921 |
try:
|
| 922 |
from torch.distributed.fsdp import (
|
| 923 |
FullyShardedDataParallel as FSDP,
|
|
|
|
| 963 |
# Set up training arguments
|
| 964 |
log_info("Setting up training arguments")
|
| 965 |
training_args = TrainingArguments(
|
| 966 |
+
output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
|
| 967 |
+
num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
|
| 968 |
per_device_train_batch_size=per_device_batch_size,
|
| 969 |
gradient_accumulation_steps=gradient_accumulation_steps,
|
| 970 |
+
learning_rate=transformers_config.get("training", {}).get("learning_rate", 2e-5),
|
| 971 |
+
weight_decay=transformers_config.get("training", {}).get("weight_decay", 0.01),
|
| 972 |
+
warmup_ratio=transformers_config.get("training", {}).get("warmup_ratio", 0.05),
|
| 973 |
+
lr_scheduler_type=transformers_config.get("training", {}).get("lr_scheduler_type", "cosine"),
|
| 974 |
+
logging_steps=transformers_config.get("training", {}).get("logging_steps", 10),
|
| 975 |
+
save_strategy=transformers_config.get("checkpointing", {}).get("save_strategy", "steps"),
|
| 976 |
+
save_steps=transformers_config.get("checkpointing", {}).get("save_steps", 100),
|
| 977 |
+
save_total_limit=transformers_config.get("checkpointing", {}).get("save_total_limit", 3),
|
| 978 |
fp16=use_fp16,
|
| 979 |
bf16=use_bf16,
|
| 980 |
+
max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
|
| 981 |
+
push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
|
| 982 |
+
hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
|
| 983 |
hub_token=os.environ.get("HF_TOKEN", None),
|
| 984 |
report_to="tensorboard",
|
| 985 |
remove_unused_columns=False, # Keep all columns
|
| 986 |
+
gradient_checkpointing=transformers_config.get("training", {}).get("gradient_checkpointing", True),
|
| 987 |
dataloader_pin_memory=pin_memory,
|
| 988 |
+
optim=transformers_config.get("training", {}).get("optim", "adamw_torch"),
|
| 989 |
ddp_find_unused_parameters=False, # Improve distributed training efficiency
|
| 990 |
dataloader_drop_last=False, # Process all examples
|
| 991 |
dataloader_num_workers=dataloader_workers,
|
| 992 |
+
no_cuda=False if CUDA_AVAILABLE else True, # Use CUDA if available
|
| 993 |
# Only add FSDP if we're in distributed mode with FSDP strategy
|
| 994 |
fsdp=fsdp_config if is_distributed and multi_gpu_strategy == "fsdp" else None,
|
| 995 |
)
|
|
|
|
| 1012 |
"""Custom dataloader that preserves original dataset order"""
|
| 1013 |
log_info("Creating sequential dataloader to maintain original dataset order")
|
| 1014 |
|
| 1015 |
+
# Verification of sequence preservation flags - consolidated
|
| 1016 |
+
data_loading_config = dataset_config.get("data_loading", {})
|
| 1017 |
+
sequential_processing = data_loading_config.get("sequential_processing", True)
|
| 1018 |
+
shuffle_disabled = not data_loading_config.get("shuffle", False)
|
| 1019 |
+
|
| 1020 |
+
if not sequential_processing:
|
| 1021 |
+
log_info("CRITICAL WARNING: sequential_processing flag is disabled! This may affect data order.")
|
| 1022 |
+
log_info("Data sequence integrity is essential - using sequential sampler regardless of flag.")
|
| 1023 |
+
# Force sequential processing regardless of flag
|
| 1024 |
+
|
| 1025 |
+
if not shuffle_disabled:
|
| 1026 |
+
log_info("CRITICAL ERROR: Shuffle is not disabled! This will randomize data entry order!")
|
| 1027 |
+
# Actually handle the error rather than just logging it
|
| 1028 |
+
raise ValueError("Dataset shuffling is enabled but sequential processing is required. " +
|
| 1029 |
+
"Please disable shuffling in your configuration.")
|
| 1030 |
+
|
| 1031 |
# Calculate batch size based on device availability
|
| 1032 |
if getattr(training_args, "no_cuda", False):
|
| 1033 |
batch_size = training_args.per_device_train_batch_size
|
| 1034 |
else:
|
| 1035 |
+
batch_size = max(training_args.per_device_train_batch_size * max(1, NUM_GPUS), 1)
|
| 1036 |
|
| 1037 |
log_info(f"Using sequential sampler with batch size {batch_size}")
|
| 1038 |
|
|
|
|
| 1054 |
log_info("=== Starting Training ===")
|
| 1055 |
try:
|
| 1056 |
# Empty cache again right before training
|
| 1057 |
+
if CUDA_AVAILABLE:
|
| 1058 |
torch.cuda.empty_cache()
|
| 1059 |
log_info("Cleared CUDA cache before training")
|
| 1060 |
|
| 1061 |
# Display compact training info
|
| 1062 |
+
total_steps = int(len(dataset) / (per_device_batch_size * NUM_GPUS * gradient_accumulation_steps) * training_args.num_train_epochs)
|
| 1063 |
log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
|
| 1064 |
|
| 1065 |
trainer.train()
|
|
|
|
| 1071 |
log_info(f"Model saved to {training_args.output_dir}")
|
| 1072 |
|
| 1073 |
# Push to hub if enabled
|
| 1074 |
+
if transformers_config.get("huggingface_hub", {}).get("push_to_hub", False):
|
| 1075 |
+
hub_id = transformers_config.get("huggingface_hub", {}).get("hub_model_id", "model")
|
| 1076 |
log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
|
| 1077 |
trainer.push_to_hub()
|
| 1078 |
log_info("Model successfully pushed to Hub")
|
|
|
|
| 1081 |
except Exception as e:
|
| 1082 |
logger.error(f"Training failed with error: {str(e)}")
|
| 1083 |
# Log CUDA memory info if available in compact format
|
| 1084 |
+
if CUDA_AVAILABLE:
|
| 1085 |
memory_info = []
|
| 1086 |
+
for i in range(NUM_GPUS):
|
| 1087 |
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
| 1088 |
reserved = torch.cuda.memory_reserved(i) / 1024**2
|
| 1089 |
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
transformers_config.json
CHANGED
|
@@ -139,6 +139,7 @@
|
|
| 139 |
"processing": {
|
| 140 |
"sort_by_id": true,
|
| 141 |
"maintain_paper_order": true,
|
|
|
|
| 142 |
"max_seq_length": 2048
|
| 143 |
}
|
| 144 |
},
|
|
@@ -159,6 +160,7 @@
|
|
| 159 |
"data_loading": {
|
| 160 |
"batch_size": 24,
|
| 161 |
"shuffle": false,
|
|
|
|
| 162 |
"drop_last": false,
|
| 163 |
"num_workers": 4,
|
| 164 |
"pin_memory": true,
|
|
@@ -167,6 +169,7 @@
|
|
| 167 |
"validation": {
|
| 168 |
"log_samples": 3,
|
| 169 |
"log_interval": 50,
|
|
|
|
| 170 |
"metrics": ["processed", "skipped", "avg_tokens", "unique_papers"]
|
| 171 |
}
|
| 172 |
}
|
|
|
|
| 139 |
"processing": {
|
| 140 |
"sort_by_id": true,
|
| 141 |
"maintain_paper_order": true,
|
| 142 |
+
"preserve_entry_sequence": true,
|
| 143 |
"max_seq_length": 2048
|
| 144 |
}
|
| 145 |
},
|
|
|
|
| 160 |
"data_loading": {
|
| 161 |
"batch_size": 24,
|
| 162 |
"shuffle": false,
|
| 163 |
+
"sequential_processing": true,
|
| 164 |
"drop_last": false,
|
| 165 |
"num_workers": 4,
|
| 166 |
"pin_memory": true,
|
|
|
|
| 169 |
"validation": {
|
| 170 |
"log_samples": 3,
|
| 171 |
"log_interval": 50,
|
| 172 |
+
"verify_sequence_integrity": true,
|
| 173 |
"metrics": ["processed", "skipped", "avg_tokens", "unique_papers"]
|
| 174 |
}
|
| 175 |
}
|