| # ============================================================================= | |
| # LTX-2 Audio-Video LoRA Training Configuration | |
| # ============================================================================= | |
| # | |
| # This configuration is for training LoRA adapters on the LTX-2 model for | |
| # text-to-video generation. It supports both video-only and joint audio-video | |
| # training modes. | |
| # | |
| # Use this configuration when you want to: | |
| # - Fine-tune LTX-2 on your own video dataset | |
| # - Train with or without audio generation | |
| # - Create custom video generation styles or audiovisual concepts | |
| # | |
| # Dataset structure for text-to-video training: | |
| # preprocessed_data_root/ | |
| # ├── latents/ # Video latents (VAE-encoded videos) | |
| # ├── conditions/ # Text embeddings for each video | |
| # └── audio_latents/ # Audio latents (only if with_audio: true) | |
| # | |
| # ============================================================================= | |
| # ----------------------------------------------------------------------------- | |
| # Model Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Specifies the base model to fine-tune and the training mode. | |
| model: | |
| # Path to the LTX-2 model checkpoint (.safetensors file) | |
| # This should be a local path to your downloaded model | |
| model_path: "path/to/ltx-2-model.safetensors" | |
| # Path to the text encoder model directory | |
| # For LTX-2, this is typically the Gemma-based text encoder | |
| text_encoder_path: "path/to/gemma-text-encoder" | |
| # Training mode: "lora" for efficient adapter training, "full" for full fine-tuning | |
| # LoRA is recommended for most use cases (faster, less memory, prevents overfitting) | |
| training_mode: "lora" | |
| # Optional: Path to resume training from a checkpoint | |
| # Can be a checkpoint file (.safetensors) or directory (uses latest checkpoint) | |
| load_checkpoint: null | |
| # ----------------------------------------------------------------------------- | |
| # LoRA Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Controls the Low-Rank Adaptation parameters for efficient fine-tuning. | |
| lora: | |
| # Rank of the LoRA matrices (higher = more capacity but more parameters) | |
| # Typical values: 8, 16, 32, 64. Start with 32 for general fine-tuning. | |
| rank: 32 | |
| # Alpha scaling factor (usually set equal to rank) | |
| # The effective scaling is alpha/rank, so alpha=rank means scaling of 1.0 | |
| alpha: 32 | |
| # Dropout probability for LoRA layers (0.0 = no dropout) | |
| # Can help with regularization if overfitting occurs | |
| dropout: 0.0 | |
| # Which transformer modules to apply LoRA to | |
| # The LTX-2 transformer has separate attention and FFN blocks for video and audio: | |
| # | |
| # VIDEO MODULES: | |
| # - attn1.to_k, attn1.to_q, attn1.to_v, attn1.to_out.0 (video self-attention) | |
| # - attn2.to_k, attn2.to_q, attn2.to_v, attn2.to_out.0 (video cross-attention to text) | |
| # - ff.net.0.proj, ff.net.2 (video feed-forward) | |
| # | |
| # AUDIO MODULES: | |
| # - audio_attn1.to_k, audio_attn1.to_q, audio_attn1.to_v, audio_attn1.to_out.0 (audio self-attention) | |
| # - audio_attn2.to_k, audio_attn2.to_q, audio_attn2.to_v, audio_attn2.to_out.0 (audio cross-attention to text) | |
| # - audio_ff.net.0.proj, audio_ff.net.2 (audio feed-forward) | |
| # | |
| # AUDIO-VIDEO CROSS-ATTENTION MODULES (for cross-modal interaction): | |
| # - audio_to_video_attn.to_k, audio_to_video_attn.to_q, audio_to_video_attn.to_v, audio_to_video_attn.to_out.0 | |
| # (Q from video, K/V from audio - allows video to attend to audio features) | |
| # - video_to_audio_attn.to_k, video_to_audio_attn.to_q, video_to_audio_attn.to_v, video_to_audio_attn.to_out.0 | |
| # (Q from audio, K/V from video - allows audio to attend to video features) | |
| # | |
| # Using short patterns like "to_k" matches ALL attention modules (video, audio, and cross-modal). | |
| # For audio-video training, this is the recommended approach. | |
| target_modules: | |
| # Attention layers (matches both video and audio branches) | |
| - "to_k" | |
| - "to_q" | |
| - "to_v" | |
| - "to_out.0" | |
| # Uncomment below to also train feed-forward layers (can increase the LoRA's capacity): | |
| # - "ff.net.0.proj" | |
| # - "ff.net.2" | |
| # - "audio_ff.net.0.proj" | |
| # - "audio_ff.net.2" | |
| # ----------------------------------------------------------------------------- | |
| # Training Strategy Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Defines the text-to-video training approach. | |
| training_strategy: | |
| # Strategy name: "text_to_video" for standard text-to-video training | |
| name: "text_to_video" | |
| # Probability of conditioning on the first frame during training | |
| # Higher values train the model to perform better in image-to-video (I2V) mode, | |
| # where a clean first frame is provided and the model generates the rest of the video | |
| # Increase this value to train the model to perform better in image-to-video (I2V) mode | |
| first_frame_conditioning_p: 0.5 | |
| # Enable joint audio-video training | |
| # Set to true if your dataset includes audio and you want to train the audio branch | |
| with_audio: true | |
| # Directory name (within preprocessed_data_root) containing audio latents | |
| # Only used when with_audio is true | |
| audio_latents_dir: "audio_latents" | |
| # ----------------------------------------------------------------------------- | |
| # Optimization Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Controls the training optimization parameters. | |
| optimization: | |
| # Learning rate for the optimizer | |
| # Typical range for LoRA: 1e-5 to 1e-4 | |
| learning_rate: 1e-4 | |
| # Total number of training steps | |
| steps: 2000 | |
| # Batch size per GPU | |
| # Reduce if running out of memory | |
| batch_size: 1 | |
| # Number of gradient accumulation steps | |
| # Effective batch size = batch_size * gradient_accumulation_steps * num_gpus | |
| gradient_accumulation_steps: 1 | |
| # Maximum gradient norm for clipping (helps training stability) | |
| max_grad_norm: 1.0 | |
| # Optimizer type: "adamw" (standard) or "adamw8bit" (memory-efficient) | |
| optimizer_type: "adamw" | |
| # Learning rate scheduler type | |
| # Options: "constant", "linear", "cosine", "cosine_with_restarts", "polynomial" | |
| scheduler_type: "linear" | |
| # Additional scheduler parameters (depends on scheduler_type) | |
| scheduler_params: { } | |
| # Enable gradient checkpointing to reduce memory usage | |
| # Recommended for training with limited GPU memory | |
| enable_gradient_checkpointing: true | |
| # ----------------------------------------------------------------------------- | |
| # Acceleration Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Hardware acceleration and memory optimization settings. | |
| acceleration: | |
| # Mixed precision training mode | |
| # Options: "no" (fp32), "fp16" (half precision), "bf16" (bfloat16, recommended) | |
| mixed_precision_mode: "bf16" | |
| # Model quantization for reduced memory usage | |
| # Options: null (none), "int8-quanto", "int4-quanto", "int2-quanto", "fp8-quanto", "fp8uz-quanto" | |
| quantization: null | |
| # Load text encoder in 8-bit precision to save memory | |
| # Useful when GPU memory is limited | |
| load_text_encoder_in_8bit: false | |
| # ----------------------------------------------------------------------------- | |
| # Data Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Specifies the training data location and loading parameters. | |
| data: | |
| # Root directory containing preprocessed training data | |
| # Should contain: latents/, conditions/, and optionally audio_latents/ | |
| preprocessed_data_root: "/path/to/preprocessed/data" | |
| # Number of worker processes for data loading | |
| # Used for parallel data loading to speed up data loading | |
| num_dataloader_workers: 2 | |
| # ----------------------------------------------------------------------------- | |
| # Validation Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Controls validation video generation during training. | |
| validation: | |
| # Text prompts for validation video generation | |
| # Provide prompts representative of your training data | |
| # LTX-2 prefers longer, detailed prompts that describe both visual content and audio | |
| prompts: | |
| - "A woman with long brown hair sits at a wooden desk in a cozy home office, typing on a laptop while occasionally glancing at notes beside her. Soft natural light streams through a large window, casting warm shadows across the room. She pauses to take a sip from a ceramic mug, then continues working with focused concentration. The audio captures the gentle clicking of keyboard keys, the soft rustle of papers, and ambient room tone with occasional distant bird chirps from outside." | |
| - "A chef in a white uniform stands in a professional kitchen, carefully plating a gourmet dish with precise movements. Steam rises from freshly cooked vegetables as he arranges them with tweezers. The stainless steel surfaces gleam under bright overhead lights, and various pots simmer on the stove behind him. The audio features the sizzling of pans, the clinking of utensils against plates, and the ambient hum of kitchen ventilation." | |
| # Negative prompt to avoid unwanted artifacts | |
| negative_prompt: "worst quality, inconsistent motion, blurry, jittery, distorted" | |
| # Optional: First frame images for image-to-video validation | |
| # If provided, must have one image per prompt | |
| images: null | |
| # Output video dimensions [width, height, frames] | |
| # Width and height must be divisible by 32 | |
| # Frames must satisfy: frames % 8 == 1 (e.g., 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, ...) | |
| video_dims: [ 576, 576, 89 ] | |
| # Frame rate for generated videos | |
| frame_rate: 25.0 | |
| # Random seed for reproducible validation outputs | |
| seed: 42 | |
| # Number of denoising steps for validation inference | |
| # Higher values = better quality but slower generation | |
| inference_steps: 30 | |
| # Generate validation videos every N training steps | |
| # Set to null to disable validation during training | |
| interval: 100 | |
| # Number of videos to generate per prompt | |
| videos_per_prompt: 1 | |
| # Classifier-free guidance scale | |
| # Higher values = stronger adherence to prompt but may introduce artifacts | |
| guidance_scale: 3.0 | |
| # STG (Spatio-Temporal Guidance) parameters for improved video quality | |
| # STG is combined with CFG for better temporal coherence | |
| stg_scale: 1.0 # Recommended: 1.0 (0.0 disables STG) | |
| stg_blocks: [29] # Recommended: single block 29 | |
| stg_mode: "stg_av" # "stg_av" perturbs both audio and video, "stg_v" video only | |
| # Whether to generate audio in validation samples | |
| # Independent of training_strategy.with_audio - you can generate audio | |
| # in validation even when not training the audio branch | |
| generate_audio: true | |
| # Skip validation at the beginning of training (step 0) | |
| skip_initial_validation: false | |
| # ----------------------------------------------------------------------------- | |
| # Checkpoint Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Controls model checkpoint saving during training. | |
| checkpoints: | |
| # Save a checkpoint every N steps | |
| # Set to null to disable intermediate checkpoints | |
| interval: 250 | |
| # Number of most recent checkpoints to keep | |
| # Set to -1 to keep all checkpoints | |
| keep_last_n: -1 | |
| # ----------------------------------------------------------------------------- | |
| # Flow Matching Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Parameters for the flow matching training objective. | |
| flow_matching: | |
| # Timestep sampling mode | |
| # "shifted_logit_normal" is recommended for LTX-2 models | |
| timestep_sampling_mode: "shifted_logit_normal" | |
| # Additional parameters for timestep sampling | |
| timestep_sampling_params: { } | |
| # ----------------------------------------------------------------------------- | |
| # Hugging Face Hub Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Settings for uploading trained models to the Hugging Face Hub. | |
| hub: | |
| # Whether to push the trained model to the Hub | |
| push_to_hub: false | |
| # Repository ID on Hugging Face Hub (e.g., "username/my-lora-model") | |
| # Required if push_to_hub is true | |
| hub_model_id: null | |
| # ----------------------------------------------------------------------------- | |
| # Weights & Biases Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Settings for experiment tracking with W&B. | |
| wandb: | |
| # Enable W&B logging | |
| enabled: false | |
| # W&B project name | |
| project: "ltx-2-trainer" | |
| # W&B username or team (null uses default account) | |
| entity: null | |
| # Tags to help organize runs | |
| tags: [ "ltx2", "lora" ] | |
| # Log validation videos to W&B | |
| log_validation_videos: true | |
| # ----------------------------------------------------------------------------- | |
| # General Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Global settings for the training run. | |
| # Random seed for reproducibility | |
| seed: 42 | |
| # Directory to save outputs (checkpoints, validation videos, logs) | |
| output_dir: "outputs/ltx2_av_lora" | |