linoy
inital commit
ebfc6b3
# =============================================================================
# LTX-2 Audio-Video LoRA Training Configuration
# =============================================================================
#
# This configuration is for training LoRA adapters on the LTX-2 model for
# text-to-video generation. It supports both video-only and joint audio-video
# training modes.
#
# Use this configuration when you want to:
# - Fine-tune LTX-2 on your own video dataset
# - Train with or without audio generation
# - Create custom video generation styles or audiovisual concepts
#
# Dataset structure for text-to-video training:
# preprocessed_data_root/
# ├── latents/ # Video latents (VAE-encoded videos)
# ├── conditions/ # Text embeddings for each video
# └── audio_latents/ # Audio latents (only if with_audio: true)
#
# =============================================================================
# -----------------------------------------------------------------------------
# Model Configuration
# -----------------------------------------------------------------------------
# Specifies the base model to fine-tune and the training mode.
model:
# Path to the LTX-2 model checkpoint (.safetensors file)
# This should be a local path to your downloaded model
model_path: "path/to/ltx-2-model.safetensors"
# Path to the text encoder model directory
# For LTX-2, this is typically the Gemma-based text encoder
text_encoder_path: "path/to/gemma-text-encoder"
# Training mode: "lora" for efficient adapter training, "full" for full fine-tuning
# LoRA is recommended for most use cases (faster, less memory, prevents overfitting)
training_mode: "lora"
# Optional: Path to resume training from a checkpoint
# Can be a checkpoint file (.safetensors) or directory (uses latest checkpoint)
load_checkpoint: null
# -----------------------------------------------------------------------------
# LoRA Configuration
# -----------------------------------------------------------------------------
# Controls the Low-Rank Adaptation parameters for efficient fine-tuning.
lora:
# Rank of the LoRA matrices (higher = more capacity but more parameters)
# Typical values: 8, 16, 32, 64. Start with 32 for general fine-tuning.
rank: 32
# Alpha scaling factor (usually set equal to rank)
# The effective scaling is alpha/rank, so alpha=rank means scaling of 1.0
alpha: 32
# Dropout probability for LoRA layers (0.0 = no dropout)
# Can help with regularization if overfitting occurs
dropout: 0.0
# Which transformer modules to apply LoRA to
# The LTX-2 transformer has separate attention and FFN blocks for video and audio:
#
# VIDEO MODULES:
# - attn1.to_k, attn1.to_q, attn1.to_v, attn1.to_out.0 (video self-attention)
# - attn2.to_k, attn2.to_q, attn2.to_v, attn2.to_out.0 (video cross-attention to text)
# - ff.net.0.proj, ff.net.2 (video feed-forward)
#
# AUDIO MODULES:
# - audio_attn1.to_k, audio_attn1.to_q, audio_attn1.to_v, audio_attn1.to_out.0 (audio self-attention)
# - audio_attn2.to_k, audio_attn2.to_q, audio_attn2.to_v, audio_attn2.to_out.0 (audio cross-attention to text)
# - audio_ff.net.0.proj, audio_ff.net.2 (audio feed-forward)
#
# AUDIO-VIDEO CROSS-ATTENTION MODULES (for cross-modal interaction):
# - audio_to_video_attn.to_k, audio_to_video_attn.to_q, audio_to_video_attn.to_v, audio_to_video_attn.to_out.0
# (Q from video, K/V from audio - allows video to attend to audio features)
# - video_to_audio_attn.to_k, video_to_audio_attn.to_q, video_to_audio_attn.to_v, video_to_audio_attn.to_out.0
# (Q from audio, K/V from video - allows audio to attend to video features)
#
# Using short patterns like "to_k" matches ALL attention modules (video, audio, and cross-modal).
# For audio-video training, this is the recommended approach.
target_modules:
# Attention layers (matches both video and audio branches)
- "to_k"
- "to_q"
- "to_v"
- "to_out.0"
# Uncomment below to also train feed-forward layers (can increase the LoRA's capacity):
# - "ff.net.0.proj"
# - "ff.net.2"
# - "audio_ff.net.0.proj"
# - "audio_ff.net.2"
# -----------------------------------------------------------------------------
# Training Strategy Configuration
# -----------------------------------------------------------------------------
# Defines the text-to-video training approach.
training_strategy:
# Strategy name: "text_to_video" for standard text-to-video training
name: "text_to_video"
# Probability of conditioning on the first frame during training
# Higher values train the model to perform better in image-to-video (I2V) mode,
# where a clean first frame is provided and the model generates the rest of the video
# Increase this value to train the model to perform better in image-to-video (I2V) mode
first_frame_conditioning_p: 0.5
# Enable joint audio-video training
# Set to true if your dataset includes audio and you want to train the audio branch
with_audio: true
# Directory name (within preprocessed_data_root) containing audio latents
# Only used when with_audio is true
audio_latents_dir: "audio_latents"
# -----------------------------------------------------------------------------
# Optimization Configuration
# -----------------------------------------------------------------------------
# Controls the training optimization parameters.
optimization:
# Learning rate for the optimizer
# Typical range for LoRA: 1e-5 to 1e-4
learning_rate: 1e-4
# Total number of training steps
steps: 2000
# Batch size per GPU
# Reduce if running out of memory
batch_size: 1
# Number of gradient accumulation steps
# Effective batch size = batch_size * gradient_accumulation_steps * num_gpus
gradient_accumulation_steps: 1
# Maximum gradient norm for clipping (helps training stability)
max_grad_norm: 1.0
# Optimizer type: "adamw" (standard) or "adamw8bit" (memory-efficient)
optimizer_type: "adamw"
# Learning rate scheduler type
# Options: "constant", "linear", "cosine", "cosine_with_restarts", "polynomial"
scheduler_type: "linear"
# Additional scheduler parameters (depends on scheduler_type)
scheduler_params: { }
# Enable gradient checkpointing to reduce memory usage
# Recommended for training with limited GPU memory
enable_gradient_checkpointing: true
# -----------------------------------------------------------------------------
# Acceleration Configuration
# -----------------------------------------------------------------------------
# Hardware acceleration and memory optimization settings.
acceleration:
# Mixed precision training mode
# Options: "no" (fp32), "fp16" (half precision), "bf16" (bfloat16, recommended)
mixed_precision_mode: "bf16"
# Model quantization for reduced memory usage
# Options: null (none), "int8-quanto", "int4-quanto", "int2-quanto", "fp8-quanto", "fp8uz-quanto"
quantization: null
# Load text encoder in 8-bit precision to save memory
# Useful when GPU memory is limited
load_text_encoder_in_8bit: false
# -----------------------------------------------------------------------------
# Data Configuration
# -----------------------------------------------------------------------------
# Specifies the training data location and loading parameters.
data:
# Root directory containing preprocessed training data
# Should contain: latents/, conditions/, and optionally audio_latents/
preprocessed_data_root: "/path/to/preprocessed/data"
# Number of worker processes for data loading
# Used for parallel data loading to speed up data loading
num_dataloader_workers: 2
# -----------------------------------------------------------------------------
# Validation Configuration
# -----------------------------------------------------------------------------
# Controls validation video generation during training.
validation:
# Text prompts for validation video generation
# Provide prompts representative of your training data
# LTX-2 prefers longer, detailed prompts that describe both visual content and audio
prompts:
- "A woman with long brown hair sits at a wooden desk in a cozy home office, typing on a laptop while occasionally glancing at notes beside her. Soft natural light streams through a large window, casting warm shadows across the room. She pauses to take a sip from a ceramic mug, then continues working with focused concentration. The audio captures the gentle clicking of keyboard keys, the soft rustle of papers, and ambient room tone with occasional distant bird chirps from outside."
- "A chef in a white uniform stands in a professional kitchen, carefully plating a gourmet dish with precise movements. Steam rises from freshly cooked vegetables as he arranges them with tweezers. The stainless steel surfaces gleam under bright overhead lights, and various pots simmer on the stove behind him. The audio features the sizzling of pans, the clinking of utensils against plates, and the ambient hum of kitchen ventilation."
# Negative prompt to avoid unwanted artifacts
negative_prompt: "worst quality, inconsistent motion, blurry, jittery, distorted"
# Optional: First frame images for image-to-video validation
# If provided, must have one image per prompt
images: null
# Output video dimensions [width, height, frames]
# Width and height must be divisible by 32
# Frames must satisfy: frames % 8 == 1 (e.g., 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, ...)
video_dims: [ 576, 576, 89 ]
# Frame rate for generated videos
frame_rate: 25.0
# Random seed for reproducible validation outputs
seed: 42
# Number of denoising steps for validation inference
# Higher values = better quality but slower generation
inference_steps: 30
# Generate validation videos every N training steps
# Set to null to disable validation during training
interval: 100
# Number of videos to generate per prompt
videos_per_prompt: 1
# Classifier-free guidance scale
# Higher values = stronger adherence to prompt but may introduce artifacts
guidance_scale: 3.0
# STG (Spatio-Temporal Guidance) parameters for improved video quality
# STG is combined with CFG for better temporal coherence
stg_scale: 1.0 # Recommended: 1.0 (0.0 disables STG)
stg_blocks: [29] # Recommended: single block 29
stg_mode: "stg_av" # "stg_av" perturbs both audio and video, "stg_v" video only
# Whether to generate audio in validation samples
# Independent of training_strategy.with_audio - you can generate audio
# in validation even when not training the audio branch
generate_audio: true
# Skip validation at the beginning of training (step 0)
skip_initial_validation: false
# -----------------------------------------------------------------------------
# Checkpoint Configuration
# -----------------------------------------------------------------------------
# Controls model checkpoint saving during training.
checkpoints:
# Save a checkpoint every N steps
# Set to null to disable intermediate checkpoints
interval: 250
# Number of most recent checkpoints to keep
# Set to -1 to keep all checkpoints
keep_last_n: -1
# -----------------------------------------------------------------------------
# Flow Matching Configuration
# -----------------------------------------------------------------------------
# Parameters for the flow matching training objective.
flow_matching:
# Timestep sampling mode
# "shifted_logit_normal" is recommended for LTX-2 models
timestep_sampling_mode: "shifted_logit_normal"
# Additional parameters for timestep sampling
timestep_sampling_params: { }
# -----------------------------------------------------------------------------
# Hugging Face Hub Configuration
# -----------------------------------------------------------------------------
# Settings for uploading trained models to the Hugging Face Hub.
hub:
# Whether to push the trained model to the Hub
push_to_hub: false
# Repository ID on Hugging Face Hub (e.g., "username/my-lora-model")
# Required if push_to_hub is true
hub_model_id: null
# -----------------------------------------------------------------------------
# Weights & Biases Configuration
# -----------------------------------------------------------------------------
# Settings for experiment tracking with W&B.
wandb:
# Enable W&B logging
enabled: false
# W&B project name
project: "ltx-2-trainer"
# W&B username or team (null uses default account)
entity: null
# Tags to help organize runs
tags: [ "ltx2", "lora" ]
# Log validation videos to W&B
log_validation_videos: true
# -----------------------------------------------------------------------------
# General Configuration
# -----------------------------------------------------------------------------
# Global settings for the training run.
# Random seed for reproducibility
seed: 42
# Directory to save outputs (checkpoints, validation videos, logs)
output_dir: "outputs/ltx2_av_lora"