ltx-2-distilled

Paused

ltx-2-distilled / packages /ltx-trainer /configs /ltx2_av_lora.yaml

linoy

inital commit

ebfc6b3 5 months ago

13.2 kB

	# =============================================================================
	# LTX-2 Audio-Video LoRA Training Configuration
	# =============================================================================
	#
	# This configuration is for training LoRA adapters on the LTX-2 model for
	# text-to-video generation. It supports both video-only and joint audio-video
	# training modes.
	#
	# Use this configuration when you want to:
	# - Fine-tune LTX-2 on your own video dataset
	# - Train with or without audio generation
	# - Create custom video generation styles or audiovisual concepts
	#
	# Dataset structure for text-to-video training:
	# preprocessed_data_root/
	# ├── latents/ # Video latents (VAE-encoded videos)
	# ├── conditions/ # Text embeddings for each video
	# └── audio_latents/ # Audio latents (only if with_audio: true)
	#
	# =============================================================================

	# -----------------------------------------------------------------------------
	# Model Configuration
	# -----------------------------------------------------------------------------
	# Specifies the base model to fine-tune and the training mode.
	model:
	# Path to the LTX-2 model checkpoint (.safetensors file)
	# This should be a local path to your downloaded model
	model_path: "path/to/ltx-2-model.safetensors"

	# Path to the text encoder model directory
	# For LTX-2, this is typically the Gemma-based text encoder
	text_encoder_path: "path/to/gemma-text-encoder"

	# Training mode: "lora" for efficient adapter training, "full" for full fine-tuning
	# LoRA is recommended for most use cases (faster, less memory, prevents overfitting)
	training_mode: "lora"

	# Optional: Path to resume training from a checkpoint
	# Can be a checkpoint file (.safetensors) or directory (uses latest checkpoint)
	load_checkpoint: null

	# -----------------------------------------------------------------------------
	# LoRA Configuration
	# -----------------------------------------------------------------------------
	# Controls the Low-Rank Adaptation parameters for efficient fine-tuning.
	lora:
	# Rank of the LoRA matrices (higher = more capacity but more parameters)
	# Typical values: 8, 16, 32, 64. Start with 32 for general fine-tuning.
	rank: 32

	# Alpha scaling factor (usually set equal to rank)
	# The effective scaling is alpha/rank, so alpha=rank means scaling of 1.0
	alpha: 32

	# Dropout probability for LoRA layers (0.0 = no dropout)
	# Can help with regularization if overfitting occurs
	dropout: 0.0

	# Which transformer modules to apply LoRA to
	# The LTX-2 transformer has separate attention and FFN blocks for video and audio:
	#
	# VIDEO MODULES:
	# - attn1.to_k, attn1.to_q, attn1.to_v, attn1.to_out.0 (video self-attention)
	# - attn2.to_k, attn2.to_q, attn2.to_v, attn2.to_out.0 (video cross-attention to text)
	# - ff.net.0.proj, ff.net.2 (video feed-forward)
	#
	# AUDIO MODULES:
	# - audio_attn1.to_k, audio_attn1.to_q, audio_attn1.to_v, audio_attn1.to_out.0 (audio self-attention)
	# - audio_attn2.to_k, audio_attn2.to_q, audio_attn2.to_v, audio_attn2.to_out.0 (audio cross-attention to text)
	# - audio_ff.net.0.proj, audio_ff.net.2 (audio feed-forward)
	#
	# AUDIO-VIDEO CROSS-ATTENTION MODULES (for cross-modal interaction):
	# - audio_to_video_attn.to_k, audio_to_video_attn.to_q, audio_to_video_attn.to_v, audio_to_video_attn.to_out.0
	# (Q from video, K/V from audio - allows video to attend to audio features)
	# - video_to_audio_attn.to_k, video_to_audio_attn.to_q, video_to_audio_attn.to_v, video_to_audio_attn.to_out.0
	# (Q from audio, K/V from video - allows audio to attend to video features)
	#
	# Using short patterns like "to_k" matches ALL attention modules (video, audio, and cross-modal).
	# For audio-video training, this is the recommended approach.
	target_modules:
	# Attention layers (matches both video and audio branches)
	- "to_k"
	- "to_q"
	- "to_v"
	- "to_out.0"
	# Uncomment below to also train feed-forward layers (can increase the LoRA's capacity):
	# - "ff.net.0.proj"
	# - "ff.net.2"
	# - "audio_ff.net.0.proj"
	# - "audio_ff.net.2"

	# -----------------------------------------------------------------------------
	# Training Strategy Configuration
	# -----------------------------------------------------------------------------
	# Defines the text-to-video training approach.
	training_strategy:
	# Strategy name: "text_to_video" for standard text-to-video training
	name: "text_to_video"

	# Probability of conditioning on the first frame during training
	# Higher values train the model to perform better in image-to-video (I2V) mode,
	# where a clean first frame is provided and the model generates the rest of the video
	# Increase this value to train the model to perform better in image-to-video (I2V) mode
	first_frame_conditioning_p: 0.5

	# Enable joint audio-video training
	# Set to true if your dataset includes audio and you want to train the audio branch
	with_audio: true

	# Directory name (within preprocessed_data_root) containing audio latents
	# Only used when with_audio is true
	audio_latents_dir: "audio_latents"

	# -----------------------------------------------------------------------------
	# Optimization Configuration
	# -----------------------------------------------------------------------------
	# Controls the training optimization parameters.
	optimization:
	# Learning rate for the optimizer
	# Typical range for LoRA: 1e-5 to 1e-4
	learning_rate: 1e-4

	# Total number of training steps
	steps: 2000

	# Batch size per GPU
	# Reduce if running out of memory
	batch_size: 1

	# Number of gradient accumulation steps
	# Effective batch size = batch_size * gradient_accumulation_steps * num_gpus
	gradient_accumulation_steps: 1

	# Maximum gradient norm for clipping (helps training stability)
	max_grad_norm: 1.0

	# Optimizer type: "adamw" (standard) or "adamw8bit" (memory-efficient)
	optimizer_type: "adamw"

	# Learning rate scheduler type
	# Options: "constant", "linear", "cosine", "cosine_with_restarts", "polynomial"
	scheduler_type: "linear"

	# Additional scheduler parameters (depends on scheduler_type)
	scheduler_params: { }

	# Enable gradient checkpointing to reduce memory usage
	# Recommended for training with limited GPU memory
	enable_gradient_checkpointing: true

	# -----------------------------------------------------------------------------
	# Acceleration Configuration
	# -----------------------------------------------------------------------------
	# Hardware acceleration and memory optimization settings.
	acceleration:
	# Mixed precision training mode
	# Options: "no" (fp32), "fp16" (half precision), "bf16" (bfloat16, recommended)
	mixed_precision_mode: "bf16"

	# Model quantization for reduced memory usage
	# Options: null (none), "int8-quanto", "int4-quanto", "int2-quanto", "fp8-quanto", "fp8uz-quanto"
	quantization: null

	# Load text encoder in 8-bit precision to save memory
	# Useful when GPU memory is limited
	load_text_encoder_in_8bit: false

	# -----------------------------------------------------------------------------
	# Data Configuration
	# -----------------------------------------------------------------------------
	# Specifies the training data location and loading parameters.
	data:
	# Root directory containing preprocessed training data
	# Should contain: latents/, conditions/, and optionally audio_latents/
	preprocessed_data_root: "/path/to/preprocessed/data"

	# Number of worker processes for data loading
	# Used for parallel data loading to speed up data loading
	num_dataloader_workers: 2

	# -----------------------------------------------------------------------------
	# Validation Configuration
	# -----------------------------------------------------------------------------
	# Controls validation video generation during training.
	validation:
	# Text prompts for validation video generation
	# Provide prompts representative of your training data
	# LTX-2 prefers longer, detailed prompts that describe both visual content and audio
	prompts:
	- "A woman with long brown hair sits at a wooden desk in a cozy home office, typing on a laptop while occasionally glancing at notes beside her. Soft natural light streams through a large window, casting warm shadows across the room. She pauses to take a sip from a ceramic mug, then continues working with focused concentration. The audio captures the gentle clicking of keyboard keys, the soft rustle of papers, and ambient room tone with occasional distant bird chirps from outside."
	- "A chef in a white uniform stands in a professional kitchen, carefully plating a gourmet dish with precise movements. Steam rises from freshly cooked vegetables as he arranges them with tweezers. The stainless steel surfaces gleam under bright overhead lights, and various pots simmer on the stove behind him. The audio features the sizzling of pans, the clinking of utensils against plates, and the ambient hum of kitchen ventilation."

	# Negative prompt to avoid unwanted artifacts
	negative_prompt: "worst quality, inconsistent motion, blurry, jittery, distorted"

	# Optional: First frame images for image-to-video validation
	# If provided, must have one image per prompt
	images: null

	# Output video dimensions [width, height, frames]
	# Width and height must be divisible by 32
	# Frames must satisfy: frames % 8 == 1 (e.g., 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, ...)
	video_dims: [ 576, 576, 89 ]

	# Frame rate for generated videos
	frame_rate: 25.0

	# Random seed for reproducible validation outputs
	seed: 42

	# Number of denoising steps for validation inference
	# Higher values = better quality but slower generation
	inference_steps: 30

	# Generate validation videos every N training steps
	# Set to null to disable validation during training
	interval: 100

	# Number of videos to generate per prompt
	videos_per_prompt: 1

	# Classifier-free guidance scale
	# Higher values = stronger adherence to prompt but may introduce artifacts
	guidance_scale: 3.0

	# STG (Spatio-Temporal Guidance) parameters for improved video quality
	# STG is combined with CFG for better temporal coherence
	stg_scale: 1.0 # Recommended: 1.0 (0.0 disables STG)
	stg_blocks: [29] # Recommended: single block 29
	stg_mode: "stg_av" # "stg_av" perturbs both audio and video, "stg_v" video only

	# Whether to generate audio in validation samples
	# Independent of training_strategy.with_audio - you can generate audio
	# in validation even when not training the audio branch
	generate_audio: true

	# Skip validation at the beginning of training (step 0)
	skip_initial_validation: false

	# -----------------------------------------------------------------------------
	# Checkpoint Configuration
	# -----------------------------------------------------------------------------
	# Controls model checkpoint saving during training.
	checkpoints:
	# Save a checkpoint every N steps
	# Set to null to disable intermediate checkpoints
	interval: 250

	# Number of most recent checkpoints to keep
	# Set to -1 to keep all checkpoints
	keep_last_n: -1

	# -----------------------------------------------------------------------------
	# Flow Matching Configuration
	# -----------------------------------------------------------------------------
	# Parameters for the flow matching training objective.
	flow_matching:
	# Timestep sampling mode
	# "shifted_logit_normal" is recommended for LTX-2 models
	timestep_sampling_mode: "shifted_logit_normal"

	# Additional parameters for timestep sampling
	timestep_sampling_params: { }

	# -----------------------------------------------------------------------------
	# Hugging Face Hub Configuration
	# -----------------------------------------------------------------------------
	# Settings for uploading trained models to the Hugging Face Hub.
	hub:
	# Whether to push the trained model to the Hub
	push_to_hub: false

	# Repository ID on Hugging Face Hub (e.g., "username/my-lora-model")
	# Required if push_to_hub is true
	hub_model_id: null

	# -----------------------------------------------------------------------------
	# Weights & Biases Configuration
	# -----------------------------------------------------------------------------
	# Settings for experiment tracking with W&B.
	wandb:
	# Enable W&B logging
	enabled: false

	# W&B project name
	project: "ltx-2-trainer"

	# W&B username or team (null uses default account)
	entity: null

	# Tags to help organize runs
	tags: [ "ltx2", "lora" ]

	# Log validation videos to W&B
	log_validation_videos: true

	# -----------------------------------------------------------------------------
	# General Configuration
	# -----------------------------------------------------------------------------
	# Global settings for the training run.

	# Random seed for reproducibility
	seed: 42

	# Directory to save outputs (checkpoints, validation videos, logs)
	output_dir: "outputs/ltx2_av_lora"