Fun-ASR-Nano / app.py
xiaoyunchong.xyc
fix: upgrade gradio 5.23.0, disable @spaces.GPU for CPU mode
b023d15
Raw
History Blame Contribute Delete
8.97 kB
import os
import spaces
REPO_TYPE = "hf"
from huggingface_hub import snapshot_download
MODEL_CACHE_DIR = "./models"
FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano")
SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall")
VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad")
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
def download_model_if_not_exists(repo_id, local_path, model_name):
if not os.path.exists(local_path):
print(f"Downloading {model_name} to {local_path} ...")
snapshot_download(repo_id=repo_id, local_dir=local_path, ignore_patterns=["*.onnx"])
print(f"{model_name} downloaded.")
else:
print(f"{model_name} found locally, skipping download.")
download_model_if_not_exists("FunAudioLLM/Fun-ASR-Nano-2512", FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano")
download_model_if_not_exists("FunAudioLLM/SenseVoiceSmall", SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall")
download_model_if_not_exists("funasr/fsmn-vad", VAD_MODEL_LOCAL_PATH, "VAD Model")
import gradio as gr
import time
import tempfile
import numpy as np
import torch
import torchaudio
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
device = "cuda:0" if torch.cuda.is_available() else "cpu"
loaded_models = {}
def get_model(pipeline_type):
if pipeline_type in loaded_models:
return loaded_models[pipeline_type]
if pipeline_type == "fun-asr-nano":
model = AutoModel(
model=FUN_ASR_NANO_LOCAL_PATH,
trust_remote_code=True,
remote_code="./Fun-ASR/model.py",
vad_model=VAD_MODEL_LOCAL_PATH,
vad_kwargs={"max_single_segment_time": 30000},
device=device,
disable_update=True,
hub="hf",
)
elif pipeline_type == "sensevoice":
model = AutoModel(
model=SENSE_VOICE_SMALL_LOCAL_PATH,
trust_remote_code=False,
vad_model=VAD_MODEL_LOCAL_PATH,
vad_kwargs={"max_single_segment_time": 30000},
device=device,
disable_update=True,
hub="hf",
)
else:
return None
loaded_models[pipeline_type] = model
return model
# @spaces.GPU(duration=60) # disabled for CPU-only space
def transcribe(audio_input, pipeline_type, language):
if audio_input is None:
return "Please upload an audio file or record via microphone.", ""
model = get_model(pipeline_type)
if model is None:
return "Model loading failed.", ""
# Handle gradio audio input
if isinstance(audio_input, tuple):
sr, audio_data = audio_input
audio_data = audio_data.astype(np.float32) / np.iinfo(np.int16).max
if len(audio_data.shape) > 1:
audio_data = audio_data.mean(-1)
if sr != 16000:
resampler = torchaudio.transforms.Resample(sr, 16000)
audio_data = resampler(torch.from_numpy(audio_data).float().unsqueeze(0))[0].numpy()
# Save to temp file
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
import soundfile as sf
sf.write(tmp.name, audio_data, 16000)
audio_path = tmp.name
else:
audio_path = audio_input
start_time = time.time()
try:
if pipeline_type == "fun-asr-nano":
res = model.generate(input=[audio_path], use_itn=True, batch_size=1)
else:
res = model.generate(
input=audio_path, cache={}, language=language or "auto",
use_itn=True, batch_size_s=60, merge_vad=True,
)
elapsed = time.time() - start_time
text = rich_transcription_postprocess(res[0]["text"])
metrics = f"⏱️ {elapsed:.2f}s"
if os.path.exists(audio_path):
import librosa
duration = librosa.get_duration(filename=audio_path)
rtf = elapsed / duration if duration > 0 else 0
metrics = f"⏱️ {elapsed:.2f}s | Audio: {duration:.1f}s | RTF: {rtf:.4f}"
return text, metrics
except Exception as e:
return f"Error: {str(e)}", ""
finally:
if isinstance(audio_input, tuple) and os.path.exists(audio_path):
os.unlink(audio_path)
description_html = """
<div style="text-align: center; max-width: 850px; margin: 0 auto;">
<h1 style="font-size: 2.2em; margin-bottom: 0.1em;">🚀 Fun-ASR-Nano</h1>
<p style="font-size: 1.3em; color: #444; margin-bottom: 0.3em;">LLM-Powered Speech Recognition — 31 Languages, Dialects & Accents</p>
<p style="font-size: 1em; color: #666;">
End-to-end ASR trained on <strong>tens of millions of hours</strong> of data.
Supports Chinese (+ dialects), English, Japanese, Korean, French, German, Spanish, and 24 more languages.
</p>
<p style="font-size: 0.9em; margin-top: 0.8em;">
<a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">⭐ GitHub (Fun-ASR)</a> ·
<a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR Toolkit</a> ·
<a href="https://github.com/FunAudioLLM/SenseVoice" target="_blank">🎙️ SenseVoice</a> ·
<a href="https://huggingface.co/FunAudioLLM/Fun-ASR-Nano-2512" target="_blank">🤗 Model Card</a>
</p>
</div>
"""
comparison_html = """
<div style="background: linear-gradient(135deg, #f0f9ff 0%, #f5f3ff 100%); border-radius: 10px; padding: 16px; margin: 10px 0;">
<table style="width: 100%; border-collapse: collapse; font-size: 0.9em;">
<tr style="border-bottom: 2px solid #ddd;">
<th style="padding: 8px; text-align: left;">Model</th>
<th style="padding: 8px; text-align: center;">Languages</th>
<th style="padding: 8px; text-align: center;">Architecture</th>
<th style="padding: 8px; text-align: center;">Best For</th>
</tr>
<tr style="background: #e8f4fd;">
<td style="padding: 8px;"><strong>Fun-ASR-Nano</strong> ⭐</td>
<td style="padding: 8px; text-align: center;">31</td>
<td style="padding: 8px; text-align: center;">LLM-based</td>
<td style="padding: 8px; text-align: center;">Multi-language, dialects, highest accuracy</td>
</tr>
<tr>
<td style="padding: 8px;">SenseVoice</td>
<td style="padding: 8px; text-align: center;">5</td>
<td style="padding: 8px; text-align: center;">CTC (non-AR)</td>
<td style="padding: 8px; text-align: center;">Speed + Emotion + Audio events</td>
</tr>
</table>
</div>
"""
def launch():
with gr.Blocks(theme=gr.themes.Soft(), title="Fun-ASR-Nano - 31 Language ASR") as demo:
gr.HTML(description_html)
gr.HTML(comparison_html)
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="Upload audio or record via microphone",
sources=["upload", "microphone"],
type="filepath",
)
with gr.Row():
pipeline_type = gr.Dropdown(
choices=["fun-asr-nano", "sensevoice"],
value="fun-asr-nano",
label="Model",
)
language = gr.Dropdown(
choices=["auto", "zh", "en", "yue", "ja", "ko"],
value="auto",
label="Language (SenseVoice only)",
interactive=True,
)
btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")
with gr.Column(scale=1):
output_text = gr.Textbox(
label="Transcription Result",
lines=10,
show_copy_button=True,
)
metrics_text = gr.Textbox(label="Performance", lines=1)
btn.click(
transcribe,
inputs=[audio_input, pipeline_type, language],
outputs=[output_text, metrics_text],
)
gr.Markdown("""
### Supported Languages (Fun-ASR-Nano)
Chinese (Mandarin, Cantonese, Sichuan, Shanghai, Minnan, Wenzhou, Hakka, Gan, and more),
English, Japanese, Korean, French, German, Spanish, Italian, Portuguese, Russian, Arabic, Hindi,
Thai, Vietnamese, Indonesian, Malay, Turkish, Polish, Dutch, Swedish, Hebrew, Greek, Czech, Romanian, Hungarian, Finnish, Danish, Norwegian, Ukrainian.
### Tips
- **Fun-ASR-Nano**: Best for multi-language & Chinese dialects. Outputs punctuation natively.
- **SenseVoice**: Ultra-fast (7x faster than Whisper-small), also detects emotions & audio events.
- For long audio (>5min), consider using [FunASR](https://github.com/modelscope/FunASR) locally with GPU.
""")
demo.launch()
if __name__ == "__main__":
launch()