Spaces:

FunAudioLLM
/

Fun-ASR-Nano

Running

xiaoyunchong.xyc

fix: upgrade gradio 5.23.0, disable @spaces.GPU for CPU mode

b023d15 about 1 month ago

8.97 kB

	import os
	import spaces

	REPO_TYPE = "hf"

	from huggingface_hub import snapshot_download

	MODEL_CACHE_DIR = "./models"
	FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano")
	SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall")
	VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad")

	os.makedirs(MODEL_CACHE_DIR, exist_ok=True)


	def download_model_if_not_exists(repo_id, local_path, model_name):
	if not os.path.exists(local_path):
	print(f"Downloading {model_name} to {local_path} ...")
	snapshot_download(repo_id=repo_id, local_dir=local_path, ignore_patterns=["*.onnx"])
	print(f"{model_name} downloaded.")
	else:
	print(f"{model_name} found locally, skipping download.")


	download_model_if_not_exists("FunAudioLLM/Fun-ASR-Nano-2512", FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano")
	download_model_if_not_exists("FunAudioLLM/SenseVoiceSmall", SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall")
	download_model_if_not_exists("funasr/fsmn-vad", VAD_MODEL_LOCAL_PATH, "VAD Model")

	import gradio as gr
	import time
	import tempfile
	import numpy as np
	import torch
	import torchaudio
	from funasr import AutoModel
	from funasr.utils.postprocess_utils import rich_transcription_postprocess

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	loaded_models = {}


	def get_model(pipeline_type):
	if pipeline_type in loaded_models:
	return loaded_models[pipeline_type]

	if pipeline_type == "fun-asr-nano":
	model = AutoModel(
	model=FUN_ASR_NANO_LOCAL_PATH,
	trust_remote_code=True,
	remote_code="./Fun-ASR/model.py",
	vad_model=VAD_MODEL_LOCAL_PATH,
	vad_kwargs={"max_single_segment_time": 30000},
	device=device,
	disable_update=True,
	hub="hf",
	)
	elif pipeline_type == "sensevoice":
	model = AutoModel(
	model=SENSE_VOICE_SMALL_LOCAL_PATH,
	trust_remote_code=False,
	vad_model=VAD_MODEL_LOCAL_PATH,
	vad_kwargs={"max_single_segment_time": 30000},
	device=device,
	disable_update=True,
	hub="hf",
	)
	else:
	return None

	loaded_models[pipeline_type] = model
	return model


	# @spaces.GPU(duration=60) # disabled for CPU-only space
	def transcribe(audio_input, pipeline_type, language):
	if audio_input is None:
	return "Please upload an audio file or record via microphone.", ""

	model = get_model(pipeline_type)
	if model is None:
	return "Model loading failed.", ""

	# Handle gradio audio input
	if isinstance(audio_input, tuple):
	sr, audio_data = audio_input
	audio_data = audio_data.astype(np.float32) / np.iinfo(np.int16).max
	if len(audio_data.shape) > 1:
	audio_data = audio_data.mean(-1)
	if sr != 16000:
	resampler = torchaudio.transforms.Resample(sr, 16000)
	audio_data = resampler(torch.from_numpy(audio_data).float().unsqueeze(0))[0].numpy()
	# Save to temp file
	tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	import soundfile as sf
	sf.write(tmp.name, audio_data, 16000)
	audio_path = tmp.name
	else:
	audio_path = audio_input

	start_time = time.time()

	try:
	if pipeline_type == "fun-asr-nano":
	res = model.generate(input=[audio_path], use_itn=True, batch_size=1)
	else:
	res = model.generate(
	input=audio_path, cache={}, language=language or "auto",
	use_itn=True, batch_size_s=60, merge_vad=True,
	)

	elapsed = time.time() - start_time
	text = rich_transcription_postprocess(res[0]["text"])

	metrics = f"⏱️ {elapsed:.2f}s"
	if os.path.exists(audio_path):
	import librosa
	duration = librosa.get_duration(filename=audio_path)
	rtf = elapsed / duration if duration > 0 else 0
	metrics = f"⏱️ {elapsed:.2f}s \| Audio: {duration:.1f}s \| RTF: {rtf:.4f}"

	return text, metrics

	except Exception as e:
	return f"Error: {str(e)}", ""
	finally:
	if isinstance(audio_input, tuple) and os.path.exists(audio_path):
	os.unlink(audio_path)


	description_html = """
	<div style="text-align: center; max-width: 850px; margin: 0 auto;">
	<h1 style="font-size: 2.2em; margin-bottom: 0.1em;">🚀 Fun-ASR-Nano</h1>
	<p style="font-size: 1.3em; color: #444; margin-bottom: 0.3em;">LLM-Powered Speech Recognition — 31 Languages, Dialects & Accents</p>
	<p style="font-size: 1em; color: #666;">
	End-to-end ASR trained on <strong>tens of millions of hours</strong> of data.
	Supports Chinese (+ dialects), English, Japanese, Korean, French, German, Spanish, and 24 more languages.
	</p>
	<p style="font-size: 0.9em; margin-top: 0.8em;">
	<a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">⭐ GitHub (Fun-ASR)</a> ·
	<a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR Toolkit</a> ·
	<a href="https://github.com/FunAudioLLM/SenseVoice" target="_blank">🎙️ SenseVoice</a> ·
	<a href="https://huggingface.co/FunAudioLLM/Fun-ASR-Nano-2512" target="_blank">🤗 Model Card</a>
	</p>
	</div>
	"""

	comparison_html = """
	<div style="background: linear-gradient(135deg, #f0f9ff 0%, #f5f3ff 100%); border-radius: 10px; padding: 16px; margin: 10px 0;">
	<table style="width: 100%; border-collapse: collapse; font-size: 0.9em;">
	<tr style="border-bottom: 2px solid #ddd;">
	<th style="padding: 8px; text-align: left;">Model</th>
	<th style="padding: 8px; text-align: center;">Languages</th>
	<th style="padding: 8px; text-align: center;">Architecture</th>
	<th style="padding: 8px; text-align: center;">Best For</th>
	</tr>
	<tr style="background: #e8f4fd;">
	<td style="padding: 8px;"><strong>Fun-ASR-Nano</strong> ⭐</td>
	<td style="padding: 8px; text-align: center;">31</td>
	<td style="padding: 8px; text-align: center;">LLM-based</td>
	<td style="padding: 8px; text-align: center;">Multi-language, dialects, highest accuracy</td>
	</tr>
	<tr>
	<td style="padding: 8px;">SenseVoice</td>
	<td style="padding: 8px; text-align: center;">5</td>
	<td style="padding: 8px; text-align: center;">CTC (non-AR)</td>
	<td style="padding: 8px; text-align: center;">Speed + Emotion + Audio events</td>
	</tr>
	</table>
	</div>
	"""


	def launch():
	with gr.Blocks(theme=gr.themes.Soft(), title="Fun-ASR-Nano - 31 Language ASR") as demo:
	gr.HTML(description_html)
	gr.HTML(comparison_html)

	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(
	label="Upload audio or record via microphone",
	sources=["upload", "microphone"],
	type="filepath",
	)
	with gr.Row():
	pipeline_type = gr.Dropdown(
	choices=["fun-asr-nano", "sensevoice"],
	value="fun-asr-nano",
	label="Model",
	)
	language = gr.Dropdown(
	choices=["auto", "zh", "en", "yue", "ja", "ko"],
	value="auto",
	label="Language (SenseVoice only)",
	interactive=True,
	)
	btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")

	with gr.Column(scale=1):
	output_text = gr.Textbox(
	label="Transcription Result",
	lines=10,
	show_copy_button=True,
	)
	metrics_text = gr.Textbox(label="Performance", lines=1)

	btn.click(
	transcribe,
	inputs=[audio_input, pipeline_type, language],
	outputs=[output_text, metrics_text],
	)

	gr.Markdown("""
	### Supported Languages (Fun-ASR-Nano)
	Chinese (Mandarin, Cantonese, Sichuan, Shanghai, Minnan, Wenzhou, Hakka, Gan, and more),
	English, Japanese, Korean, French, German, Spanish, Italian, Portuguese, Russian, Arabic, Hindi,
	Thai, Vietnamese, Indonesian, Malay, Turkish, Polish, Dutch, Swedish, Hebrew, Greek, Czech, Romanian, Hungarian, Finnish, Danish, Norwegian, Ukrainian.

	### Tips
	- Fun-ASR-Nano: Best for multi-language & Chinese dialects. Outputs punctuation natively.
	- SenseVoice: Ultra-fast (7x faster than Whisper-small), also detects emotions & audio events.
	- For long audio (>5min), consider using [FunASR](https://github.com/modelscope/FunASR) locally with GPU.
	""")

	demo.launch()


	if __name__ == "__main__":
	launch()