Spaces:
Running
Running
| import os | |
| import spaces | |
| REPO_TYPE = "hf" | |
| from huggingface_hub import snapshot_download | |
| MODEL_CACHE_DIR = "./models" | |
| FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano") | |
| SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall") | |
| VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad") | |
| os.makedirs(MODEL_CACHE_DIR, exist_ok=True) | |
| def download_model_if_not_exists(repo_id, local_path, model_name): | |
| if not os.path.exists(local_path): | |
| print(f"Downloading {model_name} to {local_path} ...") | |
| snapshot_download(repo_id=repo_id, local_dir=local_path, ignore_patterns=["*.onnx"]) | |
| print(f"{model_name} downloaded.") | |
| else: | |
| print(f"{model_name} found locally, skipping download.") | |
| download_model_if_not_exists("FunAudioLLM/Fun-ASR-Nano-2512", FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano") | |
| download_model_if_not_exists("FunAudioLLM/SenseVoiceSmall", SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall") | |
| download_model_if_not_exists("funasr/fsmn-vad", VAD_MODEL_LOCAL_PATH, "VAD Model") | |
| import gradio as gr | |
| import time | |
| import tempfile | |
| import numpy as np | |
| import torch | |
| import torchaudio | |
| from funasr import AutoModel | |
| from funasr.utils.postprocess_utils import rich_transcription_postprocess | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| loaded_models = {} | |
| def get_model(pipeline_type): | |
| if pipeline_type in loaded_models: | |
| return loaded_models[pipeline_type] | |
| if pipeline_type == "fun-asr-nano": | |
| model = AutoModel( | |
| model=FUN_ASR_NANO_LOCAL_PATH, | |
| trust_remote_code=True, | |
| remote_code="./Fun-ASR/model.py", | |
| vad_model=VAD_MODEL_LOCAL_PATH, | |
| vad_kwargs={"max_single_segment_time": 30000}, | |
| device=device, | |
| disable_update=True, | |
| hub="hf", | |
| ) | |
| elif pipeline_type == "sensevoice": | |
| model = AutoModel( | |
| model=SENSE_VOICE_SMALL_LOCAL_PATH, | |
| trust_remote_code=False, | |
| vad_model=VAD_MODEL_LOCAL_PATH, | |
| vad_kwargs={"max_single_segment_time": 30000}, | |
| device=device, | |
| disable_update=True, | |
| hub="hf", | |
| ) | |
| else: | |
| return None | |
| loaded_models[pipeline_type] = model | |
| return model | |
| # @spaces.GPU(duration=60) # disabled for CPU-only space | |
| def transcribe(audio_input, pipeline_type, language): | |
| if audio_input is None: | |
| return "Please upload an audio file or record via microphone.", "" | |
| model = get_model(pipeline_type) | |
| if model is None: | |
| return "Model loading failed.", "" | |
| # Handle gradio audio input | |
| if isinstance(audio_input, tuple): | |
| sr, audio_data = audio_input | |
| audio_data = audio_data.astype(np.float32) / np.iinfo(np.int16).max | |
| if len(audio_data.shape) > 1: | |
| audio_data = audio_data.mean(-1) | |
| if sr != 16000: | |
| resampler = torchaudio.transforms.Resample(sr, 16000) | |
| audio_data = resampler(torch.from_numpy(audio_data).float().unsqueeze(0))[0].numpy() | |
| # Save to temp file | |
| tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| import soundfile as sf | |
| sf.write(tmp.name, audio_data, 16000) | |
| audio_path = tmp.name | |
| else: | |
| audio_path = audio_input | |
| start_time = time.time() | |
| try: | |
| if pipeline_type == "fun-asr-nano": | |
| res = model.generate(input=[audio_path], use_itn=True, batch_size=1) | |
| else: | |
| res = model.generate( | |
| input=audio_path, cache={}, language=language or "auto", | |
| use_itn=True, batch_size_s=60, merge_vad=True, | |
| ) | |
| elapsed = time.time() - start_time | |
| text = rich_transcription_postprocess(res[0]["text"]) | |
| metrics = f"⏱️ {elapsed:.2f}s" | |
| if os.path.exists(audio_path): | |
| import librosa | |
| duration = librosa.get_duration(filename=audio_path) | |
| rtf = elapsed / duration if duration > 0 else 0 | |
| metrics = f"⏱️ {elapsed:.2f}s | Audio: {duration:.1f}s | RTF: {rtf:.4f}" | |
| return text, metrics | |
| except Exception as e: | |
| return f"Error: {str(e)}", "" | |
| finally: | |
| if isinstance(audio_input, tuple) and os.path.exists(audio_path): | |
| os.unlink(audio_path) | |
| description_html = """ | |
| <div style="text-align: center; max-width: 850px; margin: 0 auto;"> | |
| <h1 style="font-size: 2.2em; margin-bottom: 0.1em;">🚀 Fun-ASR-Nano</h1> | |
| <p style="font-size: 1.3em; color: #444; margin-bottom: 0.3em;">LLM-Powered Speech Recognition — 31 Languages, Dialects & Accents</p> | |
| <p style="font-size: 1em; color: #666;"> | |
| End-to-end ASR trained on <strong>tens of millions of hours</strong> of data. | |
| Supports Chinese (+ dialects), English, Japanese, Korean, French, German, Spanish, and 24 more languages. | |
| </p> | |
| <p style="font-size: 0.9em; margin-top: 0.8em;"> | |
| <a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">⭐ GitHub (Fun-ASR)</a> · | |
| <a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR Toolkit</a> · | |
| <a href="https://github.com/FunAudioLLM/SenseVoice" target="_blank">🎙️ SenseVoice</a> · | |
| <a href="https://huggingface.co/FunAudioLLM/Fun-ASR-Nano-2512" target="_blank">🤗 Model Card</a> | |
| </p> | |
| </div> | |
| """ | |
| comparison_html = """ | |
| <div style="background: linear-gradient(135deg, #f0f9ff 0%, #f5f3ff 100%); border-radius: 10px; padding: 16px; margin: 10px 0;"> | |
| <table style="width: 100%; border-collapse: collapse; font-size: 0.9em;"> | |
| <tr style="border-bottom: 2px solid #ddd;"> | |
| <th style="padding: 8px; text-align: left;">Model</th> | |
| <th style="padding: 8px; text-align: center;">Languages</th> | |
| <th style="padding: 8px; text-align: center;">Architecture</th> | |
| <th style="padding: 8px; text-align: center;">Best For</th> | |
| </tr> | |
| <tr style="background: #e8f4fd;"> | |
| <td style="padding: 8px;"><strong>Fun-ASR-Nano</strong> ⭐</td> | |
| <td style="padding: 8px; text-align: center;">31</td> | |
| <td style="padding: 8px; text-align: center;">LLM-based</td> | |
| <td style="padding: 8px; text-align: center;">Multi-language, dialects, highest accuracy</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px;">SenseVoice</td> | |
| <td style="padding: 8px; text-align: center;">5</td> | |
| <td style="padding: 8px; text-align: center;">CTC (non-AR)</td> | |
| <td style="padding: 8px; text-align: center;">Speed + Emotion + Audio events</td> | |
| </tr> | |
| </table> | |
| </div> | |
| """ | |
| def launch(): | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Fun-ASR-Nano - 31 Language ASR") as demo: | |
| gr.HTML(description_html) | |
| gr.HTML(comparison_html) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| label="Upload audio or record via microphone", | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| ) | |
| with gr.Row(): | |
| pipeline_type = gr.Dropdown( | |
| choices=["fun-asr-nano", "sensevoice"], | |
| value="fun-asr-nano", | |
| label="Model", | |
| ) | |
| language = gr.Dropdown( | |
| choices=["auto", "zh", "en", "yue", "ja", "ko"], | |
| value="auto", | |
| label="Language (SenseVoice only)", | |
| interactive=True, | |
| ) | |
| btn = gr.Button("🎯 Transcribe", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| output_text = gr.Textbox( | |
| label="Transcription Result", | |
| lines=10, | |
| show_copy_button=True, | |
| ) | |
| metrics_text = gr.Textbox(label="Performance", lines=1) | |
| btn.click( | |
| transcribe, | |
| inputs=[audio_input, pipeline_type, language], | |
| outputs=[output_text, metrics_text], | |
| ) | |
| gr.Markdown(""" | |
| ### Supported Languages (Fun-ASR-Nano) | |
| Chinese (Mandarin, Cantonese, Sichuan, Shanghai, Minnan, Wenzhou, Hakka, Gan, and more), | |
| English, Japanese, Korean, French, German, Spanish, Italian, Portuguese, Russian, Arabic, Hindi, | |
| Thai, Vietnamese, Indonesian, Malay, Turkish, Polish, Dutch, Swedish, Hebrew, Greek, Czech, Romanian, Hungarian, Finnish, Danish, Norwegian, Ukrainian. | |
| ### Tips | |
| - **Fun-ASR-Nano**: Best for multi-language & Chinese dialects. Outputs punctuation natively. | |
| - **SenseVoice**: Ultra-fast (7x faster than Whisper-small), also detects emotions & audio events. | |
| - For long audio (>5min), consider using [FunASR](https://github.com/modelscope/FunASR) locally with GPU. | |
| """) | |
| demo.launch() | |
| if __name__ == "__main__": | |
| launch() | |