From 0203858a6d3cf3de534f7243f5c23032e4bb03a1 Mon Sep 17 00:00:00 2001 From: shibao Date: Thu, 5 Mar 2026 07:19:38 +0000 Subject: [PATCH] initial commit --- .tool-versions | 2 + README.md | 103 +++++++++++++++++++++++++++++++++++++++++++++++ narrate.py | 99 +++++++++++++++++++++++++++++++++++++++++++++ narrate.sh | 2 + requirements.txt | 4 ++ 5 files changed, 210 insertions(+) create mode 100644 .tool-versions create mode 100644 README.md create mode 100644 narrate.py create mode 100755 narrate.sh create mode 100644 requirements.txt diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000..9817b3b --- /dev/null +++ b/.tool-versions @@ -0,0 +1,2 @@ +python 3.14.3 +nodejs 24.4.1 diff --git a/README.md b/README.md new file mode 100644 index 0000000..a699ab0 --- /dev/null +++ b/README.md @@ -0,0 +1,103 @@ +# Narrate + +A high-performance text-to-speech (TTS) narration tool using the Kokoro v1.0 ONNX model. This project is optimized for AMD GPUs using the `MIGraphX` execution provider but falls back to CPU when necessary. + +## Features + +- **High Quality**: Leverages the Kokoro v1.0 TTS model for natural-sounding speech. +- **Hardware Accelerated**: Optimized for AMD GPUs via ONNX Runtime and MIGraphX. +- **Sentence-Level Buffering**: Splits text into logical units (sentences/lines) to provide smooth, continuous playback without long initial wait times. +- **Asynchronous Playback**: Uses a dedicated background thread for audio playback to ensure generation and playback happen in parallel. + +## Prerequisites + +### Hardware +- Optimized for systems with AMD GPUs (uses `MIGraphXExecutionProvider`). +- Works on CPU (fallback). + +### Software +- **Python**: 3.14.3 (as specified in `.tool-versions` using asdf vm) +- **Node.js**: 24.4.1 (as specified in `.tool-versions` using asdf vm) +- **Required Files**: The following files must be present in the same directory: + - `kokoro-v1.0.onnx`: The ONNX model file. + - `voices-v1.0.bin`: The voice weights file. + - `narrate.txt`: The text file you want to narrate. + +## Models + +This project requires the Kokoro v1.0 ONNX model and the corresponding voice binary. You can download them using the links below: + +- **Kokoro v1.0 ONNX (FP16)**: [kokoro-v1.0.fp16.onnx (169 MB)](https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.fp16.onnx) +- **Voice Weights**: [voices-v1.0.bin (26.9 MB)](https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin) + +### Quick Download +```bash +wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.fp16.onnx +wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin +``` + +*Note: The script `narrate.py` expects the model file to be named `kokoro-v1.0.onnx` or `kokoro-v1.0.fp16.onnx` in the same directory.* + +## Setup + +### 1. Create the Virtual Environment +The project uses a virtual environment named `kokoro-venv` to manage its dependencies. + +```bash +# Create the virtual environment using Python 3.14 (as per .tool-versions) +python3.14 -m venv kokoro-venv + +# Activate the environment +source kokoro-venv/bin/activate +``` + +### 2. Install Dependencies +With the virtual environment activated, install the required Python packages: + +```bash +pip install --upgrade pip +pip install -r requirements.txt +``` + +*Note: For AMD GPU support (MIGraphX), ensure your environment has the necessary ROCm/MIGraphX libraries installed. The script will automatically fall back to the CPU if the GPU provider is unavailable.* + +### 3. Audio Requirements +On Linux, you may need to install the PortAudio development headers for `sounddevice` to work: + +```bash +# For Ubuntu/Debian +sudo apt-get install libportaudio2 +``` + +2. **Model Files**: Ensure you have downloaded the Kokoro ONNX model and voice binaries and placed them in the same folder. + +## Usage + +### Direct Python Execution +You can run the narration script directly: +```bash +python narrate.py +``` + +### Using the Shell Script +A convenience script is provided to run the narrator using the local virtual environment: +```bash +./narrate.sh +``` + +## Configuration + +The script `narrate.py` contains several adjustable settings: + +- **Voice**: Defaulted to `af_sky`. +- **Speed**: Set to `1.3x` for faster narration. +- **Environment Variables**: + - `HSA_OVERRIDE_GFX_VERSION`: Set to `10.3.0` for compatibility. + - `MIGRAPHX_ENABLE_CACHE`: Enabled to speed up subsequent loads. + +## File Structure + +- `narrate.py`: The core logic for TTS generation and audio playback. +- `narrate.sh`: Entry point script. +- `.tool-versions`: Version pinning for runtime environments. +- `kokoro-venv/`: Local Python virtual environment containing dependencies. diff --git a/narrate.py b/narrate.py new file mode 100644 index 0000000..9927a6d --- /dev/null +++ b/narrate.py @@ -0,0 +1,99 @@ +import os +import sys +import signal +import re +import queue +import threading +import time +import numpy as np + +# SETTINGS +os.environ["HSA_OVERRIDE_GFX_VERSION"] = "10.3.0" +os.environ["ORT_LOGGING_LEVEL"] = "3" +os.environ["MIGRAPHX_ENABLE_CACHE"] = "1" +os.environ["MIGRAPHX_CACHE_PATH"] = os.path.expanduser("./migraphx_cache") + +def signal_handler(sig, frame): + print("\n[Ctrl+C] Stopping...") + os._exit(0) + +signal.signal(signal.SIGINT, signal_handler) + +import onnxruntime as ort +import sounddevice as sd +from kokoro_onnx import Kokoro + +# Paths +base_dir = os.path.expanduser("./") +model_path = os.path.join(base_dir, "kokoro-v1.0.onnx") +voices_path = os.path.join(base_dir, "voices-v1.0.bin") +input_text_path = os.path.join(base_dir, "narrate.txt") + +# Initialize Session +try: + session = ort.InferenceSession(model_path, providers=[ + ('MIGraphXExecutionProvider', {'device_id': 0, 'migraphx_fp16_enable': True}), + 'CPUExecutionProvider' + ]) + kokoro = Kokoro.from_session(session, voices_path) +except Exception as e: + print(f"Init Error: {e}") + sys.exit(1) + +audio_queue = queue.Queue() + +def playback_worker(): + samplerate = 24000 + try: + with sd.OutputStream(samplerate=samplerate, channels=1, dtype='float32') as stream: + while True: + item = audio_queue.get() + if item is None: + audio_queue.task_done() + break + + samples, text_preview = item + samples_reshaped = samples.reshape(-1, 1).astype('float32') + + print(f"Playing: {text_preview}...") + stream.write(samples_reshaped) + + # Physical silence (0.5s) + silence = np.zeros((int(samplerate * 0.5), 1), dtype='float32') + stream.write(silence) + + audio_queue.task_done() + except Exception as e: + print(f"Playback Error: {e}") + +play_thread = threading.Thread(target=playback_worker, daemon=True) +play_thread.start() + +# Read and Split Text +if os.path.exists(input_text_path): + with open(input_text_path, "r") as f: + full_text = f.read() +else: + full_text = "No input file found." + +# Split on punctuation (and any following whitespace/newlines) OR standalone newlines +# This avoids word-level splitting and ensures no double pauses +sentences = [s.strip() for s in re.split(r'(?<=[.!?])(?![.!?])\s*|\n+', full_text) if s.strip()] +print(f"Detected {len(sentences)} units (sentences/lines).") + +try: + for i, sentence in enumerate(sentences): + samples, _ = kokoro.create(sentence, voice="af_sky", speed=1.3, lang="en-us") + audio_queue.put((samples, sentence[:50])) + print(f"[{i+1}/{len(sentences)}] Buffered.") + + print("Generation complete. Waiting for playback to finish...") + audio_queue.put(None) + audio_queue.join() + # Final flush + time.sleep(0.1) + +except Exception as e: + print(f"Process Error: {e}") +finally: + print("Finished.") diff --git a/narrate.sh b/narrate.sh new file mode 100755 index 0000000..387ff78 --- /dev/null +++ b/narrate.sh @@ -0,0 +1,2 @@ +#!/bin/bash +kokoro-venv/bin/python ./narrate.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8fcae91 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +kokoro-onnx +onnxruntime-gpu +sounddevice +numpy