initial commit

2026-03-05 07:19:38 +00:00
commit 0203858a6d
5 changed files with 210 additions and 0 deletions
--- a/.tool-versions
+++ b/.tool-versions
@@ -0,0 +1,2 @@
 python 3.14.3
 nodejs 24.4.1
--- a/README.md
+++ b/README.md
@@ -0,0 +1,103 @@
 # Narrate
 A high-performance text-to-speech (TTS) narration tool using the Kokoro v1.0 ONNX model. This project is optimized for AMD GPUs using the `MIGraphX` execution provider but falls back to CPU when necessary.
 ## Features
 - **High Quality**: Leverages the Kokoro v1.0 TTS model for natural-sounding speech.
 - **Hardware Accelerated**: Optimized for AMD GPUs via ONNX Runtime and MIGraphX.
 - **Sentence-Level Buffering**: Splits text into logical units (sentences/lines) to provide smooth, continuous playback without long initial wait times.
 - **Asynchronous Playback**: Uses a dedicated background thread for audio playback to ensure generation and playback happen in parallel.
 ## Prerequisites
 ### Hardware
 - Optimized for systems with AMD GPUs (uses `MIGraphXExecutionProvider`).
 - Works on CPU (fallback).
 ### Software
 - **Python**: 3.14.3 (as specified in `.tool-versions` using asdf vm)
 - **Node.js**: 24.4.1 (as specified in `.tool-versions` using asdf vm)
 - **Required Files**: The following files must be present in the same directory:
  - `kokoro-v1.0.onnx`: The ONNX model file.
  - `voices-v1.0.bin`: The voice weights file.
  - `narrate.txt`: The text file you want to narrate.
 ## Models
 This project requires the Kokoro v1.0 ONNX model and the corresponding voice binary. You can download them using the links below:
 - **Kokoro v1.0 ONNX (FP16)**: [kokoro-v1.0.fp16.onnx (169 MB)](https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.fp16.onnx)
 - **Voice Weights**: [voices-v1.0.bin (26.9 MB)](https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin)
 ### Quick Download
 ```bash
 wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.fp16.onnx
 wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 ```
 *Note: The script `narrate.py` expects the model file to be named `kokoro-v1.0.onnx` or `kokoro-v1.0.fp16.onnx` in the same directory.*
 ## Setup
 ### 1. Create the Virtual Environment
 The project uses a virtual environment named `kokoro-venv` to manage its dependencies.
 ```bash
 # Create the virtual environment using Python 3.14 (as per .tool-versions)
 python3.14 -m venv kokoro-venv
 # Activate the environment
 source kokoro-venv/bin/activate
 ```
 ### 2. Install Dependencies
 With the virtual environment activated, install the required Python packages:
 ```bash
 pip install --upgrade pip
 pip install -r requirements.txt
 ```
 *Note: For AMD GPU support (MIGraphX), ensure your environment has the necessary ROCm/MIGraphX libraries installed. The script will automatically fall back to the CPU if the GPU provider is unavailable.*
 ### 3. Audio Requirements
 On Linux, you may need to install the PortAudio development headers for `sounddevice` to work:
 ```bash
 # For Ubuntu/Debian
 sudo apt-get install libportaudio2
 ```
 2. **Model Files**: Ensure you have downloaded the Kokoro ONNX model and voice binaries and placed them in the same folder.
 ## Usage
 ### Direct Python Execution
 You can run the narration script directly:
 ```bash
 python narrate.py
 ```
 ### Using the Shell Script
 A convenience script is provided to run the narrator using the local virtual environment:
 ```bash
 ./narrate.sh
 ```
 ## Configuration
 The script `narrate.py` contains several adjustable settings:
 - **Voice**: Defaulted to `af_sky`.
 - **Speed**: Set to `1.3x` for faster narration.
 - **Environment Variables**:
  - `HSA_OVERRIDE_GFX_VERSION`: Set to `10.3.0` for compatibility.
  - `MIGRAPHX_ENABLE_CACHE`: Enabled to speed up subsequent loads.
 ## File Structure
 - `narrate.py`: The core logic for TTS generation and audio playback.
 - `narrate.sh`: Entry point script.
 - `.tool-versions`: Version pinning for runtime environments.
 - `kokoro-venv/`: Local Python virtual environment containing dependencies.
--- a/narrate.py
+++ b/narrate.py
@@ -0,0 +1,99 @@
 import os
 import sys
 import signal
 import re
 import queue
 import threading
 import time
 import numpy as np
 # SETTINGS
 os.environ["HSA_OVERRIDE_GFX_VERSION"] = "10.3.0"
 os.environ["ORT_LOGGING_LEVEL"] = "3"
 os.environ["MIGRAPHX_ENABLE_CACHE"] = "1"
 os.environ["MIGRAPHX_CACHE_PATH"] = os.path.expanduser("./migraphx_cache")
 def signal_handler(sig, frame):
    print("\n[Ctrl+C] Stopping...")
    os._exit(0)
 signal.signal(signal.SIGINT, signal_handler)
 import onnxruntime as ort
 import sounddevice as sd
 from kokoro_onnx import Kokoro
 # Paths
 base_dir = os.path.expanduser("./")
 model_path = os.path.join(base_dir, "kokoro-v1.0.onnx")
 voices_path = os.path.join(base_dir, "voices-v1.0.bin")
 input_text_path = os.path.join(base_dir, "narrate.txt")
 # Initialize Session
 try:
    session = ort.InferenceSession(model_path, providers=[
        ('MIGraphXExecutionProvider', {'device_id': 0, 'migraphx_fp16_enable': True}),
        'CPUExecutionProvider'
    ])
    kokoro = Kokoro.from_session(session, voices_path)
 except Exception as e:
    print(f"Init Error: {e}")
    sys.exit(1)
 audio_queue = queue.Queue()
 def playback_worker():
    samplerate = 24000
    try:
        with sd.OutputStream(samplerate=samplerate, channels=1, dtype='float32') as stream:
            while True:
                item = audio_queue.get()
                if item is None:
                    audio_queue.task_done()
                    break
                samples, text_preview = item
                samples_reshaped = samples.reshape(-1, 1).astype('float32')
                print(f"Playing: {text_preview}...")
                stream.write(samples_reshaped)
                # Physical silence (0.5s)
                silence = np.zeros((int(samplerate * 0.5), 1), dtype='float32')
                stream.write(silence)
                audio_queue.task_done()
    except Exception as e:
        print(f"Playback Error: {e}")
 play_thread = threading.Thread(target=playback_worker, daemon=True)
 play_thread.start()
 # Read and Split Text
 if os.path.exists(input_text_path):
    with open(input_text_path, "r") as f:
        full_text = f.read()
 else:
    full_text = "No input file found."
 # Split on punctuation (and any following whitespace/newlines) OR standalone newlines
 # This avoids word-level splitting and ensures no double pauses
 sentences = [s.strip() for s in re.split(r'(?<=[.!?])(?![.!?])\s*|\n+', full_text) if s.strip()]
 print(f"Detected {len(sentences)} units (sentences/lines).")
 try:
    for i, sentence in enumerate(sentences):
        samples, _ = kokoro.create(sentence, voice="af_sky", speed=1.3, lang="en-us")
        audio_queue.put((samples, sentence[:50]))
        print(f"[{i+1}/{len(sentences)}] Buffered.")
    print("Generation complete. Waiting for playback to finish...")
    audio_queue.put(None)
    audio_queue.join()
    # Final flush
    time.sleep(0.1)
 except Exception as e:
    print(f"Process Error: {e}")
 finally:
    print("Finished.")
--- a/narrate.sh
+++ b/narrate.sh
@@ -0,0 +1,2 @@
 #!/bin/bash
 kokoro-venv/bin/python ./narrate.py
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,4 @@
 kokoro-onnx
 onnxruntime-gpu
 sounddevice
 numpy
		`@@ -0,0 +1,2 @@`
							`#!/bin/bash`
							`kokoro-venv/bin/python ./narrate.py`