Text-to-Speech

Speech synthesis for text-to-speech (TTS) — i.e., generate audio using custom voices from written input.

Overview

Text-to-Speech uses ONNX runtime as inference engine. Load any supported model using modelType: "tts". Then, provide text as input (with inputType: "text") to generate speech audio.

textToSpeech() returns an object containing buffer and, when streaming is enabled, a bufferStream for incremental audio output.

Functions

Use the following sequence of function calls:

For how to use each function, see SDK — API reference.

You can load any Chatterbox or Supertonic model bundle compatible with ONNX Runtime. Required files: an onnx/ directory with one or more *.onnx graphs (and optional *.onnx_data) plus the model assets required by the bundle (tokenizer/config files and voice assets such as voices/*.bin or a reference *.wav).

For models available as constants, see SDK — Models.

Example

Chatterbox

The following script shows an example of Chatterbox TTS with voice cloning from a reference audio file. Use it with utils.js / utils.ts:

tts-chatterbox.js

import { loadModel, textToSpeech, unloadModel, TTS_TOKENIZER_EN_CHATTERBOX, TTS_SPEECH_ENCODER_EN_CHATTERBOX_FP32, TTS_EMBED_TOKENS_EN_CHATTERBOX_FP32, TTS_CONDITIONAL_DECODER_EN_CHATTERBOX_FP32, TTS_LANGUAGE_MODEL_EN_CHATTERBOX_FP32, } from "@qvac/sdk";
import { createWav, playAudio, int16ArrayToBuffer, createWavHeader, } from "./utils.js";
// Chatterbox TTS: voice cloning with reference audio.
// Uses registry model constants - downloads automatically from QVAC Registry.
// Only reference audio WAV needs to be provided by the user.
// Usage: node chatterbox-filesystem.js <referenceAudioSrc>
const [referenceAudioSrc] = process.argv.slice(2);
if (!referenceAudioSrc) {
    console.error("Usage: node chatterbox-filesystem.js <referenceAudioSrc>");
    process.exit(1);
}
const CHATTERBOX_SAMPLE_RATE = 24000;
try {
    const modelId = await loadModel({
        modelSrc: TTS_TOKENIZER_EN_CHATTERBOX.src,
        modelType: "tts",
        modelConfig: {
            ttsEngine: "chatterbox",
            language: "en",
            ttsTokenizerSrc: TTS_TOKENIZER_EN_CHATTERBOX.src,
            ttsSpeechEncoderSrc: TTS_SPEECH_ENCODER_EN_CHATTERBOX_FP32.src,
            ttsEmbedTokensSrc: TTS_EMBED_TOKENS_EN_CHATTERBOX_FP32.src,
            ttsConditionalDecoderSrc: TTS_CONDITIONAL_DECODER_EN_CHATTERBOX_FP32.src,
            ttsLanguageModelSrc: TTS_LANGUAGE_MODEL_EN_CHATTERBOX_FP32.src,
            referenceAudioSrc,
        },
        onProgress: (progress) => {
            console.log(progress);
        },
    });
    console.log(`Model loaded: ${modelId}`);
    console.log("🎵 Testing Text-to-Speech...");
    const result = textToSpeech({
        modelId,
        text: `QVAC SDK is the canonical entry point to QVAC. Written in TypeScript, it provides all QVAC capabilities through a unified interface while also abstracting away the complexity of running your application in a JS environment other than Bare. Supported JS environments include Bare, Node.js, Expo and Bun.`,
        inputType: "text",
        stream: false,
    });
    const audioBuffer = await result.buffer;
    console.log(`TTS complete. Total bytes: ${audioBuffer.length}`);
    console.log("💾 Saving audio to file...");
    createWav(audioBuffer, CHATTERBOX_SAMPLE_RATE, "tts-output.wav");
    console.log("✅ Audio saved to tts-output.wav");
    console.log("🔊 Playing audio...");
    const audioData = int16ArrayToBuffer(audioBuffer);
    const wavBuffer = Buffer.concat([
        createWavHeader(audioData.length, CHATTERBOX_SAMPLE_RATE),
        audioData,
    ]);
    playAudio(wavBuffer);
    console.log("✅ Audio playback complete");
    await unloadModel({ modelId });
    console.log("Model unloaded");
    process.exit(0);
}
catch (error) {
    console.error("❌ Error:", error);
    process.exit(1);
}

Supertonic

The following script shows an example of Supertonic TTS for general-purpose speech synthesis. Use it with utils.js / utils.ts:

tts-supertonic.js

import { loadModel, textToSpeech, unloadModel, TTS_TOKENIZER_SUPERTONIC, TTS_TEXT_ENCODER_SUPERTONIC_FP32, TTS_LATENT_DENOISER_SUPERTONIC_FP32, TTS_VOICE_DECODER_SUPERTONIC_FP32, TTS_VOICE_STYLE_SUPERTONIC, } from "@qvac/sdk";
import { createWav, playAudio, int16ArrayToBuffer, createWavHeader, } from "./utils.js";
// Supertonic TTS: general-purpose, no voice cloning.
// Uses registry model constants - downloads automatically from QVAC Registry.
const SUPERTONIC_SAMPLE_RATE = 44100;
try {
    const modelId = await loadModel({
        modelSrc: TTS_TOKENIZER_SUPERTONIC.src,
        modelType: "tts",
        modelConfig: {
            ttsEngine: "supertonic",
            language: "en",
            ttsTokenizerSrc: TTS_TOKENIZER_SUPERTONIC.src,
            ttsTextEncoderSrc: TTS_TEXT_ENCODER_SUPERTONIC_FP32.src,
            ttsLatentDenoiserSrc: TTS_LATENT_DENOISER_SUPERTONIC_FP32.src,
            ttsVoiceDecoderSrc: TTS_VOICE_DECODER_SUPERTONIC_FP32.src,
            ttsVoiceSrc: TTS_VOICE_STYLE_SUPERTONIC.src,
        },
        onProgress: (progress) => {
            console.log(progress);
        },
    });
    console.log(`Model loaded: ${modelId}`);
    console.log("🎵 Testing Text-to-Speech...");
    const result = textToSpeech({
        modelId,
        text: `QVAC SDK is the canonical entry point to QVAC. Written in TypeScript, it provides all QVAC capabilities through a unified interface while also abstracting away the complexity of running your application in a JS environment other than Bare. Supported JS environments include Bare, Node.js, Expo and Bun.`,
        inputType: "text",
        stream: false,
    });
    const audioBuffer = await result.buffer;
    console.log(`TTS complete. Total samples: ${audioBuffer.length}`);
    console.log("💾 Saving audio to file...");
    createWav(audioBuffer, SUPERTONIC_SAMPLE_RATE, "supertonic-output.wav");
    console.log("✅ Audio saved to supertonic-output.wav");
    console.log("🔊 Playing audio...");
    const audioData = int16ArrayToBuffer(audioBuffer);
    const wavBuffer = Buffer.concat([
        createWavHeader(audioData.length, SUPERTONIC_SAMPLE_RATE),
        audioData,
    ]);
    playAudio(wavBuffer);
    console.log("✅ Audio playback complete");
    await unloadModel({ modelId });
    console.log("Model unloaded");
    process.exit(0);
}
catch (error) {
    console.error("❌ Error:", error);
    process.exit(1);
}

Utils

The following helper script is used by both examples above to convert the raw PCM samples returned by textToSpeech() into a WAV file and play it back:

utils.js

import { writeFileSync, unlinkSync } from "fs";
import { spawnSync } from "child_process";
import { platform } from "os";
/**
 * Create WAV header for 16-bit PCM audio
 */
export function createWavHeader(dataLength, sampleRate) {
    const header = Buffer.alloc(44);
    // RIFF header
    header.write("RIFF", 0);
    header.writeUInt32LE(36 + dataLength, 4);
    header.write("WAVE", 8);
    // fmt chunk
    header.write("fmt ", 12);
    header.writeUInt32LE(16, 16); // fmt chunk size
    header.writeUInt16LE(1, 20); // PCM format
    header.writeUInt16LE(1, 22); // mono
    header.writeUInt32LE(sampleRate, 24);
    header.writeUInt32LE(sampleRate * 2, 28); // byte rate
    header.writeUInt16LE(2, 32); // block align
    header.writeUInt16LE(16, 34); // bits per sample
    // data chunk
    header.write("data", 36);
    header.writeUInt32LE(dataLength, 40);
    return header;
}
/**
 * Convert Int16Array to Buffer
 */
export function int16ArrayToBuffer(samples) {
    const buffer = Buffer.alloc(samples.length * 2);
    for (let i = 0; i < samples.length; i++) {
        const value = Math.max(-32768, Math.min(32767, Math.round(samples[i] ?? 0)));
        buffer.writeInt16LE(value, i * 2);
    }
    return buffer;
}
/**
 * Create and save WAV file
 */
export function createWav(audioBuffer, sampleRate, filename) {
    const audioData = int16ArrayToBuffer(audioBuffer);
    const wavHeader = createWavHeader(audioData.length, sampleRate);
    const wavFile = Buffer.concat([wavHeader, audioData]);
    writeFileSync(filename, wavFile);
    console.log(`WAV file saved as: ${filename}`);
}
/**
 * Play audio using system audio players
 */
export function playAudio(audioBuffer) {
    const currentPlatform = platform();
    const tempFile = `/tmp/audio-${Date.now()}.wav`;
    // Write audio buffer to temporary file
    writeFileSync(tempFile, audioBuffer);
    let audioPlayer;
    let args;
    switch (currentPlatform) {
        case "darwin":
            audioPlayer = "afplay";
            args = [tempFile];
            break;
        case "linux":
            audioPlayer = "aplay";
            args = [tempFile];
            break;
        case "win32":
            audioPlayer = "powershell";
            args = [
                "-Command",
                `Add-Type -AssemblyName presentationCore; (New-Object Media.SoundPlayer).LoadStream([System.IO.File]::ReadAllBytes('${tempFile}')).PlaySync()`,
            ];
            break;
        default:
            audioPlayer = "aplay";
            args = [tempFile];
    }
    const result = spawnSync(audioPlayer, args, {
        stdio: ["inherit", "inherit", "inherit"],
    });
    try {
        unlinkSync(tempFile);
    }
    catch {
        // Ignore cleanup errors
    }
    if (result.error) {
        throw new Error(`Audio player failed: ${result.error.message}`);
    }
    if (result.status !== 0) {
        throw new Error(`Audio player exited with code ${result.status}`);
    }
}

Tip: all examples throughout this documentation are self-contained and runnable. For instructions on how to run them, see SDK quickstart.