Python Tools API

Developer reference for standalone helper functions in voxkitchen.tools. The supported user workflow for data processing remains YAML plus vkit docker run.

from voxkitchen.tools import (
    audio_info,
    estimate_bandwidth,
    transcribe,
    detect_speech,
    classify_gender,
    estimate_snr,
    resample_audio,
    normalize_loudness,
    extract_speaker_embedding,
    compute_speaker_similarity,
    enhance_speech,
    align_words,
    tokenize_audio,
    synthesize,
)

Audio Info

info = audio_info("recording.wav")
# AudioInfo(path='recording.wav', sample_rate=16000, num_channels=1,
#           num_samples=160000, duration=10.0, format='WAV')

# Detect upsampled audio
info = audio_info("suspicious.wav", estimate_real_sr=True)
print(f"Header: {info.sample_rate} Hz, real: {info.real_sample_rate} Hz")

ASR Transcription

segments = transcribe("speech.wav", model="tiny")
# [SpeechSegment(start=0.0, end=3.2, text="Hello world")]

# Different engines
segments = transcribe("speech.wav", engine="sensevoice")     # Chinese/multilingual
segments = transcribe("speech.wav", engine="paraformer")      # Chinese, fast
segments = transcribe("speech.wav", engine="wenet", model="chinese")

Speech Detection (VAD)

segments = detect_speech("recording.wav", method="silero")
# [SpeechSegment(start=0.5, end=2.8), SpeechSegment(start=4.1, end=6.3)]

# Lightweight alternative
segments = detect_speech("recording.wav", method="webrtc")

Quality Estimation

snr = estimate_snr("noisy.wav")
# 18.3 (dB)

bandwidth = estimate_bandwidth("maybe_upsampled.wav")
# 8000.0 (Hz — real content is 8kHz despite 16kHz header)

Gender Classification

result = classify_gender("speaker.wav")
# {"gender": "f", "median_f0": 210.5, "method": "f0"}

result = classify_gender("speaker.wav", method="speechbrain")
# {"gender": "m", "method": "speechbrain", ...}

Audio Processing

resample_audio("input.wav", "output_16k.wav", target_sr=16000)
normalize_loudness("loud.wav", "normalized.wav", target_lufs=-23.0)

Speaker Embedding

emb = extract_speaker_embedding("speaker.wav")
# [0.12, -0.34, 0.56, ...]

# Explicit SpeechBrain model
emb = extract_speaker_embedding("speaker.wav", method="speechbrain",
                                 model="speechbrain/spkrec-ecapa-voxceleb")

Runtime image for equivalent pipeline operators: slim.

Speaker Similarity

Compare a sample against a saved reference embedding (cosine, 0–1):

# First, save a reference embedding (e.g. enrollment audio):
import numpy as np
ref = extract_speaker_embedding("enroll.wav")
np.save("reference.npy", np.asarray(ref, dtype=np.float32))

# Later, score new samples against it:
sim = compute_speaker_similarity("test.wav", "reference.npy")
# > 0.65 → likely same speaker; < 0.40 → likely different.

Runtime image for equivalent pipeline operators: slim.

Speech Enhancement

enhance_speech("noisy.wav", "clean.wav", aggressiveness=0.5)

Runtime image for equivalent pipeline operators: slim.

Forced Alignment

words = align_words("speech.wav", "hello world", language="English")
# [{"text": "hello", "start": 0.12, "end": 0.58},
#  {"text": "world", "start": 0.62, "end": 1.15}]

# Chinese
words = align_words("speech.wav", "你好世界", language="Chinese")

Runtime image for equivalent pipeline operators: asr.

Audio Tokenization

Encode audio into discrete codec tokens for neural codec / audio LM use:

tokens = tokenize_audio("speech.wav", backend="encodec", bandwidth=6.0)
# tokens is a list of token sequences, one per codebook layer.

tokens = tokenize_audio("speech.wav", backend="dac")

Runtime image for equivalent pipeline operators: slim.

TTS Synthesis

# Kokoro — lightweight (82M), CPU-capable, 8 languages
synthesize("Hello world!", "hello.wav", engine="kokoro")
synthesize("你好，世界", "zh.wav", engine="kokoro", language="z")
synthesize("Hello", "slow.wav", engine="kokoro", speed=0.8)

# ChatTTS — conversational, Chinese / English, GPU
synthesize("你好世界", "chat.wav", engine="chattts", seed=42)

# CosyVoice2 — zero-shot voice cloning, GPU
synthesize("你好", "clone.wav", engine="cosyvoice",
           reference_audio="ref.wav", reference_text="参考文本")

# Fish-Speech — zero-shot voice cloning, GPU
synthesize("Hello", "clone.wav", engine="fish_speech",
           reference_audio="ref.wav")

`engine`	Docker tag	Device	Notes
`"kokoro"`	`tts`	CPU/GPU	82M params, 8 langs (`a`/`b`/`j`/`z`/...), `speed` supported.
`"chattts"`	`tts`	GPU	Conversational ZH/EN, `seed` for speaker sampling.
`"cosyvoice"`	`tts`	GPU	Voice cloning via `reference_audio` + `reference_text`.
`"fish_speech"`	`fish-speech`	GPU	Fish-Speech S2 voice cloning via `reference_audio`.