Python Tools API
Developer reference for standalone helper functions in voxkitchen.tools.
The supported user workflow for data processing remains YAML plus
vkit docker run.
from voxkitchen.tools import (
audio_info,
estimate_bandwidth,
transcribe,
detect_speech,
classify_gender,
estimate_snr,
resample_audio,
normalize_loudness,
extract_speaker_embedding,
compute_speaker_similarity,
enhance_speech,
align_words,
tokenize_audio,
synthesize,
)
Audio Info
info = audio_info("recording.wav")
# AudioInfo(path='recording.wav', sample_rate=16000, num_channels=1,
# num_samples=160000, duration=10.0, format='WAV')
# Detect upsampled audio
info = audio_info("suspicious.wav", estimate_real_sr=True)
print(f"Header: {info.sample_rate} Hz, real: {info.real_sample_rate} Hz")
ASR Transcription
segments = transcribe("speech.wav", model="tiny")
# [SpeechSegment(start=0.0, end=3.2, text="Hello world")]
# Different engines
segments = transcribe("speech.wav", engine="sensevoice") # Chinese/multilingual
segments = transcribe("speech.wav", engine="paraformer") # Chinese, fast
segments = transcribe("speech.wav", engine="wenet", model="chinese")
Speech Detection (VAD)
segments = detect_speech("recording.wav", method="silero")
# [SpeechSegment(start=0.5, end=2.8), SpeechSegment(start=4.1, end=6.3)]
# Lightweight alternative
segments = detect_speech("recording.wav", method="webrtc")
Quality Estimation
snr = estimate_snr("noisy.wav")
# 18.3 (dB)
bandwidth = estimate_bandwidth("maybe_upsampled.wav")
# 8000.0 (Hz — real content is 8kHz despite 16kHz header)
Gender Classification
result = classify_gender("speaker.wav")
# {"gender": "f", "median_f0": 210.5, "method": "f0"}
result = classify_gender("speaker.wav", method="speechbrain")
# {"gender": "m", "method": "speechbrain", ...}
Audio Processing
resample_audio("input.wav", "output_16k.wav", target_sr=16000)
normalize_loudness("loud.wav", "normalized.wav", target_lufs=-23.0)
Speaker Embedding
emb = extract_speaker_embedding("speaker.wav")
# [0.12, -0.34, 0.56, ...]
# Explicit SpeechBrain model
emb = extract_speaker_embedding("speaker.wav", method="speechbrain",
model="speechbrain/spkrec-ecapa-voxceleb")
Runtime image for equivalent pipeline operators: slim.
Speaker Similarity
Compare a sample against a saved reference embedding (cosine, 0–1):
# First, save a reference embedding (e.g. enrollment audio):
import numpy as np
ref = extract_speaker_embedding("enroll.wav")
np.save("reference.npy", np.asarray(ref, dtype=np.float32))
# Later, score new samples against it:
sim = compute_speaker_similarity("test.wav", "reference.npy")
# > 0.65 → likely same speaker; < 0.40 → likely different.
Runtime image for equivalent pipeline operators: slim.
Speech Enhancement
enhance_speech("noisy.wav", "clean.wav", aggressiveness=0.5)
Runtime image for equivalent pipeline operators: slim.
Forced Alignment
words = align_words("speech.wav", "hello world", language="English")
# [{"text": "hello", "start": 0.12, "end": 0.58},
# {"text": "world", "start": 0.62, "end": 1.15}]
# Chinese
words = align_words("speech.wav", "你好世界", language="Chinese")
Runtime image for equivalent pipeline operators: asr.
Audio Tokenization
Encode audio into discrete codec tokens for neural codec / audio LM use:
tokens = tokenize_audio("speech.wav", backend="encodec", bandwidth=6.0)
# tokens is a list of token sequences, one per codebook layer.
tokens = tokenize_audio("speech.wav", backend="dac")
Runtime image for equivalent pipeline operators: slim.
TTS Synthesis
# Kokoro — lightweight (82M), CPU-capable, 8 languages
synthesize("Hello world!", "hello.wav", engine="kokoro")
synthesize("你好,世界", "zh.wav", engine="kokoro", language="z")
synthesize("Hello", "slow.wav", engine="kokoro", speed=0.8)
# ChatTTS — conversational, Chinese / English, GPU
synthesize("你好世界", "chat.wav", engine="chattts", seed=42)
# CosyVoice2 — zero-shot voice cloning, GPU
synthesize("你好", "clone.wav", engine="cosyvoice",
reference_audio="ref.wav", reference_text="参考文本")
# Fish-Speech — zero-shot voice cloning, GPU
synthesize("Hello", "clone.wav", engine="fish_speech",
reference_audio="ref.wav")
engine |
Docker tag | Device | Notes |
|---|---|---|---|
"kokoro" |
tts |
CPU/GPU | 82M params, 8 langs (a/b/j/z/...), speed supported. |
"chattts" |
tts |
GPU | Conversational ZH/EN, seed for speaker sampling. |
"cosyvoice" |
tts |
GPU | Voice cloning via reference_audio + reference_text. |
"fish_speech" |
fish-speech |
GPU | Fish-Speech S2 voice cloning via reference_audio. |