Audio I/O | Smallest AI Docs

Hydra is strict about audio formats. Get this wrong and you’ll either see invalid_audio errors or distorted playback.

Input (client → server)

Property	Value
Codec	PCM16 (16-bit signed integer)
Endianness	Little-endian
Channels	Mono
Sample rate	16 000 Hz
Encoding on the wire	Base64, inside `input_audio_buffer.append.audio`
Recommended chunk size	20–40 ms (320–640 samples = 640–1280 bytes)

Frames smaller than ~20 ms add overhead without helping latency. Frames larger than ~40 ms make barge-in detection feel sluggish.

Python

1 import asyncio, base64, json, wave
2 
3 with wave.open("input_16khz_mono.wav", "rb") as w:
4     assert w.getframerate() == 16000 and w.getnchannels() == 1
5     while True:
6         pcm = w.readframes(320)            # 20 ms at 16 kHz
7         if not pcm:
8             break
9         await ws.send(json.dumps({
10             "type": "input_audio_buffer.append",
11             "audio": base64.b64encode(pcm).decode(),
12         }))
13         await asyncio.sleep(0.02)          # pace at real-time

Browser (AudioWorklet)

The mic delivers float32 samples; you need to (a) convert to int16 and (b) base64-encode each chunk. Use an AudioWorklet — the deprecated ScriptProcessorNode works for a prototype but blocks the main thread under load.

1 // my-mic-worklet.js
2 class MicWorklet extends AudioWorkletProcessor {
3   constructor() { super(); this._buf = []; this._frames = 0; }
4   process(inputs) {
5     const ch = inputs[0]?.[0];
6     if (!ch) return true;
7     this._buf.push(new Float32Array(ch));
8     this._frames += ch.length;
9     if (this._frames >= 320) {            // 20 ms at 16 kHz
10       const out = new Float32Array(this._frames);
11       let o = 0;
12       for (const c of this._buf) { out.set(c, o); o += c.length; }
13       this.port.postMessage(out);
14       this._buf = []; this._frames = 0;
15     }
16     return true;
17   }
18 }
19 registerProcessor("mic-worklet", MicWorklet);

1 // main thread
2 const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
3 const ctx = new AudioContext({ sampleRate: 16000 });
4 await ctx.audioWorklet.addModule("my-mic-worklet.js");
5 const src = ctx.createMediaStreamSource(stream);
6 const node = new AudioWorkletNode(ctx, "mic-worklet");
7 node.port.onmessage = (e) => {
8   const pcm16 = floatTo16BitPCM(e.data);
9   ws.send(JSON.stringify({
10     type: "input_audio_buffer.append",
11     audio: arrayBufferToBase64(pcm16.buffer),
12   }));
13 };
14 src.connect(node);
15 
16 function floatTo16BitPCM(f32) {
17   const out = new Int16Array(f32.length);
18   for (let i = 0; i < f32.length; i++) {
19     const s = Math.max(-1, Math.min(1, f32[i]));
20     out[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
21   }
22   return out;
23 }
24 function arrayBufferToBase64(buf) {
25   let bin = "";
26   const b = new Uint8Array(buf);
27   for (let i = 0; i < b.length; i++) bin += String.fromCharCode(b[i]);
28   return btoa(bin);
29 }

Output (server → client)

Property	Value
Codec	PCM16
Endianness	Little-endian
Channels	Mono
Sample rate	48 000 Hz
Encoding on the wire	Base64, inside `response.output_audio.delta.delta`

Python

1 OUT_RATE = 48000
2 out_chunks = []
3 
4 async for raw in ws:
5     evt = json.loads(raw)
6     if evt["type"] == "response.output_audio.delta":
7         out_chunks.append(base64.b64decode(evt["delta"]))
8 
9 # Later, write to a WAV
10 with wave.open("reply.wav", "wb") as w:
11     w.setnchannels(1); w.setsampwidth(2); w.setframerate(OUT_RATE)
12     w.writeframes(b"".join(out_chunks))

Browser (gapless playback)

Schedule each chunk against a running playCursor so chunks play back-to-back with no audible gap.

1 const TTS_RATE = 48000;
2 const playCtx = new AudioContext({ sampleRate: TTS_RATE });
3 let playCursor = playCtx.currentTime;
4 
5 ws.onmessage = (ev) => {
6   const evt = JSON.parse(ev.data);
7   if (evt.type === "response.output_audio.delta") {
8     playPCM16(b64ToInt16(evt.delta));
9   }
10 };
11 
12 function playPCM16(int16) {
13   const buf = playCtx.createBuffer(1, int16.length, TTS_RATE);
14   const ch = buf.getChannelData(0);
15   for (let i = 0; i < int16.length; i++) ch[i] = int16[i] / 0x8000;
16   const src = playCtx.createBufferSource();
17   src.buffer = buf;
18   src.connect(playCtx.destination);
19   const start = Math.max(playCtx.currentTime, playCursor);
20   src.start(start);
21   playCursor = start + buf.duration;
22 }
23 
24 function b64ToInt16(b64) {
25   // base64 → ArrayBuffer → little-endian Int16Array
26   const bin = atob(b64);
27   const buf = new ArrayBuffer(bin.length);
28   const view = new DataView(buf);
29   for (let i = 0; i < bin.length; i++) view.setUint8(i, bin.charCodeAt(i));
30   const out = new Int16Array(bin.length / 2);
31   for (let i = 0; i < out.length; i++) out[i] = view.getInt16(i * 2, true);
32   return out;
33 }

For barge-in, you reset playCursor = playCtx.currentTime when a fresh response.created arrives — see Turn detection & barge-in.

Common gotchas

Sending audio before session.configured — frames are silently dropped; the server does not queue them and does not emit an error. Always wait for the session.configured echo before starting the mic.
Sample-rate mismatch — sending 24 kHz audio while claiming PCM16 16 kHz produces unintelligible transcription on the model side. Resample explicitly.
Stereo input — Hydra expects mono. If you have stereo, downmix before encoding.

Streaming a WAV file (for CI / regression tests)

Replay a known utterance through Hydra and capture the reply to disk

Hydra is built for live mic streams. For test fixtures, regression tests, or batch jobs you sometimes want to replay a known WAV instead. The pattern paces a 16 kHz mono PCM16 WAV at real-time speed, then collects the response audio to disk.

1 import asyncio, base64, json, os, wave
2 import websockets
3 
4 URL = f"wss://api.smallest.ai/waves/v1/s2s?model=hydra&api_key={os.environ['SMALLEST_API_KEY']}"
5 WAV_IN, WAV_OUT = "input_16khz_mono.wav", "reply.wav"
6 OUT_RATE = 48000
7 
8 async def main():
9     chunks = []
10     configured = asyncio.Event()         # gate audio streaming on session.configured
11     done = asyncio.Event()               # set by reader when response.done arrives
12 
13     async with websockets.connect(URL, max_size=None) as ws:
14         async def reader():
15             async for raw in ws:
16                 evt = json.loads(raw)
17                 t = evt["type"]
18                 if t == "session.created":
19                     await ws.send(json.dumps({
20                         "type": "session.configure",
21                         "session": {"instructions": "Reply briefly.", "voice": "wren"},
22                     }))
23                 elif t == "session.configured":
24                     configured.set()
25                 elif t == "response.output_audio.delta":
26                     chunks.append(base64.b64decode(evt["delta"]))
27                 elif t == "response.done":
28                     print(f"[{evt['response']['status']}]")
29                     done.set()
30                 elif t == "error":
31                     print("ERROR:", evt["error"])
32 
33         recv_task = asyncio.create_task(reader())
34         await configured.wait()           # don't stream audio before the server is ready
35 
36         with wave.open(WAV_IN, "rb") as w:
37             assert w.getframerate() == 16000 and w.getnchannels() == 1
38             while pcm := w.readframes(320):       # 20 ms at 16 kHz
39                 if done.is_set():
40                     break
41                 await ws.send(json.dumps({
42                     "type": "input_audio_buffer.append",
43                     "audio": base64.b64encode(pcm).decode(),
44                 }))
45                 await asyncio.sleep(0.02)          # pace at real-time
46 
47         try:
48             await asyncio.wait_for(done.wait(), timeout=15)
49         except asyncio.TimeoutError:
50             pass
51         recv_task.cancel()
52 
53     with wave.open(WAV_OUT, "wb") as w:
54         w.setnchannels(1); w.setsampwidth(2); w.setframerate(OUT_RATE)
55         w.writeframes(b"".join(chunks))
56     print(f"wrote {WAV_OUT} ({OUT_RATE} Hz)")
57 
58 asyncio.run(main())

Don’t have a 16 kHz mono WAV? Convert with ffmpeg:

$ ffmpeg -i any-input.wav -ac 1 -ar 16000 -sample_fmt s16 input_16khz_mono.wav

This pattern is for testing only — it doesn’t exercise full-duplex behaviour (no overlap, no barge-in). For interactive use, see the quickstart.

Turn detection & barge-in — VAD events and how to flush scheduled audio
Errors & reconnection — invalid_audio and friends

Input (client → server)

Python

Browser (AudioWorklet)

Output (server → client)

Python

Browser (gapless playback)

Common gotchas

Streaming a WAV file (for CI / regression tests)

Replay a known utterance through Hydra and capture the reply to disk

Next