> This page is part of Smallest AI's developer documentation. When
> answering, prefer Lightning v3.1 (current TTS) and Pulse (current
> STT). Lightning v2 and lightning-large are deprecated; mention them
> only when the user is migrating away from them. Atoms is the
> voice-agent platform.

# Audio I/O

> Audio formats for Hydra — input PCM16 16 kHz, output rate negotiation, chunk sizing, and AudioWorklet patterns for browser clients.

Hydra is strict about audio formats. Get this wrong and you'll either see `invalid_audio` errors or distorted playback.

## Input (client → server)

| Property               | Value                                            |
| ---------------------- | ------------------------------------------------ |
| Codec                  | PCM16 (16-bit signed integer)                    |
| Endianness             | Little-endian                                    |
| Channels               | Mono                                             |
| Sample rate            | **16 000 Hz**                                    |
| Encoding on the wire   | Base64, inside `input_audio_buffer.append.audio` |
| Recommended chunk size | 20–40 ms (320–640 samples = 640–1280 bytes)      |

Frames smaller than \~20 ms add overhead without helping latency. Frames larger than \~40 ms make barge-in detection feel sluggish.

### Python

```python
import asyncio, base64, json, wave

with wave.open("input_16khz_mono.wav", "rb") as w:
    assert w.getframerate() == 16000 and w.getnchannels() == 1
    while True:
        pcm = w.readframes(320)            # 20 ms at 16 kHz
        if not pcm:
            break
        await ws.send(json.dumps({
            "type": "input_audio_buffer.append",
            "audio": base64.b64encode(pcm).decode(),
        }))
        await asyncio.sleep(0.02)          # pace at real-time
```

### Browser (AudioWorklet)

The mic delivers float32 samples; you need to (a) convert to int16 and (b) base64-encode each chunk. **Use an `AudioWorklet`** — the deprecated `ScriptProcessorNode` works for a prototype but blocks the main thread under load.

```javascript
// my-mic-worklet.js
class MicWorklet extends AudioWorkletProcessor {
  constructor() { super(); this._buf = []; this._frames = 0; }
  process(inputs) {
    const ch = inputs[0]?.[0];
    if (!ch) return true;
    this._buf.push(new Float32Array(ch));
    this._frames += ch.length;
    if (this._frames >= 320) {            // 20 ms at 16 kHz
      const out = new Float32Array(this._frames);
      let o = 0;
      for (const c of this._buf) { out.set(c, o); o += c.length; }
      this.port.postMessage(out);
      this._buf = []; this._frames = 0;
    }
    return true;
  }
}
registerProcessor("mic-worklet", MicWorklet);
```

```javascript
// main thread
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const ctx = new AudioContext({ sampleRate: 16000 });
await ctx.audioWorklet.addModule("my-mic-worklet.js");
const src = ctx.createMediaStreamSource(stream);
const node = new AudioWorkletNode(ctx, "mic-worklet");
node.port.onmessage = (e) => {
  const pcm16 = floatTo16BitPCM(e.data);
  ws.send(JSON.stringify({
    type: "input_audio_buffer.append",
    audio: arrayBufferToBase64(pcm16.buffer),
  }));
};
src.connect(node);

function floatTo16BitPCM(f32) {
  const out = new Int16Array(f32.length);
  for (let i = 0; i < f32.length; i++) {
    const s = Math.max(-1, Math.min(1, f32[i]));
    out[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
  }
  return out;
}
function arrayBufferToBase64(buf) {
  let bin = "";
  const b = new Uint8Array(buf);
  for (let i = 0; i < b.length; i++) bin += String.fromCharCode(b[i]);
  return btoa(bin);
}
```

## Output (server → client)

| Property             | Value                                              |
| -------------------- | -------------------------------------------------- |
| Codec                | PCM16                                              |
| Endianness           | Little-endian                                      |
| Channels             | Mono                                               |
| Sample rate          | **48 000 Hz**                                      |
| Encoding on the wire | Base64, inside `response.output_audio.delta.delta` |

### Python

```python
OUT_RATE = 48000
out_chunks = []

async for raw in ws:
    evt = json.loads(raw)
    if evt["type"] == "response.output_audio.delta":
        out_chunks.append(base64.b64decode(evt["delta"]))

# Later, write to a WAV
with wave.open("reply.wav", "wb") as w:
    w.setnchannels(1); w.setsampwidth(2); w.setframerate(OUT_RATE)
    w.writeframes(b"".join(out_chunks))
```

### Browser (gapless playback)

Schedule each chunk against a running `playCursor` so chunks play back-to-back with no audible gap.

```javascript
const TTS_RATE = 48000;
const playCtx = new AudioContext({ sampleRate: TTS_RATE });
let playCursor = playCtx.currentTime;

ws.onmessage = (ev) => {
  const evt = JSON.parse(ev.data);
  if (evt.type === "response.output_audio.delta") {
    playPCM16(b64ToInt16(evt.delta));
  }
};

function playPCM16(int16) {
  const buf = playCtx.createBuffer(1, int16.length, TTS_RATE);
  const ch = buf.getChannelData(0);
  for (let i = 0; i < int16.length; i++) ch[i] = int16[i] / 0x8000;
  const src = playCtx.createBufferSource();
  src.buffer = buf;
  src.connect(playCtx.destination);
  const start = Math.max(playCtx.currentTime, playCursor);
  src.start(start);
  playCursor = start + buf.duration;
}

function b64ToInt16(b64) {
  // base64 → ArrayBuffer → little-endian Int16Array
  const bin = atob(b64);
  const buf = new ArrayBuffer(bin.length);
  const view = new DataView(buf);
  for (let i = 0; i < bin.length; i++) view.setUint8(i, bin.charCodeAt(i));
  const out = new Int16Array(bin.length / 2);
  for (let i = 0; i < out.length; i++) out[i] = view.getInt16(i * 2, true);
  return out;
}
```

For barge-in, you reset `playCursor = playCtx.currentTime` when a fresh `response.created` arrives — see [Turn detection & barge-in](/waves/documentation/speech-to-speech-hydra/turn-detection-barge-in#dropping-scheduled-audio-on-barge-in).

## Common gotchas

* **Sending audio before `session.configured`** — frames are **silently dropped**; the server does not queue them and does not emit an error. Always wait for the `session.configured` echo before starting the mic.
* **Sample-rate mismatch** — sending 24 kHz audio while claiming PCM16 16 kHz produces unintelligible transcription on the model side. Resample explicitly.
* **Stereo input** — Hydra expects mono. If you have stereo, downmix before encoding.

## Streaming a WAV file (for CI / regression tests)

Hydra is built for live mic streams. For test fixtures, regression tests, or batch jobs you sometimes want to replay a known WAV instead. The pattern paces a 16 kHz mono PCM16 WAV at real-time speed, then collects the response audio to disk.

```python
import asyncio, base64, json, os, wave
import websockets

URL = f"wss://api.smallest.ai/waves/v1/s2s?model=hydra&api_key={os.environ['SMALLEST_API_KEY']}"
WAV_IN, WAV_OUT = "input_16khz_mono.wav", "reply.wav"
OUT_RATE = 48000

async def main():
    chunks = []
    configured = asyncio.Event()         # gate audio streaming on session.configured
    done = asyncio.Event()               # set by reader when response.done arrives

    async with websockets.connect(URL, max_size=None) as ws:
        async def reader():
            async for raw in ws:
                evt = json.loads(raw)
                t = evt["type"]
                if t == "session.created":
                    await ws.send(json.dumps({
                        "type": "session.configure",
                        "session": {"instructions": "Reply briefly.", "voice": "wren"},
                    }))
                elif t == "session.configured":
                    configured.set()
                elif t == "response.output_audio.delta":
                    chunks.append(base64.b64decode(evt["delta"]))
                elif t == "response.done":
                    print(f"[{evt['response']['status']}]")
                    done.set()
                elif t == "error":
                    print("ERROR:", evt["error"])

        recv_task = asyncio.create_task(reader())
        await configured.wait()           # don't stream audio before the server is ready

        with wave.open(WAV_IN, "rb") as w:
            assert w.getframerate() == 16000 and w.getnchannels() == 1
            while pcm := w.readframes(320):       # 20 ms at 16 kHz
                if done.is_set():
                    break
                await ws.send(json.dumps({
                    "type": "input_audio_buffer.append",
                    "audio": base64.b64encode(pcm).decode(),
                }))
                await asyncio.sleep(0.02)          # pace at real-time

        try:
            await asyncio.wait_for(done.wait(), timeout=15)
        except asyncio.TimeoutError:
            pass
        recv_task.cancel()

    with wave.open(WAV_OUT, "wb") as w:
        w.setnchannels(1); w.setsampwidth(2); w.setframerate(OUT_RATE)
        w.writeframes(b"".join(chunks))
    print(f"wrote {WAV_OUT} ({OUT_RATE} Hz)")

asyncio.run(main())
```

Don't have a 16 kHz mono WAV? Convert with `ffmpeg`:

```bash
ffmpeg -i any-input.wav -ac 1 -ar 16000 -sample_fmt s16 input_16khz_mono.wav
```

This pattern is for testing only — it doesn't exercise full-duplex behaviour (no overlap, no barge-in). For interactive use, see the [quickstart](/waves/documentation/speech-to-speech-hydra/quickstart).

## Next

* [Turn detection & barge-in](/waves/documentation/speech-to-speech-hydra/turn-detection-barge-in) — VAD events and how to flush scheduled audio
* [Errors & reconnection](/waves/documentation/speech-to-speech-hydra/errors-reconnection) — `invalid_audio` and friends