Code Examples | Smallest AI Docs

Two end-to-end Python examples: a simple Pulse Pro transcription, and an advanced Pulse transcription that adds gender detection, emotion detection, speaker diarization, and sentence-level timestamps. Both call the unified /waves/v1/stt/ endpoint with the requests library.

For plain English transcription where leaderboard accuracy matters most, use Pulse Pro (?model=pulse-pro). For multilingual audio or advanced features (gender, emotion, diarization with per-utterance speaker labels), use Pulse (?model=pulse). The endpoint and request shape are identical; only the model query param changes.

Pulse Pro: basic transcription

The simplest end-to-end flow. Downloads a sample, preprocesses to 16 kHz mono WAV, transcribes with word timestamps.

1 import os
2 import requests
3 from pydub import AudioSegment
4 
5 API_KEY = os.environ["SMALLEST_API_KEY"]
6 ENDPOINT = "https://api.smallest.ai/waves/v1/stt/"
7 
8 
9 def preprocess_audio(input_path: str, output_path: str) -> str:
10     """Convert to 16 kHz mono WAV, normalize levels, strip silence."""
11     audio = AudioSegment.from_file(input_path)
12     audio = audio.set_frame_rate(16000).set_channels(1).normalize()
13     audio = audio.strip_silence(silence_len=100, silence_thresh=-40)
14     audio.export(output_path, format="wav")
15     return output_path
16 
17 
18 def transcribe_pulse_pro(audio_path: str) -> dict:
19     """Transcribe English audio with Pulse Pro. Word timestamps on."""
20     with open(audio_path, "rb") as f:
21         audio_bytes = f.read()
22     response = requests.post(
23         ENDPOINT,
24         params={"model": "pulse-pro", "language": "en", "word_timestamps": "true"},
25         headers={
26             "Authorization": f"Bearer {API_KEY}",
27             "Content-Type": "application/octet-stream",
28         },
29         data=audio_bytes,
30         timeout=120,
31     )
32     response.raise_for_status()
33     return response.json()
34 
35 
36 if __name__ == "__main__":
37     raw_path = "input_audio.mp3"
38     wav_path = "preprocessed.wav"
39 
40     preprocess_audio(raw_path, wav_path)
41     result = transcribe_pulse_pro(wav_path)
42 
43     print(f"Transcription: {result['transcription']}")
44     print(f"Duration:      {result['metadata']['duration']:.2f}s")
45     print(f"RTFx:          {result['metadata']['rtfx']:.1f}x")
46     print(f"Words:         {len(result.get('words', []))}")
47 
48     os.remove(wav_path)

Pulse: advanced features (gender, emotion, diarization, utterances)

Pulse supports gender detection, emotion detection, and per-utterance speaker labels. The example below enables all of them.

1 import os
2 import requests
3 from pydub import AudioSegment
4 
5 API_KEY = os.environ["SMALLEST_API_KEY"]
6 ENDPOINT = "https://api.smallest.ai/waves/v1/stt/"
7 
8 
9 def preprocess_audio(input_path: str, output_path: str) -> str:
10     audio = AudioSegment.from_file(input_path)
11     audio = audio.set_frame_rate(16000).set_channels(1).normalize()
12     audio = audio.strip_silence(silence_len=100, silence_thresh=-40)
13     audio.export(output_path, format="wav")
14     return output_path
15 
16 
17 def transcribe_with_features(audio_path: str) -> dict:
18     """Transcribe with Pulse + gender, emotion, diarization, utterances."""
19     with open(audio_path, "rb") as f:
20         audio_bytes = f.read()
21     response = requests.post(
22         ENDPOINT,
23         params={
24             "model": "pulse",
25             "language": "en",
26             "word_timestamps": "true",
27             "gender_detection": "true",
28             "emotion_detection": "true",
29             "diarize": "true",
30         },
31         headers={
32             "Authorization": f"Bearer {API_KEY}",
33             "Content-Type": "application/octet-stream",
34         },
35         data=audio_bytes,
36         timeout=120,
37     )
38     response.raise_for_status()
39     return response.json()
40 
41 
42 def report(result: dict) -> None:
43     print("=" * 60)
44     print("TRANSCRIPTION RESULTS")
45     print("=" * 60)
46 
47     print(f"\nTranscription: {result.get('transcription', 'N/A')}")
48 
49     if result.get("gender") is not None:
50         print(f"\nGender: {result['gender']}")
51 
52     if result.get("emotions"):
53         print("\nEmotion scores:")
54         for emotion, score in result["emotions"].items():
55             print(f"  {emotion.capitalize()}: {score:.4f}")
56 
57     utterances = result.get("utterances") or []
58     if utterances:
59         print(f"\nUtterances ({len(utterances)}):")
60         for i, u in enumerate(utterances, 1):
61             print(
62                 f"  [{i}] {u.get('start', 0):.2f}s – {u.get('end', 0):.2f}s "
63                 f"(speaker: {u.get('speaker', 'unknown')})"
64             )
65             print(f"      {u.get('text', '')}")
66 
67     words = result.get("words") or []
68     if words:
69         print(f"\nWord-level timestamps: {len(words)} words")
70 
71 
72 if __name__ == "__main__":
73     raw_path = "input_audio.mp3"
74     wav_path = "preprocessed.wav"
75     try:
76         preprocess_audio(raw_path, wav_path)
77         result = transcribe_with_features(wav_path)
78         report(result)
79     finally:
80         if os.path.exists(wav_path):
81             os.remove(wav_path)

Prerequisites

$ pip install requests pydub

pydub requires ffmpeg on PATH for non-WAV input formats.

What each example demonstrates

Step	Pulse Pro example	Pulse advanced example
Audio preprocessing	16 kHz mono WAV, normalized, silence-trimmed	Same
Transcription	Word-timestamps on	Word-timestamps on
Gender detection	Not available on Pulse Pro	`gender` field on response
Emotion detection	Not available on Pulse Pro	`emotions` object with 5 scores
Speaker diarization	Not available on Pulse Pro	`diarize=true` plus per-utterance speaker labels
Sentence-level utterances	Not available on Pulse Pro	`utterances[]` with start/end/speaker

Expected output

The Pulse advanced example prints:

Full transcription text
Detected gender (male / female)
Emotion scores: anger, disgust, fear, sadness, happiness
Sentence-level utterances with timestamps and speaker IDs
A count of word-level timestamps