Code Examples

View as Markdown

Two end-to-end Python examples: a simple Pulse Pro transcription, and an advanced Pulse transcription that adds gender detection, emotion detection, speaker diarization, and sentence-level timestamps. Both call the unified /waves/v1/stt/ endpoint with the requests library.

For plain English transcription where leaderboard accuracy matters most, use Pulse Pro (?model=pulse-pro). For multilingual audio or advanced features (gender, emotion, diarization with per-utterance speaker labels), use Pulse (?model=pulse). The endpoint and request shape are identical; only the model query param changes.

Pulse Pro: basic transcription

The simplest end-to-end flow. Downloads a sample, preprocesses to 16 kHz mono WAV, transcribes with word timestamps.

1import os
2import requests
3from pydub import AudioSegment
4
5API_KEY = os.environ["SMALLEST_API_KEY"]
6ENDPOINT = "https://api.smallest.ai/waves/v1/stt/"
7
8
9def preprocess_audio(input_path: str, output_path: str) -> str:
10 """Convert to 16 kHz mono WAV, normalize levels, strip silence."""
11 audio = AudioSegment.from_file(input_path)
12 audio = audio.set_frame_rate(16000).set_channels(1).normalize()
13 audio = audio.strip_silence(silence_len=100, silence_thresh=-40)
14 audio.export(output_path, format="wav")
15 return output_path
16
17
18def transcribe_pulse_pro(audio_path: str) -> dict:
19 """Transcribe English audio with Pulse Pro. Word timestamps on."""
20 with open(audio_path, "rb") as f:
21 audio_bytes = f.read()
22 response = requests.post(
23 ENDPOINT,
24 params={"model": "pulse-pro", "language": "en", "word_timestamps": "true"},
25 headers={
26 "Authorization": f"Bearer {API_KEY}",
27 "Content-Type": "application/octet-stream",
28 },
29 data=audio_bytes,
30 timeout=120,
31 )
32 response.raise_for_status()
33 return response.json()
34
35
36if __name__ == "__main__":
37 raw_path = "input_audio.mp3"
38 wav_path = "preprocessed.wav"
39
40 preprocess_audio(raw_path, wav_path)
41 result = transcribe_pulse_pro(wav_path)
42
43 print(f"Transcription: {result['transcription']}")
44 print(f"Duration: {result['metadata']['duration']:.2f}s")
45 print(f"RTFx: {result['metadata']['rtfx']:.1f}x")
46 print(f"Words: {len(result.get('words', []))}")
47
48 os.remove(wav_path)

Pulse: advanced features (gender, emotion, diarization, utterances)

Pulse supports gender detection, emotion detection, and per-utterance speaker labels. The example below enables all of them.

1import os
2import requests
3from pydub import AudioSegment
4
5API_KEY = os.environ["SMALLEST_API_KEY"]
6ENDPOINT = "https://api.smallest.ai/waves/v1/stt/"
7
8
9def preprocess_audio(input_path: str, output_path: str) -> str:
10 audio = AudioSegment.from_file(input_path)
11 audio = audio.set_frame_rate(16000).set_channels(1).normalize()
12 audio = audio.strip_silence(silence_len=100, silence_thresh=-40)
13 audio.export(output_path, format="wav")
14 return output_path
15
16
17def transcribe_with_features(audio_path: str) -> dict:
18 """Transcribe with Pulse + gender, emotion, diarization, utterances."""
19 with open(audio_path, "rb") as f:
20 audio_bytes = f.read()
21 response = requests.post(
22 ENDPOINT,
23 params={
24 "model": "pulse",
25 "language": "en",
26 "word_timestamps": "true",
27 "gender_detection": "true",
28 "emotion_detection": "true",
29 "diarize": "true",
30 },
31 headers={
32 "Authorization": f"Bearer {API_KEY}",
33 "Content-Type": "application/octet-stream",
34 },
35 data=audio_bytes,
36 timeout=120,
37 )
38 response.raise_for_status()
39 return response.json()
40
41
42def report(result: dict) -> None:
43 print("=" * 60)
44 print("TRANSCRIPTION RESULTS")
45 print("=" * 60)
46
47 print(f"\nTranscription: {result.get('transcription', 'N/A')}")
48
49 if result.get("gender") is not None:
50 print(f"\nGender: {result['gender']}")
51
52 if result.get("emotions"):
53 print("\nEmotion scores:")
54 for emotion, score in result["emotions"].items():
55 print(f" {emotion.capitalize()}: {score:.4f}")
56
57 utterances = result.get("utterances") or []
58 if utterances:
59 print(f"\nUtterances ({len(utterances)}):")
60 for i, u in enumerate(utterances, 1):
61 print(
62 f" [{i}] {u.get('start', 0):.2f}s – {u.get('end', 0):.2f}s "
63 f"(speaker: {u.get('speaker', 'unknown')})"
64 )
65 print(f" {u.get('text', '')}")
66
67 words = result.get("words") or []
68 if words:
69 print(f"\nWord-level timestamps: {len(words)} words")
70
71
72if __name__ == "__main__":
73 raw_path = "input_audio.mp3"
74 wav_path = "preprocessed.wav"
75 try:
76 preprocess_audio(raw_path, wav_path)
77 result = transcribe_with_features(wav_path)
78 report(result)
79 finally:
80 if os.path.exists(wav_path):
81 os.remove(wav_path)

Prerequisites

$pip install requests pydub

pydub requires ffmpeg on PATH for non-WAV input formats.

What each example demonstrates

StepPulse Pro examplePulse advanced example
Audio preprocessing16 kHz mono WAV, normalized, silence-trimmedSame
TranscriptionWord-timestamps onWord-timestamps on
Gender detectionNot available on Pulse Progender field on response
Emotion detectionNot available on Pulse Proemotions object with 5 scores
Speaker diarizationNot available on Pulse Prodiarize=true plus per-utterance speaker labels
Sentence-level utterancesNot available on Pulse Proutterances[] with start/end/speaker

Expected output

The Pulse advanced example prints:

  • Full transcription text
  • Detected gender (male / female)
  • Emotion scores: anger, disgust, fear, sadness, happiness
  • Sentence-level utterances with timestamps and speaker IDs
  • A count of word-level timestamps