Code Examples | Smallest AI Docs

Below is a complete Python example demonstrating audio preprocessing, transcription with gender detection, emotion detection, and sentence-level timestamps (utterances).

The smallestai Python SDK is being updated. If the SDK example below doesn’t work, call the underlying Pulse pre-recorded REST endpoint directly (see the API reference under Waves → API Reference). Streaming synthesis via WavesStreamingTTS is unaffected.

1 import os
2 from pydub import AudioSegment
3 from smallestai.waves import WavesClient
4 
5 client = WavesClient(api_key=os.getenv("SMALLEST_API_KEY"))
6 
7 def preprocess_audio(input_path, output_path):
8     """
9     Preprocess audio file to optimal format for Pulse STT:
10     - Convert to 16 kHz mono WAV
11     - Normalize audio levels
12     - Remove leading/trailing silence
13     """
14     audio = AudioSegment.from_file(input_path)
15     audio = audio.set_frame_rate(16000).set_channels(1)
16     audio = audio.normalize()
17     audio = audio.strip_silence(silence_len=100, silence_thresh=-40)
18     audio.export(output_path, format="wav")
19     print(f"Preprocessed audio saved to: {output_path}")
20     return output_path
21 
22 def transcribe_with_features(audio_path):
23     """
24     Transcribe audio with gender detection, emotion detection, and utterances.
25     """
26     response = client.transcribe(
27         file_path=audio_path,
28         model="pulse",
29         language="en",
30         word_timestamps=True,
31         gender_detection=True,
32         emotion_detection=True,
33         diarize=True
34     )
35     
36     return response
37 
38 def process_results(response):
39     """
40     Extract and display transcription results.
41     """
42     print("=" * 60)
43     print("TRANSCRIPTION RESULTS")
44     print("=" * 60)
45     
46     print(f"\nTranscription: {response.get('transcription', 'N/A')}")
47     
48     if 'gender' in response:
49         print(f"\nGender: {response['gender']}")
50     
51     if 'emotions' in response:
52         print("\nEmotion Scores:")
53         emotions = response['emotions']
54         for emotion, score in emotions.items():
55             print(f"  {emotion.capitalize()}: {score:.2f}")
56     
57     if 'utterances' in response:
58         print("\nUtterances (Sentence-level timestamps):")
59         for i, utterance in enumerate(response['utterances'], 1):
60             speaker = utterance.get('speaker', 'unknown')
61             start = utterance.get('start', 0)
62             end = utterance.get('end', 0)
63             text = utterance.get('text', '')
64             print(f"\n  [{i}] Speaker: {speaker}")
65             print(f"      Time: {start:.2f}s - {end:.2f}s")
66             print(f"      Text: {text}")
67     
68     if 'words' in response:
69         print(f"\nWord-level timestamps: {len(response['words'])} words")
70 
71 if __name__ == "__main__":
72     input_audio = "input_audio.mp3"
73     preprocessed_audio = "preprocessed_audio.wav"
74     
75     try:
76         print("Preprocessing audio...")
77         preprocess_audio(input_audio, preprocessed_audio)
78         
79         print("\nTranscribing audio with gender, emotion, and utterance detection...")
80         result = transcribe_with_features(preprocessed_audio)
81         
82         process_results(result)
83         
84         if os.path.exists(preprocessed_audio):
85             os.remove(preprocessed_audio)
86             print("\nCleaned up temporary preprocessed file.")
87             
88     except FileNotFoundError:
89         print(f"Error: Audio file '{input_audio}' not found.")
90     except Exception as e:
91         print(f"Error: {str(e)}")

Prerequisites

Install required dependencies:

$ pip install smallestai pydub

Key Features Demonstrated

Audio Preprocessing: Converts audio to 16 kHz mono WAV, normalizes levels, and removes silence
Gender Detection: Predicts speaker gender
Emotion Detection: Captures emotional tone with confidence scores
Utterances: Retrieves sentence-level timestamps with speaker labels
Diarization: Separates speakers for multi-speaker audio

Expected Output

The script will output:

Full transcription text
Gender predictions
Emotion scores (happiness, sadness, disgust, fear, anger)
Sentence-level utterances with timestamps and speaker IDs