Code Examples | Smallest AI Docs

Below is a complete Python example demonstrating audio preprocessing, transcription with age/gender detection, emotion detection, and sentence-level timestamps (utterances).

1 import os
2 from pydub import AudioSegment
3 from smallestai.waves import WavesClient
4 
5 client = WavesClient(api_key=os.getenv("SMALLEST_API_KEY"))
6 
7 def preprocess_audio(input_path, output_path):
8     """
9     Preprocess audio file to optimal format for Pulse STT:
10     - Convert to 16 kHz mono WAV
11     - Normalize audio levels
12     - Remove leading/trailing silence
13     """
14     audio = AudioSegment.from_file(input_path)
15     audio = audio.set_frame_rate(16000).set_channels(1)
16     audio = audio.normalize()
17     audio = audio.strip_silence(silence_len=100, silence_thresh=-40)
18     audio.export(output_path, format="wav")
19     print(f"Preprocessed audio saved to: {output_path}")
20     return output_path
21 
22 def transcribe_with_features(audio_path):
23     """
24     Transcribe audio with age detection, emotion detection, and utterances.
25     """
26     response = client.transcribe(
27         file_path=audio_path,
28         model="pulse",
29         language="en",
30         word_timestamps=True,
31         age_detection=True,
32         gender_detection=True,
33         emotion_detection=True,
34         diarize=True
35     )
36     
37     return response
38 
39 def process_results(response):
40     """
41     Extract and display transcription results.
42     """
43     print("=" * 60)
44     print("TRANSCRIPTION RESULTS")
45     print("=" * 60)
46     
47     print(f"\nTranscription: {response.get('transcription', 'N/A')}")
48     
49     if 'age' in response:
50         print(f"\nAge: {response['age']}")
51     if 'gender' in response:
52         print(f"Gender: {response['gender']}")
53     
54     if 'emotions' in response:
55         print("\nEmotion Scores:")
56         emotions = response['emotions']
57         for emotion, score in emotions.items():
58             print(f"  {emotion.capitalize()}: {score:.2f}")
59     
60     if 'utterances' in response:
61         print("\nUtterances (Sentence-level timestamps):")
62         for i, utterance in enumerate(response['utterances'], 1):
63             speaker = utterance.get('speaker', 'unknown')
64             start = utterance.get('start', 0)
65             end = utterance.get('end', 0)
66             text = utterance.get('text', '')
67             print(f"\n  [{i}] Speaker: {speaker}")
68             print(f"      Time: {start:.2f}s - {end:.2f}s")
69             print(f"      Text: {text}")
70     
71     if 'words' in response:
72         print(f"\nWord-level timestamps: {len(response['words'])} words")
73 
74 if __name__ == "__main__":
75     input_audio = "input_audio.mp3"
76     preprocessed_audio = "preprocessed_audio.wav"
77     
78     try:
79         print("Preprocessing audio...")
80         preprocess_audio(input_audio, preprocessed_audio)
81         
82         print("\nTranscribing audio with age, emotion, and utterance detection...")
83         result = transcribe_with_features(preprocessed_audio)
84         
85         process_results(result)
86         
87         if os.path.exists(preprocessed_audio):
88             os.remove(preprocessed_audio)
89             print("\nCleaned up temporary preprocessed file.")
90             
91     except FileNotFoundError:
92         print(f"Error: Audio file '{input_audio}' not found.")
93     except Exception as e:
94         print(f"Error: {str(e)}")

Prerequisites

Install required dependencies:

$ pip install smallestai pydub

Key Features Demonstrated

Audio Preprocessing: Converts audio to 16 kHz mono WAV, normalizes levels, and removes silence
Age & Gender Detection: Enables demographic analysis
Emotion Detection: Captures emotional tone with confidence scores
Utterances: Retrieves sentence-level timestamps with speaker labels
Diarization: Separates speakers for multi-speaker audio

Expected Output

The script will output:

Full transcription text
Age and gender predictions
Emotion scores (happiness, sadness, disgust, fear, anger)
Sentence-level utterances with timestamps and speaker IDs