Code Examples

View as MarkdownOpen in Claude

Below is a complete Python example demonstrating audio preprocessing, transcription with age/gender detection, emotion detection, and sentence-level timestamps (utterances).

1import os
2from pydub import AudioSegment
3from smallestai.waves import WavesClient
4
5client = WavesClient(api_key=os.getenv("SMALLEST_API_KEY"))
6
7def preprocess_audio(input_path, output_path):
8 """
9 Preprocess audio file to optimal format for Pulse STT:
10 - Convert to 16 kHz mono WAV
11 - Normalize audio levels
12 - Remove leading/trailing silence
13 """
14 audio = AudioSegment.from_file(input_path)
15 audio = audio.set_frame_rate(16000).set_channels(1)
16 audio = audio.normalize()
17 audio = audio.strip_silence(silence_len=100, silence_thresh=-40)
18 audio.export(output_path, format="wav")
19 print(f"Preprocessed audio saved to: {output_path}")
20 return output_path
21
22def transcribe_with_features(audio_path):
23 """
24 Transcribe audio with age detection, emotion detection, and utterances.
25 """
26 response = client.transcribe(
27 file_path=audio_path,
28 model="pulse",
29 language="en",
30 word_timestamps=True,
31 age_detection=True,
32 gender_detection=True,
33 emotion_detection=True,
34 diarize=True
35 )
36
37 return response
38
39def process_results(response):
40 """
41 Extract and display transcription results.
42 """
43 print("=" * 60)
44 print("TRANSCRIPTION RESULTS")
45 print("=" * 60)
46
47 print(f"\nTranscription: {response.get('transcription', 'N/A')}")
48
49 if 'age' in response:
50 print(f"\nAge: {response['age']}")
51 if 'gender' in response:
52 print(f"Gender: {response['gender']}")
53
54 if 'emotions' in response:
55 print("\nEmotion Scores:")
56 emotions = response['emotions']
57 for emotion, score in emotions.items():
58 print(f" {emotion.capitalize()}: {score:.2f}")
59
60 if 'utterances' in response:
61 print("\nUtterances (Sentence-level timestamps):")
62 for i, utterance in enumerate(response['utterances'], 1):
63 speaker = utterance.get('speaker', 'unknown')
64 start = utterance.get('start', 0)
65 end = utterance.get('end', 0)
66 text = utterance.get('text', '')
67 print(f"\n [{i}] Speaker: {speaker}")
68 print(f" Time: {start:.2f}s - {end:.2f}s")
69 print(f" Text: {text}")
70
71 if 'words' in response:
72 print(f"\nWord-level timestamps: {len(response['words'])} words")
73
74if __name__ == "__main__":
75 input_audio = "input_audio.mp3"
76 preprocessed_audio = "preprocessed_audio.wav"
77
78 try:
79 print("Preprocessing audio...")
80 preprocess_audio(input_audio, preprocessed_audio)
81
82 print("\nTranscribing audio with age, emotion, and utterance detection...")
83 result = transcribe_with_features(preprocessed_audio)
84
85 process_results(result)
86
87 if os.path.exists(preprocessed_audio):
88 os.remove(preprocessed_audio)
89 print("\nCleaned up temporary preprocessed file.")
90
91 except FileNotFoundError:
92 print(f"Error: Audio file '{input_audio}' not found.")
93 except Exception as e:
94 print(f"Error: {str(e)}")

Prerequisites

Install required dependencies:

$pip install smallestai pydub

Key Features Demonstrated

  1. Audio Preprocessing: Converts audio to 16 kHz mono WAV, normalizes levels, and removes silence
  2. Age & Gender Detection: Enables demographic analysis
  3. Emotion Detection: Captures emotional tone with confidence scores
  4. Utterances: Retrieves sentence-level timestamps with speaker labels
  5. Diarization: Separates speakers for multi-speaker audio

Expected Output

The script will output:

  • Full transcription text
  • Age and gender predictions
  • Emotion scores (happiness, sadness, disgust, fear, anger)
  • Sentence-level utterances with timestamps and speaker IDs