import streamlit as st import numpy as np import librosa import matplotlib.pyplot as plt import seaborn as sns from scipy import signal import io import base64 from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification import torch import warnings warnings.filterwarnings('ignore') # Set page config # st.set_page_config( # page_title="đž Animal Sound Translator", # page_icon="đž", # layout="wide", # initial_sidebar_state="expanded" # ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'audio_data' not in st.session_state: st.session_state.audio_data = None if 'sample_rate' not in st.session_state: st.session_state.sample_rate = None @st.cache_resource def load_models(): """Load pre-trained models for audio classification""" try: # Load a general audio classification model feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593") model = AutoModelForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593") classifier = pipeline("audio-classification", model=model, feature_extractor=feature_extractor) return classifier except Exception as e: st.error(f"Error loading models: {e}") return None def analyze_audio_features(audio_data, sr): """Extract comprehensive audio features""" features = {} # Basic features features['duration'] = len(audio_data) / sr features['sample_rate'] = sr features['rms_energy'] = np.sqrt(np.mean(audio_data**2)) # Spectral features features['spectral_centroid'] = np.mean(librosa.feature.spectral_centroid(y=audio_data, sr=sr)) features['spectral_bandwidth'] = np.mean(librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)) features['spectral_rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=audio_data, sr=sr)) features['zero_crossing_rate'] = np.mean(librosa.feature.zero_crossing_rate(audio_data)) # MFCC features mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13) for i in range(13): features[f'mfcc_{i+1}'] = np.mean(mfccs[i]) # Pitch and tempo try: pitches, magnitudes = librosa.piptrack(y=audio_data, sr=sr) pitch_values = [] for t in range(pitches.shape[1]): index = magnitudes[:, t].argmax() pitch = pitches[index, t] if pitch > 0: pitch_values.append(pitch) features['avg_pitch'] = np.mean(pitch_values) if pitch_values else 0 features['pitch_std'] = np.std(pitch_values) if pitch_values else 0 except: features['avg_pitch'] = 0 features['pitch_std'] = 0 return features def classify_animal_sound(audio_data, sr, classifier): """Classify the animal sound using pre-trained model""" try: # Resample to 16kHz if needed (common requirement for audio models) if sr != 16000: audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=16000) sr = 16000 # Ensure audio is not too long (limit to 30 seconds) max_length = 30 * sr if len(audio_data) > max_length: audio_data = audio_data[:max_length] # Get predictions predictions = classifier(audio_data, sampling_rate=sr) # Filter for animal-related sounds animal_keywords = ['dog', 'cat', 'bird', 'cow', 'horse', 'pig', 'sheep', 'goat', 'chicken', 'duck', 'rooster', 'bark', 'meow', 'chirp', 'moo', 'neigh', 'oink', 'baa', 'cluck', 'quack', 'crow', 'howl', 'purr'] animal_predictions = [] for pred in predictions: label_lower = pred['label'].lower() if any(keyword in label_lower for keyword in animal_keywords): animal_predictions.append(pred) # If no animal sounds found, return top predictions anyway if not animal_predictions: animal_predictions = predictions[:3] return animal_predictions[:5] # Return top 5 except Exception as e: st.error(f"Error in classification: {e}") return [] def generate_translation(animal_type, confidence, audio_features): """Generate human-readable translation based on animal type and audio features""" # Animal behavior patterns and translations translations = { 'dog': { 'high_pitch': "I'm excited! Let's play!", 'low_pitch': "I'm being protective or warning you.", 'rapid': "I'm very excited or anxious!", 'slow': "I'm calm but want your attention.", 'loud': "I need something urgently!", 'soft': "I'm content and happy.", 'default': "Woof! I'm trying to communicate with you!" }, 'cat': { 'high_pitch': "I want something! Feed me or pet me!", 'low_pitch': "I'm content and relaxed.", 'rapid': "I'm frustrated or demanding attention!", 'slow': "I'm greeting you or feeling social.", 'loud': "I'm upset or in distress!", 'soft': "I'm happy and comfortable.", 'default': "Meow! I'm talking to you, human!" }, 'bird': { 'high_pitch': "I'm alerting others or expressing joy!", 'low_pitch': "I'm establishing territory or calling for a mate.", 'rapid': "I'm excited or warning of danger!", 'slow': "I'm content and peaceful.", 'loud': "I'm calling to my flock or defending my space!", 'soft': "I'm content and comfortable.", 'default': "Tweet! I'm singing my song!" }, 'cow': { 'high_pitch': "I'm looking for my calf or feeling distressed!", 'low_pitch': "I'm calm and content.", 'loud': "I need attention or I'm calling to the herd!", 'soft': "I'm peaceful and relaxed.", 'default': "Moo! I'm communicating with my herd!" }, 'default': { 'high_pitch': "I'm expressing excitement or alertness!", 'low_pitch': "I'm calm or showing dominance.", 'rapid': "I'm excited, anxious, or trying to get attention!", 'slow': "I'm relaxed and content.", 'loud': "I need attention or I'm expressing strong emotion!", 'soft': "I'm comfortable and peaceful.", 'default': "I'm trying to communicate something important!" } } # Determine animal category animal_key = 'default' for key in translations.keys(): if key in animal_type.lower(): animal_key = key break # Analyze audio characteristics pitch = audio_features.get('avg_pitch', 0) energy = audio_features.get('rms_energy', 0) zcr = audio_features.get('zero_crossing_rate', 0) # Determine characteristics characteristics = [] if pitch > 300: characteristics.append('high_pitch') elif pitch > 0 and pitch < 200: characteristics.append('low_pitch') if energy > 0.1: characteristics.append('loud') elif energy < 0.05: characteristics.append('soft') if zcr > 0.1: characteristics.append('rapid') elif zcr < 0.05: characteristics.append('slow') # Get translation translation_dict = translations[animal_key] translation = translation_dict.get('default', "I'm trying to communicate!") # Use most specific characteristic available for char in characteristics: if char in translation_dict: translation = translation_dict[char] break # Add confidence-based modifier if confidence < 0.3: translation = f"[Uncertain] {translation}" elif confidence > 0.8: translation = f"[Very Confident] {translation}" return translation def create_spectrogram(audio_data, sr): """Create and return spectrogram plot""" fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8)) # Waveform time = np.linspace(0, len(audio_data)/sr, len(audio_data)) ax1.plot(time, audio_data, color='#4ECDC4', linewidth=1) ax1.set_title('Audio Waveform', fontsize=14, fontweight='bold') ax1.set_xlabel('Time (seconds)') ax1.set_ylabel('Amplitude') ax1.grid(True, alpha=0.3) # Spectrogram D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_data)), ref=np.max) img = librosa.display.specshow(D, y_axis='hz', x_axis='time', sr=sr, ax=ax2, cmap='viridis') ax2.set_title('Spectrogram', fontsize=14, fontweight='bold') plt.colorbar(img, ax=ax2, format='%+2.0f dB') plt.tight_layout() return fig def main(): # Header st.markdown('
Confidence: {confidence:.1%}
"{translation}"
{text}
" # return "No file uploaded." # # â Programmatic interface for reuse in other apps # def analyze_dolphin_audio(audio_file_path: str) -> str: # processor, model = load_model() # processed_path = convert_audio(audio_file_path) # text = predict_text(model, processor, processed_path) # return text # if __name__ == "__main__": # app.run(debug=True)