Spaces:

haoheliu
/

semanticodec_ultra_low_bitrate_audio_codec

Running

App Files Files Community

haoheliu commited on Nov 29, 2024

Commit

c8989e8

verified ·

1 Parent(s): 1f34ab8

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -11

app.py CHANGED Viewed

@@ -13,12 +13,13 @@ import os
 DEFAULT_TOKEN_RATE = 100
 DEFAULT_SEMANTIC_VOCAB_SIZE = 16384
 DEFAULT_SAMPLE_RATE = 16000
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Title and Description
 st.title("SemantiCodec: Ultra-Low Bitrate Neural Audio Codec")
 st.write("""
-Upload your audio file, adjust the codec parameters, and compare the original and reconstructed audio.
 SemantiCodec achieves high-quality audio reconstruction with ultra-low bitrates!
 """)
@@ -34,7 +35,7 @@ ddim_steps = st.sidebar.slider("DDIM Sampling Steps", 10, 100, 50, step=5)
 guidance_scale = st.sidebar.slider("CFG Guidance Scale", 0.5, 5.0, 2.0, step=0.1)
 # Upload Audio File
-uploaded_file = st.file_uploader("Upload an audio file (WAV format)", type=["wav"])
 # Helper function: Plot spectrogram
 def plot_spectrogram(waveform, sample_rate, title):
@@ -57,7 +58,7 @@ if uploaded_file and st.button("Run SemantiCodec"):
         # Load audio
         waveform, sample_rate = torchaudio.load(input_path)
         # Check if resampling is needed
         if sample_rate != DEFAULT_SAMPLE_RATE:
             st.write(f"Resampling audio from {sample_rate} Hz to {DEFAULT_SAMPLE_RATE} Hz...")
@@ -65,12 +66,23 @@ if uploaded_file and st.button("Run SemantiCodec"):
             waveform = resampler(waveform)
             sample_rate = DEFAULT_SAMPLE_RATE  # Update sample rate to 16kHz
         # Convert to numpy for librosa compatibility
-        waveform = waveform[0].numpy()
-        # Plot Original Spectrogram (16kHz resampled)
-        st.write("Original Audio Spectrogram (Resampled to 16kHz):")
-        plot_spectrogram(waveform, sample_rate, "Original Audio Spectrogram (Resampled to 16kHz)")
         # Initialize SemantiCodec
         st.write("Initializing SemantiCodec...")
@@ -86,7 +98,7 @@ if uploaded_file and st.button("Run SemantiCodec"):
         # Encode and Decode
         st.write("Encoding and Decoding Audio...")
-        tokens = semanticodec.encode(input_path)
         reconstructed_waveform = semanticodec.decode(tokens)[0, 0]
         # Save reconstructed audio
@@ -101,8 +113,8 @@ if uploaded_file and st.button("Run SemantiCodec"):
         st.write(f"Shape of Latent Code: {tokens.shape}")
         # Audio Players
-        st.audio(input_path, format="audio/wav")
-        st.write("Original Audio")
         st.audio(reconstructed_path, format="audio/wav")
         st.write("Reconstructed Audio")
@@ -113,6 +125,5 @@ if uploaded_file and st.button("Run SemantiCodec"):
             file_name="reconstructed_audio.wav",
         )
 # Footer
 st.write("Built with [Streamlit](https://streamlit.io) and SemantiCodec")

 DEFAULT_TOKEN_RATE = 100
 DEFAULT_SEMANTIC_VOCAB_SIZE = 16384
 DEFAULT_SAMPLE_RATE = 16000
+MAX_DURATION_SECONDS = 30  # Maximum allowed duration
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Title and Description
 st.title("SemantiCodec: Ultra-Low Bitrate Neural Audio Codec")
 st.write("""
+Upload your audio file (up to 30 seconds), adjust the codec parameters, and compare the original and reconstructed audio.
 SemantiCodec achieves high-quality audio reconstruction with ultra-low bitrates!
 """)
 guidance_scale = st.sidebar.slider("CFG Guidance Scale", 0.5, 5.0, 2.0, step=0.1)
 # Upload Audio File
+uploaded_file = st.file_uploader("Upload an audio file (WAV format, up to 30 seconds)", type=["wav"])
 # Helper function: Plot spectrogram
 def plot_spectrogram(waveform, sample_rate, title):
         # Load audio
         waveform, sample_rate = torchaudio.load(input_path)
         # Check if resampling is needed
         if sample_rate != DEFAULT_SAMPLE_RATE:
             st.write(f"Resampling audio from {sample_rate} Hz to {DEFAULT_SAMPLE_RATE} Hz...")
             waveform = resampler(waveform)
             sample_rate = DEFAULT_SAMPLE_RATE  # Update sample rate to 16kHz
+        # Check and limit duration
+        num_samples = waveform.size(1)
+        max_samples = MAX_DURATION_SECONDS * sample_rate  # 30 seconds limit
+        if num_samples > max_samples:
+            st.write(f"Truncating audio to the first {MAX_DURATION_SECONDS} seconds...")
+            waveform = waveform[:, :max_samples]
         # Convert to numpy for librosa compatibility
+        waveform_np = waveform[0].numpy()
+        # Plot Original Spectrogram (16kHz resampled and truncated)
+        st.write(f"Original Audio Spectrogram (Resampled and limited to {MAX_DURATION_SECONDS} seconds):")
+        plot_spectrogram(waveform_np, sample_rate, f"Original Audio Spectrogram (Resampled to {DEFAULT_SAMPLE_RATE} Hz)")
+        # Save truncated audio for processing
+        truncated_path = os.path.join(temp_dir, "truncated_input.wav")
+        torchaudio.save(truncated_path, waveform, sample_rate)
         # Initialize SemantiCodec
         st.write("Initializing SemantiCodec...")
         # Encode and Decode
         st.write("Encoding and Decoding Audio...")
+        tokens = semanticodec.encode(truncated_path)
         reconstructed_waveform = semanticodec.decode(tokens)[0, 0]
         # Save reconstructed audio
         st.write(f"Shape of Latent Code: {tokens.shape}")
         # Audio Players
+        st.audio(truncated_path, format="audio/wav")
+        st.write("Original Audio (Truncated)")
         st.audio(reconstructed_path, format="audio/wav")
         st.write("Reconstructed Audio")
             file_name="reconstructed_audio.wav",
         )
 # Footer
 st.write("Built with [Streamlit](https://streamlit.io) and SemantiCodec")