Spaces:

avans06
/

Audio-to-CUE-Generator

Sleeping

App Files Files Community

Audio-to-CUE-Generator / app.py

avans06

feat: Enhance CUE parsing, editing, and metadata preservation

717e549 4 months ago

raw

history blame contribute delete

19.7 kB

	import os
	import re
	import librosa
	import gradio as gr
	from copy import deepcopy

	# --- Helper Functions ---

	def seconds_to_cue_time(t):
	"""Converts a time in seconds to the CUE sheet format (MM:SS:FF)."""
	t = max(0, t)
	minutes = int(t // 60)
	seconds = int(t % 60)
	frames = int((t - minutes * 60 - seconds) * 75)
	return f'{minutes:02d}:{seconds:02d}:{frames:02d}'

	def parse_cue_time_to_seconds(time_str):
	"""Parses MM:SS:FF into seconds. Returns None on failure."""
	if not time_str:
	return None
	match = re.match(r'(\d+):(\d{1,2}):(\d{1,2})', time_str)
	if match:
	m, s, f = map(int, match.groups())
	return m * 60 + s + f / 75.0
	return None

	def format_cue_text(track_data, cue_globals):
	"""
	Generates the final CUE sheet string from track data and global info.
	MODIFIED: Now accepts track data with titles.
	"""
	if not track_data:
	return ""

	# Sort tracks by time before formatting
	sorted_tracks = sorted(track_data, key=lambda x: x['time'])

	cue_text = f'PERFORMER "{cue_globals.get("performer", "Unknown Artist")}"\n'
	cue_text += f'TITLE "{cue_globals.get("title", os.path.splitext(cue_globals["filename"])[0])}"\n'
	# Use parsed file type or default to WAVE
	file_type = cue_globals.get("filetype", "WAVE")
	cue_text += f'FILE "{cue_globals["filename"]}" {file_type}\n'

	for idx, track in enumerate(sorted_tracks):
	cue_time_str = seconds_to_cue_time(track['time'])
	# FEATURE 1: Use existing title or create a default one
	title = track.get('title') or f"Track {idx+1:02d}"

	cue_text += f' TRACK {idx+1:02d} AUDIO\n'
	cue_text += f' TITLE "{title}"\n'
	cue_text += f' INDEX 01 {cue_time_str}\n'
	return cue_text

	def generate_track_choices(track_data, audio_duration):
	"""Creates choices for the CheckboxGroup as (label, index) tuples."""
	if not track_data:
	return []
	# Data is already sorted, but we re-sort just in case.
	sorted_tracks = sorted(track_data, key=lambda x: x['time'])
	track_choices = []
	for i, track in enumerate(sorted_tracks):
	start_time = track['time']
	end_time = sorted_tracks[i+1]['time'] if i < len(sorted_tracks) - 1 else audio_duration
	track_length = end_time - start_time
	title = track.get('title', f"Track {i+1:02d}")

	label = f'"{title}" (Starts: {seconds_to_cue_time(start_time)}) [Length: {seconds_to_cue_time(track_length)}]'
	track_choices.append((label, i))
	return track_choices

	# --- Core Gradio Functions ---
	def analyze_audio_to_cue(audio_file, top_db, min_segment_len, merge_threshold, merge_protection_len):
	"""Workflow 1: Analyzes an uploaded audio file to generate the initial CUE text."""
	if not audio_file:
	raise gr.Error("Please upload an audio file first.")

	# --- 1. Load Audio File ---
	try:
	y, sr = librosa.load(audio_file, sr=None)
	audio_duration = librosa.get_duration(y=y, sr=sr)
	except Exception as e:
	raise gr.Error(f"Could not load audio file: {e}")

	# --- 2. Detect Segments using Silence Detection ---
	intervals = librosa.effects.split(y, top_db=top_db)

	# Corrected way to check if NumPy array is empty
	times = [iv[0] / sr for iv in intervals if (iv[1] - iv[0]) / sr >= min_segment_len] if intervals.size > 0 else []

	# --- 3. Post-process Tracks (Add Start, Auto-Merge) ---
	if not times or times[0] > 0.5:
	times.insert(0, 0.0)

	# Auto-merging logic
	if len(times) > 1:
	final_times = [times[0]]
	i = 0
	while i < len(times) - 1:
	track_length = times[i+1] - times[i]

	# Merge if track is shorter than threshold AND not longer than protection length
	if (track_length < merge_threshold) and (track_length <= merge_protection_len):
	# Condition to MERGE is met. Skip adding the next timestamp.
	pass
	else:
	# Condition to KEEP is met.
	final_times.append(times[i+1])

	i += 1

	if len(final_times) > 1 and (audio_duration - final_times[-1]) < merge_threshold:
	final_times.pop()
	times = final_times

	# --- 4. Prepare Outputs for Gradio ---
	times = sorted(list(set(times)))

	# Convert times list to the new track_data structure
	track_data = [{'time': t, 'title': None} for t in times]

	audio_filename = os.path.basename(audio_file)
	cue_globals = {"filename": audio_filename}

	initial_cue_text = format_cue_text(track_data, cue_globals)
	track_choices = generate_track_choices(track_data, audio_duration)

	# This function now returns everything needed to update the entire UI in one step.
	return (
	initial_cue_text, cue_globals, track_data, audio_duration,
	gr.update(choices=track_choices, value=[]), gr.update(visible=True)
	)

	def parse_cue_and_update_ui(cue_text):
	"""Workflow 2: Parses pasted CUE text, preserving titles."""
	if not cue_text or "INDEX 01" not in cue_text:
	return cue_text, {}, [], 0, gr.update(choices=[], value=[]), gr.update(visible=False)

	cue_globals = {}
	track_data = []
	current_track = None

	lines = cue_text.split('\n')

	for line in lines:
	line = line.strip()
	if not line:
	continue

	if re.search(r'TRACK\s+\d+\s+AUDIO', line, re.IGNORECASE):
	if current_track is not None:
	track_data.append(current_track)
	current_track = {}
	continue

	if current_track is None:
	# OPTIMIZATION: Capture file type (WAVE, MP3, etc.)
	if match := re.search(r'FILE\s+"([^"]+)"\s+([A-Z0-9]+)', line, re.IGNORECASE):
	cue_globals['filename'] = match.group(1)
	cue_globals['filetype'] = match.group(2)
	elif match := re.search(r'PERFORMER\s+"([^"]+)"', line, re.IGNORECASE):
	cue_globals['performer'] = match.group(1)
	elif match := re.search(r'^TITLE\s+"([^"]+)"', line, re.IGNORECASE):
	cue_globals['title'] = match.group(1)
	else:
	if match := re.search(r'TITLE\s+"([^"]+)"', line, re.IGNORECASE):
	current_track['title'] = match.group(1)
	elif match := re.search(r'INDEX\s+\d+\s+([\d:]{7,8})', line, re.IGNORECASE):
	# BUG FIX: Check for None instead of truthiness to correctly handle 0.0
	time_sec = parse_cue_time_to_seconds(match.group(1))
	if time_sec is not None:
	current_track['time'] = time_sec

	if current_track:
	track_data.append(current_track)

	if not track_data or not cue_globals.get('filename'):
	return cue_text, {}, [], 0, gr.update(choices=[], value=[]), gr.update(visible=False)

	#Filter incomplete tracks before sorting
	track_data = sorted([t for t in track_data if 'time' in t], key=lambda x: x['time'])

	if not track_data: # All tracks might have been invalid
	return cue_text, {}, [], 0, gr.update(choices=[], value=[]), gr.update(visible=False)

	audio_duration = track_data[-1]['time'] if track_data else 0
	track_choices = generate_track_choices(track_data, audio_duration)

	# Re-generate the CUE text to ensure consistent formatting
	formatted_text = format_cue_text(track_data, cue_globals)

	return formatted_text, cue_globals, track_data, audio_duration, gr.update(choices=track_choices, value=[]), gr.update(visible=True)

	def update_editing_tools(selected_indices, track_data, audio_duration):
	"""Dynamically shows/hides editing tools based on selection count."""
	num_selected = len(selected_indices)

	merge_update = gr.update(visible=False)
	single_update = gr.update(visible=False)
	slider_update = gr.update()
	slider_label_update = gr.update()
	edit_box_update = gr.update()

	if num_selected == 1:
	track_idx = selected_indices[0]
	single_update['visible'] = True # Use dict update to avoid overwriting the object

	start_time = track_data[track_idx]['time']
	end_time = audio_duration if (track_idx + 1) >= len(track_data) else track_data[track_idx + 1]['time']

	# --- 2. Add padding to prevent splitting at the exact edges ---
	# A CUE sheet frame is 1/75s (~0.013s). We use a slightly larger padding.
	padding = 0.02
	split_possible = (start_time + padding) < (end_time - padding)

	if split_possible:
	mid_point = start_time + (end_time - start_time) / 2
	slider_update = gr.update(minimum=start_time + padding, maximum=end_time - padding, value=mid_point)
	slider_label_update = gr.update(value=f"Split at: {seconds_to_cue_time(mid_point)}")
	else:
	slider_label_update = gr.update(value="Track is too short to be split")

	edit_box_update = gr.update(value=seconds_to_cue_time(start_time))

	elif num_selected > 1:
	merge_update['visible'] = True

	return merge_update, single_update, slider_update, slider_label_update, edit_box_update

	def perform_manual_merge(indices_to_merge, original_track_data, audio_duration, cue_globals):
	"""Merges selected tracks based on their indices."""
	indices_set = set(indices_to_merge)

	# --- Create the new list of times ---
	# --- This logic correctly handles all merge cases. ---
	new_track_data = []
	for i, track in enumerate(original_track_data):
	# Condition to KEEP a track's start time:
	# 1. It was NOT selected.
	# OR
	# 2. It WAS selected, BUT it's the start of a merge block.
	# (This means it's the very first track, OR the track before it was NOT selected).
	if i not in indices_set or (i == 0) or ((i - 1) not in indices_set):
	new_track_data.append(track)

	# --- Prepare all the outputs to update the UI ---
	# The new CUE text for the textbox
	final_cue_text = format_cue_text(new_track_data, cue_globals)
	new_track_choices = generate_track_choices(new_track_data, audio_duration)

	# Return a tuple that will update the textbox, the state, and the checklist
	return final_cue_text, new_track_data, gr.update(choices=new_track_choices, value=[])

	def perform_manual_split(split_time_sec, original_track_data, audio_duration, cue_globals):
	"""Splits a track at the time specified by the slider."""
	if any(abs(t['time'] - split_time_sec) < 1e-3 for t in original_track_data):
	raise gr.Error("This exact timestamp already exists.")

	new_track = {'time': split_time_sec, 'title': None}
	new_track_data = sorted(original_track_data + [new_track], key=lambda x: x['time'])

	final_cue_text = format_cue_text(new_track_data, cue_globals)
	new_track_choices = generate_track_choices(new_track_data, audio_duration)
	return final_cue_text, new_track_data, gr.update(choices=new_track_choices, value=[])

	# --- Timeline Shift ---
	def shift_timeline(shift_amount_sec, original_track_data, audio_duration, cue_globals):
	"""Shifts all track start times by a specified amount."""
	if not original_track_data:
	raise gr.Error("No track times to shift.")

	# Use deepcopy to avoid modifying the original state directly
	new_track_data = deepcopy(original_track_data)

	# FEATURE 2: Apply shift without an upper bound, allowing the last track to move forward
	for track in new_track_data:
	track['time'] = max(0, track['time'] + shift_amount_sec)

	# Remove duplicates that might be created if multiple tracks are clamped to 0
	unique_tracks = []
	seen_times = set()
	for track in sorted(new_track_data, key=lambda x: x['time']):
	if track['time'] not in seen_times:
	unique_tracks.append(track)
	seen_times.add(track['time'])

	final_cue_text = format_cue_text(unique_tracks, cue_globals)
	new_track_choices = generate_track_choices(unique_tracks, audio_duration)
	return final_cue_text, unique_tracks, gr.update(choices=new_track_choices, value=[])


	# --- Edit Track Start Time ---
	def edit_track_start_time(selected_indices, new_time_str, original_track_data, audio_duration, cue_globals):
	"""Edits the start time of a single selected track using its index."""
	if not selected_indices:
	raise gr.Error("No track selected for editing.")

	new_time_sec = parse_cue_time_to_seconds(new_time_str)
	if new_time_sec is None:
	raise gr.Error("Invalid time format. Please use MM:SS:FF.")

	track_idx = selected_indices[0]

	# Boundary checks
	prev_time = original_track_data[track_idx - 1]['time'] if track_idx > 0 else -1
	next_time = original_track_data[track_idx + 1]['time'] if track_idx < len(original_track_data) - 1 else float('inf')

	if new_time_sec <= prev_time:
	raise gr.Error(f"New time cannot be earlier than the previous track's start time.")
	if new_time_sec >= next_time:
	raise gr.Error(f"New time cannot be later than or equal to the next track's start time.")

	new_track_data = deepcopy(original_track_data)
	new_track_data[track_idx]['time'] = new_time_sec

	final_cue_text = format_cue_text(new_track_data, cue_globals)
	new_track_choices = generate_track_choices(new_track_data, audio_duration)
	return final_cue_text, new_track_data, gr.update(choices=new_track_choices, value=[])

	# --- Gradio User Interface Definition ---
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🎵 Advanced CUE Sheet Generator")

	# --- Hidden State Variables ---
	cue_globals_state = gr.State({})
	track_data_state = gr.State([])
	audio_duration_state = gr.State(0)

	with gr.Tabs():
	with gr.TabItem("Start with Audio File"):
	gr.Markdown("Upload an audio file to automatically detect track points.")
	audio_input = gr.Audio(type="filepath", label="Upload Audio File")
	with gr.Accordion("Analysis Parameters", open=False):
	threshold_slider = gr.Slider(10, 80, 40, step=1, label="Silence Threshold (dB)")
	min_length_slider = gr.Slider(0.5, 30, 1, step=0.1, label="Min. Segment Length (s)")
	merge_length_slider = gr.Slider(1, 60, 15, step=1, label="Auto-Merge Threshold (s)")
	min_silence_length_slider = gr.Slider(0.5, 60, 2, step=0.1, label="Merge Protection Length (s)")
	generate_button = gr.Button("Analyze Audio", variant="primary")

	with gr.TabItem("Start with CUE Text"):
	gr.Markdown("Or paste CUE text below and click outside the box. The editing tools will appear automatically.")
	cue_text_input_for_paste = gr.Textbox(label="Paste CUE Text Here", lines=8, placeholder="Paste your CUE sheet content here and click outside the box. The editing tools will appear automatically.")

	# The main output textbox is now outside the tabs, serving as a central display.
	output_text = gr.Textbox(label="CUE Sheet Output", lines=15, show_copy_button=True, interactive=True)

	with gr.Group(visible=False) as manual_editing_group:
	gr.Markdown("### Manual Editing Tools")
	track_checkboxes = gr.CheckboxGroup(label="Select Tracks to Edit")

	with gr.Row(visible=False) as merge_tools:
	merge_button = gr.Button("Merge Selected Tracks", variant="secondary", size="lg")

	# This group contains both Split and Edit tools, shown when one track is selected
	with gr.Group(visible=False) as single_track_tools:
	with gr.Accordion("Split Track", open=False):
	split_slider_label = gr.Textbox(label="Current Split Time", interactive=False)
	split_slider = gr.Slider(label="Drag to select split point")
	split_button = gr.Button("Split Track at Selected Time", variant="secondary")

	# --- Edit Start Time ---
	with gr.Accordion("Edit Start Time", open=True):
	edit_time_input = gr.Textbox(label="New Start Time (MM:SS:FF)", placeholder="e.g., 01:23:45")
	edit_time_button = gr.Button("Update Start Time", variant="secondary")

	# --- Global Timeline Shift ---
	with gr.Accordion("Global Edits", open=False, visible=False) as global_editing_group:
	shift_amount_input = gr.Number(label="Timeline Shift Amount (seconds, +/-)", value=0)
	shift_button = gr.Button("Apply Timeline Shift", variant="secondary")


	# --- Event Wiring ---

	# Combined update for enabling editing groups
	def show_editing_groups(track_data):
	is_visible = bool(track_data)
	return gr.update(visible=is_visible), gr.update(visible=is_visible)

	# Workflow 1: Audio analysis button now updates everything, including the editing tools.
	generate_button.click(
	fn=analyze_audio_to_cue,
	inputs=[audio_input, threshold_slider, min_length_slider, merge_length_slider, min_silence_length_slider],
	outputs=[output_text, cue_globals_state, track_data_state, audio_duration_state, track_checkboxes]
	).then(
	fn=show_editing_groups,
	inputs=[track_data_state],
	outputs=[manual_editing_group, global_editing_group]
	)

	# Workflow 2: Pasting text in the dedicated input box populates the main output and enables tools.
	# The `.change` event now updates all necessary outputs in a single, direct step.
	cue_text_input_for_paste.change(
	fn=parse_cue_and_update_ui,
	inputs=[cue_text_input_for_paste],
	outputs=[output_text, cue_globals_state, track_data_state, audio_duration_state, track_checkboxes]
	).then(
	fn=show_editing_groups,
	inputs=[track_data_state],
	outputs=[manual_editing_group, global_editing_group]
	)

	# Dynamic UI controller for showing/hiding Merge/Split tools
	track_checkboxes.change(
	fn=update_editing_tools,
	inputs=[track_checkboxes, track_data_state, audio_duration_state],
	outputs=[merge_tools, single_track_tools, split_slider, split_slider_label, edit_time_input]
	)

	# Live update for the split slider's time display
	split_slider.input(
	fn=lambda t: f"Split at: {seconds_to_cue_time(t)}",
	inputs=[split_slider],
	outputs=[split_slider_label]
	)

	# Action buttons
	merge_button.click(
	fn=perform_manual_merge,
	inputs=[track_checkboxes, track_data_state, audio_duration_state, cue_globals_state],
	outputs=[output_text, track_data_state, track_checkboxes]
	)

	split_button.click(
	fn=perform_manual_split,
	inputs=[split_slider, track_data_state, audio_duration_state, cue_globals_state],
	outputs=[output_text, track_data_state, track_checkboxes]
	)

	# --- Action Buttons for New Features ---
	shift_button.click(
	fn=shift_timeline,
	inputs=[shift_amount_input, track_data_state, audio_duration_state, cue_globals_state],
	outputs=[output_text, track_data_state, track_checkboxes]
	)

	edit_time_button.click(
	fn=edit_track_start_time,
	inputs=[track_checkboxes, edit_time_input, track_data_state, audio_duration_state, cue_globals_state],
	outputs=[output_text, track_data_state, track_checkboxes]
	)

	if __name__ == "__main__":
	demo.launch(inbrowser=True)