Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

App Files Files Community

ttm-webapp-hf / pipeline /visualize.py

daniel-wojahn

length ratio removal

66ee8c0 6 days ago

raw

history blame contribute delete

10.5 kB

	import plotly.graph_objects as go
	import pandas as pd
	import plotly.express as px # For color palettes
	import numpy as np # Ensure numpy is imported, in case pivot_table uses it for aggfunc


	def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict = None):
	"""
	Generate heatmap visualizations for all metrics.
	Args:
	metrics_df: DataFrame with similarity metrics (segment-level)
	Returns:
	heatmaps: dict of {metric_name: plotly Figure} for each metric
	"""

	# Identify all numeric metric columns (exclude 'Text Pair' and 'Chapter')
	metric_cols = [
	col
	for col in metrics_df.columns
	if col not in ["Text Pair", "Chapter"] and metrics_df[col].dtype != object
	]
	for col in metrics_df.columns:
	if "Pattern Similarity" in col and col not in metric_cols:
	metric_cols.append(col)

	# --- Heatmaps for each metric ---
	heatmaps = {}
	# Chapter 1 will be at the top of the Y-axis due to sort_index(ascending=False).
	for metric in metric_cols:
	# Check if all values for this metric are NaN
	if metrics_df[metric].isnull().all():
	heatmaps[metric] = None
	continue # Move to the next metric

	pivot = metrics_df.pivot(index="Chapter", columns="Text Pair", values=metric)
	pivot = pivot.sort_index(ascending=False) # Invert Y-axis: Chapter 1 at the top
	# Additional check: if pivot is empty or all NaNs after pivoting (e.g., due to single chapter comparisons)
	if pivot.empty or pivot.isnull().all().all():
	heatmaps[metric] = None
	continue

	cleaned_columns = [col.replace(".txt", "") for col in pivot.columns]

	# For consistent interpretation: higher values (more similarity) = darker colors
	# Using 'Reds' colormap for all metrics (dark red = high similarity)
	cmap = "Reds"

	# Format values for display
	text = [
	[f"{val:.2f}" if pd.notnull(val) else "" for val in row]
	for row in pivot.values
	]

	# Create a copy of the pivot data for visualization
	# For LCS and Semantic Similarity, we need to reverse the color scale
	# so that higher values (more similarity) are darker
	viz_values = pivot.values.copy()

	# Determine if we need to reverse the values for consistent color interpretation
	# (darker = more similar across all metrics)
	reverse_colorscale = False

	# All metrics should have darker colors for higher similarity
	# No need to reverse values anymore - we'll use the same scale for all

	fig = go.Figure(
	data=go.Heatmap(
	z=viz_values,
	x=cleaned_columns,
	y=pivot.index,
	colorscale=cmap,
	reversescale=reverse_colorscale, # Use the same scale direction for all metrics
	zmin=float(np.nanmin(viz_values)),
	zmax=float(np.nanmax(viz_values)),
	text=text,
	texttemplate="%{text}",
	hovertemplate="Chapter %{y}<br>Text Pair: %{x}<br>Value: %{z:.2f}<extra></extra>",
	colorbar=dict(title=metric, thickness=20, tickfont=dict(size=14)),
	)
	)
	plot_title = (
	descriptive_titles.get(metric, metric) if descriptive_titles else metric
	)
	# Calculate dynamic height based on number of rows
	num_rows = len(pivot.index)
	num_cols = len(pivot.columns)
	# Minimum 500px, scale with data size, max 900px
	dynamic_height = min(900, max(500, 80 + num_rows * 35))

	fig.update_layout(
	title=dict(text=plot_title, font=dict(size=18)),
	xaxis_title="Text Pair",
	yaxis_title="Chapter",
	autosize=True,
	height=dynamic_height,
	font=dict(size=16),
	margin=dict(l=80, b=120, t=60, r=30),
	xaxis=dict(
	automargin=True,
	side="bottom"
	),
	yaxis=dict(
	automargin=True,
	)
	)
	fig.update_xaxes(tickangle=45, tickfont=dict(size=14))
	fig.update_yaxes(tickfont=dict(size=14), autorange="reversed")
	# Ensure all integer chapter numbers are shown if the axis is numeric and reversed
	if pd.api.types.is_numeric_dtype(pivot.index):
	fig.update_yaxes(
	tickmode="array",
	tickvals=pivot.index,
	ticktext=[str(i) for i in pivot.index],
	)
	heatmaps[metric] = fig

	return heatmaps


	def generate_word_count_chart(word_counts_df: pd.DataFrame):
	"""
	Generates a bar chart for word counts per segment (file/chapter).
	Args:
	word_counts_df: DataFrame with 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'.
	Returns:
	plotly Figure for the bar chart, or None if input is empty.
	"""
	if word_counts_df.empty:
	return None

	fig = go.Figure()

	# Assign colors based on Filename
	unique_files = sorted(word_counts_df["Filename"].unique())
	colors = px.colors.qualitative.Plotly # Get a default Plotly color sequence

	for i, filename in enumerate(unique_files):
	file_df = word_counts_df[word_counts_df["Filename"] == filename].sort_values(
	"ChapterNumber"
	)
	fig.add_trace(
	go.Bar(
	x=file_df["ChapterNumber"],
	y=file_df["WordCount"],
	name=filename,
	marker_color=colors[i % len(colors)],
	text=file_df["WordCount"],
	textposition="auto",
	customdata=file_df[["Filename"]], # Pass Filename for hovertemplate
	hovertemplate="<b>File</b>: %{customdata[0]}<br>"
	+ "<b>Chapter</b>: %{x}<br>"
	+ "<b>Word Count</b>: %{y}<extra></extra>",
	)
	)

	fig.update_layout(
	title_text="Word Counts per Chapter (Grouped by File)",
	xaxis_title="Chapter Number",
	yaxis_title="Word Count",
	barmode="group",
	font=dict(size=14),
	legend_title_text="Filename",
	xaxis=dict(
	type="category", # Treat chapter numbers as categories
	automargin=True # Automatically adjust margin for x-axis labels/title
	),
	yaxis=dict(
	rangemode='tozero', # Ensure y-axis starts at 0 and includes max value
	automargin=True, # Automatically adjust margin for y-axis labels/title
	autorange=True # Ensure automatic range calculation
	),
	autosize=True, # Keep for responsiveness in Gradio
	margin=dict(l=80, r=50, b=100, t=50, pad=4), # Keep existing base margins
	height=500, # Set a fixed height for better visibility
	width=800 # Set a reasonable width
	)
	# Ensure x-axis ticks are shown for all chapter numbers present
	all_chapter_numbers = sorted(word_counts_df["ChapterNumber"].unique())
	fig.update_xaxes(
	tickmode="array",
	tickvals=all_chapter_numbers,
	ticktext=[str(ch) for ch in all_chapter_numbers],
	)

	return fig


	def generate_vocab_containment_chart(vocab_containment_df: pd.DataFrame):
	"""
	Generates a bar chart showing vocabulary containment per chapter.
	Shows what percentage of each text's unique vocabulary appears in the other text.

	Args:
	vocab_containment_df: DataFrame with 'ChapterNumber', 'SourceText', 'TargetText',
	'Containment', 'SourceVocabSize', 'SharedVocabSize'.
	Returns:
	plotly Figure for the vocabulary containment chart, or None if input is empty.
	"""
	if vocab_containment_df is None or vocab_containment_df.empty:
	return None

	fig = go.Figure()

	# Create a label for each direction: "TextA → TextB" means "% of TextA's vocab in TextB"
	vocab_containment_df = vocab_containment_df.copy()
	vocab_containment_df["Direction"] = (
	vocab_containment_df["SourceText"] + " → " + vocab_containment_df["TargetText"]
	)

	# Get unique directions and assign colors
	unique_directions = sorted(vocab_containment_df["Direction"].unique())
	colors = px.colors.qualitative.Plotly

	for i, direction in enumerate(unique_directions):
	dir_df = vocab_containment_df[vocab_containment_df["Direction"] == direction].sort_values(
	"ChapterNumber"
	)
	fig.add_trace(
	go.Bar(
	x=dir_df["ChapterNumber"],
	y=dir_df["Containment"],
	name=direction,
	marker_color=colors[i % len(colors)],
	text=[f"{v:.1f}%" for v in dir_df["Containment"]],
	textposition="auto",
	customdata=dir_df[["SourceVocabSize", "SharedVocabSize", "SourceText", "TargetText"]].values,
	hovertemplate=(
	"<b>Chapter %{x}</b><br>"
	+ "<b>%{customdata[2]}</b> vocabulary in <b>%{customdata[3]}</b>: %{y:.1f}%<br>"
	+ "Unique words in source: %{customdata[0]}<br>"
	+ "Shared words: %{customdata[1]}<extra></extra>"
	),
	)
	)

	fig.update_layout(
	title_text="Vocabulary Containment per Chapter",
	xaxis_title="Chapter Number",
	yaxis_title="Vocabulary Containment (%)",
	barmode="group",
	font=dict(size=14),
	legend_title_text="Direction (Source → Target)",
	xaxis=dict(
	type="category",
	automargin=True
	),
	yaxis=dict(
	rangemode='tozero',
	automargin=True,
	range=[0, 105], # Slightly above 100% for visual clarity
	),
	autosize=True,
	margin=dict(l=80, r=50, b=100, t=60, pad=4),
	height=450,
	)

	# Add a reference line at 100%
	fig.add_hline(
	y=100,
	line_dash="dash",
	line_color="gray",
	annotation_text="100%",
	annotation_position="right"
	)

	# Ensure x-axis ticks are shown for all chapter numbers
	chapters = sorted(vocab_containment_df["ChapterNumber"].unique())
	fig.update_xaxes(
	tickmode="array",
	tickvals=chapters,
	ticktext=[str(ch) for ch in chapters],
	)

	return fig