import plotly.graph_objects as go
import pandas as pd
import plotly.express as px # For color palettes
import numpy as np # Ensure numpy is imported, in case pivot_table uses it for aggfunc
def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict = None):
"""
Generate heatmap visualizations for all metrics.
Args:
metrics_df: DataFrame with similarity metrics (segment-level)
Returns:
heatmaps: dict of {metric_name: plotly Figure} for each metric
"""
# Identify all numeric metric columns (exclude 'Text Pair' and 'Chapter')
metric_cols = [
col
for col in metrics_df.columns
if col not in ["Text Pair", "Chapter"] and metrics_df[col].dtype != object
]
for col in metrics_df.columns:
if "Pattern Similarity" in col and col not in metric_cols:
metric_cols.append(col)
# --- Heatmaps for each metric ---
heatmaps = {}
# Chapter 1 will be at the top of the Y-axis due to sort_index(ascending=False).
for metric in metric_cols:
# Check if all values for this metric are NaN
if metrics_df[metric].isnull().all():
heatmaps[metric] = None
continue # Move to the next metric
pivot = metrics_df.pivot(index="Chapter", columns="Text Pair", values=metric)
pivot = pivot.sort_index(ascending=False) # Invert Y-axis: Chapter 1 at the top
# Additional check: if pivot is empty or all NaNs after pivoting (e.g., due to single chapter comparisons)
if pivot.empty or pivot.isnull().all().all():
heatmaps[metric] = None
continue
cleaned_columns = [col.replace(".txt", "") for col in pivot.columns]
# For consistent interpretation: higher values (more similarity) = darker colors
# Using 'Reds' colormap for all metrics (dark red = high similarity)
cmap = "Reds"
# Format values for display
text = [
[f"{val:.2f}" if pd.notnull(val) else "" for val in row]
for row in pivot.values
]
# Create a copy of the pivot data for visualization
# For LCS and Semantic Similarity, we need to reverse the color scale
# so that higher values (more similarity) are darker
viz_values = pivot.values.copy()
# Determine if we need to reverse the values for consistent color interpretation
# (darker = more similar across all metrics)
reverse_colorscale = False
# All metrics should have darker colors for higher similarity
# No need to reverse values anymore - we'll use the same scale for all
fig = go.Figure(
data=go.Heatmap(
z=viz_values,
x=cleaned_columns,
y=pivot.index,
colorscale=cmap,
reversescale=reverse_colorscale, # Use the same scale direction for all metrics
zmin=float(np.nanmin(viz_values)),
zmax=float(np.nanmax(viz_values)),
text=text,
texttemplate="%{text}",
hovertemplate="Chapter %{y}
Text Pair: %{x}
Value: %{z:.2f}",
colorbar=dict(title=metric, thickness=20, tickfont=dict(size=14)),
)
)
plot_title = (
descriptive_titles.get(metric, metric) if descriptive_titles else metric
)
# Calculate dynamic height based on number of rows
num_rows = len(pivot.index)
num_cols = len(pivot.columns)
# Minimum 500px, scale with data size, max 900px
dynamic_height = min(900, max(500, 80 + num_rows * 35))
fig.update_layout(
title=dict(text=plot_title, font=dict(size=18)),
xaxis_title="Text Pair",
yaxis_title="Chapter",
autosize=True,
height=dynamic_height,
font=dict(size=16),
margin=dict(l=80, b=120, t=60, r=30),
xaxis=dict(
automargin=True,
side="bottom"
),
yaxis=dict(
automargin=True,
)
)
fig.update_xaxes(tickangle=45, tickfont=dict(size=14))
fig.update_yaxes(tickfont=dict(size=14), autorange="reversed")
# Ensure all integer chapter numbers are shown if the axis is numeric and reversed
if pd.api.types.is_numeric_dtype(pivot.index):
fig.update_yaxes(
tickmode="array",
tickvals=pivot.index,
ticktext=[str(i) for i in pivot.index],
)
heatmaps[metric] = fig
return heatmaps
def generate_word_count_chart(word_counts_df: pd.DataFrame):
"""
Generates a bar chart for word counts per segment (file/chapter).
Args:
word_counts_df: DataFrame with 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'.
Returns:
plotly Figure for the bar chart, or None if input is empty.
"""
if word_counts_df.empty:
return None
fig = go.Figure()
# Assign colors based on Filename
unique_files = sorted(word_counts_df["Filename"].unique())
colors = px.colors.qualitative.Plotly # Get a default Plotly color sequence
for i, filename in enumerate(unique_files):
file_df = word_counts_df[word_counts_df["Filename"] == filename].sort_values(
"ChapterNumber"
)
fig.add_trace(
go.Bar(
x=file_df["ChapterNumber"],
y=file_df["WordCount"],
name=filename,
marker_color=colors[i % len(colors)],
text=file_df["WordCount"],
textposition="auto",
customdata=file_df[["Filename"]], # Pass Filename for hovertemplate
hovertemplate="File: %{customdata[0]}
"
+ "Chapter: %{x}
"
+ "Word Count: %{y}",
)
)
fig.update_layout(
title_text="Word Counts per Chapter (Grouped by File)",
xaxis_title="Chapter Number",
yaxis_title="Word Count",
barmode="group",
font=dict(size=14),
legend_title_text="Filename",
xaxis=dict(
type="category", # Treat chapter numbers as categories
automargin=True # Automatically adjust margin for x-axis labels/title
),
yaxis=dict(
rangemode='tozero', # Ensure y-axis starts at 0 and includes max value
automargin=True, # Automatically adjust margin for y-axis labels/title
autorange=True # Ensure automatic range calculation
),
autosize=True, # Keep for responsiveness in Gradio
margin=dict(l=80, r=50, b=100, t=50, pad=4), # Keep existing base margins
height=500, # Set a fixed height for better visibility
width=800 # Set a reasonable width
)
# Ensure x-axis ticks are shown for all chapter numbers present
all_chapter_numbers = sorted(word_counts_df["ChapterNumber"].unique())
fig.update_xaxes(
tickmode="array",
tickvals=all_chapter_numbers,
ticktext=[str(ch) for ch in all_chapter_numbers],
)
return fig
def generate_vocab_containment_chart(vocab_containment_df: pd.DataFrame):
"""
Generates a bar chart showing vocabulary containment per chapter.
Shows what percentage of each text's unique vocabulary appears in the other text.
Args:
vocab_containment_df: DataFrame with 'ChapterNumber', 'SourceText', 'TargetText',
'Containment', 'SourceVocabSize', 'SharedVocabSize'.
Returns:
plotly Figure for the vocabulary containment chart, or None if input is empty.
"""
if vocab_containment_df is None or vocab_containment_df.empty:
return None
fig = go.Figure()
# Create a label for each direction: "TextA → TextB" means "% of TextA's vocab in TextB"
vocab_containment_df = vocab_containment_df.copy()
vocab_containment_df["Direction"] = (
vocab_containment_df["SourceText"] + " → " + vocab_containment_df["TargetText"]
)
# Get unique directions and assign colors
unique_directions = sorted(vocab_containment_df["Direction"].unique())
colors = px.colors.qualitative.Plotly
for i, direction in enumerate(unique_directions):
dir_df = vocab_containment_df[vocab_containment_df["Direction"] == direction].sort_values(
"ChapterNumber"
)
fig.add_trace(
go.Bar(
x=dir_df["ChapterNumber"],
y=dir_df["Containment"],
name=direction,
marker_color=colors[i % len(colors)],
text=[f"{v:.1f}%" for v in dir_df["Containment"]],
textposition="auto",
customdata=dir_df[["SourceVocabSize", "SharedVocabSize", "SourceText", "TargetText"]].values,
hovertemplate=(
"Chapter %{x}
"
+ "%{customdata[2]} vocabulary in %{customdata[3]}: %{y:.1f}%
"
+ "Unique words in source: %{customdata[0]}
"
+ "Shared words: %{customdata[1]}"
),
)
)
fig.update_layout(
title_text="Vocabulary Containment per Chapter",
xaxis_title="Chapter Number",
yaxis_title="Vocabulary Containment (%)",
barmode="group",
font=dict(size=14),
legend_title_text="Direction (Source → Target)",
xaxis=dict(
type="category",
automargin=True
),
yaxis=dict(
rangemode='tozero',
automargin=True,
range=[0, 105], # Slightly above 100% for visual clarity
),
autosize=True,
margin=dict(l=80, r=50, b=100, t=60, pad=4),
height=450,
)
# Add a reference line at 100%
fig.add_hline(
y=100,
line_dash="dash",
line_color="gray",
annotation_text="100%",
annotation_position="right"
)
# Ensure x-axis ticks are shown for all chapter numbers
chapters = sorted(vocab_containment_df["ChapterNumber"].unique())
fig.update_xaxes(
tickmode="array",
tickvals=chapters,
ticktext=[str(ch) for ch in chapters],
)
return fig