import plotly.graph_objects as go import pandas as pd import plotly.express as px # For color palettes import numpy as np # Ensure numpy is imported, in case pivot_table uses it for aggfunc def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict = None): """ Generate heatmap visualizations for all metrics. Args: metrics_df: DataFrame with similarity metrics (segment-level) Returns: heatmaps: dict of {metric_name: plotly Figure} for each metric """ # Identify all numeric metric columns (exclude 'Text Pair' and 'Chapter') metric_cols = [ col for col in metrics_df.columns if col not in ["Text Pair", "Chapter"] and metrics_df[col].dtype != object ] for col in metrics_df.columns: if "Pattern Similarity" in col and col not in metric_cols: metric_cols.append(col) # --- Heatmaps for each metric --- heatmaps = {} # Chapter 1 will be at the top of the Y-axis due to sort_index(ascending=False). for metric in metric_cols: # Check if all values for this metric are NaN if metrics_df[metric].isnull().all(): heatmaps[metric] = None continue # Move to the next metric pivot = metrics_df.pivot(index="Chapter", columns="Text Pair", values=metric) pivot = pivot.sort_index(ascending=False) # Invert Y-axis: Chapter 1 at the top # Additional check: if pivot is empty or all NaNs after pivoting (e.g., due to single chapter comparisons) if pivot.empty or pivot.isnull().all().all(): heatmaps[metric] = None continue cleaned_columns = [col.replace(".txt", "") for col in pivot.columns] # For consistent interpretation: higher values (more similarity) = darker colors # Using 'Reds' colormap for all metrics (dark red = high similarity) cmap = "Reds" # Format values for display text = [ [f"{val:.2f}" if pd.notnull(val) else "" for val in row] for row in pivot.values ] # Create a copy of the pivot data for visualization # For LCS and Semantic Similarity, we need to reverse the color scale # so that higher values (more similarity) are darker viz_values = pivot.values.copy() # Determine if we need to reverse the values for consistent color interpretation # (darker = more similar across all metrics) reverse_colorscale = False # All metrics should have darker colors for higher similarity # No need to reverse values anymore - we'll use the same scale for all fig = go.Figure( data=go.Heatmap( z=viz_values, x=cleaned_columns, y=pivot.index, colorscale=cmap, reversescale=reverse_colorscale, # Use the same scale direction for all metrics zmin=float(np.nanmin(viz_values)), zmax=float(np.nanmax(viz_values)), text=text, texttemplate="%{text}", hovertemplate="Chapter %{y}
Text Pair: %{x}
Value: %{z:.2f}", colorbar=dict(title=metric, thickness=20, tickfont=dict(size=14)), ) ) plot_title = ( descriptive_titles.get(metric, metric) if descriptive_titles else metric ) # Calculate dynamic height based on number of rows num_rows = len(pivot.index) num_cols = len(pivot.columns) # Minimum 500px, scale with data size, max 900px dynamic_height = min(900, max(500, 80 + num_rows * 35)) fig.update_layout( title=dict(text=plot_title, font=dict(size=18)), xaxis_title="Text Pair", yaxis_title="Chapter", autosize=True, height=dynamic_height, font=dict(size=16), margin=dict(l=80, b=120, t=60, r=30), xaxis=dict( automargin=True, side="bottom" ), yaxis=dict( automargin=True, ) ) fig.update_xaxes(tickangle=45, tickfont=dict(size=14)) fig.update_yaxes(tickfont=dict(size=14), autorange="reversed") # Ensure all integer chapter numbers are shown if the axis is numeric and reversed if pd.api.types.is_numeric_dtype(pivot.index): fig.update_yaxes( tickmode="array", tickvals=pivot.index, ticktext=[str(i) for i in pivot.index], ) heatmaps[metric] = fig return heatmaps def generate_word_count_chart(word_counts_df: pd.DataFrame): """ Generates a bar chart for word counts per segment (file/chapter). Args: word_counts_df: DataFrame with 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'. Returns: plotly Figure for the bar chart, or None if input is empty. """ if word_counts_df.empty: return None fig = go.Figure() # Assign colors based on Filename unique_files = sorted(word_counts_df["Filename"].unique()) colors = px.colors.qualitative.Plotly # Get a default Plotly color sequence for i, filename in enumerate(unique_files): file_df = word_counts_df[word_counts_df["Filename"] == filename].sort_values( "ChapterNumber" ) fig.add_trace( go.Bar( x=file_df["ChapterNumber"], y=file_df["WordCount"], name=filename, marker_color=colors[i % len(colors)], text=file_df["WordCount"], textposition="auto", customdata=file_df[["Filename"]], # Pass Filename for hovertemplate hovertemplate="File: %{customdata[0]}
" + "Chapter: %{x}
" + "Word Count: %{y}", ) ) fig.update_layout( title_text="Word Counts per Chapter (Grouped by File)", xaxis_title="Chapter Number", yaxis_title="Word Count", barmode="group", font=dict(size=14), legend_title_text="Filename", xaxis=dict( type="category", # Treat chapter numbers as categories automargin=True # Automatically adjust margin for x-axis labels/title ), yaxis=dict( rangemode='tozero', # Ensure y-axis starts at 0 and includes max value automargin=True, # Automatically adjust margin for y-axis labels/title autorange=True # Ensure automatic range calculation ), autosize=True, # Keep for responsiveness in Gradio margin=dict(l=80, r=50, b=100, t=50, pad=4), # Keep existing base margins height=500, # Set a fixed height for better visibility width=800 # Set a reasonable width ) # Ensure x-axis ticks are shown for all chapter numbers present all_chapter_numbers = sorted(word_counts_df["ChapterNumber"].unique()) fig.update_xaxes( tickmode="array", tickvals=all_chapter_numbers, ticktext=[str(ch) for ch in all_chapter_numbers], ) return fig def generate_vocab_containment_chart(vocab_containment_df: pd.DataFrame): """ Generates a bar chart showing vocabulary containment per chapter. Shows what percentage of each text's unique vocabulary appears in the other text. Args: vocab_containment_df: DataFrame with 'ChapterNumber', 'SourceText', 'TargetText', 'Containment', 'SourceVocabSize', 'SharedVocabSize'. Returns: plotly Figure for the vocabulary containment chart, or None if input is empty. """ if vocab_containment_df is None or vocab_containment_df.empty: return None fig = go.Figure() # Create a label for each direction: "TextA → TextB" means "% of TextA's vocab in TextB" vocab_containment_df = vocab_containment_df.copy() vocab_containment_df["Direction"] = ( vocab_containment_df["SourceText"] + " → " + vocab_containment_df["TargetText"] ) # Get unique directions and assign colors unique_directions = sorted(vocab_containment_df["Direction"].unique()) colors = px.colors.qualitative.Plotly for i, direction in enumerate(unique_directions): dir_df = vocab_containment_df[vocab_containment_df["Direction"] == direction].sort_values( "ChapterNumber" ) fig.add_trace( go.Bar( x=dir_df["ChapterNumber"], y=dir_df["Containment"], name=direction, marker_color=colors[i % len(colors)], text=[f"{v:.1f}%" for v in dir_df["Containment"]], textposition="auto", customdata=dir_df[["SourceVocabSize", "SharedVocabSize", "SourceText", "TargetText"]].values, hovertemplate=( "Chapter %{x}
" + "%{customdata[2]} vocabulary in %{customdata[3]}: %{y:.1f}%
" + "Unique words in source: %{customdata[0]}
" + "Shared words: %{customdata[1]}" ), ) ) fig.update_layout( title_text="Vocabulary Containment per Chapter", xaxis_title="Chapter Number", yaxis_title="Vocabulary Containment (%)", barmode="group", font=dict(size=14), legend_title_text="Direction (Source → Target)", xaxis=dict( type="category", automargin=True ), yaxis=dict( rangemode='tozero', automargin=True, range=[0, 105], # Slightly above 100% for visual clarity ), autosize=True, margin=dict(l=80, r=50, b=100, t=60, pad=4), height=450, ) # Add a reference line at 100% fig.add_hline( y=100, line_dash="dash", line_color="gray", annotation_text="100%", annotation_position="right" ) # Ensure x-axis ticks are shown for all chapter numbers chapters = sorted(vocab_containment_df["ChapterNumber"].unique()) fig.update_xaxes( tickmode="array", tickvals=chapters, ticktext=[str(ch) for ch in chapters], ) return fig