Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

File size: 10,483 Bytes

import plotly.graph_objects as go
import pandas as pd
import plotly.express as px  # For color palettes
import numpy as np  # Ensure numpy is imported, in case pivot_table uses it for aggfunc


def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict = None):
    """
    Generate heatmap visualizations for all metrics.
    Args:
        metrics_df: DataFrame with similarity metrics (segment-level)
    Returns:
        heatmaps: dict of {metric_name: plotly Figure} for each metric
    """

    # Identify all numeric metric columns (exclude 'Text Pair' and 'Chapter')
    metric_cols = [
        col
        for col in metrics_df.columns
        if col not in ["Text Pair", "Chapter"] and metrics_df[col].dtype != object
    ]
    for col in metrics_df.columns:
        if "Pattern Similarity" in col and col not in metric_cols:
            metric_cols.append(col)

    # --- Heatmaps for each metric ---
    heatmaps = {}
    # Chapter 1 will be at the top of the Y-axis due to sort_index(ascending=False).
    for metric in metric_cols:
        # Check if all values for this metric are NaN
        if metrics_df[metric].isnull().all():
            heatmaps[metric] = None
            continue  # Move to the next metric

        pivot = metrics_df.pivot(index="Chapter", columns="Text Pair", values=metric)
        pivot = pivot.sort_index(ascending=False)  # Invert Y-axis: Chapter 1 at the top
        # Additional check: if pivot is empty or all NaNs after pivoting (e.g., due to single chapter comparisons)
        if pivot.empty or pivot.isnull().all().all():
            heatmaps[metric] = None
            continue

        cleaned_columns = [col.replace(".txt", "") for col in pivot.columns]

        # For consistent interpretation: higher values (more similarity) = darker colors
        # Using 'Reds' colormap for all metrics (dark red = high similarity)
        cmap = "Reds"

        # Format values for display
        text = [
            [f"{val:.2f}" if pd.notnull(val) else "" for val in row]
            for row in pivot.values
        ]

        # Create a copy of the pivot data for visualization
        # For LCS and Semantic Similarity, we need to reverse the color scale
        # so that higher values (more similarity) are darker
        viz_values = pivot.values.copy()

        # Determine if we need to reverse the values for consistent color interpretation
        # (darker = more similar across all metrics)
        reverse_colorscale = False

        # All metrics should have darker colors for higher similarity
        # No need to reverse values anymore - we'll use the same scale for all

        fig = go.Figure(
            data=go.Heatmap(
                z=viz_values,
                x=cleaned_columns,
                y=pivot.index,
                colorscale=cmap,
                reversescale=reverse_colorscale,  # Use the same scale direction for all metrics
                zmin=float(np.nanmin(viz_values)),
                zmax=float(np.nanmax(viz_values)),
                text=text,
                texttemplate="%{text}",
                hovertemplate="Chapter %{y}<br>Text Pair: %{x}<br>Value: %{z:.2f}<extra></extra>",
                colorbar=dict(title=metric, thickness=20, tickfont=dict(size=14)),
            )
        )
        plot_title = (
            descriptive_titles.get(metric, metric) if descriptive_titles else metric
        )
        # Calculate dynamic height based on number of rows
        num_rows = len(pivot.index)
        num_cols = len(pivot.columns)
        # Minimum 500px, scale with data size, max 900px
        dynamic_height = min(900, max(500, 80 + num_rows * 35))

        fig.update_layout(
            title=dict(text=plot_title, font=dict(size=18)),
            xaxis_title="Text Pair",
            yaxis_title="Chapter",
            autosize=True,
            height=dynamic_height,
            font=dict(size=16),
            margin=dict(l=80, b=120, t=60, r=30),
            xaxis=dict(
                automargin=True,
                side="bottom"
            ),
            yaxis=dict(
                automargin=True,
            )
        )
        fig.update_xaxes(tickangle=45, tickfont=dict(size=14))
        fig.update_yaxes(tickfont=dict(size=14), autorange="reversed")
        # Ensure all integer chapter numbers are shown if the axis is numeric and reversed
        if pd.api.types.is_numeric_dtype(pivot.index):
            fig.update_yaxes(
                tickmode="array",
                tickvals=pivot.index,
                ticktext=[str(i) for i in pivot.index],
            )
        heatmaps[metric] = fig

    return heatmaps


def generate_word_count_chart(word_counts_df: pd.DataFrame):
    """
    Generates a bar chart for word counts per segment (file/chapter).
    Args:
        word_counts_df: DataFrame with 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'.
    Returns:
        plotly Figure for the bar chart, or None if input is empty.
    """
    if word_counts_df.empty:
        return None

    fig = go.Figure()

    # Assign colors based on Filename
    unique_files = sorted(word_counts_df["Filename"].unique())
    colors = px.colors.qualitative.Plotly  # Get a default Plotly color sequence

    for i, filename in enumerate(unique_files):
        file_df = word_counts_df[word_counts_df["Filename"] == filename].sort_values(
            "ChapterNumber"
        )
        fig.add_trace(
            go.Bar(
                x=file_df["ChapterNumber"],
                y=file_df["WordCount"],
                name=filename,
                marker_color=colors[i % len(colors)],
                text=file_df["WordCount"],
                textposition="auto",
                customdata=file_df[["Filename"]],  # Pass Filename for hovertemplate
                hovertemplate="<b>File</b>: %{customdata[0]}<br>"
                + "<b>Chapter</b>: %{x}<br>"
                + "<b>Word Count</b>: %{y}<extra></extra>",
            )
        )

    fig.update_layout(
        title_text="Word Counts per Chapter (Grouped by File)",
        xaxis_title="Chapter Number",
        yaxis_title="Word Count",
        barmode="group",
        font=dict(size=14),
        legend_title_text="Filename",
        xaxis=dict(
            type="category",  # Treat chapter numbers as categories
            automargin=True   # Automatically adjust margin for x-axis labels/title
        ),
        yaxis=dict(
            rangemode='tozero', # Ensure y-axis starts at 0 and includes max value
            automargin=True,   # Automatically adjust margin for y-axis labels/title
            autorange=True     # Ensure automatic range calculation
        ),
        autosize=True,        # Keep for responsiveness in Gradio
        margin=dict(l=80, r=50, b=100, t=50, pad=4), # Keep existing base margins
        height=500,           # Set a fixed height for better visibility
        width=800             # Set a reasonable width
    )
    # Ensure x-axis ticks are shown for all chapter numbers present
    all_chapter_numbers = sorted(word_counts_df["ChapterNumber"].unique())
    fig.update_xaxes(
        tickmode="array",
        tickvals=all_chapter_numbers,
        ticktext=[str(ch) for ch in all_chapter_numbers],
    )

    return fig


def generate_vocab_containment_chart(vocab_containment_df: pd.DataFrame):
    """
    Generates a bar chart showing vocabulary containment per chapter.
    Shows what percentage of each text's unique vocabulary appears in the other text.

    Args:
        vocab_containment_df: DataFrame with 'ChapterNumber', 'SourceText', 'TargetText', 
                              'Containment', 'SourceVocabSize', 'SharedVocabSize'.
    Returns:
        plotly Figure for the vocabulary containment chart, or None if input is empty.
    """
    if vocab_containment_df is None or vocab_containment_df.empty:
        return None

    fig = go.Figure()

    # Create a label for each direction: "TextA → TextB" means "% of TextA's vocab in TextB"
    vocab_containment_df = vocab_containment_df.copy()
    vocab_containment_df["Direction"] = (
        vocab_containment_df["SourceText"] + " → " + vocab_containment_df["TargetText"]
    )

    # Get unique directions and assign colors
    unique_directions = sorted(vocab_containment_df["Direction"].unique())
    colors = px.colors.qualitative.Plotly

    for i, direction in enumerate(unique_directions):
        dir_df = vocab_containment_df[vocab_containment_df["Direction"] == direction].sort_values(
            "ChapterNumber"
        )
        fig.add_trace(
            go.Bar(
                x=dir_df["ChapterNumber"],
                y=dir_df["Containment"],
                name=direction,
                marker_color=colors[i % len(colors)],
                text=[f"{v:.1f}%" for v in dir_df["Containment"]],
                textposition="auto",
                customdata=dir_df[["SourceVocabSize", "SharedVocabSize", "SourceText", "TargetText"]].values,
                hovertemplate=(
                    "<b>Chapter %{x}</b><br>"
                    + "<b>%{customdata[2]}</b> vocabulary in <b>%{customdata[3]}</b>: %{y:.1f}%<br>"
                    + "Unique words in source: %{customdata[0]}<br>"
                    + "Shared words: %{customdata[1]}<extra></extra>"
                ),
            )
        )

    fig.update_layout(
        title_text="Vocabulary Containment per Chapter",
        xaxis_title="Chapter Number",
        yaxis_title="Vocabulary Containment (%)",
        barmode="group",
        font=dict(size=14),
        legend_title_text="Direction (Source → Target)",
        xaxis=dict(
            type="category",
            automargin=True
        ),
        yaxis=dict(
            rangemode='tozero',
            automargin=True,
            range=[0, 105],  # Slightly above 100% for visual clarity
        ),
        autosize=True,
        margin=dict(l=80, r=50, b=100, t=60, pad=4),
        height=450,
    )

    # Add a reference line at 100%
    fig.add_hline(
        y=100,
        line_dash="dash",
        line_color="gray",
        annotation_text="100%",
        annotation_position="right"
    )

    # Ensure x-axis ticks are shown for all chapter numbers
    chapters = sorted(vocab_containment_df["ChapterNumber"].unique())
    fig.update_xaxes(
        tickmode="array",
        tickvals=chapters,
        ticktext=[str(ch) for ch in chapters],
    )

    return fig