File size: 10,483 Bytes
4bf5701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75e8f38
b4c92f5
 
75e8f38
 
b4c92f5
4bf5701
 
 
 
75e8f38
b4c92f5
 
 
 
75e8f38
b4c92f5
 
 
75e8f38
b4c92f5
 
75e8f38
4bf5701
 
b4c92f5
4bf5701
 
 
b4c92f5
 
 
4bf5701
 
 
 
 
 
 
 
 
2c93726
 
 
 
 
 
4bf5701
2c93726
4bf5701
 
bda2b5b
2c93726
 
 
e30d4c0
2c93726
 
e30d4c0
 
2c93726
e30d4c0
4bf5701
2c93726
 
4bf5701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3011301
 
 
 
 
e30d4c0
 
3011301
 
e30d4c0
 
 
4bf5701
 
 
 
 
 
 
 
 
 
ee7fa4f
 
b8b2303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px  # For color palettes
import numpy as np  # Ensure numpy is imported, in case pivot_table uses it for aggfunc


def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict = None):
    """
    Generate heatmap visualizations for all metrics.
    Args:
        metrics_df: DataFrame with similarity metrics (segment-level)
    Returns:
        heatmaps: dict of {metric_name: plotly Figure} for each metric
    """

    # Identify all numeric metric columns (exclude 'Text Pair' and 'Chapter')
    metric_cols = [
        col
        for col in metrics_df.columns
        if col not in ["Text Pair", "Chapter"] and metrics_df[col].dtype != object
    ]
    for col in metrics_df.columns:
        if "Pattern Similarity" in col and col not in metric_cols:
            metric_cols.append(col)

    # --- Heatmaps for each metric ---
    heatmaps = {}
    # Chapter 1 will be at the top of the Y-axis due to sort_index(ascending=False).
    for metric in metric_cols:
        # Check if all values for this metric are NaN
        if metrics_df[metric].isnull().all():
            heatmaps[metric] = None
            continue  # Move to the next metric

        pivot = metrics_df.pivot(index="Chapter", columns="Text Pair", values=metric)
        pivot = pivot.sort_index(ascending=False)  # Invert Y-axis: Chapter 1 at the top
        # Additional check: if pivot is empty or all NaNs after pivoting (e.g., due to single chapter comparisons)
        if pivot.empty or pivot.isnull().all().all():
            heatmaps[metric] = None
            continue

        cleaned_columns = [col.replace(".txt", "") for col in pivot.columns]

        # For consistent interpretation: higher values (more similarity) = darker colors
        # Using 'Reds' colormap for all metrics (dark red = high similarity)
        cmap = "Reds"

        # Format values for display
        text = [
            [f"{val:.2f}" if pd.notnull(val) else "" for val in row]
            for row in pivot.values
        ]

        # Create a copy of the pivot data for visualization
        # For LCS and Semantic Similarity, we need to reverse the color scale
        # so that higher values (more similarity) are darker
        viz_values = pivot.values.copy()

        # Determine if we need to reverse the values for consistent color interpretation
        # (darker = more similar across all metrics)
        reverse_colorscale = False

        # All metrics should have darker colors for higher similarity
        # No need to reverse values anymore - we'll use the same scale for all

        fig = go.Figure(
            data=go.Heatmap(
                z=viz_values,
                x=cleaned_columns,
                y=pivot.index,
                colorscale=cmap,
                reversescale=reverse_colorscale,  # Use the same scale direction for all metrics
                zmin=float(np.nanmin(viz_values)),
                zmax=float(np.nanmax(viz_values)),
                text=text,
                texttemplate="%{text}",
                hovertemplate="Chapter %{y}<br>Text Pair: %{x}<br>Value: %{z:.2f}<extra></extra>",
                colorbar=dict(title=metric, thickness=20, tickfont=dict(size=14)),
            )
        )
        plot_title = (
            descriptive_titles.get(metric, metric) if descriptive_titles else metric
        )
        # Calculate dynamic height based on number of rows
        num_rows = len(pivot.index)
        num_cols = len(pivot.columns)
        # Minimum 500px, scale with data size, max 900px
        dynamic_height = min(900, max(500, 80 + num_rows * 35))

        fig.update_layout(
            title=dict(text=plot_title, font=dict(size=18)),
            xaxis_title="Text Pair",
            yaxis_title="Chapter",
            autosize=True,
            height=dynamic_height,
            font=dict(size=16),
            margin=dict(l=80, b=120, t=60, r=30),
            xaxis=dict(
                automargin=True,
                side="bottom"
            ),
            yaxis=dict(
                automargin=True,
            )
        )
        fig.update_xaxes(tickangle=45, tickfont=dict(size=14))
        fig.update_yaxes(tickfont=dict(size=14), autorange="reversed")
        # Ensure all integer chapter numbers are shown if the axis is numeric and reversed
        if pd.api.types.is_numeric_dtype(pivot.index):
            fig.update_yaxes(
                tickmode="array",
                tickvals=pivot.index,
                ticktext=[str(i) for i in pivot.index],
            )
        heatmaps[metric] = fig

    return heatmaps


def generate_word_count_chart(word_counts_df: pd.DataFrame):
    """
    Generates a bar chart for word counts per segment (file/chapter).
    Args:
        word_counts_df: DataFrame with 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'.
    Returns:
        plotly Figure for the bar chart, or None if input is empty.
    """
    if word_counts_df.empty:
        return None

    fig = go.Figure()

    # Assign colors based on Filename
    unique_files = sorted(word_counts_df["Filename"].unique())
    colors = px.colors.qualitative.Plotly  # Get a default Plotly color sequence

    for i, filename in enumerate(unique_files):
        file_df = word_counts_df[word_counts_df["Filename"] == filename].sort_values(
            "ChapterNumber"
        )
        fig.add_trace(
            go.Bar(
                x=file_df["ChapterNumber"],
                y=file_df["WordCount"],
                name=filename,
                marker_color=colors[i % len(colors)],
                text=file_df["WordCount"],
                textposition="auto",
                customdata=file_df[["Filename"]],  # Pass Filename for hovertemplate
                hovertemplate="<b>File</b>: %{customdata[0]}<br>"
                + "<b>Chapter</b>: %{x}<br>"
                + "<b>Word Count</b>: %{y}<extra></extra>",
            )
        )

    fig.update_layout(
        title_text="Word Counts per Chapter (Grouped by File)",
        xaxis_title="Chapter Number",
        yaxis_title="Word Count",
        barmode="group",
        font=dict(size=14),
        legend_title_text="Filename",
        xaxis=dict(
            type="category",  # Treat chapter numbers as categories
            automargin=True   # Automatically adjust margin for x-axis labels/title
        ),
        yaxis=dict(
            rangemode='tozero', # Ensure y-axis starts at 0 and includes max value
            automargin=True,   # Automatically adjust margin for y-axis labels/title
            autorange=True     # Ensure automatic range calculation
        ),
        autosize=True,        # Keep for responsiveness in Gradio
        margin=dict(l=80, r=50, b=100, t=50, pad=4), # Keep existing base margins
        height=500,           # Set a fixed height for better visibility
        width=800             # Set a reasonable width
    )
    # Ensure x-axis ticks are shown for all chapter numbers present
    all_chapter_numbers = sorted(word_counts_df["ChapterNumber"].unique())
    fig.update_xaxes(
        tickmode="array",
        tickvals=all_chapter_numbers,
        ticktext=[str(ch) for ch in all_chapter_numbers],
    )

    return fig


def generate_vocab_containment_chart(vocab_containment_df: pd.DataFrame):
    """
    Generates a bar chart showing vocabulary containment per chapter.
    Shows what percentage of each text's unique vocabulary appears in the other text.

    Args:
        vocab_containment_df: DataFrame with 'ChapterNumber', 'SourceText', 'TargetText', 
                              'Containment', 'SourceVocabSize', 'SharedVocabSize'.
    Returns:
        plotly Figure for the vocabulary containment chart, or None if input is empty.
    """
    if vocab_containment_df is None or vocab_containment_df.empty:
        return None

    fig = go.Figure()

    # Create a label for each direction: "TextA β†’ TextB" means "% of TextA's vocab in TextB"
    vocab_containment_df = vocab_containment_df.copy()
    vocab_containment_df["Direction"] = (
        vocab_containment_df["SourceText"] + " β†’ " + vocab_containment_df["TargetText"]
    )

    # Get unique directions and assign colors
    unique_directions = sorted(vocab_containment_df["Direction"].unique())
    colors = px.colors.qualitative.Plotly

    for i, direction in enumerate(unique_directions):
        dir_df = vocab_containment_df[vocab_containment_df["Direction"] == direction].sort_values(
            "ChapterNumber"
        )
        fig.add_trace(
            go.Bar(
                x=dir_df["ChapterNumber"],
                y=dir_df["Containment"],
                name=direction,
                marker_color=colors[i % len(colors)],
                text=[f"{v:.1f}%" for v in dir_df["Containment"]],
                textposition="auto",
                customdata=dir_df[["SourceVocabSize", "SharedVocabSize", "SourceText", "TargetText"]].values,
                hovertemplate=(
                    "<b>Chapter %{x}</b><br>"
                    + "<b>%{customdata[2]}</b> vocabulary in <b>%{customdata[3]}</b>: %{y:.1f}%<br>"
                    + "Unique words in source: %{customdata[0]}<br>"
                    + "Shared words: %{customdata[1]}<extra></extra>"
                ),
            )
        )

    fig.update_layout(
        title_text="Vocabulary Containment per Chapter",
        xaxis_title="Chapter Number",
        yaxis_title="Vocabulary Containment (%)",
        barmode="group",
        font=dict(size=14),
        legend_title_text="Direction (Source β†’ Target)",
        xaxis=dict(
            type="category",
            automargin=True
        ),
        yaxis=dict(
            rangemode='tozero',
            automargin=True,
            range=[0, 105],  # Slightly above 100% for visual clarity
        ),
        autosize=True,
        margin=dict(l=80, r=50, b=100, t=60, pad=4),
        height=450,
    )

    # Add a reference line at 100%
    fig.add_hline(
        y=100,
        line_dash="dash",
        line_color="gray",
        annotation_text="100%",
        annotation_position="right"
    )

    # Ensure x-axis ticks are shown for all chapter numbers
    chapters = sorted(vocab_containment_df["ChapterNumber"].unique())
    fig.update_xaxes(
        tickmode="array",
        tickvals=chapters,
        ticktext=[str(ch) for ch in chapters],
    )

    return fig