Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from pathlib import Path | |
| from pipeline.process import process_texts | |
| from pipeline.visualize import generate_visualizations, generate_word_count_chart, generate_vocab_containment_chart | |
| from pipeline.llm_service import LLMService | |
| from pipeline.progressive_ui import ProgressiveUI, create_progressive_callback | |
| import logging | |
| import pandas as pd | |
| from datetime import datetime | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| from theme import tibetan_theme | |
| load_dotenv() | |
| logger = logging.getLogger(__name__) | |
| def main_interface(): | |
| # Theme and CSS applied here for Gradio 5.x compatibility | |
| # For Gradio 6.x, these will move to launch() - see migration guide | |
| with gr.Blocks( | |
| theme=tibetan_theme, | |
| css=tibetan_theme.get_css_string(), | |
| title="Tibetan Text Metrics Web App" | |
| ) as demo: | |
| gr.Markdown( | |
| """# Tibetan Text Metrics | |
| <span style='font-size:18px;'>Compare Tibetan texts to discover how similar they are. This tool helps scholars identify shared passages, textual variations, and relationships between different versions of Tibetan manuscripts. Part of the <a href="https://github.com/daniel-wojahn/tibetan-text-metrics" target="_blank">TTM project</a>.</span> | |
| """, | |
| elem_classes="gr-markdown", | |
| ) | |
| with gr.Row(elem_id="steps-row"): | |
| with gr.Column(scale=1, elem_classes="step-column"): | |
| with gr.Group(elem_classes="step-box"): | |
| gr.Markdown( | |
| """ | |
| ## Step 1: Upload Your Texts | |
| <span style='font-size:16px;'>Upload two or more Tibetan text files (.txt format). If your texts have chapters, separate them with the ༈ marker so the tool can compare chapter-by-chapter.</span> | |
| """, | |
| elem_classes="gr-markdown", | |
| ) | |
| file_input = gr.File( | |
| label="Choose your Tibetan text files", | |
| file_types=[".txt"], | |
| file_count="multiple", | |
| ) | |
| gr.Markdown( | |
| "<small>Tip: Files should be under 1MB for best performance. Use UTF-8 encoded .txt files.</small>", | |
| elem_classes="gr-markdown" | |
| ) | |
| with gr.Column(scale=1, elem_classes="step-column"): | |
| with gr.Group(elem_classes="step-box"): | |
| gr.Markdown( | |
| """## Step 2: Choose Analysis Type | |
| <span style='font-size:16px;'>Pick a preset for quick results, or use Custom for full control.</span> | |
| """, | |
| elem_classes="gr-markdown", | |
| ) | |
| with gr.Tabs(): | |
| # ===== QUICK START TAB ===== | |
| with gr.Tab("Quick Start", id="quick_tab"): | |
| analysis_preset = gr.Radio( | |
| label="What kind of analysis do you need?", | |
| choices=[ | |
| "Standard — Vocabulary + Sequences + Fuzzy matching", | |
| "Deep — All metrics including AI meaning analysis", | |
| "Quick — Vocabulary overlap only (fastest)" | |
| ], | |
| value="Standard — Vocabulary + Sequences + Fuzzy matching", | |
| info="Standard is recommended for most users. Deep analysis takes longer but finds texts with similar meaning even when words differ." | |
| ) | |
| gr.Markdown(""" | |
| **What each preset includes:** | |
| | Preset | Jaccard | LCS | Fuzzy | Semantic AI | | |
| |--------|---------|-----|-------|-------------| | |
| | Standard | ✓ | ✓ | ✓ | — | | |
| | Deep | ✓ | ✓ | ✓ | ✓ | | |
| | Quick | ✓ | — | — | — | | |
| """, elem_classes="preset-table") | |
| process_btn_quick = gr.Button( | |
| "Compare My Texts", elem_id="run-btn-quick", variant="primary" | |
| ) | |
| # ===== CUSTOM TAB ===== | |
| with gr.Tab("Custom", id="custom_tab"): | |
| gr.Markdown("**Fine-tune each metric and option:**", elem_classes="custom-header") | |
| with gr.Accordion("Lexical Metrics", open=True): | |
| gr.Markdown("*Compare the actual words used in texts*") | |
| tokenization_mode_dropdown = gr.Dropdown( | |
| label="How to split text?", | |
| choices=[ | |
| "word - Whole words (recommended)", | |
| "syllable - Individual syllables (finer detail)" | |
| ], | |
| value="word - Whole words (recommended)", | |
| info="'Word' keeps multi-syllable words together — recommended for Jaccard." | |
| ) | |
| stopwords_dropdown = gr.Dropdown( | |
| label="Filter common words?", | |
| choices=[ | |
| "None (No filtering)", | |
| "Standard (Common particles only)", | |
| "Aggressive (All function words)" | |
| ], | |
| value="Standard (Common particles only)", | |
| info="Remove common particles (གི, ལ, ནི) before comparing." | |
| ) | |
| particle_normalization_checkbox = gr.Checkbox( | |
| label="Normalize grammatical particles?", | |
| value=False, | |
| info="Treat variants as equivalent (གི/ཀྱི/གྱི → གི). Useful for different scribal conventions." | |
| ) | |
| with gr.Accordion("Sequence Matching (LCS)", open=True): | |
| gr.Markdown("*Find shared passages in the same order*") | |
| gr.Checkbox( | |
| label="Enable sequence matching", | |
| value=True, | |
| info="Finds the longest sequence of words appearing in both texts." | |
| ) # LCS is always computed as a core metric | |
| lcs_normalization_dropdown = gr.Dropdown( | |
| label="How to handle different text lengths?", | |
| choices=[ | |
| "avg - Balanced comparison (default)", | |
| "min - Detect if one text contains the other", | |
| "max - Stricter, penalizes length differences" | |
| ], | |
| value="avg - Balanced comparison (default)", | |
| info="'min' is useful for finding quotes or excerpts." | |
| ) | |
| with gr.Accordion("Fuzzy Matching", open=True): | |
| gr.Markdown("*Detect similar but not identical text*") | |
| fuzzy_toggle_radio = gr.Radio( | |
| label="Find approximate matches?", | |
| choices=["Yes", "No"], | |
| value="Yes", | |
| info="Useful for spelling variations and scribal differences." | |
| ) | |
| fuzzy_method_dropdown = gr.Dropdown( | |
| label="Matching method", | |
| choices=[ | |
| "ngram - Syllable pairs (recommended)", | |
| "syllable_edit - Count syllable changes", | |
| "weighted_jaccard - Word frequency comparison" | |
| ], | |
| value="ngram - Syllable pairs (recommended)", | |
| info="All options work at the Tibetan syllable level." | |
| ) | |
| with gr.Accordion("Semantic Analysis", open=False): | |
| gr.Markdown("*Compare meaning using AI (slower)*") | |
| semantic_toggle_radio = gr.Radio( | |
| label="Analyze meaning similarity?", | |
| choices=["Yes", "No"], | |
| value="No", | |
| info="Finds texts that say similar things in different words." | |
| ) | |
| model_dropdown = gr.Dropdown( | |
| choices=[ | |
| "buddhist-nlp/buddhist-sentence-similarity", | |
| "buddhist-nlp/bod-eng-similarity", | |
| "sentence-transformers/LaBSE", | |
| "BAAI/bge-m3" | |
| ], | |
| label="AI Model", | |
| value="buddhist-nlp/buddhist-sentence-similarity", | |
| info="'buddhist-sentence-similarity' works best for Buddhist texts." | |
| ) | |
| batch_size_slider = gr.Slider( | |
| minimum=1, | |
| maximum=64, | |
| value=8, | |
| step=1, | |
| label="Processing batch size", | |
| info="Higher = faster but uses more memory." | |
| ) | |
| progress_bar_checkbox = gr.Checkbox( | |
| label="Show detailed progress", | |
| value=False, | |
| info="See step-by-step progress during analysis." | |
| ) | |
| process_btn_custom = gr.Button( | |
| "Compare My Texts (Custom)", elem_id="run-btn-custom", variant="primary" | |
| ) | |
| # Note: Both process_btn_quick and process_btn_custom are wired below | |
| gr.Markdown( | |
| """## Results | |
| """, | |
| elem_classes="gr-markdown", | |
| ) | |
| # The heatmap_titles and metric_tooltips dictionaries are defined here | |
| # heatmap_titles = { ... } | |
| # metric_tooltips = { ... } | |
| csv_output = gr.File(label="📥 Download Full Results (CSV spreadsheet)") | |
| metrics_preview = gr.Dataframe( | |
| label="Results Summary — Compare chapters across your texts", interactive=False, visible=True | |
| ) | |
| # States for data persistence | |
| state_text_data = gr.State() | |
| state_df_results = gr.State() | |
| # LLM Interpretation components | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown( | |
| "## Get Expert Insights\n*Let AI help you understand what the numbers mean and what patterns they reveal about your texts.*", | |
| elem_classes="gr-markdown" | |
| ) | |
| # Add the interpret button | |
| with gr.Row(): | |
| interpret_btn = gr.Button( | |
| "📊 Explain My Results", | |
| variant="primary", | |
| elem_id="interpret-btn" | |
| ) | |
| # Create a placeholder message with proper formatting and structure | |
| initial_message = """ | |
| ## Understanding Your Results | |
| <small>*After running the analysis, click "Explain My Results" to get a plain-language interpretation of what the similarity scores mean for your texts.*</small> | |
| """ | |
| interpretation_output = gr.Markdown( | |
| value=initial_message, | |
| elem_id="llm-analysis" | |
| ) | |
| # Heatmap tabs for each metric | |
| heatmap_titles = { | |
| "Jaccard Similarity (%)": "Shows how much vocabulary the texts share. Higher = more words in common.", | |
| "Normalized LCS": "Shows shared phrases in the same order. Higher = more passages appear in both texts.", | |
| "Fuzzy Similarity": "Finds similar text even with spelling differences. Higher = more alike.", | |
| "Semantic Similarity": "Compares actual meaning using AI. Higher = texts say similar things.", | |
| "Word Counts": "How long is each section? Helps you understand text structure.", | |
| "Vocabulary Containment": "What % of one text's vocabulary appears in the other?", | |
| } | |
| metric_tooltips = { | |
| "Jaccard Similarity (%)": """ | |
| ### Vocabulary Overlap (Jaccard Similarity) | |
| **What it measures:** How many unique words appear in both texts. | |
| **How to read it:** A score of 70% means 70% of all unique words found in either text appear in both. Higher scores = more shared vocabulary. | |
| **What it tells you:** | |
| - High scores (>70%): Texts use very similar vocabulary — possibly the same source or direct copying | |
| - Medium scores (40-70%): Texts share significant vocabulary — likely related topics or traditions | |
| - Low scores (<40%): Texts use different words — different sources or heavily edited versions | |
| **Good to know:** This metric ignores word order and how often words repeat. It only asks "does this word appear in both texts?" | |
| **Tips:** | |
| - Use the "Filter common words" option to focus on meaningful content words rather than grammatical particles. | |
| - **Word mode is recommended** for Jaccard. Syllable mode may inflate scores because common syllables (like ས, ར, ན) appear in many different words. | |
| """, | |
| "Fuzzy Similarity": """ | |
| ### Approximate Matching (Fuzzy Similarity) | |
| **What it measures:** How similar texts are, even when they're not exactly the same. | |
| **How to read it:** Scores from 0 to 1. Higher = more similar. A score of 0.85 means the texts are 85% alike. | |
| **What it tells you:** | |
| - High scores (>0.8): Very similar texts with minor differences (spelling, small edits) | |
| - Medium scores (0.5-0.8): Noticeably different but clearly related | |
| - Low scores (<0.5): Substantially different texts | |
| **Why it matters for Tibetan texts:** | |
| - Catches spelling variations between manuscripts | |
| - Finds scribal differences and regional conventions | |
| - Identifies passages that were slightly modified | |
| **Recommended methods:** | |
| - **Syllable pairs (ngram)**: Best for Tibetan — compares pairs of syllables | |
| - **Count syllable changes**: Good for finding minor edits | |
| - **Word frequency**: Useful when certain words repeat often | |
| """, | |
| "Normalized LCS": """ | |
| ### Shared Sequences (Longest Common Subsequence) | |
| **What it measures:** The longest chain of words that appears in both texts *in the same order*. | |
| **How to read it:** Higher scores mean longer shared passages. A score of 0.6 means 60% of the text follows the same word sequence. | |
| **Example:** If Text A says "the quick brown fox" and Text B says "the lazy brown dog", the shared sequence is "the brown" — words that appear in both, in the same order. | |
| **What it tells you:** | |
| - High scores (>0.6): Texts share substantial passages — likely direct copying or common source | |
| - Medium scores (0.3-0.6): Some shared phrasing — possibly related traditions | |
| - Low scores (<0.3): Different word ordering — independent compositions or heavy editing | |
| **Why this is different from vocabulary overlap:** | |
| - Vocabulary overlap asks: "Do they use the same words?" | |
| - Sequence matching asks: "Do they say things in the same order?" | |
| Two texts might share many words (high Jaccard) but arrange them differently (low LCS), suggesting they discuss similar topics but were composed independently. | |
| """, | |
| "Semantic Similarity": """ | |
| ### Meaning Similarity (Semantic Analysis) | |
| **What it measures:** Whether texts convey similar *meaning*, even if they use different words. | |
| **How to read it:** Scores from 0 to 1. Higher = more similar meaning. A score of 0.8 means the texts express very similar ideas. | |
| **What it tells you:** | |
| - High scores (>0.75): Texts say similar things, even if worded differently | |
| - Medium scores (0.5-0.75): Related topics or themes | |
| - Low scores (<0.5): Different subject matter | |
| **How it works:** An AI model (trained on Buddhist texts) reads both passages and judges how similar their meaning is. This catches similarities that word-matching would miss. | |
| **When to use it:** | |
| - Finding paraphrased passages | |
| - Identifying texts that discuss the same concepts differently | |
| - Comparing translations or commentaries | |
| **Note:** This takes longer to compute but provides insights the other metrics can't. | |
| """, | |
| "Word Counts": """ | |
| ### Text Length by Section | |
| **What it shows:** How many words are in each chapter or section of your texts. | |
| **How to read it:** Taller bars = longer sections. Compare bars to see which parts of your texts are longer or shorter. | |
| **What it tells you:** | |
| - Similar bar heights across texts suggest similar structure | |
| - Very different lengths might explain why similarity scores vary | |
| - Helps identify which sections to examine more closely | |
| **Tip:** If one text has much longer chapters, it might contain additional material not in the other version. | |
| """, | |
| "Vocabulary Containment": """ | |
| ### Vocabulary Containment (Directional) | |
| **What it shows:** What percentage of one text's unique vocabulary appears in the other text. | |
| **How to read it:** | |
| - "Text A → Text B" means: "What % of Text A's vocabulary is found in Text B?" | |
| - 90% means 90% of the unique words in the source text also appear in the target text | |
| **What it tells you:** | |
| - If Text A → Text B is 95% but Text B → Text A is 60%, then Text B contains almost all of Text A's vocabulary plus additional words | |
| - This suggests Text B might be an expansion or commentary on Text A | |
| - Asymmetric containment often indicates a base text + commentary relationship | |
| **Useful for:** | |
| - Identifying which text is the "base" (shorter vocabulary fully contained in longer text) | |
| - Understanding directionality of textual relationships | |
| - Distinguishing between shared sources vs. one text derived from another | |
| **Tip:** Unlike Jaccard (which is symmetric), containment is directional — it tells you which text's vocabulary is "inside" the other. | |
| """, | |
| "Structural Analysis": """ | |
| ### How Texts Relate to Each Other | |
| **What it shows:** An overview of how your text sections connect and relate across documents. | |
| **What it tells you:** | |
| - Which sections are most similar to each other | |
| - Possible patterns of copying or shared sources | |
| - How texts might have evolved or been edited over time | |
| **Useful for:** | |
| - Understanding textual transmission history | |
| - Identifying which version might be older or more original | |
| - Finding sections that were added, removed, or modified | |
| **Note:** This analysis combines all the other metrics to give you the big picture. | |
| """ | |
| } | |
| heatmap_tabs = {} | |
| gr.Markdown("## Visual Comparison", elem_classes="gr-markdown") | |
| with gr.Tabs(elem_id="heatmap-tab-group"): | |
| # Process all metrics | |
| metrics_to_display = heatmap_titles | |
| for metric_key, descriptive_title in metrics_to_display.items(): | |
| with gr.Tab(metric_key): | |
| # Set CSS class based on metric type | |
| if metric_key == "Jaccard Similarity (%)": | |
| css_class = "metric-info-accordion jaccard-info" | |
| accordion_title = "ℹ️ What does this mean?" | |
| elif metric_key == "Normalized LCS": | |
| css_class = "metric-info-accordion lcs-info" | |
| accordion_title = "ℹ️ What does this mean?" | |
| elif metric_key == "Fuzzy Similarity": | |
| css_class = "metric-info-accordion fuzzy-info" | |
| accordion_title = "ℹ️ What does this mean?" | |
| elif metric_key == "Semantic Similarity": | |
| css_class = "metric-info-accordion semantic-info" | |
| accordion_title = "ℹ️ What does this mean?" | |
| elif metric_key == "Word Counts": | |
| css_class = "metric-info-accordion wordcount-info" | |
| accordion_title = "ℹ️ What does this mean?" | |
| elif metric_key == "Vocabulary Containment": | |
| css_class = "metric-info-accordion vocabcontain-info" | |
| accordion_title = "ℹ️ What does this mean?" | |
| else: | |
| css_class = "metric-info-accordion" | |
| accordion_title = f"ℹ️ About {metric_key}" | |
| # Create the accordion with appropriate content | |
| with gr.Accordion(accordion_title, open=False, elem_classes=css_class): | |
| if metric_key == "Word Counts": | |
| gr.Markdown(""" | |
| ### Text Length by Section | |
| This chart shows how many words are in each chapter or section. Taller bars = longer sections. | |
| **Why it matters:** If sections have very different lengths, it might explain differences in similarity scores. | |
| """) | |
| elif metric_key in metric_tooltips: | |
| gr.Markdown(value=metric_tooltips[metric_key], elem_classes="metric-description") | |
| else: | |
| gr.Markdown(value=f"### {metric_key}\nDescription not found.") | |
| # Add the appropriate plot | |
| if metric_key == "Word Counts": | |
| word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description") | |
| elif metric_key == "Vocabulary Containment": | |
| vocab_containment_plot = gr.Plot(label="Vocabulary Containment per Chapter", show_label=False, scale=1, elem_classes="metric-description") | |
| else: | |
| heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap") | |
| # Structural Analysis Tab | |
| # Structural analysis tab removed - see dedicated collation app | |
| # For now, this modification focuses on creating the plot object and making it an output. | |
| # The visual placement depends on how Gradio renders children of gr.Tab or if there's another container. | |
| warning_box = gr.Markdown(visible=False) | |
| # Create a container for metric progress indicators | |
| with gr.Row(visible=False) as progress_container: | |
| # Progress indicators will be created dynamically by ProgressiveUI | |
| gr.Markdown("Metric progress will appear here during analysis") | |
| def run_pipeline(files, enable_semantic, enable_fuzzy, fuzzy_method, lcs_normalization, model_name, tokenization_mode, stopwords_option, normalize_particles, batch_size, show_progress, progress=gr.Progress()): | |
| """Processes uploaded files, computes metrics, generates visualizations, and prepares outputs for the UI. | |
| Args: | |
| files: A list of file objects uploaded by the user. | |
| enable_semantic: Whether to compute semantic similarity. | |
| enable_fuzzy: Whether to compute fuzzy string similarity. | |
| fuzzy_method: The fuzzy matching method to use. | |
| model_name: Name of the embedding model to use. | |
| tokenization_mode: How to tokenize text (syllable or word). | |
| stopwords_option: Stopword filtering level (None, Standard, or Aggressive). | |
| normalize_particles: Whether to normalize grammatical particles. | |
| batch_size: Batch size for embedding generation. | |
| show_progress: Whether to show progress bars during embedding. | |
| progress: Gradio progress indicator. | |
| Returns: | |
| tuple: Results for UI components including metrics, visualizations, and state. | |
| """ | |
| # Initialize return values with defaults | |
| csv_path_res = None | |
| metrics_preview_df_res = pd.DataFrame() | |
| word_count_fig_res = None | |
| vocab_containment_fig_res = None | |
| jaccard_heatmap_res = None | |
| lcs_heatmap_res = None | |
| fuzzy_heatmap_res = None | |
| semantic_heatmap_res = None | |
| warning_update_res = gr.update(visible=False) | |
| state_text_data_res = None | |
| state_df_results_res = None | |
| # Create a ProgressiveUI instance for handling progressive updates | |
| progressive_ui = ProgressiveUI( | |
| metrics_preview=metrics_preview, | |
| word_count_plot=word_count_plot, | |
| jaccard_heatmap=heatmap_tabs["Jaccard Similarity (%)"], | |
| lcs_heatmap=heatmap_tabs["Normalized LCS"], | |
| fuzzy_heatmap=heatmap_tabs["Fuzzy Similarity"], | |
| semantic_heatmap=heatmap_tabs["Semantic Similarity"], | |
| warning_box=warning_box, | |
| progress_container=progress_container, | |
| heatmap_titles=heatmap_titles | |
| ) | |
| # Make progress container visible during analysis | |
| progress_container.update(visible=True) | |
| # Create a progressive callback function | |
| progressive_callback = create_progressive_callback(progressive_ui) | |
| # Check if files are provided | |
| if not files: | |
| return ( | |
| None, | |
| pd.DataFrame({"Message": ["Please upload files to analyze."]}), | |
| None, # word_count_plot | |
| None, # vocab_containment_plot | |
| None, # jaccard_heatmap | |
| None, # lcs_heatmap | |
| None, # fuzzy_heatmap | |
| None, # semantic_heatmap | |
| None, # warning update | |
| None, # state_text_data | |
| None # state_df_results | |
| ) | |
| # Check file size limits (10MB per file) | |
| for file in files: | |
| file_size_mb = Path(file.name).stat().st_size / (1024 * 1024) | |
| if file_size_mb > 10: | |
| return ( | |
| None, | |
| pd.DataFrame({"Error": [f"File '{Path(file.name).name}' exceeds the 10MB size limit (size: {file_size_mb:.2f}MB)."]}), | |
| None, # word_count_plot | |
| None, # vocab_containment_plot | |
| None, # jaccard_heatmap | |
| None, # lcs_heatmap | |
| None, # fuzzy_heatmap | |
| None, # semantic_heatmap | |
| gr.update(value=f"Error: File '{Path(file.name).name}' exceeds the 10MB size limit.", visible=True), | |
| None, # state_text_data | |
| None # state_df_results | |
| ) | |
| try: | |
| if progress is not None: | |
| try: | |
| progress(0.1, desc="Preparing files...") | |
| except Exception as e: | |
| logger.warning(f"Progress update error (non-critical): {e}") | |
| # Get filenames and read file contents | |
| filenames = [ | |
| Path(file.name).name for file in files | |
| ] # Use Path().name to get just the filename | |
| text_data = {} | |
| # Read files with progress updates | |
| for i, file in enumerate(files): | |
| file_path = Path(file.name) | |
| filename = file_path.name | |
| if progress is not None: | |
| try: | |
| progress(0.1 + (0.1 * (i / len(files))), desc=f"Reading file: {filename}") | |
| except Exception as e: | |
| logger.warning(f"Progress update error (non-critical): {e}") | |
| try: | |
| text_data[filename] = file_path.read_text(encoding="utf-8-sig") | |
| except UnicodeDecodeError: | |
| # Try with different encodings if UTF-8 fails | |
| try: | |
| text_data[filename] = file_path.read_text(encoding="utf-16") | |
| except UnicodeDecodeError: | |
| return ( | |
| None, | |
| pd.DataFrame({"Error": [f"Could not decode file '{filename}'. Please ensure it contains valid Tibetan text in UTF-8 or UTF-16 encoding."]}), | |
| None, # word_count_plot | |
| None, # vocab_containment_plot | |
| None, # jaccard_heatmap | |
| None, # lcs_heatmap | |
| None, # fuzzy_heatmap | |
| None, # semantic_heatmap | |
| gr.update(value=f"Error: Could not decode file '{filename}'.", visible=True), | |
| None, # state_text_data | |
| None # state_df_results | |
| ) | |
| # Configure semantic similarity and fuzzy matching | |
| enable_semantic_bool = enable_semantic == "Yes" | |
| enable_fuzzy_bool = enable_fuzzy == "Yes" | |
| # Extract the fuzzy method from the dropdown value | |
| fuzzy_method_value = fuzzy_method.split(' - ')[0] if fuzzy_method else 'ngram' | |
| # Extract the LCS normalization from the dropdown value | |
| lcs_normalization_value = lcs_normalization.split(' - ')[0] if lcs_normalization else 'avg' | |
| # Extract the tokenization mode from the dropdown value | |
| tokenization_mode_value = tokenization_mode.split(' - ')[0] if tokenization_mode else 'syllable' | |
| if progress is not None: | |
| try: | |
| progress(0.2, desc="Loading model..." if enable_semantic_bool else "Processing text...") | |
| except Exception as e: | |
| logger.warning(f"Progress update error (non-critical): {e}") | |
| # Process texts with selected model | |
| # Convert stopword option to appropriate parameters | |
| use_stopwords = stopwords_option != "None (No filtering)" | |
| use_lite_stopwords = stopwords_option == "Standard (Common particles only)" | |
| # For Hugging Face models, the UI value is the correct model ID | |
| internal_model_id = model_name | |
| df_results, word_counts_df_data, vocab_containment_df_data, warning_raw = process_texts( | |
| text_data=text_data, | |
| filenames=filenames, | |
| enable_semantic=enable_semantic_bool, | |
| enable_fuzzy=enable_fuzzy_bool, | |
| fuzzy_method=fuzzy_method_value, | |
| lcs_normalization=lcs_normalization_value, | |
| model_name=internal_model_id, | |
| use_stopwords=use_stopwords, | |
| use_lite_stopwords=use_lite_stopwords, | |
| normalize_particles=normalize_particles, | |
| tokenization_mode=tokenization_mode_value, | |
| progress_callback=progress, | |
| progressive_callback=progressive_callback, | |
| batch_size=batch_size, | |
| show_progress_bar=show_progress | |
| ) | |
| if df_results.empty: | |
| warning_md = f"**⚠️ Warning:** {warning_raw}" if warning_raw else "" | |
| warning_message = "No common chapters found or results are empty. " + (warning_raw or "") | |
| metrics_preview_df_res = pd.DataFrame({"Message": [warning_message]}) | |
| warning_update_res = gr.update(value=warning_md or warning_message, visible=True) | |
| # No structural analysis in this app | |
| else: | |
| # Generate visualizations | |
| if progress is not None: | |
| try: | |
| progress(0.8, desc="Generating visualizations...") | |
| except Exception as e: | |
| logger.warning(f"Progress update error (non-critical): {e}") | |
| # heatmap_titles is already defined in the outer scope of main_interface | |
| heatmaps_data = generate_visualizations( | |
| df_results, descriptive_titles=heatmap_titles | |
| ) | |
| # Generate word count chart | |
| if progress is not None: | |
| try: | |
| progress(0.9, desc="Creating word count chart...") | |
| except Exception as e: | |
| logger.warning(f"Progress update error (non-critical): {e}") | |
| word_count_fig_res = generate_word_count_chart(word_counts_df_data) | |
| # Generate vocabulary containment chart | |
| vocab_containment_fig_res = generate_vocab_containment_chart(vocab_containment_df_data) | |
| # Store state data for potential future use | |
| state_text_data_res = text_data | |
| state_df_results_res = df_results | |
| logger.info("Analysis complete, storing state data") | |
| # Save results to CSV | |
| if progress is not None: | |
| try: | |
| progress(0.95, desc="Saving results...") | |
| except Exception as e: | |
| logger.warning(f"Progress update error (non-critical): {e}") | |
| csv_path_res = "results.csv" | |
| df_results.to_csv(csv_path_res, index=False) | |
| # Prepare final output | |
| warning_md = f"**⚠️ Warning:** {warning_raw}" if warning_raw else "" | |
| metrics_preview_df_res = df_results.head(10) | |
| jaccard_heatmap_res = heatmaps_data.get("Jaccard Similarity (%)") | |
| lcs_heatmap_res = heatmaps_data.get("Normalized LCS") | |
| fuzzy_heatmap_res = heatmaps_data.get("Fuzzy Similarity") | |
| semantic_heatmap_res = heatmaps_data.get("Semantic Similarity") | |
| warning_update_res = gr.update( | |
| visible=bool(warning_raw), value=warning_md | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error in run_pipeline: {e}", exc_info=True) | |
| # Ensure DataFrame for metrics preview on error | |
| metrics_preview_df_res = pd.DataFrame({"Error": [str(e)]}) | |
| warning_update_res = gr.update(value=f"Error: {str(e)}", visible=True) | |
| return ( | |
| csv_path_res, | |
| metrics_preview_df_res, | |
| word_count_fig_res, | |
| vocab_containment_fig_res, | |
| jaccard_heatmap_res, | |
| lcs_heatmap_res, | |
| fuzzy_heatmap_res, | |
| semantic_heatmap_res, | |
| warning_update_res, | |
| state_text_data_res, | |
| state_df_results_res, | |
| ) | |
| # Function to interpret results using LLM | |
| def interpret_results(csv_path, progress=gr.Progress()): | |
| try: | |
| if not csv_path or not Path(csv_path).exists(): | |
| return "Please run the analysis first to generate results." | |
| # Read the CSV file | |
| df_results = pd.read_csv(csv_path) | |
| # Show detailed progress messages with percentages | |
| progress(0, desc="Preparing data for analysis...") | |
| progress(0.1, desc="Analyzing similarity patterns...") | |
| progress(0.2, desc="Connecting to Mistral 7B via OpenRouter...") | |
| # Get interpretation from LLM (using OpenRouter API) | |
| progress(0.3, desc="Generating scholarly interpretation (this may take 20-40 seconds)...") | |
| llm_service = LLMService() | |
| interpretation = llm_service.analyze_similarity(df_results) | |
| # Simulate completion steps | |
| progress(0.9, desc="Formatting results...") | |
| progress(0.95, desc="Applying scholarly formatting...") | |
| # Completed | |
| progress(1.0, desc="Analysis complete!") | |
| # Add a timestamp to the interpretation | |
| timestamp = datetime.now().strftime("%Y-%m-%d %H:%M") | |
| interpretation = f"{interpretation}\n\n<small>Analysis generated on {timestamp}</small>" | |
| return interpretation | |
| except Exception as e: | |
| logger.error(f"Error in interpret_results: {e}", exc_info=True) | |
| return f"Error interpreting results: {str(e)}" | |
| def run_pipeline_preset(files, preset, progress=gr.Progress()): | |
| """Wrapper that converts preset selection to pipeline parameters.""" | |
| # Determine settings based on preset | |
| if "Quick" in preset: | |
| # Quick: Jaccard only | |
| enable_semantic = "No" | |
| enable_fuzzy = "No" | |
| elif "Deep" in preset: | |
| # Deep: All metrics including semantic | |
| enable_semantic = "Yes" | |
| enable_fuzzy = "Yes" | |
| else: | |
| # Standard: Jaccard + LCS + Fuzzy (no semantic) | |
| enable_semantic = "No" | |
| enable_fuzzy = "Yes" | |
| # Use sensible defaults for preset mode | |
| fuzzy_method = "ngram - Syllable pairs (recommended)" | |
| lcs_normalization = "avg - Balanced comparison (default)" | |
| model_name = "buddhist-nlp/buddhist-sentence-similarity" | |
| tokenization_mode = "word - Whole words (recommended)" | |
| stopwords_option = "Standard (Common particles only)" | |
| normalize_particles = False | |
| batch_size = 8 | |
| show_progress = False | |
| return run_pipeline( | |
| files, enable_semantic, enable_fuzzy, fuzzy_method, | |
| lcs_normalization, model_name, tokenization_mode, | |
| stopwords_option, normalize_particles, batch_size, | |
| show_progress, progress | |
| ) | |
| # Output components for both buttons | |
| pipeline_outputs = [ | |
| csv_output, | |
| metrics_preview, | |
| word_count_plot, | |
| vocab_containment_plot, | |
| heatmap_tabs["Jaccard Similarity (%)"], | |
| heatmap_tabs["Normalized LCS"], | |
| heatmap_tabs["Fuzzy Similarity"], | |
| heatmap_tabs["Semantic Similarity"], | |
| warning_box, | |
| state_text_data, | |
| state_df_results, | |
| ] | |
| # Quick Start button uses presets | |
| process_btn_quick.click( | |
| fn=run_pipeline_preset, | |
| inputs=[file_input, analysis_preset], | |
| outputs=pipeline_outputs | |
| ) | |
| # Custom button uses all the detailed settings | |
| process_btn_custom.click( | |
| fn=run_pipeline, | |
| inputs=[ | |
| file_input, | |
| semantic_toggle_radio, | |
| fuzzy_toggle_radio, | |
| fuzzy_method_dropdown, | |
| lcs_normalization_dropdown, | |
| model_dropdown, | |
| tokenization_mode_dropdown, | |
| stopwords_dropdown, | |
| particle_normalization_checkbox, | |
| batch_size_slider, | |
| progress_bar_checkbox | |
| ], | |
| outputs=pipeline_outputs | |
| ) | |
| # Structural analysis functionality removed - see dedicated collation app | |
| # Connect the interpret button | |
| interpret_btn.click( | |
| fn=interpret_results, | |
| inputs=[csv_output], | |
| outputs=interpretation_output | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = main_interface() | |
| demo.launch() | |