Spaces:

cjc0013
/

epstein-semantic-explorer

Running

App Files Files Community

cjc0013 commited on 18 days ago

Commit

5e57bf1

verified ·

1 Parent(s): dff3605

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -23

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ def text_vector(text):
 def centroid(docs):
     C = Counter()
     for d in docs:
-        C.update(text_vector(d["text"]))
     return C
 def cosine(a, b):
@@ -60,12 +60,11 @@ def initialize_state(records):
         "Ndocs": Ndocs,
         "avg_len": avg_len,
         "centroids": centroids
-    }, sorted(cluster_map.keys()), f"Loaded {len(records)} records."
 def load_jsonl(user_file):
     if user_file is None:
-        return None, None, "⚠ No file uploaded."
     records = []
     with open(user_file.name, "r", encoding="utf8") as f:
@@ -74,13 +73,12 @@ def load_jsonl(user_file):
                 records.append(json.loads(line))
             except:
                 pass
-    return initialize_state(records)
 def load_default():
     path = "epstein_semantic.jsonl"
     if not os.path.exists(path):
-        return None, None, "⚠ Upload a dataset to begin."
     records = []
     with open(path, "r", encoding="utf8") as f:
@@ -89,7 +87,7 @@ def load_default():
                 records.append(json.loads(line))
             except:
                 pass
-    return initialize_state(records)
 # =====================================================================
 # BM25
@@ -108,7 +106,6 @@ def bm25_score(query, doc_toks, doc_freq, Ndocs, avg_len):
         idf = math.log((Ndocs - df + 0.5) / (df + 0.5) + 1)
         tf = doc_toks.count(q)
         denom = tf + k * (1 - b + b * (len(doc_toks) / avg_len))
         score += idf * (tf * (k + 1)) / denom
     return score
@@ -124,7 +121,7 @@ def do_view_cluster(state, cid):
     try:
         cid = int(cid)
     except:
-        return "Enter a valid number."
     cluster_map = state["cluster_map"]
@@ -133,7 +130,6 @@ def do_view_cluster(state, cid):
     out = [f"=== Cluster {cid} ({len(cluster_map[cid])} docs) ===\n"]
-    # show all docs, untruncated
     for d in cluster_map[cid]:
         rid = d.get("id", "unknown")
         out.append(f"\n--- id={rid} ---\n{d.get('text','')}\n")
@@ -152,7 +148,6 @@ def do_search(state, query):
         if score > 0:
             results.append((score, r))
-    # FIX: sort by score, not dict
     results.sort(key=lambda x: x[0], reverse=True)
     out = [f"=== Results for '{query}' ==="]
@@ -177,11 +172,12 @@ subject re fw message thereof all may any doc email said
     out = ["=== Cluster Topics ==="]
     for cid, cent in state["centroids"].items():
-        filtered = {w: c for w, c in cent.items()
-                    if w not in STOPWORDS and len(w) > 2 and c > 1}
         top = [w for w, _ in Counter(filtered).most_common(12)]
         out.append(f"Cluster {cid:<4} | {' '.join(top)}")
     return "\n".join(out)
@@ -207,16 +203,22 @@ def do_entity_search(state, name):
     return "\n".join(out)
 # =====================================================================
-# UI Layout
 # =====================================================================
-startup_state, startup_clusters, startup_msg = load_default()
-with gr.Blocks(title="Epstein Semantic Explorer", css="#output {white-space: pre-wrap;}") as demo:
     gr.Markdown("# **Epstein Semantic Explorer**")
     gr.Markdown(startup_msg)
     with gr.Tab("View Cluster"):
         cluster_num = gr.Number(label="Cluster #", value=96)
         out_cluster = gr.Textbox(label="Cluster Output", lines=40)
@@ -236,11 +238,17 @@ with gr.Blocks(title="Epstein Semantic Explorer", css="#output {white-space: pre
         out_topics = gr.Textbox(label="Topics", lines=40)
         gr.Button("Show Topics").click(do_show_topics, [startup_state], out_topics)
-    # File Upload (override default)
     with gr.Tab("Upload Different Dataset"):
-        jsonl_file = gr.File(label="Upload JSONL")
-        load_btn = gr.Button("Load Dataset")
-        load_out = gr.Textbox(label="Status", lines=2)
-        load_btn.click(load_jsonl, [jsonl_file], [startup_state, cluster_num, load_out])
 demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)

 def centroid(docs):
     C = Counter()
     for d in docs:
+        C.update(text_vector(d.get("text", "")))
     return C
 def cosine(a, b):
         "Ndocs": Ndocs,
         "avg_len": avg_len,
         "centroids": centroids
+    }
 def load_jsonl(user_file):
     if user_file is None:
+        return None, "⚠ No file uploaded."
     records = []
     with open(user_file.name, "r", encoding="utf8") as f:
                 records.append(json.loads(line))
             except:
                 pass
+    return initialize_state(records), f"Loaded {len(records)} records."
 def load_default():
     path = "epstein_semantic.jsonl"
     if not os.path.exists(path):
+        return None, "⚠ No default dataset found."
     records = []
     with open(path, "r", encoding="utf8") as f:
                 records.append(json.loads(line))
             except:
                 pass
+    return initialize_state(records), f"Loaded {len(records)} records."
 # =====================================================================
 # BM25
         idf = math.log((Ndocs - df + 0.5) / (df + 0.5) + 1)
         tf = doc_toks.count(q)
         denom = tf + k * (1 - b + b * (len(doc_toks) / avg_len))
         score += idf * (tf * (k + 1)) / denom
     return score
     try:
         cid = int(cid)
     except:
+        return "Enter a valid cluster number."
     cluster_map = state["cluster_map"]
     out = [f"=== Cluster {cid} ({len(cluster_map[cid])} docs) ===\n"]
     for d in cluster_map[cid]:
         rid = d.get("id", "unknown")
         out.append(f"\n--- id={rid} ---\n{d.get('text','')}\n")
         if score > 0:
             results.append((score, r))
     results.sort(key=lambda x: x[0], reverse=True)
     out = [f"=== Results for '{query}' ==="]
     out = ["=== Cluster Topics ==="]
     for cid, cent in state["centroids"].items():
+        filtered = {
+            w: c for w, c in cent.items()
+            if w not in STOPWORDS and len(w) > 2 and c > 1
+        }
         top = [w for w, _ in Counter(filtered).most_common(12)]
         out.append(f"Cluster {cid:<4} | {' '.join(top)}")
     return "\n".join(out)
     return "\n".join(out)
 # =====================================================================
+# Startup
 # =====================================================================
+startup_state_raw, startup_msg = load_default()
+startup_state = gr.State(startup_state_raw)
+# =====================================================================
+# UI
+# =====================================================================
+with gr.Blocks(title="Epstein Semantic Explorer") as demo:
     gr.Markdown("# **Epstein Semantic Explorer**")
     gr.Markdown(startup_msg)
+    # Tabs
     with gr.Tab("View Cluster"):
         cluster_num = gr.Number(label="Cluster #", value=96)
         out_cluster = gr.Textbox(label="Cluster Output", lines=40)
         out_topics = gr.Textbox(label="Topics", lines=40)
         gr.Button("Show Topics").click(do_show_topics, [startup_state], out_topics)
+    # Upload override
     with gr.Tab("Upload Different Dataset"):
+        file_up = gr.File(label="Upload JSONL")
+        load_btn = gr.Button("Load")
+        load_msg = gr.Textbox(label="Status", lines=2)
+        def apply_upload(file):
+            new_state, msg = load_jsonl(file)
+            startup_state.value = new_state
+            return msg
+        load_btn.click(apply_upload, [file_up], load_msg)
 demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)