cjc0013 commited on
Commit
dff3605
·
verified ·
1 Parent(s): f2ffec2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -102
app.py CHANGED
@@ -2,9 +2,9 @@ import gradio as gr
2
  import json, re, math, os
3
  from collections import Counter, defaultdict
4
 
5
- # ===============================================================
6
- # UTILITIES
7
- # ===============================================================
8
 
9
  def tokenize(text):
10
  return re.findall(r"[A-Za-z0-9']+", text.lower())
@@ -19,62 +19,27 @@ def centroid(docs):
19
  return C
20
 
21
  def cosine(a, b):
22
- num = 0
23
- da = 0
24
- db = 0
25
  for k in set(a.keys()) | set(b.keys()):
26
- va = a.get(k,0)
27
- vb = b.get(k,0)
28
- num += va*vb
29
- da += va*va
30
- db += vb*vb
31
  if da == 0 or db == 0:
32
  return 0
33
- return num / math.sqrt(da*db)
34
-
35
- # ===============================================================
36
- # LOAD JSONL FROM FILE
37
- # ===============================================================
38
-
39
- def load_records_from_path(path):
40
- """Loads a dataset from an existing file, used at startup."""
41
- if not os.path.exists(path):
42
- return None, None, "⚠ JSONL file not found."
43
-
44
- records = []
45
- with open(path, "r", encoding="utf8") as f:
46
- for line in f:
47
- try:
48
- records.append(json.loads(line))
49
- except:
50
- pass
51
-
52
- return initialize_state(records)
53
-
54
-
55
- def load_jsonl(user_file):
56
- """Loads a dataset from user upload."""
57
- if user_file is None:
58
- return gr.update(), None, "⚠ No file uploaded."
59
-
60
- records = []
61
- with open(user_file.name, "r", encoding="utf8") as f:
62
- for line in f:
63
- try:
64
- records.append(json.loads(line))
65
- except:
66
- pass
67
-
68
- return initialize_state(records)
69
 
 
 
 
70
 
71
  def initialize_state(records):
72
- """Builds all indexes for search, clustering, etc."""
73
  cluster_map = defaultdict(list)
74
  for r in records:
75
  cluster_map[r.get("cluster", -1)].append(r)
76
 
77
- docs_text = [r["text"] for r in records]
78
  tokenized_docs = [tokenize(t) for t in docs_text]
79
 
80
  doc_freq = Counter()
@@ -94,16 +59,44 @@ def initialize_state(records):
94
  "doc_freq": doc_freq,
95
  "Ndocs": Ndocs,
96
  "avg_len": avg_len,
97
- "centroids": centroids,
98
  }, sorted(cluster_map.keys()), f"Loaded {len(records)} records."
99
 
100
 
101
- # ===============================================================
102
- # BM25 SEARCH
103
- # ===============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  def bm25_score(query, doc_toks, doc_freq, Ndocs, avg_len):
106
- k=1.5; b=0.75
107
  score = 0
108
  q_toks = tokenize(query)
109
 
@@ -111,17 +104,18 @@ def bm25_score(query, doc_toks, doc_freq, Ndocs, avg_len):
111
  df = doc_freq.get(q, 0)
112
  if df == 0:
113
  continue
 
114
  idf = math.log((Ndocs - df + 0.5) / (df + 0.5) + 1)
115
  tf = doc_toks.count(q)
116
  denom = tf + k * (1 - b + b * (len(doc_toks) / avg_len))
 
117
  score += idf * (tf * (k + 1)) / denom
118
 
119
  return score
120
 
121
-
122
- # ===============================================================
123
- # GRADIO FEATURE FUNCTIONS
124
- # ===============================================================
125
 
126
  def do_view_cluster(state, cid):
127
  if state is None:
@@ -135,12 +129,14 @@ def do_view_cluster(state, cid):
135
  cluster_map = state["cluster_map"]
136
 
137
  if cid not in cluster_map:
138
- return "❌ Cluster not found."
139
 
140
- # FULL TEXT (NO MORE TRUNCATION)
141
  out = [f"=== Cluster {cid} ({len(cluster_map[cid])} docs) ===\n"]
 
 
142
  for d in cluster_map[cid]:
143
- out.append(f"\n--- id={d.get('id')} ---\n{d['text']}\n")
 
144
 
145
  return "\n".join(out)
146
 
@@ -150,16 +146,20 @@ def do_search(state, query):
150
  return "⚠ No dataset loaded."
151
 
152
  results = []
 
153
  for r, toks in zip(state["records"], state["tokenized_docs"]):
154
  score = bm25_score(query, toks, state["doc_freq"], state["Ndocs"], state["avg_len"])
155
  if score > 0:
156
  results.append((score, r))
157
 
158
- results.sort(reverse=True)
 
159
 
160
  out = [f"=== Results for '{query}' ==="]
161
- for score, r in results[:30]:
162
- out.append(f"\nScore {score:.2f} — Cluster {r['cluster']} id={r['id']}\n{r['text']}\n")
 
 
163
 
164
  return "\n".join(out)
165
 
@@ -171,7 +171,7 @@ def do_show_topics(state):
171
  STOPWORDS = set("""
172
  the and to of a in is this that for on with as be or by from at
173
  an it are was you your if but have we they his her she their our
174
- subject re fw message thereof all may any doc email
175
  """.split())
176
 
177
  out = ["=== Cluster Topics ==="]
@@ -179,7 +179,9 @@ subject re fw message thereof all may any doc email
179
  for cid, cent in state["centroids"].items():
180
  filtered = {w: c for w, c in cent.items()
181
  if w not in STOPWORDS and len(w) > 2 and c > 1}
182
- top = [w for w, _ in Counter(filtered).most_common(10)]
 
 
183
  out.append(f"Cluster {cid:<4} | {' '.join(top)}")
184
 
185
  return "\n".join(out)
@@ -190,59 +192,55 @@ def do_entity_search(state, name):
190
  return "⚠ No dataset loaded."
191
 
192
  hits = []
 
193
  for cid, docs in state["cluster_map"].items():
194
- count = sum(name.lower() in d["text"].lower() for d in docs)
195
- if count:
196
  hits.append((count, cid))
197
 
198
  hits.sort(reverse=True)
199
 
200
  out = [f"=== Clusters mentioning '{name}' ==="]
201
- for count, cid in hits[:30]:
202
  out.append(f"Cluster {cid}: {count} hits")
203
 
204
  return "\n".join(out)
205
 
 
 
 
206
 
207
- # ===============================================================
208
- # AUTO-LOAD DATASET IF PRESENT
209
- # ===============================================================
210
-
211
- DEFAULT_PATH = "epstein_semantic.jsonl"
212
-
213
- startup_state = None
214
- startup_clusters = None
215
- startup_msg = "⚠ No default dataset found."
216
-
217
- if os.path.exists(DEFAULT_PATH):
218
- startup_state, startup_clusters, startup_msg = load_records_from_path(DEFAULT_PATH)
219
 
 
220
 
221
- # ===============================================================
222
- # GRADIO UI
223
- # ===============================================================
224
-
225
- with gr.Blocks(title="Epstein Semantic Explorer") as demo:
226
-
227
- gr.Markdown("# Epstein Semantic Explorer")
228
  gr.Markdown(startup_msg)
229
 
230
- with gr.Row():
231
- jsonl_file = gr.File(label="Upload different JSONL dataset")
232
- load_btn = gr.Button("Load Dataset")
 
233
 
234
- state = gr.State(startup_state)
235
- clusters_box = gr.Number(label="Cluster #", value=96)
236
- query_box = gr.Textbox(label="Keyword Search")
237
- entity_box = gr.Textbox(label="Entity Search (name)")
238
 
239
- output = gr.Textbox(label="Output", lines=40)
 
 
 
240
 
241
- load_btn.click(load_jsonl, inputs=[jsonl_file], outputs=[state, clusters_box, output])
242
- clusters_box.change(do_view_cluster, inputs=[state, clusters_box], outputs=output)
243
- query_box.submit(do_search, inputs=[state, query_box], outputs=output)
244
- entity_box.submit(do_entity_search, inputs=[state, entity_box], outputs=output)
245
 
246
- gr.Button("Show Topics").click(do_show_topics, inputs=[state], outputs=output)
 
 
 
 
 
247
 
248
- demo.launch()
 
2
  import json, re, math, os
3
  from collections import Counter, defaultdict
4
 
5
+ # =====================================================================
6
+ # Utility Functions
7
+ # =====================================================================
8
 
9
  def tokenize(text):
10
  return re.findall(r"[A-Za-z0-9']+", text.lower())
 
19
  return C
20
 
21
  def cosine(a, b):
22
+ num = 0; da = 0; db = 0
 
 
23
  for k in set(a.keys()) | set(b.keys()):
24
+ va = a.get(k, 0)
25
+ vb = b.get(k, 0)
26
+ num += va * vb
27
+ da += va * va
28
+ db += vb * vb
29
  if da == 0 or db == 0:
30
  return 0
31
+ return num / math.sqrt(da * db)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # =====================================================================
34
+ # Load Data
35
+ # =====================================================================
36
 
37
  def initialize_state(records):
 
38
  cluster_map = defaultdict(list)
39
  for r in records:
40
  cluster_map[r.get("cluster", -1)].append(r)
41
 
42
+ docs_text = [r.get("text", "") for r in records]
43
  tokenized_docs = [tokenize(t) for t in docs_text]
44
 
45
  doc_freq = Counter()
 
59
  "doc_freq": doc_freq,
60
  "Ndocs": Ndocs,
61
  "avg_len": avg_len,
62
+ "centroids": centroids
63
  }, sorted(cluster_map.keys()), f"Loaded {len(records)} records."
64
 
65
 
66
+ def load_jsonl(user_file):
67
+ if user_file is None:
68
+ return None, None, "⚠ No file uploaded."
69
+
70
+ records = []
71
+ with open(user_file.name, "r", encoding="utf8") as f:
72
+ for line in f:
73
+ try:
74
+ records.append(json.loads(line))
75
+ except:
76
+ pass
77
+ return initialize_state(records)
78
+
79
+
80
+ def load_default():
81
+ path = "epstein_semantic.jsonl"
82
+ if not os.path.exists(path):
83
+ return None, None, "⚠ Upload a dataset to begin."
84
+
85
+ records = []
86
+ with open(path, "r", encoding="utf8") as f:
87
+ for line in f:
88
+ try:
89
+ records.append(json.loads(line))
90
+ except:
91
+ pass
92
+ return initialize_state(records)
93
+
94
+ # =====================================================================
95
+ # BM25
96
+ # =====================================================================
97
 
98
  def bm25_score(query, doc_toks, doc_freq, Ndocs, avg_len):
99
+ k = 1.5; b = 0.75
100
  score = 0
101
  q_toks = tokenize(query)
102
 
 
104
  df = doc_freq.get(q, 0)
105
  if df == 0:
106
  continue
107
+
108
  idf = math.log((Ndocs - df + 0.5) / (df + 0.5) + 1)
109
  tf = doc_toks.count(q)
110
  denom = tf + k * (1 - b + b * (len(doc_toks) / avg_len))
111
+
112
  score += idf * (tf * (k + 1)) / denom
113
 
114
  return score
115
 
116
+ # =====================================================================
117
+ # UI Functions
118
+ # =====================================================================
 
119
 
120
  def do_view_cluster(state, cid):
121
  if state is None:
 
129
  cluster_map = state["cluster_map"]
130
 
131
  if cid not in cluster_map:
132
+ return f"❌ Cluster {cid} not found."
133
 
 
134
  out = [f"=== Cluster {cid} ({len(cluster_map[cid])} docs) ===\n"]
135
+
136
+ # show all docs, untruncated
137
  for d in cluster_map[cid]:
138
+ rid = d.get("id", "unknown")
139
+ out.append(f"\n--- id={rid} ---\n{d.get('text','')}\n")
140
 
141
  return "\n".join(out)
142
 
 
146
  return "⚠ No dataset loaded."
147
 
148
  results = []
149
+
150
  for r, toks in zip(state["records"], state["tokenized_docs"]):
151
  score = bm25_score(query, toks, state["doc_freq"], state["Ndocs"], state["avg_len"])
152
  if score > 0:
153
  results.append((score, r))
154
 
155
+ # FIX: sort by score, not dict
156
+ results.sort(key=lambda x: x[0], reverse=True)
157
 
158
  out = [f"=== Results for '{query}' ==="]
159
+
160
+ for score, r in results[:40]:
161
+ rid = r.get("id", "unknown")
162
+ out.append(f"\nScore {score:.2f} — Cluster {r.get('cluster')} — id={rid}\n{r.get('text','')}\n")
163
 
164
  return "\n".join(out)
165
 
 
171
  STOPWORDS = set("""
172
  the and to of a in is this that for on with as be or by from at
173
  an it are was you your if but have we they his her she their our
174
+ subject re fw message thereof all may any doc email said
175
  """.split())
176
 
177
  out = ["=== Cluster Topics ==="]
 
179
  for cid, cent in state["centroids"].items():
180
  filtered = {w: c for w, c in cent.items()
181
  if w not in STOPWORDS and len(w) > 2 and c > 1}
182
+
183
+ top = [w for w, _ in Counter(filtered).most_common(12)]
184
+
185
  out.append(f"Cluster {cid:<4} | {' '.join(top)}")
186
 
187
  return "\n".join(out)
 
192
  return "⚠ No dataset loaded."
193
 
194
  hits = []
195
+
196
  for cid, docs in state["cluster_map"].items():
197
+ count = sum(name.lower() in d.get("text", "").lower() for d in docs)
198
+ if count > 0:
199
  hits.append((count, cid))
200
 
201
  hits.sort(reverse=True)
202
 
203
  out = [f"=== Clusters mentioning '{name}' ==="]
204
+ for count, cid in hits[:40]:
205
  out.append(f"Cluster {cid}: {count} hits")
206
 
207
  return "\n".join(out)
208
 
209
+ # =====================================================================
210
+ # UI Layout
211
+ # =====================================================================
212
 
213
+ startup_state, startup_clusters, startup_msg = load_default()
 
 
 
 
 
 
 
 
 
 
 
214
 
215
+ with gr.Blocks(title="Epstein Semantic Explorer", css="#output {white-space: pre-wrap;}") as demo:
216
 
217
+ gr.Markdown("# **Epstein Semantic Explorer**")
 
 
 
 
 
 
218
  gr.Markdown(startup_msg)
219
 
220
+ with gr.Tab("View Cluster"):
221
+ cluster_num = gr.Number(label="Cluster #", value=96)
222
+ out_cluster = gr.Textbox(label="Cluster Output", lines=40)
223
+ cluster_num.change(do_view_cluster, [startup_state, cluster_num], out_cluster)
224
 
225
+ with gr.Tab("Keyword Search"):
226
+ query_box = gr.Textbox(label="Keyword")
227
+ out_search = gr.Textbox(label="Search Output", lines=40)
228
+ query_box.submit(do_search, [startup_state, query_box], out_search)
229
 
230
+ with gr.Tab("Entity Search"):
231
+ entity_box = gr.Textbox(label="Person / Name")
232
+ out_entity = gr.Textbox(label="Entity Output", lines=40)
233
+ entity_box.submit(do_entity_search, [startup_state, entity_box], out_entity)
234
 
235
+ with gr.Tab("Topics"):
236
+ out_topics = gr.Textbox(label="Topics", lines=40)
237
+ gr.Button("Show Topics").click(do_show_topics, [startup_state], out_topics)
 
238
 
239
+ # File Upload (override default)
240
+ with gr.Tab("Upload Different Dataset"):
241
+ jsonl_file = gr.File(label="Upload JSONL")
242
+ load_btn = gr.Button("Load Dataset")
243
+ load_out = gr.Textbox(label="Status", lines=2)
244
+ load_btn.click(load_jsonl, [jsonl_file], [startup_state, cluster_num, load_out])
245
 
246
+ demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)