cjc0013 commited on
Commit
a113d71
·
verified ·
1 Parent(s): 5e57bf1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -120
app.py CHANGED
@@ -2,9 +2,9 @@ import gradio as gr
2
  import json, re, math, os
3
  from collections import Counter, defaultdict
4
 
5
- # =====================================================================
6
- # Utility Functions
7
- # =====================================================================
8
 
9
  def tokenize(text):
10
  return re.findall(r"[A-Za-z0-9']+", text.lower())
@@ -15,31 +15,69 @@ def text_vector(text):
15
  def centroid(docs):
16
  C = Counter()
17
  for d in docs:
18
- C.update(text_vector(d.get("text", "")))
19
  return C
20
 
21
  def cosine(a, b):
22
- num = 0; da = 0; db = 0
 
 
23
  for k in set(a.keys()) | set(b.keys()):
24
- va = a.get(k, 0)
25
- vb = b.get(k, 0)
26
- num += va * vb
27
- da += va * va
28
- db += vb * vb
29
  if da == 0 or db == 0:
30
  return 0
31
- return num / math.sqrt(da * db)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # =====================================================================
34
- # Load Data
35
- # =====================================================================
36
 
37
  def initialize_state(records):
 
 
 
 
 
38
  cluster_map = defaultdict(list)
39
  for r in records:
40
  cluster_map[r.get("cluster", -1)].append(r)
41
 
42
- docs_text = [r.get("text", "") for r in records]
43
  tokenized_docs = [tokenize(t) for t in docs_text]
44
 
45
  doc_freq = Counter()
@@ -52,49 +90,25 @@ def initialize_state(records):
52
 
53
  centroids = {cid: centroid(docs) for cid, docs in cluster_map.items()}
54
 
55
- return {
56
  "records": records,
57
  "cluster_map": cluster_map,
58
  "tokenized_docs": tokenized_docs,
59
  "doc_freq": doc_freq,
60
  "Ndocs": Ndocs,
61
  "avg_len": avg_len,
62
- "centroids": centroids
63
  }
64
 
65
- def load_jsonl(user_file):
66
- if user_file is None:
67
- return None, "⚠ No file uploaded."
68
-
69
- records = []
70
- with open(user_file.name, "r", encoding="utf8") as f:
71
- for line in f:
72
- try:
73
- records.append(json.loads(line))
74
- except:
75
- pass
76
- return initialize_state(records), f"Loaded {len(records)} records."
77
 
78
- def load_default():
79
- path = "epstein_semantic.jsonl"
80
- if not os.path.exists(path):
81
- return None, "⚠ No default dataset found."
82
-
83
- records = []
84
- with open(path, "r", encoding="utf8") as f:
85
- for line in f:
86
- try:
87
- records.append(json.loads(line))
88
- except:
89
- pass
90
- return initialize_state(records), f"Loaded {len(records)} records."
91
 
92
- # =====================================================================
93
- # BM25
94
- # =====================================================================
95
 
96
  def bm25_score(query, doc_toks, doc_freq, Ndocs, avg_len):
97
- k = 1.5; b = 0.75
98
  score = 0
99
  q_toks = tokenize(query)
100
 
@@ -102,7 +116,6 @@ def bm25_score(query, doc_toks, doc_freq, Ndocs, avg_len):
102
  df = doc_freq.get(q, 0)
103
  if df == 0:
104
  continue
105
-
106
  idf = math.log((Ndocs - df + 0.5) / (df + 0.5) + 1)
107
  tf = doc_toks.count(q)
108
  denom = tf + k * (1 - b + b * (len(doc_toks) / avg_len))
@@ -110,9 +123,10 @@ def bm25_score(query, doc_toks, doc_freq, Ndocs, avg_len):
110
 
111
  return score
112
 
113
- # =====================================================================
114
- # UI Functions
115
- # =====================================================================
 
116
 
117
  def do_view_cluster(state, cid):
118
  if state is None:
@@ -126,13 +140,11 @@ def do_view_cluster(state, cid):
126
  cluster_map = state["cluster_map"]
127
 
128
  if cid not in cluster_map:
129
- return f"❌ Cluster {cid} not found."
130
 
131
  out = [f"=== Cluster {cid} ({len(cluster_map[cid])} docs) ===\n"]
132
-
133
  for d in cluster_map[cid]:
134
- rid = d.get("id", "unknown")
135
- out.append(f"\n--- id={rid} ---\n{d.get('text','')}\n")
136
 
137
  return "\n".join(out)
138
 
@@ -142,7 +154,6 @@ def do_search(state, query):
142
  return "⚠ No dataset loaded."
143
 
144
  results = []
145
-
146
  for r, toks in zip(state["records"], state["tokenized_docs"]):
147
  score = bm25_score(query, toks, state["doc_freq"], state["Ndocs"], state["avg_len"])
148
  if score > 0:
@@ -151,10 +162,27 @@ def do_search(state, query):
151
  results.sort(key=lambda x: x[0], reverse=True)
152
 
153
  out = [f"=== Results for '{query}' ==="]
 
 
154
 
155
- for score, r in results[:40]:
156
- rid = r.get("id", "unknown")
157
- out.append(f"\nScore {score:.2f} — Cluster {r.get('cluster')} — id={rid}\n{r.get('text','')}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  return "\n".join(out)
160
 
@@ -163,92 +191,60 @@ def do_show_topics(state):
163
  if state is None:
164
  return "⚠ No dataset loaded."
165
 
166
- STOPWORDS = set("""
167
  the and to of a in is this that for on with as be or by from at
168
  an it are was you your if but have we they his her she their our
169
- subject re fw message thereof all may any doc email said
170
  """.split())
171
 
172
  out = ["=== Cluster Topics ==="]
173
 
174
  for cid, cent in state["centroids"].items():
175
- filtered = {
176
- w: c for w, c in cent.items()
177
- if w not in STOPWORDS and len(w) > 2 and c > 1
178
- }
179
-
180
- top = [w for w, _ in Counter(filtered).most_common(12)]
181
  out.append(f"Cluster {cid:<4} | {' '.join(top)}")
182
 
183
  return "\n".join(out)
184
 
185
 
186
- def do_entity_search(state, name):
187
- if state is None:
188
- return "⚠ No dataset loaded."
189
-
190
- hits = []
191
-
192
- for cid, docs in state["cluster_map"].items():
193
- count = sum(name.lower() in d.get("text", "").lower() for d in docs)
194
- if count > 0:
195
- hits.append((count, cid))
196
-
197
- hits.sort(reverse=True)
198
 
199
- out = [f"=== Clusters mentioning '{name}' ==="]
200
- for count, cid in hits[:40]:
201
- out.append(f"Cluster {cid}: {count} hits")
202
 
203
- return "\n".join(out)
 
204
 
205
- # =====================================================================
206
- # Startup
207
- # =====================================================================
208
 
209
- startup_state_raw, startup_msg = load_default()
210
- startup_state = gr.State(startup_state_raw)
211
 
212
- # =====================================================================
213
- # UI
214
- # =====================================================================
215
 
216
  with gr.Blocks(title="Epstein Semantic Explorer") as demo:
217
 
218
- gr.Markdown("# **Epstein Semantic Explorer**")
219
  gr.Markdown(startup_msg)
220
 
221
- # Tabs
222
- with gr.Tab("View Cluster"):
223
- cluster_num = gr.Number(label="Cluster #", value=96)
224
- out_cluster = gr.Textbox(label="Cluster Output", lines=40)
225
- cluster_num.change(do_view_cluster, [startup_state, cluster_num], out_cluster)
226
-
227
- with gr.Tab("Keyword Search"):
228
- query_box = gr.Textbox(label="Keyword")
229
- out_search = gr.Textbox(label="Search Output", lines=40)
230
- query_box.submit(do_search, [startup_state, query_box], out_search)
231
-
232
- with gr.Tab("Entity Search"):
233
- entity_box = gr.Textbox(label="Person / Name")
234
- out_entity = gr.Textbox(label="Entity Output", lines=40)
235
- entity_box.submit(do_entity_search, [startup_state, entity_box], out_entity)
236
 
237
- with gr.Tab("Topics"):
238
- out_topics = gr.Textbox(label="Topics", lines=40)
239
- gr.Button("Show Topics").click(do_show_topics, [startup_state], out_topics)
 
240
 
241
- # Upload override
242
- with gr.Tab("Upload Different Dataset"):
243
- file_up = gr.File(label="Upload JSONL")
244
- load_btn = gr.Button("Load")
245
- load_msg = gr.Textbox(label="Status", lines=2)
246
 
247
- def apply_upload(file):
248
- new_state, msg = load_jsonl(file)
249
- startup_state.value = new_state
250
- return msg
251
 
252
- load_btn.click(apply_upload, [file_up], load_msg)
 
253
 
254
- demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
 
2
  import json, re, math, os
3
  from collections import Counter, defaultdict
4
 
5
+ # ===============================================================
6
+ # UTILITIES
7
+ # ===============================================================
8
 
9
  def tokenize(text):
10
  return re.findall(r"[A-Za-z0-9']+", text.lower())
 
15
  def centroid(docs):
16
  C = Counter()
17
  for d in docs:
18
+ C.update(text_vector(d["text"]))
19
  return C
20
 
21
  def cosine(a, b):
22
+ num = 0
23
+ da = 0
24
+ db = 0
25
  for k in set(a.keys()) | set(b.keys()):
26
+ va = a.get(k,0)
27
+ vb = b.get(k,0)
28
+ num += va*vb
29
+ da += va*va
30
+ db += vb*vb
31
  if da == 0 or db == 0:
32
  return 0
33
+ return num / math.sqrt(da*db)
34
+
35
+ # ===============================================================
36
+ # LOAD JSONL FROM FILE
37
+ # ===============================================================
38
+
39
+ def load_records_from_path(path):
40
+ if not os.path.exists(path):
41
+ return None, None, "⚠ JSONL file not found."
42
+
43
+ records = []
44
+ with open(path, "r", encoding="utf8") as f:
45
+ for line in f:
46
+ try:
47
+ records.append(json.loads(line))
48
+ except:
49
+ pass
50
+
51
+ return initialize_state(records)
52
+
53
+
54
+ def load_jsonl(user_file):
55
+ if user_file is None:
56
+ return None, "⚠ No file uploaded."
57
+
58
+ records = []
59
+ with open(user_file.name, "r", encoding="utf8") as f:
60
+ for line in f:
61
+ try:
62
+ records.append(json.loads(line))
63
+ except:
64
+ pass
65
+
66
+ state, msg = initialize_state(records)
67
+ return state, msg
68
 
 
 
 
69
 
70
  def initialize_state(records):
71
+ # Ensure IDs exist
72
+ for i, r in enumerate(records):
73
+ if "id" not in r:
74
+ r["id"] = i
75
+
76
  cluster_map = defaultdict(list)
77
  for r in records:
78
  cluster_map[r.get("cluster", -1)].append(r)
79
 
80
+ docs_text = [r["text"] for r in records]
81
  tokenized_docs = [tokenize(t) for t in docs_text]
82
 
83
  doc_freq = Counter()
 
90
 
91
  centroids = {cid: centroid(docs) for cid, docs in cluster_map.items()}
92
 
93
+ state = {
94
  "records": records,
95
  "cluster_map": cluster_map,
96
  "tokenized_docs": tokenized_docs,
97
  "doc_freq": doc_freq,
98
  "Ndocs": Ndocs,
99
  "avg_len": avg_len,
100
+ "centroids": centroids,
101
  }
102
 
103
+ return state, f"Loaded {len(records)} records."
 
 
 
 
 
 
 
 
 
 
 
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # ===============================================================
107
+ # BM25 SEARCH
108
+ # ===============================================================
109
 
110
  def bm25_score(query, doc_toks, doc_freq, Ndocs, avg_len):
111
+ k=1.5; b=0.75
112
  score = 0
113
  q_toks = tokenize(query)
114
 
 
116
  df = doc_freq.get(q, 0)
117
  if df == 0:
118
  continue
 
119
  idf = math.log((Ndocs - df + 0.5) / (df + 0.5) + 1)
120
  tf = doc_toks.count(q)
121
  denom = tf + k * (1 - b + b * (len(doc_toks) / avg_len))
 
123
 
124
  return score
125
 
126
+
127
+ # ===============================================================
128
+ # FEATURE FUNCTIONS
129
+ # ===============================================================
130
 
131
  def do_view_cluster(state, cid):
132
  if state is None:
 
140
  cluster_map = state["cluster_map"]
141
 
142
  if cid not in cluster_map:
143
+ return "❌ Cluster not found."
144
 
145
  out = [f"=== Cluster {cid} ({len(cluster_map[cid])} docs) ===\n"]
 
146
  for d in cluster_map[cid]:
147
+ out.append(f"\n--- id={d['id']} ---\n{d['text']}\n")
 
148
 
149
  return "\n".join(out)
150
 
 
154
  return "⚠ No dataset loaded."
155
 
156
  results = []
 
157
  for r, toks in zip(state["records"], state["tokenized_docs"]):
158
  score = bm25_score(query, toks, state["doc_freq"], state["Ndocs"], state["avg_len"])
159
  if score > 0:
 
162
  results.sort(key=lambda x: x[0], reverse=True)
163
 
164
  out = [f"=== Results for '{query}' ==="]
165
+ for score, r in results[:30]:
166
+ out.append(f"\nScore {score:.2f} — Cluster {r['cluster']} — id={r['id']}\n{r['text']}\n")
167
 
168
+ return "\n".join(out)
169
+
170
+
171
+ def do_entity_search(state, name):
172
+ if state is None:
173
+ return "⚠ No dataset loaded."
174
+
175
+ hits = []
176
+ for cid, docs in state["cluster_map"].items():
177
+ count = sum(name.lower() in d["text"].lower() for d in docs)
178
+ if count:
179
+ hits.append((count, cid))
180
+
181
+ hits.sort(reverse=True)
182
+
183
+ out = [f"=== Clusters mentioning '{name}' ==="]
184
+ for count, cid in hits[:30]:
185
+ out.append(f"Cluster {cid}: {count} hits")
186
 
187
  return "\n".join(out)
188
 
 
191
  if state is None:
192
  return "⚠ No dataset loaded."
193
 
194
+ STOP = set("""
195
  the and to of a in is this that for on with as be or by from at
196
  an it are was you your if but have we they his her she their our
197
+ subject re fw message thereof all may any doc email
198
  """.split())
199
 
200
  out = ["=== Cluster Topics ==="]
201
 
202
  for cid, cent in state["centroids"].items():
203
+ filtered = {w: c for w, c in cent.items()
204
+ if w not in STOP and len(w) > 2 and c > 1}
205
+ top = [w for w, _ in Counter(filtered).most_common(10)]
 
 
 
206
  out.append(f"Cluster {cid:<4} | {' '.join(top)}")
207
 
208
  return "\n".join(out)
209
 
210
 
211
+ # ===============================================================
212
+ # AUTO LOAD IF FILE EXISTS
213
+ # ===============================================================
 
 
 
 
 
 
 
 
 
214
 
215
+ DEFAULT_PATH = "epstein_semantic.jsonl"
 
 
216
 
217
+ startup_state = None
218
+ startup_msg = "⚠ No default dataset found."
219
 
220
+ if os.path.exists(DEFAULT_PATH):
221
+ startup_state, startup_msg = load_records_from_path(DEFAULT_PATH)
 
222
 
 
 
223
 
224
+ # ===============================================================
225
+ # GRADIO UI (SINGLE PAGE)
226
+ # ===============================================================
227
 
228
  with gr.Blocks(title="Epstein Semantic Explorer") as demo:
229
 
230
+ gr.Markdown("# Epstein Semantic Explorer")
231
  gr.Markdown(startup_msg)
232
 
233
+ state_box = gr.State(startup_state)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ cluster_input = gr.Number(label="Cluster #", value=0)
236
+ keyword_input = gr.Textbox(label="Keyword Search")
237
+ entity_input = gr.Textbox(label="Entity Search (name)")
238
+ jsonl_file = gr.File(label="Upload different JSONL dataset")
239
 
240
+ out_box = gr.Textbox(label="Output", lines=40)
 
 
 
 
241
 
242
+ # Bindings
243
+ cluster_input.change(do_view_cluster, [state_box, cluster_input], out_box)
244
+ keyword_input.submit(do_search, [state_box, keyword_input], out_box)
245
+ entity_input.submit(do_entity_search, [state_box, entity_input], out_box)
246
 
247
+ gr.Button("Show Topics").click(do_show_topics, state_box, out_box)
248
+ gr.Button("Load Dataset").click(load_jsonl, jsonl_file, [state_box, out_box])
249
 
250
+ demo.launch()