童石渊 commited on
Commit
9c6d163
·
1 Parent(s): 31f6fc8

增加仅字符分割参数,如果开启,仅采用字符分割,不开启,在分割完以后如果chunk过大,会继续根据token size分割,更新测试文件

Browse files
test.ipynb → examples/test_split_by_character.ipynb RENAMED
@@ -6,8 +6,8 @@
6
  "id": "4b5690db12e34685",
7
  "metadata": {
8
  "ExecuteTime": {
9
- "end_time": "2025-01-07T05:38:34.174205Z",
10
- "start_time": "2025-01-07T05:38:29.978194Z"
11
  }
12
  },
13
  "outputs": [],
@@ -21,34 +21,40 @@
21
  "import nest_asyncio"
22
  ]
23
  },
 
 
 
 
 
 
24
  {
25
  "cell_type": "code",
26
- "execution_count": 2,
27
  "id": "8c8ee7c061bf9159",
28
  "metadata": {
29
  "ExecuteTime": {
30
- "end_time": "2025-01-07T05:38:37.440083Z",
31
- "start_time": "2025-01-07T05:38:37.437666Z"
32
  }
33
  },
34
  "outputs": [],
35
  "source": [
36
  "nest_asyncio.apply()\n",
37
- "WORKING_DIR = \"../llm_rag/paper_db/R000088_test2\"\n",
38
  "logging.basicConfig(format=\"%(levelname)s:%(message)s\", level=logging.INFO)\n",
39
  "if not os.path.exists(WORKING_DIR):\n",
40
  " os.mkdir(WORKING_DIR)\n",
41
- "os.environ[\"doubao_api\"] = \"6b890250-0cf6-4eb1-aa82-9c9d711398a7\""
42
  ]
43
  },
44
  {
45
  "cell_type": "code",
46
- "execution_count": 3,
47
  "id": "a5009d16e0851dca",
48
  "metadata": {
49
  "ExecuteTime": {
50
- "end_time": "2025-01-07T05:38:42.594315Z",
51
- "start_time": "2025-01-07T05:38:42.590800Z"
52
  }
53
  },
54
  "outputs": [],
@@ -61,7 +67,7 @@
61
  " prompt,\n",
62
  " system_prompt=system_prompt,\n",
63
  " history_messages=history_messages,\n",
64
- " api_key=os.getenv(\"doubao_api\"),\n",
65
  " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n",
66
  " **kwargs,\n",
67
  " )\n",
@@ -71,19 +77,19 @@
71
  " return await openai_embedding(\n",
72
  " texts,\n",
73
  " model=\"ep-20241231173413-pgjmk\",\n",
74
- " api_key=os.getenv(\"doubao_api\"),\n",
75
  " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n",
76
  " )"
77
  ]
78
  },
79
  {
80
  "cell_type": "code",
81
- "execution_count": 4,
82
  "id": "397fcad24ce4d0ed",
83
  "metadata": {
84
  "ExecuteTime": {
85
- "end_time": "2025-01-07T05:38:44.016901Z",
86
- "start_time": "2025-01-07T05:38:44.006291Z"
87
  }
88
  },
89
  "outputs": [
@@ -91,13 +97,13 @@
91
  "name": "stderr",
92
  "output_type": "stream",
93
  "text": [
94
- "INFO:lightrag:Logger initialized for working directory: ../llm_rag/paper_db/R000088_test2\n",
95
  "INFO:lightrag:Load KV llm_response_cache with 0 data\n",
96
  "INFO:lightrag:Load KV full_docs with 0 data\n",
97
  "INFO:lightrag:Load KV text_chunks with 0 data\n",
98
- "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_entities.json'} 0 data\n",
99
- "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_relationships.json'} 0 data\n",
100
- "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_chunks.json'} 0 data\n",
101
  "INFO:lightrag:Loaded document status storage with 0 records\n"
102
  ]
103
  }
@@ -109,23 +115,24 @@
109
  " embedding_func=EmbeddingFunc(\n",
110
  " embedding_dim=4096, max_token_size=8192, func=embedding_func\n",
111
  " ),\n",
 
112
  ")"
113
  ]
114
  },
115
  {
116
  "cell_type": "code",
117
- "execution_count": 5,
118
  "id": "1dc3603677f7484d",
119
  "metadata": {
120
  "ExecuteTime": {
121
- "end_time": "2025-01-07T05:38:47.509111Z",
122
- "start_time": "2025-01-07T05:38:47.501997Z"
123
  }
124
  },
125
  "outputs": [],
126
  "source": [
127
  "with open(\n",
128
- " \"../llm_rag/example/R000088/auto/R000088_full_txt.md\", \"r\", encoding=\"utf-8\"\n",
129
  ") as f:\n",
130
  " content = f.read()\n",
131
  "\n",
@@ -134,7 +141,7 @@
134
  " return await openai_embedding(\n",
135
  " texts,\n",
136
  " model=\"ep-20241231173413-pgjmk\",\n",
137
- " api_key=os.getenv(\"doubao_api\"),\n",
138
  " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n",
139
  " )\n",
140
  "\n",
@@ -148,12 +155,12 @@
148
  },
149
  {
150
  "cell_type": "code",
151
- "execution_count": 6,
152
  "id": "6844202606acfbe5",
153
  "metadata": {
154
  "ExecuteTime": {
155
- "end_time": "2025-01-07T05:38:50.666764Z",
156
- "start_time": "2025-01-07T05:38:50.247712Z"
157
  }
158
  },
159
  "outputs": [
@@ -171,12 +178,12 @@
171
  },
172
  {
173
  "cell_type": "code",
174
- "execution_count": 7,
175
  "id": "d6273839d9681403",
176
  "metadata": {
177
  "ExecuteTime": {
178
- "end_time": "2025-01-07T05:42:33.085507Z",
179
- "start_time": "2025-01-07T05:38:56.789348Z"
180
  }
181
  },
182
  "outputs": [
@@ -185,13 +192,48 @@
185
  "output_type": "stream",
186
  "text": [
187
  "INFO:lightrag:Processing 1 new unique documents\n",
188
- "Processing batch 1: 0%| | 0/1 [00:00<?, ?it/s]INFO:lightrag:Inserting 22 vectors to chunks\n",
189
  "\n",
190
- "Generating embeddings: 0%| | 0/1 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
191
  "\n",
192
- "Generating embeddings: 100%|██████████| 1/1 [00:03<00:00, 3.85s/batch]\u001b[A\n",
193
  "\n",
194
- "Extracting entities from chunks: 0%| | 0/22 [00:00<?, ?chunk/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
196
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
197
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
@@ -206,7 +248,7 @@
206
  "name": "stdout",
207
  "output_type": "stream",
208
  "text": [
209
- " Processed 1 chunks, 7 entities(duplicated), 6 relations(duplicated)\r"
210
  ]
211
  },
212
  {
@@ -214,7 +256,22 @@
214
  "output_type": "stream",
215
  "text": [
216
  "\n",
217
- "Extracting entities from chunks: 5%|▍ | 1/22 [00:23<08:21, 23.90s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
219
  ]
220
  },
@@ -222,7 +279,7 @@
222
  "name": "stdout",
223
  "output_type": "stream",
224
  "text": [
225
- " Processed 2 chunks, 12 entities(duplicated), 15 relations(duplicated)\r"
226
  ]
227
  },
228
  {
@@ -230,7 +287,7 @@
230
  "output_type": "stream",
231
  "text": [
232
  "\n",
233
- "Extracting entities from chunks: 9%|▉ | 2/22 [00:26<03:50, 11.51s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
234
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
235
  ]
236
  },
@@ -238,7 +295,7 @@
238
  "name": "stdout",
239
  "output_type": "stream",
240
  "text": [
241
- " Processed 3 chunks, 20 entities(duplicated), 22 relations(duplicated)\r"
242
  ]
243
  },
244
  {
@@ -246,7 +303,7 @@
246
  "output_type": "stream",
247
  "text": [
248
  "\n",
249
- "Extracting entities from chunks: 14%|█▎ | 3/22 [00:34<03:08, 9.93s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
250
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
251
  ]
252
  },
@@ -254,7 +311,7 @@
254
  "name": "stdout",
255
  "output_type": "stream",
256
  "text": [
257
- " Processed 4 chunks, 30 entities(duplicated), 30 relations(duplicated)\r"
258
  ]
259
  },
260
  {
@@ -262,14 +319,14 @@
262
  "output_type": "stream",
263
  "text": [
264
  "\n",
265
- "Extracting entities from chunks: 18%|█▊ | 4/22 [00:37<02:09, 7.21s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
266
  ]
267
  },
268
  {
269
  "name": "stdout",
270
  "output_type": "stream",
271
  "text": [
272
- " Processed 5 chunks, 39 entities(duplicated), 39 relations(duplicated)\r"
273
  ]
274
  },
275
  {
@@ -277,14 +334,29 @@
277
  "output_type": "stream",
278
  "text": [
279
  "\n",
280
- "Extracting entities from chunks: 23%|██▎ | 5/22 [00:38<01:19, 4.70s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
281
  ]
282
  },
283
  {
284
  "name": "stdout",
285
  "output_type": "stream",
286
  "text": [
287
- " Processed 6 chunks, 39 entities(duplicated), 39 relations(duplicated)\r"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  ]
289
  },
290
  {
@@ -292,14 +364,16 @@
292
  "output_type": "stream",
293
  "text": [
294
  "\n",
295
- "Extracting entities from chunks: 27%|██▋ | 6/22 [00:38<00:53, 3.32s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
 
 
296
  ]
297
  },
298
  {
299
  "name": "stdout",
300
  "output_type": "stream",
301
  "text": [
302
- " Processed 7 chunks, 47 entities(duplicated), 50 relations(duplicated)\r"
303
  ]
304
  },
305
  {
@@ -307,7 +381,7 @@
307
  "output_type": "stream",
308
  "text": [
309
  "\n",
310
- "Extracting entities from chunks: 32%|███▏ | 7/22 [00:39<00:39, 2.65s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
311
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
312
  ]
313
  },
@@ -315,7 +389,7 @@
315
  "name": "stdout",
316
  "output_type": "stream",
317
  "text": [
318
- " Processed 8 chunks, 56 entities(duplicated), 58 relations(duplicated)\r"
319
  ]
320
  },
321
  {
@@ -323,7 +397,7 @@
323
  "output_type": "stream",
324
  "text": [
325
  "\n",
326
- "Extracting entities from chunks: 36%|███▋ | 8/22 [00:40<00:29, 2.13s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
327
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
328
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
329
  ]
@@ -332,7 +406,38 @@
332
  "name": "stdout",
333
  "output_type": "stream",
334
  "text": [
335
- " Processed 9 chunks, 63 entities(duplicated), 69 relations(duplicated)\r"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  ]
337
  },
338
  {
@@ -340,14 +445,14 @@
340
  "output_type": "stream",
341
  "text": [
342
  "\n",
343
- "Extracting entities from chunks: 41%|████ | 9/22 [00:47<00:43, 3.38s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
344
  ]
345
  },
346
  {
347
  "name": "stdout",
348
  "output_type": "stream",
349
  "text": [
350
- " Processed 10 chunks, 81 entities(duplicated), 81 relations(duplicated)\r"
351
  ]
352
  },
353
  {
@@ -355,7 +460,39 @@
355
  "output_type": "stream",
356
  "text": [
357
  "\n",
358
- "Extracting entities from chunks: 45%|████▌ | 10/22 [00:48<00:32, 2.73s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
360
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
361
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
@@ -366,7 +503,7 @@
366
  "name": "stdout",
367
  "output_type": "stream",
368
  "text": [
369
- " Processed 11 chunks, 92 entities(duplicated), 89 relations(duplicated)\r"
370
  ]
371
  },
372
  {
@@ -374,14 +511,15 @@
374
  "output_type": "stream",
375
  "text": [
376
  "\n",
377
- "Extracting entities from chunks: 50%|█████ | 11/22 [01:01<01:05, 5.99s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
 
378
  ]
379
  },
380
  {
381
  "name": "stdout",
382
  "output_type": "stream",
383
  "text": [
384
- " Processed 12 chunks, 107 entities(duplicated), 107 relations(duplicated)\r"
385
  ]
386
  },
387
  {
@@ -389,7 +527,7 @@
389
  "output_type": "stream",
390
  "text": [
391
  "\n",
392
- "Extracting entities from chunks: 55%|█████▍ | 12/22 [01:10<01:09, 6.94s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
393
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
394
  ]
395
  },
@@ -397,7 +535,7 @@
397
  "name": "stdout",
398
  "output_type": "stream",
399
  "text": [
400
- " Processed 13 chunks, 127 entities(duplicated), 126 relations(duplicated)\r"
401
  ]
402
  },
403
  {
@@ -405,14 +543,18 @@
405
  "output_type": "stream",
406
  "text": [
407
  "\n",
408
- "Extracting entities from chunks: 59%|█████▉ | 13/22 [01:16<00:59, 6.59s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
 
 
 
 
409
  ]
410
  },
411
  {
412
  "name": "stdout",
413
  "output_type": "stream",
414
  "text": [
415
- " Processed 14 chunks, 151 entities(duplicated), 137 relations(duplicated)\r"
416
  ]
417
  },
418
  {
@@ -420,14 +562,14 @@
420
  "output_type": "stream",
421
  "text": [
422
  "\n",
423
- "Extracting entities from chunks: 64%|██████▎ | 14/22 [01:16<00:37, 4.68s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
424
  ]
425
  },
426
  {
427
  "name": "stdout",
428
  "output_type": "stream",
429
  "text": [
430
- " Processed 15 chunks, 161 entities(duplicated), 144 relations(duplicated)\r"
431
  ]
432
  },
433
  {
@@ -435,14 +577,14 @@
435
  "output_type": "stream",
436
  "text": [
437
  "\n",
438
- "Extracting entities from chunks: 68%|██████▊ | 15/22 [01:17<00:23, 3.31s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
439
  ]
440
  },
441
  {
442
  "name": "stdout",
443
  "output_type": "stream",
444
  "text": [
445
- " Processed 16 chunks, 176 entities(duplicated), 154 relations(duplicated)\r"
446
  ]
447
  },
448
  {
@@ -450,14 +592,14 @@
450
  "output_type": "stream",
451
  "text": [
452
  "\n",
453
- "Extracting entities from chunks: 73%|███████▎ | 16/22 [01:19<00:18, 3.04s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
454
  ]
455
  },
456
  {
457
  "name": "stdout",
458
  "output_type": "stream",
459
  "text": [
460
- " Processed 17 chunks, 189 entities(duplicated), 162 relations(duplicated)\r"
461
  ]
462
  },
463
  {
@@ -465,7 +607,7 @@
465
  "output_type": "stream",
466
  "text": [
467
  "\n",
468
- "Extracting entities from chunks: 77%|███████▋ | 17/22 [01:21<00:13, 2.80s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
469
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
470
  ]
471
  },
@@ -473,7 +615,7 @@
473
  "name": "stdout",
474
  "output_type": "stream",
475
  "text": [
476
- " Processed 18 chunks, 207 entities(duplicated), 186 relations(duplicated)\r"
477
  ]
478
  },
479
  {
@@ -481,14 +623,14 @@
481
  "output_type": "stream",
482
  "text": [
483
  "\n",
484
- "Extracting entities from chunks: 82%|████████▏ | 18/22 [01:38<00:28, 7.06s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
485
  ]
486
  },
487
  {
488
  "name": "stdout",
489
  "output_type": "stream",
490
  "text": [
491
- " Processed 19 chunks, 222 entities(duplicated), 200 relations(duplicated)\r"
492
  ]
493
  },
494
  {
@@ -496,15 +638,29 @@
496
  "output_type": "stream",
497
  "text": [
498
  "\n",
499
- "Extracting entities from chunks: 86%|████████▋ | 19/22 [01:44<00:19, 6.61s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
500
- "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
  ]
502
  },
503
  {
504
  "name": "stdout",
505
  "output_type": "stream",
506
  "text": [
507
- " Processed 20 chunks, 310 entities(duplicated), 219 relations(duplicated)\r"
508
  ]
509
  },
510
  {
@@ -512,7 +668,7 @@
512
  "output_type": "stream",
513
  "text": [
514
  "\n",
515
- "Extracting entities from chunks: 91%|█████████ | 20/22 [02:12<00:26, 13.19s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
516
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
517
  ]
518
  },
@@ -520,7 +676,52 @@
520
  "name": "stdout",
521
  "output_type": "stream",
522
  "text": [
523
- " Processed 21 chunks, 345 entities(duplicated), 263 relations(duplicated)\r"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  ]
525
  },
526
  {
@@ -528,14 +729,14 @@
528
  "output_type": "stream",
529
  "text": [
530
  "\n",
531
- "Extracting entities from chunks: 95%|█████████▌| 21/22 [02:32<00:15, 15.15s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
532
  ]
533
  },
534
  {
535
  "name": "stdout",
536
  "output_type": "stream",
537
  "text": [
538
- " Processed 22 chunks, 417 entities(duplicated), 285 relations(duplicated)\r"
539
  ]
540
  },
541
  {
@@ -543,20 +744,33 @@
543
  "output_type": "stream",
544
  "text": [
545
  "\n",
546
- "Extracting entities from chunks: 100%|██████████| 22/22 [03:21<00:00, 9.18s/chunk]\u001b[A\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
  "INFO:lightrag:Inserting entities into storage...\n",
548
  "\n",
549
- "Inserting entities: 100%|██████████| 327/327 [00:00<00:00, 13446.31entity/s]\n",
550
  "INFO:lightrag:Inserting relationships into storage...\n",
551
  "\n",
552
- "Inserting relationships: 100%|██████████| 272/272 [00:00<00:00, 16740.29relationship/s]\n",
553
- "INFO:lightrag:Inserting 327 vectors to entities\n",
554
  "\n",
555
  "Generating embeddings: 0%| | 0/11 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
556
  "\n",
557
- "Generating embeddings: 9%|▉ | 1/11 [00:00<00:09, 1.02batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
558
- "\n",
559
- "Generating embeddings: 18%|█▊ | 2/11 [00:02<00:09, 1.07s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
560
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
561
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
562
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
@@ -564,40 +778,52 @@
564
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
565
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
566
  "\n",
567
- "Generating embeddings: 27%|██▋ | 3/11 [00:02<00:06, 1.33batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
 
568
  "\n",
569
- "Generating embeddings: 36%|███▋ | 4/11 [00:02<00:04, 1.67batch/s]\u001b[A\n",
570
- "Generating embeddings: 45%|████▌ | 5/11 [00:03<00:03, 1.93batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
571
  "\n",
572
- "Generating embeddings: 55%|█████▍ | 6/11 [00:03<00:02, 2.15batch/s]\u001b[A\n",
573
- "Generating embeddings: 64%|██████▎ | 7/11 [00:03<00:01, 2.33batch/s]\u001b[A\n",
574
- "Generating embeddings: 73%|███████▎ | 8/11 [00:04<00:01, 2.46batch/s]\u001b[A\n",
575
- "Generating embeddings: 82%|████████▏ | 9/11 [00:04<00:00, 2.55batch/s]\u001b[A\n",
576
- "Generating embeddings: 91%|█████████ | 10/11 [00:05<00:00, 2.64batch/s]\u001b[A\n",
577
- "Generating embeddings: 100%|██████████| 11/11 [00:05<00:00, 2.04batch/s]\u001b[A\n",
578
- "INFO:lightrag:Inserting 272 vectors to relationships\n",
 
579
  "\n",
580
- "Generating embeddings: 0%| | 0/9 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
581
  "\n",
582
- "Generating embeddings: 11%|█ | 1/9 [00:01<00:11, 1.39s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
 
 
 
 
 
583
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
584
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
585
  "\n",
586
- "Generating embeddings: 22%|██▏ | 2/9 [00:02<00:07, 1.01s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
587
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
588
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
589
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
590
  "\n",
591
- "Generating embeddings: 33%|███▎ | 3/9 [00:02<00:04, 1.40batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
592
  "\n",
593
- "Generating embeddings: 44%|████▍ | 4/9 [00:02<00:02, 1.74batch/s]\u001b[A\n",
594
- "Generating embeddings: 56%|█████▌ | 5/9 [00:03<00:01, 2.01batch/s]\u001b[A\n",
595
- "Generating embeddings: 67%|██████▋ | 6/9 [00:03<00:01, 2.23batch/s]\u001b[A\n",
596
- "Generating embeddings: 78%|███████▊ | 7/9 [00:03<00:00, 2.39batch/s]\u001b[A\n",
597
- "Generating embeddings: 89%|████████▉ | 8/9 [00:04<00:00, 2.52batch/s]\u001b[A\n",
598
- "Generating embeddings: 100%|██████████| 9/9 [00:04<00:00, 1.93batch/s]\u001b[A\n",
599
- "INFO:lightrag:Writing graph with 331 nodes, 272 edges\n",
600
- "Processing batch 1: 100%|██████████| 1/1 [03:36<00:00, 216.27s/it]\n"
 
 
 
 
 
601
  ]
602
  }
603
  ],
@@ -608,40 +834,107 @@
608
  },
609
  {
610
  "cell_type": "code",
611
- "execution_count": 8,
612
  "id": "c4f9ae517151a01d",
613
  "metadata": {
614
  "ExecuteTime": {
615
- "end_time": "2025-01-07T05:42:50.044809Z",
616
- "start_time": "2025-01-07T05:42:50.041256Z"
617
  }
618
  },
619
  "outputs": [],
620
  "source": [
621
- "prompt1 = \"\"\"\n",
622
- "你是一名经验丰富的论文分析科学家,你的任务是对一篇英文学术研究论文进行关键信息提取并深入分析。\n",
623
- "\n",
624
  "请按照以下步骤进行分析:\n",
625
- "1. 对于论文的分析对象相关问题:\n",
626
- " - 仔细查找论文中的研究队列相关信息,确定分析对象来自哪些研究队列。\n",
627
- " - 查看如果来自多个队列,文中是单独分析还是联合分析。\n",
628
- " - 找出这些队列的名称。\n",
629
- " - 确定这些队列开展的国家有哪些(注意:“澳门”记为“中国澳门”,“香港”记为“中国香港”,“台湾”记为“中国台湾”,其余采用国家回答)。\n",
630
- " - 明确队列研究对象的性别分布(“男性”、“女性”或“全体”)。\n",
631
- " - 查找队列收集结束时,研究对象年龄分布(平均值/中位值、标准差或范围),若信息缺失则根据年龄推理规则进行推理:当论文只提供了队列开展时对象的年龄,应根据队列结束时间推算最终年龄范围。例如:1989建立队列时年龄为25 - 42岁,随访至2011年结束,则推算年龄范围为47 - 64岁。\n",
632
- " - 确定队列研究时间线,即哪一年开始收集信息/建立队列,哪一年结束,若信息缺失则根据队列时间线推理规则进行推理:如论文只提供了建立队列时间为1995,进行了10年的随访,则推算队列结束时间为2005年。\n",
633
- " - 找出队列结束时实际参与研究人数是多少。\n",
634
  "首先在<分析>标签中,针对每个问题详细分析你的思考过程。然后在<回答>标签中给出所有问题的最终答案。\"\"\""
635
  ]
636
  },
637
  {
638
  "cell_type": "code",
639
- "execution_count": 9,
640
  "id": "7a6491385b050095",
641
  "metadata": {
642
  "ExecuteTime": {
643
- "end_time": "2025-01-07T05:43:24.751628Z",
644
- "start_time": "2025-01-07T05:42:50.865679Z"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
  }
646
  },
647
  "outputs": [
@@ -649,28 +942,292 @@
649
  "name": "stderr",
650
  "output_type": "stream",
651
  "text": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
653
- "INFO:lightrag:kw_prompt result:\n"
 
 
 
 
654
  ]
655
  },
656
  {
657
  "name": "stdout",
658
  "output_type": "stream",
659
  "text": [
660
- "{\n",
661
- " \"high_level_keywords\": [\"英文学术研究论文分析\", \"关键信息提取\", \"深入分析\"],\n",
662
- " \"low_level_keywords\": [\"研究队列\", \"队列名称\", \"队列开展国家\", \"性别分布\", \"年龄分布\", \"队列研究时间线\", \"实际参与研究人数\"]\n",
663
- "}\n"
664
  ]
665
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  {
667
  "name": "stderr",
668
  "output_type": "stream",
669
  "text": [
670
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
671
- "INFO:lightrag:Local query uses 60 entites, 38 relations, 6 text units\n",
 
 
672
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
673
- "INFO:lightrag:Global query uses 72 entites, 60 relations, 4 text units\n",
674
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
675
  ]
676
  },
@@ -679,38 +1236,37 @@
679
  "output_type": "stream",
680
  "text": [
681
  "<分析>\n",
682
- "- **分析对象来自哪些研究队列及是单独分析还是联合分析**:\n",
683
- " 通过查找论文内容,发现文中提到“This is a combined analysis of data from 2 randomized, double-blind, placebo-controlled clinical trials (Norwegian Vitamin [NORVIT] trial15 and Western Norway B Vitamin Intervention Trial [WENBIT]16)”,明确是对两个队列的数据进行联合分析,队列名称分别为“Norwegian Vitamin (NORVIT) trial”和“Western Norway B Vitamin Intervention Trial (WENBIT)”。\n",
684
- "- **队列开展的国家**:\n",
685
- " 文中多次提及研究在挪威进行,如“combined analyses and extended follow-up of 2 vitamin B intervention trials among patients with ischemic heart disease in Norway”,所以确定研究开展的国家是挪威。\n",
686
- "- **队列研究对象的性别分布**:\n",
687
- " 从“Mean (SD) age was 62.3 (11.0) years and 23.5% of participants were women”可知,研究对象包含男性和女性,即全体。\n",
688
- "- **队列收集结束时研究对象年龄分布**:\n",
689
- " 已知“Mean (SD) age was 62.3 (11.0) years”是基线时年龄信息,“Median (interquartile range) duration of extended follow-up through December 31, 2007, was 78 (61 - 90) months”,由于随访的中位时间是78个月(约6.5年),所以可推算队列收集结束时研究对象年龄均值约为62.3 + 6.5 = 68.8岁(标准差仍为11.0年)。\n",
690
- "- **队列研究时间线**:\n",
691
- " 根据“2 randomized, double-blind, placebo-controlled clinical trials (Norwegian Vitamin [NORVIT] trial15 and Western Norway B Vitamin Intervention Trial [WENBIT]16) conducted between 1998 and 2005, and an observational posttrial follow-up through December 31, 2007”可知,队列开始收集信息时间为1998年,结束时间为2007年12月31日。\n",
692
- "- **队列结束时实际参与研究人数**:\n",
693
- " 由“A total of 6837 individuals were included in the combined analyses, of whom 6261 (91.6%) participated in posttrial follow-up”可知,队列结束时实际参与研究人数为6261人。\n",
694
  "</分析>\n",
695
  "\n",
696
  "<回答>\n",
697
- "- 分析对象来自“Norwegian Vitamin (NORVIT) trial”和“Western Norway B Vitamin Intervention Trial (WENBIT)”两个研究队列,文中是对这两个队列的数据进行联合分析。\n",
698
- "- 队列开展的国家是挪威。\n",
699
- "- 队列研究对象的性别分布为全体。\n",
700
- "- 队列收集结束时,研究对象年龄分布均值约为68.8岁,标准差为11.0年。\n",
701
- "- 队列研究时间线为1998年开始收集信息/建立队列,2007年12月31日结束。\n",
702
- "- 队列结束时实际参与研究人数是6261人。\n"
 
 
 
 
703
  ]
704
  }
705
  ],
706
  "source": [
707
- "print(rag.query(prompt1, param=QueryParam(mode=\"hybrid\")))"
 
708
  ]
709
  },
710
  {
711
  "cell_type": "code",
712
  "execution_count": null,
713
- "id": "fef9d06983da47af",
714
  "metadata": {},
715
  "outputs": [],
716
  "source": []
 
6
  "id": "4b5690db12e34685",
7
  "metadata": {
8
  "ExecuteTime": {
9
+ "end_time": "2025-01-09T03:40:58.307102Z",
10
+ "start_time": "2025-01-09T03:40:51.935233Z"
11
  }
12
  },
13
  "outputs": [],
 
21
  "import nest_asyncio"
22
  ]
23
  },
24
+ {
25
+ "cell_type": "markdown",
26
+ "id": "dd17956ec322b361",
27
+ "metadata": {},
28
+ "source": "#### split by character"
29
+ },
30
  {
31
  "cell_type": "code",
32
+ "execution_count": 3,
33
  "id": "8c8ee7c061bf9159",
34
  "metadata": {
35
  "ExecuteTime": {
36
+ "end_time": "2025-01-09T03:41:13.961167Z",
37
+ "start_time": "2025-01-09T03:41:13.958357Z"
38
  }
39
  },
40
  "outputs": [],
41
  "source": [
42
  "nest_asyncio.apply()\n",
43
+ "WORKING_DIR = \"../../llm_rag/paper_db/R000088_test1\"\n",
44
  "logging.basicConfig(format=\"%(levelname)s:%(message)s\", level=logging.INFO)\n",
45
  "if not os.path.exists(WORKING_DIR):\n",
46
  " os.mkdir(WORKING_DIR)\n",
47
+ "API = os.environ.get(\"DOUBAO_API_KEY\")"
48
  ]
49
  },
50
  {
51
  "cell_type": "code",
52
+ "execution_count": 4,
53
  "id": "a5009d16e0851dca",
54
  "metadata": {
55
  "ExecuteTime": {
56
+ "end_time": "2025-01-09T03:41:16.862036Z",
57
+ "start_time": "2025-01-09T03:41:16.859306Z"
58
  }
59
  },
60
  "outputs": [],
 
67
  " prompt,\n",
68
  " system_prompt=system_prompt,\n",
69
  " history_messages=history_messages,\n",
70
+ " api_key=API,\n",
71
  " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n",
72
  " **kwargs,\n",
73
  " )\n",
 
77
  " return await openai_embedding(\n",
78
  " texts,\n",
79
  " model=\"ep-20241231173413-pgjmk\",\n",
80
+ " api_key=API,\n",
81
  " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n",
82
  " )"
83
  ]
84
  },
85
  {
86
  "cell_type": "code",
87
+ "execution_count": 5,
88
  "id": "397fcad24ce4d0ed",
89
  "metadata": {
90
  "ExecuteTime": {
91
+ "end_time": "2025-01-09T03:41:24.950307Z",
92
+ "start_time": "2025-01-09T03:41:24.940353Z"
93
  }
94
  },
95
  "outputs": [
 
97
  "name": "stderr",
98
  "output_type": "stream",
99
  "text": [
100
+ "INFO:lightrag:Logger initialized for working directory: ../../llm_rag/paper_db/R000088_test1\n",
101
  "INFO:lightrag:Load KV llm_response_cache with 0 data\n",
102
  "INFO:lightrag:Load KV full_docs with 0 data\n",
103
  "INFO:lightrag:Load KV text_chunks with 0 data\n",
104
+ "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_entities.json'} 0 data\n",
105
+ "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_relationships.json'} 0 data\n",
106
+ "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_chunks.json'} 0 data\n",
107
  "INFO:lightrag:Loaded document status storage with 0 records\n"
108
  ]
109
  }
 
115
  " embedding_func=EmbeddingFunc(\n",
116
  " embedding_dim=4096, max_token_size=8192, func=embedding_func\n",
117
  " ),\n",
118
+ " chunk_token_size=512,\n",
119
  ")"
120
  ]
121
  },
122
  {
123
  "cell_type": "code",
124
+ "execution_count": 6,
125
  "id": "1dc3603677f7484d",
126
  "metadata": {
127
  "ExecuteTime": {
128
+ "end_time": "2025-01-09T03:41:37.947456Z",
129
+ "start_time": "2025-01-09T03:41:37.941901Z"
130
  }
131
  },
132
  "outputs": [],
133
  "source": [
134
  "with open(\n",
135
+ " \"../../llm_rag/example/R000088/auto/R000088_full_txt.md\", \"r\", encoding=\"utf-8\"\n",
136
  ") as f:\n",
137
  " content = f.read()\n",
138
  "\n",
 
141
  " return await openai_embedding(\n",
142
  " texts,\n",
143
  " model=\"ep-20241231173413-pgjmk\",\n",
144
+ " api_key=API,\n",
145
  " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n",
146
  " )\n",
147
  "\n",
 
155
  },
156
  {
157
  "cell_type": "code",
158
+ "execution_count": 7,
159
  "id": "6844202606acfbe5",
160
  "metadata": {
161
  "ExecuteTime": {
162
+ "end_time": "2025-01-09T03:41:39.608541Z",
163
+ "start_time": "2025-01-09T03:41:39.165057Z"
164
  }
165
  },
166
  "outputs": [
 
178
  },
179
  {
180
  "cell_type": "code",
181
+ "execution_count": 8,
182
  "id": "d6273839d9681403",
183
  "metadata": {
184
  "ExecuteTime": {
185
+ "end_time": "2025-01-09T03:44:34.295345Z",
186
+ "start_time": "2025-01-09T03:41:48.324171Z"
187
  }
188
  },
189
  "outputs": [
 
192
  "output_type": "stream",
193
  "text": [
194
  "INFO:lightrag:Processing 1 new unique documents\n",
195
+ "Processing batch 1: 0%| | 0/1 [00:00<?, ?it/s]INFO:lightrag:Inserting 35 vectors to chunks\n",
196
  "\n",
197
+ "Generating embeddings: 0%| | 0/2 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
198
  "\n",
199
+ "Generating embeddings: 50%|█████ | 1/2 [00:00<00:00, 1.36batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
200
  "\n",
201
+ "Generating embeddings: 100%|██████████| 2/2 [00:04<00:00, 2.25s/batch]\u001b[A\n",
202
+ "\n",
203
+ "Extracting entities from chunks: 0%| | 0/35 [00:00<?, ?chunk/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
204
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
205
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
206
+ ]
207
+ },
208
+ {
209
+ "name": "stdout",
210
+ "output_type": "stream",
211
+ "text": [
212
+ "⠙ Processed 1 chunks, 1 entities(duplicated), 0 relations(duplicated)\r"
213
+ ]
214
+ },
215
+ {
216
+ "name": "stderr",
217
+ "output_type": "stream",
218
+ "text": [
219
+ "\n",
220
+ "Extracting entities from chunks: 3%|▎ | 1/35 [00:04<02:47, 4.93s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
221
+ ]
222
+ },
223
+ {
224
+ "name": "stdout",
225
+ "output_type": "stream",
226
+ "text": [
227
+ "⠹ Processed 2 chunks, 2 entities(duplicated), 0 relations(duplicated)\r"
228
+ ]
229
+ },
230
+ {
231
+ "name": "stderr",
232
+ "output_type": "stream",
233
+ "text": [
234
+ "\n",
235
+ "Extracting entities from chunks: 6%|▌ | 2/35 [00:05<01:18, 2.37s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
236
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
237
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
238
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
239
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
 
248
  "name": "stdout",
249
  "output_type": "stream",
250
  "text": [
251
+ " Processed 3 chunks, 9 entities(duplicated), 5 relations(duplicated)\r"
252
  ]
253
  },
254
  {
 
256
  "output_type": "stream",
257
  "text": [
258
  "\n",
259
+ "Extracting entities from chunks: 9%|▊ | 3/35 [00:26<05:43, 10.73s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
260
+ ]
261
+ },
262
+ {
263
+ "name": "stdout",
264
+ "output_type": "stream",
265
+ "text": [
266
+ "⠼ Processed 4 chunks, 16 entities(duplicated), 11 relations(duplicated)\r"
267
+ ]
268
+ },
269
+ {
270
+ "name": "stderr",
271
+ "output_type": "stream",
272
+ "text": [
273
+ "\n",
274
+ "Extracting entities from chunks: 11%|█▏ | 4/35 [00:26<03:24, 6.60s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
275
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
276
  ]
277
  },
 
279
  "name": "stdout",
280
  "output_type": "stream",
281
  "text": [
282
+ " Processed 5 chunks, 24 entities(duplicated), 18 relations(duplicated)\r"
283
  ]
284
  },
285
  {
 
287
  "output_type": "stream",
288
  "text": [
289
  "\n",
290
+ "Extracting entities from chunks: 14%|█▍ | 5/35 [00:33<03:24, 6.82s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
291
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
292
  ]
293
  },
 
295
  "name": "stdout",
296
  "output_type": "stream",
297
  "text": [
298
+ " Processed 6 chunks, 35 entities(duplicated), 28 relations(duplicated)\r"
299
  ]
300
  },
301
  {
 
303
  "output_type": "stream",
304
  "text": [
305
  "\n",
306
+ "Extracting entities from chunks: 17%|█▋ | 6/35 [00:42<03:38, 7.53s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
307
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
308
  ]
309
  },
 
311
  "name": "stdout",
312
  "output_type": "stream",
313
  "text": [
314
+ " Processed 7 chunks, 47 entities(duplicated), 36 relations(duplicated)\r"
315
  ]
316
  },
317
  {
 
319
  "output_type": "stream",
320
  "text": [
321
  "\n",
322
+ "Extracting entities from chunks: 20%|██ | 7/35 [00:43<02:28, 5.31s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
323
  ]
324
  },
325
  {
326
  "name": "stdout",
327
  "output_type": "stream",
328
  "text": [
329
+ " Processed 8 chunks, 61 entities(duplicated), 49 relations(duplicated)\r"
330
  ]
331
  },
332
  {
 
334
  "output_type": "stream",
335
  "text": [
336
  "\n",
337
+ "Extracting entities from chunks: 23%|██▎ | 8/35 [00:45<01:52, 4.16s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
338
  ]
339
  },
340
  {
341
  "name": "stdout",
342
  "output_type": "stream",
343
  "text": [
344
+ " Processed 9 chunks, 81 entities(duplicated), 49 relations(duplicated)\r"
345
+ ]
346
+ },
347
+ {
348
+ "name": "stderr",
349
+ "output_type": "stream",
350
+ "text": [
351
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
352
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
353
+ ]
354
+ },
355
+ {
356
+ "name": "stdout",
357
+ "output_type": "stream",
358
+ "text": [
359
+ "⠋ Processed 10 chunks, 90 entities(duplicated), 62 relations(duplicated)\r"
360
  ]
361
  },
362
  {
 
364
  "output_type": "stream",
365
  "text": [
366
  "\n",
367
+ "Extracting entities from chunks: 29%|██▊ | 10/35 [00:46<01:06, 2.64s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
368
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
369
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
370
  ]
371
  },
372
  {
373
  "name": "stdout",
374
  "output_type": "stream",
375
  "text": [
376
+ " Processed 11 chunks, 101 entities(duplicated), 80 relations(duplicated)\r"
377
  ]
378
  },
379
  {
 
381
  "output_type": "stream",
382
  "text": [
383
  "\n",
384
+ "Extracting entities from chunks: 31%|███▏ | 11/35 [00:52<01:19, 3.31s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
385
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
386
  ]
387
  },
 
389
  "name": "stdout",
390
  "output_type": "stream",
391
  "text": [
392
+ " Processed 12 chunks, 108 entities(duplicated), 85 relations(duplicated)\r"
393
  ]
394
  },
395
  {
 
397
  "output_type": "stream",
398
  "text": [
399
  "\n",
400
+ "Extracting entities from chunks: 34%|███▍ | 12/35 [00:54<01:11, 3.12s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
401
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
402
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
403
  ]
 
406
  "name": "stdout",
407
  "output_type": "stream",
408
  "text": [
409
+ " Processed 13 chunks, 120 entities(duplicated), 100 relations(duplicated)\r"
410
+ ]
411
+ },
412
+ {
413
+ "name": "stderr",
414
+ "output_type": "stream",
415
+ "text": [
416
+ "\n",
417
+ "Extracting entities from chunks: 37%|███▋ | 13/35 [00:59<01:18, 3.55s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
418
+ ]
419
+ },
420
+ {
421
+ "name": "stdout",
422
+ "output_type": "stream",
423
+ "text": [
424
+ "⠼ Processed 14 chunks, 131 entities(duplicated), 110 relations(duplicated)\r"
425
+ ]
426
+ },
427
+ {
428
+ "name": "stderr",
429
+ "output_type": "stream",
430
+ "text": [
431
+ "\n",
432
+ "Extracting entities from chunks: 40%|████ | 14/35 [01:00<00:59, 2.82s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
433
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
434
+ ]
435
+ },
436
+ {
437
+ "name": "stdout",
438
+ "output_type": "stream",
439
+ "text": [
440
+ "⠴ Processed 15 chunks, 143 entities(duplicated), 110 relations(duplicated)\r"
441
  ]
442
  },
443
  {
 
445
  "output_type": "stream",
446
  "text": [
447
  "\n",
448
+ "Extracting entities from chunks: 43%|████▎ | 15/35 [01:02<00:52, 2.64s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
449
  ]
450
  },
451
  {
452
  "name": "stdout",
453
  "output_type": "stream",
454
  "text": [
455
+ " Processed 16 chunks, 162 entities(duplicated), 124 relations(duplicated)\r"
456
  ]
457
  },
458
  {
 
460
  "output_type": "stream",
461
  "text": [
462
  "\n",
463
+ "Extracting entities from chunks: 46%|████▌ | 16/35 [01:05<00:53, 2.80s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
464
+ ]
465
+ },
466
+ {
467
+ "name": "stdout",
468
+ "output_type": "stream",
469
+ "text": [
470
+ "⠧ Processed 17 chunks, 174 entities(duplicated), 132 relations(duplicated)\r"
471
+ ]
472
+ },
473
+ {
474
+ "name": "stderr",
475
+ "output_type": "stream",
476
+ "text": [
477
+ "\n",
478
+ "Extracting entities from chunks: 49%|████▊ | 17/35 [01:06<00:39, 2.19s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
479
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
480
+ ]
481
+ },
482
+ {
483
+ "name": "stdout",
484
+ "output_type": "stream",
485
+ "text": [
486
+ "⠇ Processed 18 chunks, 185 entities(duplicated), 137 relations(duplicated)\r"
487
+ ]
488
+ },
489
+ {
490
+ "name": "stderr",
491
+ "output_type": "stream",
492
+ "text": [
493
+ "\n",
494
+ "Extracting entities from chunks: 51%|█████▏ | 18/35 [01:12<00:53, 3.15s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
495
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
496
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
497
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
498
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
 
503
  "name": "stdout",
504
  "output_type": "stream",
505
  "text": [
506
+ " Processed 19 chunks, 193 entities(duplicated), 149 relations(duplicated)\r"
507
  ]
508
  },
509
  {
 
511
  "output_type": "stream",
512
  "text": [
513
  "\n",
514
+ "Extracting entities from chunks: 54%|█████▍ | 19/35 [01:18<01:06, 4.14s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
515
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
516
  ]
517
  },
518
  {
519
  "name": "stdout",
520
  "output_type": "stream",
521
  "text": [
522
+ " Processed 20 chunks, 205 entities(duplicated), 158 relations(duplicated)\r"
523
  ]
524
  },
525
  {
 
527
  "output_type": "stream",
528
  "text": [
529
  "\n",
530
+ "Extracting entities from chunks: 57%|█████▋ | 20/35 [01:19<00:50, 3.35s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
531
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
532
  ]
533
  },
 
535
  "name": "stdout",
536
  "output_type": "stream",
537
  "text": [
538
+ " Processed 21 chunks, 220 entities(duplicated), 187 relations(duplicated)\r"
539
  ]
540
  },
541
  {
 
543
  "output_type": "stream",
544
  "text": [
545
  "\n",
546
+ "Extracting entities from chunks: 60%|██████ | 21/35 [01:27<01:02, 4.47s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
547
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
548
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
549
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
550
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
551
  ]
552
  },
553
  {
554
  "name": "stdout",
555
  "output_type": "stream",
556
  "text": [
557
+ " Processed 22 chunks, 247 entities(duplicated), 216 relations(duplicated)\r"
558
  ]
559
  },
560
  {
 
562
  "output_type": "stream",
563
  "text": [
564
  "\n",
565
+ "Extracting entities from chunks: 63%|██████▎ | 22/35 [01:30<00:54, 4.16s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
566
  ]
567
  },
568
  {
569
  "name": "stdout",
570
  "output_type": "stream",
571
  "text": [
572
+ " Processed 23 chunks, 260 entities(duplicated), 230 relations(duplicated)\r"
573
  ]
574
  },
575
  {
 
577
  "output_type": "stream",
578
  "text": [
579
  "\n",
580
+ "Extracting entities from chunks: 66%|██████▌ | 23/35 [01:34<00:48, 4.05s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
581
  ]
582
  },
583
  {
584
  "name": "stdout",
585
  "output_type": "stream",
586
  "text": [
587
+ " Processed 24 chunks, 291 entities(duplicated), 253 relations(duplicated)\r"
588
  ]
589
  },
590
  {
 
592
  "output_type": "stream",
593
  "text": [
594
  "\n",
595
+ "Extracting entities from chunks: 69%|██████▊ | 24/35 [01:38<00:44, 4.03s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
596
  ]
597
  },
598
  {
599
  "name": "stdout",
600
  "output_type": "stream",
601
  "text": [
602
+ " Processed 25 chunks, 304 entities(duplicated), 262 relations(duplicated)\r"
603
  ]
604
  },
605
  {
 
607
  "output_type": "stream",
608
  "text": [
609
  "\n",
610
+ "Extracting entities from chunks: 71%|███████▏ | 25/35 [01:41<00:36, 3.67s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
611
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
612
  ]
613
  },
 
615
  "name": "stdout",
616
  "output_type": "stream",
617
  "text": [
618
+ " Processed 26 chunks, 313 entities(duplicated), 271 relations(duplicated)\r"
619
  ]
620
  },
621
  {
 
623
  "output_type": "stream",
624
  "text": [
625
  "\n",
626
+ "Extracting entities from chunks: 74%|███████▍ | 26/35 [01:41<00:24, 2.76s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
627
  ]
628
  },
629
  {
630
  "name": "stdout",
631
  "output_type": "stream",
632
  "text": [
633
+ " Processed 27 chunks, 321 entities(duplicated), 283 relations(duplicated)\r"
634
  ]
635
  },
636
  {
 
638
  "output_type": "stream",
639
  "text": [
640
  "\n",
641
+ "Extracting entities from chunks: 77%|███████▋ | 27/35 [01:47<00:28, 3.52s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
642
+ ]
643
+ },
644
+ {
645
+ "name": "stdout",
646
+ "output_type": "stream",
647
+ "text": [
648
+ "⠇ Processed 28 chunks, 333 entities(duplicated), 290 relations(duplicated)\r"
649
+ ]
650
+ },
651
+ {
652
+ "name": "stderr",
653
+ "output_type": "stream",
654
+ "text": [
655
+ "\n",
656
+ "Extracting entities from chunks: 80%|████████ | 28/35 [01:52<00:28, 4.08s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
657
  ]
658
  },
659
  {
660
  "name": "stdout",
661
  "output_type": "stream",
662
  "text": [
663
+ " Processed 29 chunks, 348 entities(duplicated), 307 relations(duplicated)\r"
664
  ]
665
  },
666
  {
 
668
  "output_type": "stream",
669
  "text": [
670
  "\n",
671
+ "Extracting entities from chunks: 83%|████████▎ | 29/35 [01:59<00:29, 4.88s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
672
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
673
  ]
674
  },
 
676
  "name": "stdout",
677
  "output_type": "stream",
678
  "text": [
679
+ " Processed 30 chunks, 362 entities(duplicated), 329 relations(duplicated)\r"
680
+ ]
681
+ },
682
+ {
683
+ "name": "stderr",
684
+ "output_type": "stream",
685
+ "text": [
686
+ "\n",
687
+ "Extracting entities from chunks: 86%|████████▌ | 30/35 [02:02<00:21, 4.29s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
688
+ ]
689
+ },
690
+ {
691
+ "name": "stdout",
692
+ "output_type": "stream",
693
+ "text": [
694
+ "⠙ Processed 31 chunks, 373 entities(duplicated), 337 relations(duplicated)\r"
695
+ ]
696
+ },
697
+ {
698
+ "name": "stderr",
699
+ "output_type": "stream",
700
+ "text": [
701
+ "\n",
702
+ "Extracting entities from chunks: 89%|████████▊ | 31/35 [02:03<00:13, 3.28s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
703
+ ]
704
+ },
705
+ {
706
+ "name": "stdout",
707
+ "output_type": "stream",
708
+ "text": [
709
+ "⠹ Processed 32 chunks, 390 entities(duplicated), 369 relations(duplicated)\r"
710
+ ]
711
+ },
712
+ {
713
+ "name": "stderr",
714
+ "output_type": "stream",
715
+ "text": [
716
+ "\n",
717
+ "Extracting entities from chunks: 91%|█████████▏| 32/35 [02:03<00:07, 2.55s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
718
+ ]
719
+ },
720
+ {
721
+ "name": "stdout",
722
+ "output_type": "stream",
723
+ "text": [
724
+ "⠸ Processed 33 chunks, 405 entities(duplicated), 378 relations(duplicated)\r"
725
  ]
726
  },
727
  {
 
729
  "output_type": "stream",
730
  "text": [
731
  "\n",
732
+ "Extracting entities from chunks: 94%|█████████▍| 33/35 [02:07<00:05, 2.84s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
733
  ]
734
  },
735
  {
736
  "name": "stdout",
737
  "output_type": "stream",
738
  "text": [
739
+ " Processed 34 chunks, 435 entities(duplicated), 395 relations(duplicated)\r"
740
  ]
741
  },
742
  {
 
744
  "output_type": "stream",
745
  "text": [
746
  "\n",
747
+ "Extracting entities from chunks: 97%|█████████▋| 34/35 [02:10<00:02, 2.94s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
748
+ ]
749
+ },
750
+ {
751
+ "name": "stdout",
752
+ "output_type": "stream",
753
+ "text": [
754
+ "⠴ Processed 35 chunks, 456 entities(duplicated), 440 relations(duplicated)\r"
755
+ ]
756
+ },
757
+ {
758
+ "name": "stderr",
759
+ "output_type": "stream",
760
+ "text": [
761
+ "\n",
762
+ "Extracting entities from chunks: 100%|██████████| 35/35 [02:23<00:00, 4.10s/chunk]\u001b[A\n",
763
  "INFO:lightrag:Inserting entities into storage...\n",
764
  "\n",
765
+ "Inserting entities: 100%|██████████| 324/324 [00:00<00:00, 17456.96entity/s]\n",
766
  "INFO:lightrag:Inserting relationships into storage...\n",
767
  "\n",
768
+ "Inserting relationships: 100%|██████████| 427/427 [00:00<00:00, 29956.31relationship/s]\n",
769
+ "INFO:lightrag:Inserting 324 vectors to entities\n",
770
  "\n",
771
  "Generating embeddings: 0%| | 0/11 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
772
  "\n",
773
+ "Generating embeddings: 9%|▉ | 1/11 [00:00<00:06, 1.48batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
 
 
774
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
775
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
776
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
 
778
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
779
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
780
  "\n",
781
+ "Generating embeddings: 18%|█▊ | 2/11 [00:02<00:11, 1.25s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
782
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
783
  "\n",
784
+ "Generating embeddings: 27%|██▋ | 3/11 [00:02<00:06, 1.17batch/s]\u001b[A\n",
785
+ "Generating embeddings: 36%|███▋ | 4/11 [00:03<00:04, 1.50batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
786
  "\n",
787
+ "Generating embeddings: 45%|████▌ | 5/11 [00:03<00:03, 1.78batch/s]\u001b[A\n",
788
+ "Generating embeddings: 55%|█████▍ | 6/11 [00:03<00:02, 2.01batch/s]\u001b[A\n",
789
+ "Generating embeddings: 64%|██████▎ | 7/11 [00:04<00:01, 2.19batch/s]\u001b[A\n",
790
+ "Generating embeddings: 73%|███████▎ | 8/11 [00:04<00:01, 2.31batch/s]\u001b[A\n",
791
+ "Generating embeddings: 82%|████████▏ | 9/11 [00:04<00:00, 2.41batch/s]\u001b[A\n",
792
+ "Generating embeddings: 91%|█████████ | 10/11 [00:05<00:00, 2.48batch/s]\u001b[A\n",
793
+ "Generating embeddings: 100%|██████████| 11/11 [00:05<00:00, 1.91batch/s]\u001b[A\n",
794
+ "INFO:lightrag:Inserting 427 vectors to relationships\n",
795
  "\n",
796
+ "Generating embeddings: 0%| | 0/14 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
797
  "\n",
798
+ "Generating embeddings: 7%|▋ | 1/14 [00:01<00:14, 1.11s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
799
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
800
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
801
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
802
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
803
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
804
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
805
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
806
  "\n",
807
+ "Generating embeddings: 14%|█▍ | 2/14 [00:02<00:14, 1.18s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
808
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
809
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
810
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
811
  "\n",
812
+ "Generating embeddings: 21%|██▏ | 3/14 [00:02<00:08, 1.23batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
813
  "\n",
814
+ "Generating embeddings: 29%|██▊ | 4/14 [00:03<00:06, 1.56batch/s]\u001b[A\n",
815
+ "Generating embeddings: 36%|███▌ | 5/14 [00:03<00:04, 1.85batch/s]\u001b[A\n",
816
+ "Generating embeddings: 43%|████▎ | 6/14 [00:03<00:03, 2.05batch/s]\u001b[A\n",
817
+ "Generating embeddings: 50%|█████ | 7/14 [00:04<00:03, 2.23batch/s]\u001b[A\n",
818
+ "Generating embeddings: 57%|█████▋ | 8/14 [00:04<00:02, 2.37batch/s]\u001b[A\n",
819
+ "Generating embeddings: 64%|██████▍ | 9/14 [00:04<00:02, 2.46batch/s]\u001b[A\n",
820
+ "Generating embeddings: 71%|███████▏ | 10/14 [00:05<00:01, 2.54batch/s]\u001b[A\n",
821
+ "Generating embeddings: 79%|███████▊ | 11/14 [00:05<00:01, 2.59batch/s]\u001b[A\n",
822
+ "Generating embeddings: 86%|████████▌ | 12/14 [00:06<00:00, 2.64batch/s]\u001b[A\n",
823
+ "Generating embeddings: 93%|█████████▎| 13/14 [00:06<00:00, 2.65batch/s]\u001b[A\n",
824
+ "Generating embeddings: 100%|██████████| 14/14 [00:06<00:00, 2.05batch/s]\u001b[A\n",
825
+ "INFO:lightrag:Writing graph with 333 nodes, 427 edges\n",
826
+ "Processing batch 1: 100%|██████████| 1/1 [02:45<00:00, 165.90s/it]\n"
827
  ]
828
  }
829
  ],
 
834
  },
835
  {
836
  "cell_type": "code",
837
+ "execution_count": 9,
838
  "id": "c4f9ae517151a01d",
839
  "metadata": {
840
  "ExecuteTime": {
841
+ "end_time": "2025-01-09T03:45:11.668987Z",
842
+ "start_time": "2025-01-09T03:45:11.664744Z"
843
  }
844
  },
845
  "outputs": [],
846
  "source": [
847
+ "prompt1 = \"\"\"你是一名经验丰富的论文分析科学家,你的任务是对一篇英文学术研究论文进行关键信息提取并深入分析。\n",
 
 
848
  "请按照以下步骤进行分析:\n",
849
+ "1. 该文献主要研究的问题是什么?\n",
850
+ "2. 该文献采用什么方法进行分析?\n",
851
+ "3. 该文献的主要结论是什么?\n",
 
 
 
 
 
 
852
  "首先在<分析>标签中,针对每个问题详细分析你的思考过程。然后在<回答>标签中给出所有问题的最终答案。\"\"\""
853
  ]
854
  },
855
  {
856
  "cell_type": "code",
857
+ "execution_count": 10,
858
  "id": "7a6491385b050095",
859
  "metadata": {
860
  "ExecuteTime": {
861
+ "end_time": "2025-01-09T03:45:40.829111Z",
862
+ "start_time": "2025-01-09T03:45:13.530298Z"
863
+ }
864
+ },
865
+ "outputs": [
866
+ {
867
+ "name": "stderr",
868
+ "output_type": "stream",
869
+ "text": [
870
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
871
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
872
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
873
+ "INFO:lightrag:Local query uses 5 entites, 12 relations, 3 text units\n",
874
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
875
+ "INFO:lightrag:Global query uses 8 entites, 5 relations, 4 text units\n",
876
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
877
+ ]
878
+ },
879
+ {
880
+ "name": "stdout",
881
+ "output_type": "stream",
882
+ "text": [
883
+ "<分析>\n",
884
+ "1. **该文献主要研究的问题是什么?**\n",
885
+ " - 思考过程:通过浏览论文内容,查找作者明确阐述研究目的的部分。文中多处提及“Our study was performed to explore whether folic acid treatment was associated with cancer outcomes and all-cause mortality after extended follow-up”,表明作者旨在探究叶酸治疗与癌症结局及全因死亡率之间的关系,尤其是在经过长期随访后。\n",
886
+ "2. **该文献采用什么方法进行分析?**\n",
887
+ " - 思考过程:寻找描述研究方法和数据分析过程的段落。文中提到“Survival curves were constructed using the Kaplan-Meier method and differences in survival between groups were analyzed using the log-rank test. Estimates of hazard ratios (HRs) with 95% CIs were obtained by using Cox proportional hazards regression models stratified by trial”,可以看出作者使用了Kaplan-Meier法构建生存曲线、log-rank检验分析组间生存差异以及Cox比例风险回归模型估计风险比等方法。\n",
888
+ "3. **该文献的主要结论是什么?**\n",
889
+ " - 思考过程:定位到论文中总结结论的部分,如“Conclusion Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods”,可知作者得出叶酸加维生素$\\mathsf{B}_{12}$治疗与癌症结局和全因死亡率增加有关的结论。\n",
890
+ "<回答>\n",
891
+ "1. 该文献主要研究的问题是:叶酸治疗与癌症结局及全因死亡率之间的关系,尤其是在经过长期随访后,叶酸治疗是否与癌症结局和全因死亡率相关。\n",
892
+ "2. 该文献采用的分析方法包括:使用Kaplan-Meier法构建生存曲线、log-rank检验分析组间生存差异、Cox比例风险回归模型估计风险比等。\n",
893
+ "3. 该文献的主要结论是:在挪威没有叶酸强化食品的情况下,叶酸加维生素$\\mathsf{B}_{12}$治疗与缺血性心脏病患者的癌症结局和全因死亡率增加有关。\n",
894
+ "\n",
895
+ "**参考文献**\n",
896
+ "- [VD] In2Norwegianhomocysteine-lowering trialsamongpatientswithischemicheart disease, there was a statistically nonsignificantincreaseincancerincidenceinthe groupsassignedtofolicacidtreatment.15,16 Our study was performed to explore whetherfolicacidtreatmentwasassociatedwithcanceroutcomesandall-cause mortality after extended follow-up.\n",
897
+ "- [VD] Survivalcurveswereconstructedusing theKaplan-Meiermethodanddifferences insurvivalbetweengroupswereanalyzed usingthelog-ranktest.Estimatesofhazard ratios (HRs) with $95\\%$ CIs were obtainedbyusingCoxproportionalhazards regressionmodelsstratifiedbytrial.\n",
898
+ "- [VD] Conclusion Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods.\n"
899
+ ]
900
+ }
901
+ ],
902
+ "source": [
903
+ "resp = rag.query(prompt1, param=QueryParam(mode=\"mix\", top_k=5))\n",
904
+ "print(resp)"
905
+ ]
906
+ },
907
+ {
908
+ "cell_type": "markdown",
909
+ "id": "4e5bfad24cb721a8",
910
+ "metadata": {},
911
+ "source": "#### split by character only"
912
+ },
913
+ {
914
+ "cell_type": "code",
915
+ "execution_count": 11,
916
+ "id": "44e2992dc95f8ce0",
917
+ "metadata": {
918
+ "ExecuteTime": {
919
+ "end_time": "2025-01-09T03:47:40.988796Z",
920
+ "start_time": "2025-01-09T03:47:40.982648Z"
921
+ }
922
+ },
923
+ "outputs": [],
924
+ "source": [
925
+ "WORKING_DIR = \"../../llm_rag/paper_db/R000088_test2\"\n",
926
+ "if not os.path.exists(WORKING_DIR):\n",
927
+ " os.mkdir(WORKING_DIR)"
928
+ ]
929
+ },
930
+ {
931
+ "cell_type": "code",
932
+ "execution_count": 12,
933
+ "id": "62c63385d2d973d5",
934
+ "metadata": {
935
+ "ExecuteTime": {
936
+ "end_time": "2025-01-09T03:51:39.951329Z",
937
+ "start_time": "2025-01-09T03:49:15.218976Z"
938
  }
939
  },
940
  "outputs": [
 
942
  "name": "stderr",
943
  "output_type": "stream",
944
  "text": [
945
+ "INFO:lightrag:Logger initialized for working directory: ../../llm_rag/paper_db/R000088_test2\n",
946
+ "INFO:lightrag:Load KV llm_response_cache with 0 data\n",
947
+ "INFO:lightrag:Load KV full_docs with 0 data\n",
948
+ "INFO:lightrag:Load KV text_chunks with 0 data\n",
949
+ "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_entities.json'} 0 data\n",
950
+ "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_relationships.json'} 0 data\n",
951
+ "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_chunks.json'} 0 data\n",
952
+ "INFO:lightrag:Loaded document status storage with 0 records\n",
953
+ "INFO:lightrag:Processing 1 new unique documents\n",
954
+ "Processing batch 1: 0%| | 0/1 [00:00<?, ?it/s]INFO:lightrag:Inserting 12 vectors to chunks\n",
955
+ "\n",
956
+ "Generating embeddings: 0%| | 0/1 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
957
+ "\n",
958
+ "Generating embeddings: 100%|██████████| 1/1 [00:02<00:00, 2.95s/batch]\u001b[A\n",
959
+ "\n",
960
+ "Extracting entities from chunks: 0%| | 0/12 [00:00<?, ?chunk/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
961
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
962
+ ]
963
+ },
964
+ {
965
+ "name": "stdout",
966
+ "output_type": "stream",
967
+ "text": [
968
+ "⠙ Processed 1 chunks, 0 entities(duplicated), 0 relations(duplicated)\r"
969
+ ]
970
+ },
971
+ {
972
+ "name": "stderr",
973
+ "output_type": "stream",
974
+ "text": [
975
+ "\n",
976
+ "Extracting entities from chunks: 8%|▊ | 1/12 [00:03<00:43, 3.93s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
977
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
978
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
979
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
980
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
981
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
982
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
983
  ]
984
  },
985
  {
986
  "name": "stdout",
987
  "output_type": "stream",
988
  "text": [
989
+ "⠹ Processed 2 chunks, 8 entities(duplicated), 8 relations(duplicated)\r"
 
 
 
990
  ]
991
  },
992
+ {
993
+ "name": "stderr",
994
+ "output_type": "stream",
995
+ "text": [
996
+ "\n",
997
+ "Extracting entities from chunks: 17%|█▋ | 2/12 [00:29<02:44, 16.46s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
998
+ ]
999
+ },
1000
+ {
1001
+ "name": "stdout",
1002
+ "output_type": "stream",
1003
+ "text": [
1004
+ "⠸ Processed 3 chunks, 17 entities(duplicated), 15 relations(duplicated)\r"
1005
+ ]
1006
+ },
1007
+ {
1008
+ "name": "stderr",
1009
+ "output_type": "stream",
1010
+ "text": [
1011
+ "\n",
1012
+ "Extracting entities from chunks: 25%|██▌ | 3/12 [00:30<01:25, 9.45s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
1013
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
1014
+ ]
1015
+ },
1016
+ {
1017
+ "name": "stdout",
1018
+ "output_type": "stream",
1019
+ "text": [
1020
+ "⠼ Processed 4 chunks, 27 entities(duplicated), 22 relations(duplicated)\r"
1021
+ ]
1022
+ },
1023
+ {
1024
+ "name": "stderr",
1025
+ "output_type": "stream",
1026
+ "text": [
1027
+ "\n",
1028
+ "Extracting entities from chunks: 33%|███▎ | 4/12 [00:39<01:16, 9.52s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
1029
+ ]
1030
+ },
1031
+ {
1032
+ "name": "stdout",
1033
+ "output_type": "stream",
1034
+ "text": [
1035
+ "⠴ Processed 5 chunks, 36 entities(duplicated), 33 relations(duplicated)\r"
1036
+ ]
1037
+ },
1038
+ {
1039
+ "name": "stderr",
1040
+ "output_type": "stream",
1041
+ "text": [
1042
+ "\n",
1043
+ "Extracting entities from chunks: 42%|████▏ | 5/12 [00:40<00:43, 6.24s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
1044
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
1045
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
1046
+ ]
1047
+ },
1048
+ {
1049
+ "name": "stdout",
1050
+ "output_type": "stream",
1051
+ "text": [
1052
+ "⠦ Processed 6 chunks, 49 entities(duplicated), 42 relations(duplicated)\r"
1053
+ ]
1054
+ },
1055
+ {
1056
+ "name": "stderr",
1057
+ "output_type": "stream",
1058
+ "text": [
1059
+ "\n",
1060
+ "Extracting entities from chunks: 50%|█████ | 6/12 [00:49<00:43, 7.33s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
1061
+ ]
1062
+ },
1063
+ {
1064
+ "name": "stdout",
1065
+ "output_type": "stream",
1066
+ "text": [
1067
+ "⠧ Processed 7 chunks, 62 entities(duplicated), 65 relations(duplicated)\r"
1068
+ ]
1069
+ },
1070
+ {
1071
+ "name": "stderr",
1072
+ "output_type": "stream",
1073
+ "text": [
1074
+ "\n",
1075
+ "Extracting entities from chunks: 58%|█████▊ | 7/12 [01:05<00:50, 10.05s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
1076
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
1077
+ ]
1078
+ },
1079
+ {
1080
+ "name": "stdout",
1081
+ "output_type": "stream",
1082
+ "text": [
1083
+ "⠇ Processed 8 chunks, 81 entities(duplicated), 90 relations(duplicated)\r"
1084
+ ]
1085
+ },
1086
+ {
1087
+ "name": "stderr",
1088
+ "output_type": "stream",
1089
+ "text": [
1090
+ "\n",
1091
+ "Extracting entities from chunks: 67%|██████▋ | 8/12 [01:23<00:50, 12.69s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
1092
+ ]
1093
+ },
1094
+ {
1095
+ "name": "stdout",
1096
+ "output_type": "stream",
1097
+ "text": [
1098
+ "⠏ Processed 9 chunks, 99 entities(duplicated), 117 relations(duplicated)\r"
1099
+ ]
1100
+ },
1101
+ {
1102
+ "name": "stderr",
1103
+ "output_type": "stream",
1104
+ "text": [
1105
+ "\n",
1106
+ "Extracting entities from chunks: 75%|███████▌ | 9/12 [01:32<00:34, 11.54s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
1107
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
1108
+ ]
1109
+ },
1110
+ {
1111
+ "name": "stdout",
1112
+ "output_type": "stream",
1113
+ "text": [
1114
+ "⠋ Processed 10 chunks, 123 entities(duplicated), 140 relations(duplicated)\r"
1115
+ ]
1116
+ },
1117
+ {
1118
+ "name": "stderr",
1119
+ "output_type": "stream",
1120
+ "text": [
1121
+ "\n",
1122
+ "Extracting entities from chunks: 83%|████████▎ | 10/12 [01:48<00:25, 12.79s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
1123
+ ]
1124
+ },
1125
+ {
1126
+ "name": "stdout",
1127
+ "output_type": "stream",
1128
+ "text": [
1129
+ "⠙ Processed 11 chunks, 158 entities(duplicated), 174 relations(duplicated)\r"
1130
+ ]
1131
+ },
1132
+ {
1133
+ "name": "stderr",
1134
+ "output_type": "stream",
1135
+ "text": [
1136
+ "\n",
1137
+ "Extracting entities from chunks: 92%|█████████▏| 11/12 [02:03<00:13, 13.50s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
1138
+ ]
1139
+ },
1140
+ {
1141
+ "name": "stdout",
1142
+ "output_type": "stream",
1143
+ "text": [
1144
+ "⠹ Processed 12 chunks, 194 entities(duplicated), 221 relations(duplicated)\r"
1145
+ ]
1146
+ },
1147
+ {
1148
+ "name": "stderr",
1149
+ "output_type": "stream",
1150
+ "text": [
1151
+ "\n",
1152
+ "Extracting entities from chunks: 100%|██████████| 12/12 [02:13<00:00, 11.15s/chunk]\u001b[A\n",
1153
+ "INFO:lightrag:Inserting entities into storage...\n",
1154
+ "\n",
1155
+ "Inserting entities: 100%|██████████| 170/170 [00:00<00:00, 11610.25entity/s]\n",
1156
+ "INFO:lightrag:Inserting relationships into storage...\n",
1157
+ "\n",
1158
+ "Inserting relationships: 100%|██████████| 218/218 [00:00<00:00, 15913.51relationship/s]\n",
1159
+ "INFO:lightrag:Inserting 170 vectors to entities\n",
1160
+ "\n",
1161
+ "Generating embeddings: 0%| | 0/6 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1162
+ "\n",
1163
+ "Generating embeddings: 17%|█▋ | 1/6 [00:01<00:05, 1.10s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1164
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1165
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1166
+ "\n",
1167
+ "Generating embeddings: 33%|███▎ | 2/6 [00:02<00:04, 1.07s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1168
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1169
+ "\n",
1170
+ "Generating embeddings: 50%|█████ | 3/6 [00:02<00:02, 1.33batch/s]\u001b[A\n",
1171
+ "Generating embeddings: 67%|██████▋ | 4/6 [00:02<00:01, 1.67batch/s]\u001b[A\n",
1172
+ "Generating embeddings: 83%|████████▎ | 5/6 [00:03<00:00, 1.95batch/s]\u001b[A\n",
1173
+ "Generating embeddings: 100%|██████████| 6/6 [00:03<00:00, 1.66batch/s]\u001b[A\n",
1174
+ "INFO:lightrag:Inserting 218 vectors to relationships\n",
1175
+ "\n",
1176
+ "Generating embeddings: 0%| | 0/7 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1177
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1178
+ "\n",
1179
+ "Generating embeddings: 14%|█▍ | 1/7 [00:01<00:10, 1.74s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1180
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1181
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1182
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1183
+ "\n",
1184
+ "Generating embeddings: 29%|██▊ | 2/7 [00:02<00:05, 1.04s/batch]\u001b[A\n",
1185
+ "Generating embeddings: 43%|████▎ | 3/7 [00:02<00:02, 1.35batch/s]\u001b[A\n",
1186
+ "Generating embeddings: 57%|█████▋ | 4/7 [00:03<00:01, 1.69batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1187
+ "\n",
1188
+ "Generating embeddings: 71%|███████▏ | 5/7 [00:03<00:01, 1.96batch/s]\u001b[A\n",
1189
+ "Generating embeddings: 86%|████████▌ | 6/7 [00:03<00:00, 2.17batch/s]\u001b[A\n",
1190
+ "Generating embeddings: 100%|███���██████| 7/7 [00:04<00:00, 1.68batch/s]\u001b[A\n",
1191
+ "INFO:lightrag:Writing graph with 174 nodes, 218 edges\n",
1192
+ "Processing batch 1: 100%|██████████| 1/1 [02:24<00:00, 144.69s/it]\n"
1193
+ ]
1194
+ }
1195
+ ],
1196
+ "source": [
1197
+ "rag = LightRAG(\n",
1198
+ " working_dir=WORKING_DIR,\n",
1199
+ " llm_model_func=llm_model_func,\n",
1200
+ " embedding_func=EmbeddingFunc(\n",
1201
+ " embedding_dim=4096, max_token_size=8192, func=embedding_func\n",
1202
+ " ),\n",
1203
+ " chunk_token_size=512,\n",
1204
+ ")\n",
1205
+ "\n",
1206
+ "# rag.insert(content)\n",
1207
+ "rag.insert(content, split_by_character=\"\\n#\", split_by_character_only=True)"
1208
+ ]
1209
+ },
1210
+ {
1211
+ "cell_type": "code",
1212
+ "execution_count": 13,
1213
+ "id": "3c7aa9836d8d43c7",
1214
+ "metadata": {
1215
+ "ExecuteTime": {
1216
+ "end_time": "2025-01-09T03:52:37.000418Z",
1217
+ "start_time": "2025-01-09T03:52:09.933584Z"
1218
+ }
1219
+ },
1220
+ "outputs": [
1221
  {
1222
  "name": "stderr",
1223
  "output_type": "stream",
1224
  "text": [
1225
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1226
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
1227
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1228
+ "INFO:lightrag:Local query uses 5 entites, 3 relations, 2 text units\n",
1229
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
1230
+ "INFO:lightrag:Global query uses 9 entites, 5 relations, 4 text units\n",
1231
  "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
1232
  ]
1233
  },
 
1236
  "output_type": "stream",
1237
  "text": [
1238
  "<分析>\n",
1239
+ "- **该文献主要研究的问题是什么?**\n",
1240
+ " - **思考过程**:通过浏览论文的标题、摘要、引言等部分,寻找关于研究目的和问题的描述。论文标题为“Cancer Incidence and Mortality After Treatment With Folic Acid and Vitamin B12”,摘要中的“Objective”部分明确指出研究目的是“To evaluate effects of treatment with B vitamins on cancer outcomes and all-cause mortality in 2 randomized controlled trials”。因此,可以确定该文献主要研究的问题是评估B族维生素治疗对两项随机对照试验中癌症结局和全因死亡率的影响。\n",
1241
+ "- **该文献采用什么方法进行分析?**\n",
1242
+ " - **思考过程**:在论文的“METHODS”部分详细描述了研究方法。文中提到这是一个对两项随机、双盲、安慰剂对照临床试验(Norwegian Vitamin [NORVIT] trial和Western Norway B Vitamin Intervention Trial [WENBIT])数据的联合分析,并进行了观察性的试验后随访。具体包括对参与者进行分组干预(不同剂量的叶酸、维生素B12、维生素B6或安慰剂),收集临床信息和血样,分析循环B族维生素、同型半胱氨酸和可替宁等指标,并进行基因分型等,还涉及到多种统计分析方法,如计算预期癌症发生率、构建生存曲线、进行Cox比例风险回归模型分析等。\n",
1243
+ "- **该文献的主要结论是什么?**\n",
1244
+ " - **思考过程**:在论文的“Results”和“Conclusion”部分寻找主要结论。研究结果表明,在治疗期间,接受叶酸加维生素B12治疗的参与者血清叶酸浓度显著增加,且在后续随访中,该组癌症发病率、癌症死亡率和全因死亡率均有所上升,主要是肺癌发病率增加,而维生素B6治疗未显示出显著影响。结论部分明确指出“Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods”。\n",
 
 
 
 
 
 
1245
  "</分析>\n",
1246
  "\n",
1247
  "<回答>\n",
1248
+ "- **主要研究问题**:评估B族维生素治疗对两项随机对照试验中癌症结局和全因死亡率的影响。\n",
1249
+ "- **研究方法**:采用对两项随机、双盲、安慰剂对照临床试验(Norwegian Vitamin [NORVIT] trial和Western Norway B Vitamin Intervention Trial [WENBIT])数据的联合分析,并进行观察性的试验后随访,涉及分组干预、多种指标检测以及多种统计分析方法。\n",
1250
+ "- **主要结论**:在挪威(食品中未添加叶酸),对于缺血性心脏病患者,叶酸加维生素B12治疗与癌症结局和全因死亡率的增加有关,而维生素B6治疗未显示出显著影响。\n",
1251
+ "\n",
1252
+ "**参考文献**\n",
1253
+ "- [VD] Cancer Incidence and Mortality After Treatment With Folic Acid and Vitamin B12\n",
1254
+ "- [VD] METHODS Study Design, Participants, and Study Intervention\n",
1255
+ "- [VD] RESULTS\n",
1256
+ "- [VD] Conclusion\n",
1257
+ "- [VD] Objective To evaluate effects of treatment with B vitamins on cancer outcomes and all-cause mortality in 2 randomized controlled trials.\n"
1258
  ]
1259
  }
1260
  ],
1261
  "source": [
1262
+ "resp = rag.query(prompt1, param=QueryParam(mode=\"mix\", top_k=5))\n",
1263
+ "print(resp)"
1264
  ]
1265
  },
1266
  {
1267
  "cell_type": "code",
1268
  "execution_count": null,
1269
+ "id": "7ba6fa79a2550d10",
1270
  "metadata": {},
1271
  "outputs": [],
1272
  "source": []
lightrag/lightrag.py CHANGED
@@ -314,18 +314,25 @@ class LightRAG:
314
  "JsonDocStatusStorage": JsonDocStatusStorage,
315
  }
316
 
317
- def insert(self, string_or_strings, split_by_character=None):
 
 
318
  loop = always_get_an_event_loop()
319
  return loop.run_until_complete(
320
- self.ainsert(string_or_strings, split_by_character)
321
  )
322
 
323
- async def ainsert(self, string_or_strings, split_by_character):
 
 
324
  """Insert documents with checkpoint support
325
 
326
  Args:
327
  string_or_strings: Single document string or list of document strings
328
- split_by_character: if split_by_character is not None, split the string by character
 
 
 
329
  """
330
  if isinstance(string_or_strings, str):
331
  string_or_strings = [string_or_strings]
@@ -384,6 +391,7 @@ class LightRAG:
384
  for dp in chunking_by_token_size(
385
  doc["content"],
386
  split_by_character=split_by_character,
 
387
  overlap_token_size=self.chunk_overlap_token_size,
388
  max_token_size=self.chunk_token_size,
389
  tiktoken_model=self.tiktoken_model_name,
 
314
  "JsonDocStatusStorage": JsonDocStatusStorage,
315
  }
316
 
317
+ def insert(
318
+ self, string_or_strings, split_by_character=None, split_by_character_only=False
319
+ ):
320
  loop = always_get_an_event_loop()
321
  return loop.run_until_complete(
322
+ self.ainsert(string_or_strings, split_by_character, split_by_character_only)
323
  )
324
 
325
+ async def ainsert(
326
+ self, string_or_strings, split_by_character, split_by_character_only
327
+ ):
328
  """Insert documents with checkpoint support
329
 
330
  Args:
331
  string_or_strings: Single document string or list of document strings
332
+ split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
333
+ chunk_size, split the sub chunk by token size.
334
+ split_by_character_only: if split_by_character_only is True, split the string by character only, when
335
+ split_by_character is None, this parameter is ignored.
336
  """
337
  if isinstance(string_or_strings, str):
338
  string_or_strings = [string_or_strings]
 
391
  for dp in chunking_by_token_size(
392
  doc["content"],
393
  split_by_character=split_by_character,
394
+ split_by_character_only=split_by_character_only,
395
  overlap_token_size=self.chunk_overlap_token_size,
396
  max_token_size=self.chunk_token_size,
397
  tiktoken_model=self.tiktoken_model_name,
lightrag/operate.py CHANGED
@@ -36,6 +36,7 @@ import time
36
  def chunking_by_token_size(
37
  content: str,
38
  split_by_character=None,
 
39
  overlap_token_size=128,
40
  max_token_size=1024,
41
  tiktoken_model="gpt-4o",
@@ -45,21 +46,26 @@ def chunking_by_token_size(
45
  if split_by_character:
46
  raw_chunks = content.split(split_by_character)
47
  new_chunks = []
48
- for chunk in raw_chunks:
49
- _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model)
50
- if len(_tokens) > max_token_size:
51
- for start in range(
52
- 0, len(_tokens), max_token_size - overlap_token_size
53
- ):
54
- chunk_content = decode_tokens_by_tiktoken(
55
- _tokens[start : start + max_token_size],
56
- model_name=tiktoken_model,
57
- )
58
- new_chunks.append(
59
- (min(max_token_size, len(_tokens) - start), chunk_content)
60
- )
61
- else:
62
  new_chunks.append((len(_tokens), chunk))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  for index, (_len, chunk) in enumerate(new_chunks):
64
  results.append(
65
  {
 
36
  def chunking_by_token_size(
37
  content: str,
38
  split_by_character=None,
39
+ split_by_character_only=False,
40
  overlap_token_size=128,
41
  max_token_size=1024,
42
  tiktoken_model="gpt-4o",
 
46
  if split_by_character:
47
  raw_chunks = content.split(split_by_character)
48
  new_chunks = []
49
+ if split_by_character_only:
50
+ for chunk in raw_chunks:
51
+ _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model)
 
 
 
 
 
 
 
 
 
 
 
52
  new_chunks.append((len(_tokens), chunk))
53
+ else:
54
+ for chunk in raw_chunks:
55
+ _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model)
56
+ if len(_tokens) > max_token_size:
57
+ for start in range(
58
+ 0, len(_tokens), max_token_size - overlap_token_size
59
+ ):
60
+ chunk_content = decode_tokens_by_tiktoken(
61
+ _tokens[start : start + max_token_size],
62
+ model_name=tiktoken_model,
63
+ )
64
+ new_chunks.append(
65
+ (min(max_token_size, len(_tokens) - start), chunk_content)
66
+ )
67
+ else:
68
+ new_chunks.append((len(_tokens), chunk))
69
  for index, (_len, chunk) in enumerate(new_chunks):
70
  results.append(
71
  {