Commit ·
32d216e
1
Parent(s): 56726f6
April 2026 data update
Browse files- README.md +24 -7
- config.json +22 -6
- documents +2 -2
- embeddings +2 -2
README.md
CHANGED
|
@@ -8,18 +8,20 @@ library_name: txtai
|
|
| 8 |
tags:
|
| 9 |
- sentence-similarity
|
| 10 |
datasets:
|
| 11 |
-
- NeuML/wikipedia-
|
| 12 |
---
|
| 13 |
|
| 14 |
# Wikipedia txtai embeddings index
|
| 15 |
|
| 16 |
This is a [txtai](https://github.com/neuml/txtai) embeddings index for the [English edition of Wikipedia](https://en.wikipedia.org/).
|
| 17 |
|
| 18 |
-
This index is built from the [Wikipedia
|
| 19 |
|
| 20 |
-
It
|
| 21 |
to only match commonly visited pages.
|
| 22 |
|
|
|
|
|
|
|
| 23 |
txtai must be [installed](https://neuml.github.io/txtai/install/) to use this model.
|
| 24 |
|
| 25 |
## Example
|
|
@@ -41,6 +43,12 @@ embeddings.search("""
|
|
| 41 |
SELECT id, text, score, percentile FROM txtai WHERE similar('Boston') AND
|
| 42 |
percentile >= 0.99
|
| 43 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
```
|
| 45 |
|
| 46 |
## Use Cases
|
|
@@ -65,7 +73,7 @@ Performance was evaluated using the [NDCG@10](https://en.wikipedia.org/wiki/Disc
|
|
| 65 |
|
| 66 |
## Build the index
|
| 67 |
|
| 68 |
-
The following steps show how to build this index. These scripts are using the latest data available as of
|
| 69 |
|
| 70 |
- Install required build dependencies
|
| 71 |
```bash
|
|
@@ -75,7 +83,7 @@ pip install ragdata mwparserfromhell
|
|
| 75 |
- Download and build pageviews database
|
| 76 |
```bash
|
| 77 |
mkdir -p pageviews/data
|
| 78 |
-
wget -P pageviews/data https://dumps.wikimedia.org/other/pageview_complete/monthly/
|
| 79 |
python -m ragdata.wikipedia.views -p en.wikipedia -v pageviews
|
| 80 |
```
|
| 81 |
|
|
@@ -85,17 +93,26 @@ python -m ragdata.wikipedia.views -p en.wikipedia -v pageviews
|
|
| 85 |
from datasets import load_dataset
|
| 86 |
|
| 87 |
# Data dump date from https://dumps.wikimedia.org/enwiki/
|
| 88 |
-
date = "
|
| 89 |
|
| 90 |
# Build and save dataset
|
| 91 |
ds = load_dataset("neuml/wikipedia", language="en", date=date)
|
| 92 |
ds.save_to_disk(f"wikipedia-{date}")
|
| 93 |
```
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
- Build txtai-wikipedia index
|
| 96 |
```bash
|
| 97 |
python -m ragdata.wikipedia.index \
|
| 98 |
-
-d wikipedia-
|
|
|
|
| 99 |
-o txtai-wikipedia \
|
| 100 |
-v pageviews/pageviews.sqlite
|
| 101 |
```
|
|
|
|
| 8 |
tags:
|
| 9 |
- sentence-similarity
|
| 10 |
datasets:
|
| 11 |
+
- NeuML/wikipedia-20260401
|
| 12 |
---
|
| 13 |
|
| 14 |
# Wikipedia txtai embeddings index
|
| 15 |
|
| 16 |
This is a [txtai](https://github.com/neuml/txtai) embeddings index for the [English edition of Wikipedia](https://en.wikipedia.org/).
|
| 17 |
|
| 18 |
+
This index is built from the [Wikipedia April 2026 dataset](https://huggingface.co/datasets/neuml/wikipedia-20260401). Only the first paragraph of the [lead section](https://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style/Lead_section) from each article is included in the index. This is similar to an abstract of the article.
|
| 19 |
|
| 20 |
+
It uses [Wikipedia Page Views](https://dumps.wikimedia.org/other/pageviews/readme.html) data to add a `percentile` field. The `percentile` field can be used
|
| 21 |
to only match commonly visited pages.
|
| 22 |
|
| 23 |
+
Domain labels are applied using [this model](https://huggingface.co/NeuML/domain-labeler) and adds a `domain` field.
|
| 24 |
+
|
| 25 |
txtai must be [installed](https://neuml.github.io/txtai/install/) to use this model.
|
| 26 |
|
| 27 |
## Example
|
|
|
|
| 43 |
SELECT id, text, score, percentile FROM txtai WHERE similar('Boston') AND
|
| 44 |
percentile >= 0.99
|
| 45 |
""")
|
| 46 |
+
|
| 47 |
+
# Find most popular articles for a domain label
|
| 48 |
+
embeddings.search("""
|
| 49 |
+
SELECT id, text, score, domain FROM txtai WHERE domain = 'news'
|
| 50 |
+
ORDER BY percentile DESC
|
| 51 |
+
""")
|
| 52 |
```
|
| 53 |
|
| 54 |
## Use Cases
|
|
|
|
| 73 |
|
| 74 |
## Build the index
|
| 75 |
|
| 76 |
+
The following steps show how to build this index. These scripts are using the latest data available as of 2026-04-01, update as appropriate.
|
| 77 |
|
| 78 |
- Install required build dependencies
|
| 79 |
```bash
|
|
|
|
| 83 |
- Download and build pageviews database
|
| 84 |
```bash
|
| 85 |
mkdir -p pageviews/data
|
| 86 |
+
wget -P pageviews/data https://dumps.wikimedia.org/other/pageview_complete/monthly/2026/2026-04/pageviews-202604-user.bz2
|
| 87 |
python -m ragdata.wikipedia.views -p en.wikipedia -v pageviews
|
| 88 |
```
|
| 89 |
|
|
|
|
| 93 |
from datasets import load_dataset
|
| 94 |
|
| 95 |
# Data dump date from https://dumps.wikimedia.org/enwiki/
|
| 96 |
+
date = "20260401"
|
| 97 |
|
| 98 |
# Build and save dataset
|
| 99 |
ds = load_dataset("neuml/wikipedia", language="en", date=date)
|
| 100 |
ds.save_to_disk(f"wikipedia-{date}")
|
| 101 |
```
|
| 102 |
|
| 103 |
+
- Generate domain labels
|
| 104 |
+
|
| 105 |
+
```bash
|
| 106 |
+
python -m ragdata.wikipedia.label \
|
| 107 |
+
-d wikipedia-20260401 \
|
| 108 |
+
-o labels.csv
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
- Build txtai-wikipedia index
|
| 112 |
```bash
|
| 113 |
python -m ragdata.wikipedia.index \
|
| 114 |
+
-d wikipedia-20260401 \
|
| 115 |
+
-l labels.csv \
|
| 116 |
-o txtai-wikipedia \
|
| 117 |
-v pageviews/pageviews.sqlite
|
| 118 |
```
|
config.json
CHANGED
|
@@ -12,17 +12,33 @@
|
|
| 12 |
"sample": 0.05
|
| 13 |
},
|
| 14 |
"content": true,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"dimensions": 768,
|
| 16 |
"backend": "faiss",
|
| 17 |
-
"offset":
|
| 18 |
"build": {
|
| 19 |
-
"create": "
|
| 20 |
-
"python": "3.10.
|
| 21 |
"settings": {
|
| 22 |
-
"components": "
|
| 23 |
},
|
| 24 |
"system": "Linux (x86_64)",
|
| 25 |
-
"txtai": "9.
|
| 26 |
},
|
| 27 |
-
"update": "
|
| 28 |
}
|
|
|
|
| 12 |
"sample": 0.05
|
| 13 |
},
|
| 14 |
"content": true,
|
| 15 |
+
"columns": {
|
| 16 |
+
"store": [
|
| 17 |
+
"percentile",
|
| 18 |
+
"domain"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
"expressions": [
|
| 22 |
+
{
|
| 23 |
+
"name": "percentile",
|
| 24 |
+
"index": true
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"name": "domain",
|
| 28 |
+
"index": true
|
| 29 |
+
}
|
| 30 |
+
],
|
| 31 |
"dimensions": 768,
|
| 32 |
"backend": "faiss",
|
| 33 |
+
"offset": 6527334,
|
| 34 |
"build": {
|
| 35 |
+
"create": "2026-04-19T04:36:14Z",
|
| 36 |
+
"python": "3.10.20",
|
| 37 |
"settings": {
|
| 38 |
+
"components": "IVF2285,SQ8"
|
| 39 |
},
|
| 40 |
"system": "Linux (x86_64)",
|
| 41 |
+
"txtai": "9.8.0"
|
| 42 |
},
|
| 43 |
+
"update": "2026-04-19T04:36:14Z"
|
| 44 |
}
|
documents
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4d28962a359692703c932dc0dccea5971952dbfcf5c2cdf60fba921f1a80223
|
| 3 |
+
size 3925708800
|
embeddings
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7dc16dfb26e4c93bd108176e879a24de47ea3fa37426c710a4bebb7d27bd77e3
|
| 3 |
+
size 5072255312
|