Spaces:
Runtime error
Runtime error
Commit
Β·
963c572
1
Parent(s):
53ffb10
update
Browse files- app.py +7 -20
- web_docs_final/data-00000-of-00002.arrow +0 -3
- web_docs_final/data-00001-of-00002.arrow +0 -3
- web_docs_final/state.json +0 -3
- web_docs_final/dataset_info.json β web_docs_final_replaceimgbyurl/data-00000-of-00001.arrow +2 -2
- web_docs_final_replaceimgbyurl/dataset_info.json +30 -0
- web_docs_final_replaceimgbyurl/state.json +18 -0
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import json
|
| 2 |
-
import random
|
| 3 |
|
| 4 |
import streamlit as st
|
| 5 |
from datasets import load_from_disk
|
|
@@ -19,29 +18,15 @@ class Visualization:
|
|
| 19 |
st.title("Visualization of web documents")
|
| 20 |
|
| 21 |
def load_dataset(self):
|
| 22 |
-
st.header("Select the size of the dataset")
|
| 23 |
-
|
| 24 |
self.dataset = load_from_disk(self.path_web_documents_dataset)
|
| 25 |
|
| 26 |
-
opt_sizes = ["100", "300", "1000", "3000"]
|
| 27 |
-
size_dataset = st.selectbox(
|
| 28 |
-
"Select the size of the dataset",
|
| 29 |
-
options=opt_sizes,
|
| 30 |
-
)
|
| 31 |
-
|
| 32 |
-
self.dataset = self.dataset.select(range(int(size_dataset)))
|
| 33 |
-
|
| 34 |
def choose_document(self):
|
| 35 |
st.header("Choose a document")
|
| 36 |
-
if st.button("Select a random document"):
|
| 37 |
-
dct_idx = random.randint(a=0, b=self.dataset.num_rows - 1)
|
| 38 |
-
else:
|
| 39 |
-
dct_idx = 0
|
| 40 |
idx = st.number_input(
|
| 41 |
f"Select a document among the first {self.dataset.num_rows} ones",
|
| 42 |
min_value=0,
|
| 43 |
max_value=self.dataset.num_rows - 1,
|
| 44 |
-
value=
|
| 45 |
step=1,
|
| 46 |
help=f"Index between 0 and {self.dataset.num_rows-1}",
|
| 47 |
)
|
|
@@ -54,13 +39,15 @@ class Visualization:
|
|
| 54 |
metadata = json.loads(self.current_doc["metadata"])
|
| 55 |
for text, image, meta in zip(texts, images, metadata):
|
| 56 |
if text:
|
| 57 |
-
|
|
|
|
| 58 |
elif image:
|
| 59 |
-
st.markdown(f"
|
| 60 |
-
|
|
|
|
| 61 |
|
| 62 |
if __name__ == "__main__":
|
| 63 |
st.set_page_config(layout="wide")
|
| 64 |
-
path_web_documents_dataset = "./
|
| 65 |
visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
|
| 66 |
visualization.visualization()
|
|
|
|
| 1 |
import json
|
|
|
|
| 2 |
|
| 3 |
import streamlit as st
|
| 4 |
from datasets import load_from_disk
|
|
|
|
| 18 |
st.title("Visualization of web documents")
|
| 19 |
|
| 20 |
def load_dataset(self):
|
|
|
|
|
|
|
| 21 |
self.dataset = load_from_disk(self.path_web_documents_dataset)
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def choose_document(self):
|
| 24 |
st.header("Choose a document")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
idx = st.number_input(
|
| 26 |
f"Select a document among the first {self.dataset.num_rows} ones",
|
| 27 |
min_value=0,
|
| 28 |
max_value=self.dataset.num_rows - 1,
|
| 29 |
+
value=0,
|
| 30 |
step=1,
|
| 31 |
help=f"Index between 0 and {self.dataset.num_rows-1}",
|
| 32 |
)
|
|
|
|
| 39 |
metadata = json.loads(self.current_doc["metadata"])
|
| 40 |
for text, image, meta in zip(texts, images, metadata):
|
| 41 |
if text:
|
| 42 |
+
display_text = f"{text}\n".replace("\n", "<br>") # .replace(" ", " ") Preserves white spaces, but creates text outside the width of the window
|
| 43 |
+
st.markdown(f"<pre>{display_text}</pre>", unsafe_allow_html=True)
|
| 44 |
elif image:
|
| 45 |
+
st.markdown(f'<img src="{meta["src"]}" style="max-width: 1000px; height: auto;" />', unsafe_allow_html=True)
|
| 46 |
+
st.text("\n")
|
| 47 |
+
|
| 48 |
|
| 49 |
if __name__ == "__main__":
|
| 50 |
st.set_page_config(layout="wide")
|
| 51 |
+
path_web_documents_dataset = "./web_docs_final_replaceimgbyurl"
|
| 52 |
visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
|
| 53 |
visualization.visualization()
|
web_docs_final/data-00000-of-00002.arrow
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0e5527454919de59e213c2f3ea7e7eab79a0ec9e32e6cd60987c89edc6768028
|
| 3 |
-
size 315163440
|
|
|
|
|
|
|
|
|
|
|
|
web_docs_final/data-00001-of-00002.arrow
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b6a5b9f8f8155a2fb407afd071f2c4e72e93a030ae60261add15840583bcf5a1
|
| 3 |
-
size 264390800
|
|
|
|
|
|
|
|
|
|
|
|
web_docs_final/state.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:9bf0df57986410f1c06732db30d5cd49675992821a69824661ae1cafc223114e
|
| 3 |
-
size 393
|
|
|
|
|
|
|
|
|
|
|
|
web_docs_final/dataset_info.json β web_docs_final_replaceimgbyurl/data-00000-of-00001.arrow
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ab183c2859d5d97a347d3035d96c988a31c460d945bf8f1df2a56488f3525b56
|
| 3 |
+
size 5574096
|
web_docs_final_replaceimgbyurl/dataset_info.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"citation": "",
|
| 3 |
+
"description": "",
|
| 4 |
+
"features": {
|
| 5 |
+
"images": {
|
| 6 |
+
"feature": {
|
| 7 |
+
"dtype": "string",
|
| 8 |
+
"_type": "Value"
|
| 9 |
+
},
|
| 10 |
+
"_type": "Sequence"
|
| 11 |
+
},
|
| 12 |
+
"metadata": {
|
| 13 |
+
"dtype": "string",
|
| 14 |
+
"_type": "Value"
|
| 15 |
+
},
|
| 16 |
+
"general_metadata": {
|
| 17 |
+
"dtype": "string",
|
| 18 |
+
"_type": "Value"
|
| 19 |
+
},
|
| 20 |
+
"texts": {
|
| 21 |
+
"feature": {
|
| 22 |
+
"dtype": "string",
|
| 23 |
+
"_type": "Value"
|
| 24 |
+
},
|
| 25 |
+
"_type": "Sequence"
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
"homepage": "",
|
| 29 |
+
"license": ""
|
| 30 |
+
}
|
web_docs_final_replaceimgbyurl/state.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_data_files": [
|
| 3 |
+
{
|
| 4 |
+
"filename": "data-00000-of-00001.arrow"
|
| 5 |
+
}
|
| 6 |
+
],
|
| 7 |
+
"_fingerprint": "39738be005ac73bb",
|
| 8 |
+
"_format_columns": [
|
| 9 |
+
"general_metadata",
|
| 10 |
+
"images",
|
| 11 |
+
"metadata",
|
| 12 |
+
"texts"
|
| 13 |
+
],
|
| 14 |
+
"_format_kwargs": {},
|
| 15 |
+
"_format_type": null,
|
| 16 |
+
"_output_all_columns": false,
|
| 17 |
+
"_split": null
|
| 18 |
+
}
|