|
|
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import requests |
|
import io |
|
import dask.dataframe as dd |
|
from datasets import load_dataset, Image |
|
from mlcroissant import Dataset as CroissantDataset |
|
from huggingface_hub import get_token |
|
import polars as pl |
|
import warnings |
|
import traceback |
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
|
|
DATASET_CONFIG = { |
|
"caselaw": { |
|
"name": "common-pile/caselaw_access_project", |
|
"emoji": "βοΈ", |
|
"search_col": "text", |
|
"methods": ["π¨ API (requests)", "π§ Dask", "π₯ Croissant"], |
|
"is_public": True, |
|
}, |
|
"prompts": { |
|
"name": "fka/awesome-chatgpt-prompts", |
|
"emoji": "π€", |
|
"search_col": ["act", "prompt"], |
|
"methods": ["πΌ Pandas", "π¨ API (requests)", "π₯ Croissant"], |
|
"is_public": True, |
|
}, |
|
"finance": { |
|
"name": "snorkelai/agent-finance-reasoning", |
|
"emoji": "π°", |
|
"search_col": ["question", "answer"], |
|
"methods": ["πΌ Pandas", "π§ Polars", "π¨ API (requests)", "π₯ Croissant"], |
|
"is_public": False, |
|
}, |
|
"medical": { |
|
"name": "FreedomIntelligence/medical-o1-reasoning-SFT", |
|
"emoji": "π©Ί", |
|
"search_col": "conversations", |
|
"methods": ["πΌ Pandas", "π§ Polars", "π¨ API (requests)", "π₯ Croissant"], |
|
"is_public": False, |
|
}, |
|
"inscene": { |
|
"name": "peteromallet/InScene-Dataset", |
|
"emoji": "πΌοΈ", |
|
"search_col": "text", |
|
"methods": ["π€ Datasets", "πΌ Pandas", "π§ Polars", "π¨ API (requests)", "π₯ Croissant"], |
|
"is_public": False, |
|
}, |
|
} |
|
|
|
|
|
|
|
def get_auth_headers(): |
|
"""π Creates authorization headers if a Hugging Face token is available.""" |
|
token = get_token() |
|
return {"Authorization": f"Bearer {token}"} if token else {} |
|
|
|
def dataframe_to_outputs(df: pd.DataFrame): |
|
""" |
|
π Takes a DataFrame and magically transforms it into various formats for your viewing pleasure. |
|
Like a data chameleon! |
|
""" |
|
if df.empty: |
|
return "No results found. π€·", None, None, "No results to copy." |
|
|
|
df_str = df.astype(str) |
|
markdown_output = df_str.to_markdown(index=False) |
|
|
|
csv_buffer = io.StringIO() |
|
df.to_csv(csv_buffer, index=False) |
|
csv_buffer.seek(0) |
|
|
|
excel_buffer = io.BytesIO() |
|
df.to_excel(excel_buffer, index=False, engine='openpyxl') |
|
excel_buffer.seek(0) |
|
|
|
tab_delimited_output = df.to_csv(sep='\t', index=False) |
|
|
|
return markdown_output, gr.File.from_bytes(csv_buffer.getvalue(), "results.csv"), gr.File.from_bytes(excel_buffer.getvalue(), "results.xlsx"), tab_delimited_output |
|
|
|
|
|
def handle_error(e: Exception): |
|
""" |
|
π± Oh no! An error! This function catches it and displays it nicely. |
|
Because even errors deserve to look good. |
|
""" |
|
error_message = f"π¨ An error occurred: {str(e)}\n\n" |
|
auth_tip = "π For gated datasets, did you log in? Try `huggingface-cli login` in your terminal." |
|
full_trace = traceback.format_exc() |
|
print(full_trace) |
|
|
|
if "401" in str(e) or "Gated" in str(e): |
|
error_message += auth_tip |
|
|
|
return ( |
|
pd.DataFrame(), |
|
gr.Gallery(None, label="πΌοΈ Image Results"), |
|
f"```\n{error_message}\n\n{full_trace}\n```", |
|
None, |
|
None, |
|
error_message, |
|
f"```python\n# π¨ Error during code generation:\n# {e}\n```" |
|
) |
|
|
|
|
|
|
|
def fetch_data(dataset_key: str, access_method: str, query: str): |
|
""" |
|
π The main mission control function! It fetches, searches, and formats data. |
|
It's the brains of the operation. |
|
""" |
|
try: |
|
config = DATASET_CONFIG[dataset_key] |
|
repo_id = config["name"] |
|
search_cols = [config["search_col"]] if isinstance(config["search_col"], str) else config["search_col"] |
|
df = pd.DataFrame() |
|
code_snippet = "" |
|
|
|
if "API" in access_method: |
|
url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset=0&length=100" |
|
headers = get_auth_headers() if not config["is_public"] else {} |
|
response = requests.get(url, headers=headers) |
|
response.raise_for_status() |
|
data = response.json() |
|
df = pd.json_normalize(data['rows'], record_path='row', meta=['row_idx', 'truncated_cells']) |
|
df = df.drop(columns=['row_idx', 'truncated_cells'], errors='ignore') |
|
|
|
code_snippet = f""" |
|
# π» Generated Code: API (requests) |
|
import requests |
|
import pandas as pd |
|
|
|
# For gated datasets, get your token from https://huggingface.co/settings/tokens |
|
# Make sure to `huggingface-cli login` first. |
|
headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} |
|
url = "{url}" |
|
response = requests.get(url, headers=headers) # Pass headers for gated datasets |
|
data = response.json() |
|
df = pd.json_normalize(data['rows'], record_path='row') |
|
print(df.head()) |
|
""" |
|
|
|
elif "Pandas" in access_method: |
|
file_path = f"hf://datasets/{repo_id}/" |
|
if repo_id == "fka/awesome-chatgpt-prompts": |
|
file_path += "prompts.csv" |
|
df = pd.read_csv(file_path) |
|
else: |
|
try: |
|
df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet") |
|
except: |
|
try: |
|
df = pd.read_parquet(f"{file_path}train.parquet") |
|
except: |
|
df = pd.read_json(f"{file_path}medical_o1_sft.json") |
|
|
|
code_snippet = f""" |
|
# π» Generated Code: Pandas |
|
import pandas as pd |
|
|
|
# Make sure to `huggingface-cli login` for gated datasets. |
|
file_path = "{file_path}" |
|
df = pd.{'read_csv' if '.csv' in file_path else ('read_json' if '.json' in file_path else 'read_parquet')}(file_path) |
|
print(df.head()) |
|
""" |
|
|
|
elif "Polars" in access_method: |
|
file_path = f"hf://datasets/{repo_id}/" |
|
try: |
|
df = pl.read_parquet(f"{file_path}data/train-00000-of-00001.parquet").to_pandas() |
|
except: |
|
try: |
|
df = pl.read_parquet(f"{file_path}train.parquet").to_pandas() |
|
except: |
|
df = pl.read_json(f"{file_path}medical_o1_sft.json").to_pandas() |
|
|
|
code_snippet = f""" |
|
# π» Generated Code: Polars |
|
import polars as pl |
|
|
|
# Make sure to `huggingface-cli login` for gated datasets. |
|
file_path = "{'hf://datasets/' + repo_id + '/data/train-00000-of-00001.parquet'}" |
|
df = pl.read_parquet(file_path) |
|
print(df.head()) |
|
""" |
|
|
|
elif "Datasets" in access_method: |
|
ds = load_dataset(repo_id, split='train[:100]') |
|
df = ds.to_pandas() |
|
code_snippet = f""" |
|
# π» Generated Code: Datasets |
|
from datasets import load_dataset |
|
|
|
# Make sure to `huggingface-cli login` for gated datasets. |
|
ds = load_dataset("{repo_id}", split='train') |
|
print(ds) |
|
""" |
|
|
|
elif "Dask" in access_method: |
|
df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz").head(100) |
|
code_snippet = f""" |
|
# π» Generated Code: Dask |
|
import dask.dataframe as dd |
|
|
|
# Make sure to `huggingface-cli login` for gated datasets. |
|
ddf = dd.read_json("hf://datasets/{repo_id}/**/*.jsonl.gz") |
|
print(ddf.head()) |
|
""" |
|
|
|
elif "Croissant" in access_method: |
|
headers = get_auth_headers() if not config["is_public"] else {} |
|
jsonld_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant" |
|
jsonld = requests.get(jsonld_url, headers=headers).json() |
|
ds = CroissantDataset(jsonld=jsonld) |
|
records = ds.records("default") |
|
data_rows = [row for _, row in zip(range(100), records)] |
|
df = pd.DataFrame(data_rows) |
|
code_snippet = f""" |
|
# π» Generated Code: Croissant |
|
import requests |
|
from mlcroissant import Dataset as CroissantDataset |
|
import pandas as pd |
|
|
|
# For gated datasets, get your token from https://huggingface.co/settings/tokens |
|
headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} |
|
jsonld_url = "{jsonld_url}" |
|
jsonld = requests.get(jsonld_url, headers=headers).json() |
|
ds = CroissantDataset(jsonld=jsonld) |
|
records = ds.records("default") # This is a generator |
|
|
|
# To preview data: |
|
preview_rows = [row for _, row in zip(range(100), records)] |
|
df = pd.DataFrame(preview_rows) |
|
print(df.head()) |
|
""" |
|
|
|
|
|
if query and not df.empty: |
|
if dataset_key == 'medical': |
|
df = df[df['conversations'].apply(lambda x: isinstance(x, list) and len(x) > 1 and query.lower() in str(x[1].get('value', '')).lower())] |
|
else: |
|
combined_mask = pd.Series([False] * len(df)) |
|
for col in search_cols: |
|
if col in df.columns and pd.api.types.is_string_dtype(df[col]): |
|
combined_mask |= df[col].str.contains(query, case=False, na=False) |
|
df = df[combined_mask] |
|
|
|
|
|
gallery_output = None |
|
if dataset_key == 'inscene' and not df.empty: |
|
gallery_data = [] |
|
for _, row in df.iterrows(): |
|
if isinstance(row.get('image'), Image.Image): |
|
gallery_data.append((row['image'], row.get('text', ''))) |
|
gallery_output = gr.Gallery(gallery_data, label="πΌοΈ Image Results", height=400) |
|
|
|
md, csv, xlsx, tab = dataframe_to_outputs(df) |
|
return df, gallery_output, md, csv, xlsx, tab, code_snippet |
|
|
|
except Exception as e: |
|
return handle_error(e) |
|
|
|
|
|
|
|
|
|
def create_dataset_tab(dataset_key: str): |
|
""" |
|
ποΈ This function builds a whole tab in our UI for a single dataset. |
|
It's like a little construction worker for Gradio interfaces. |
|
""" |
|
config = DATASET_CONFIG[dataset_key] |
|
|
|
with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"): |
|
gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset") |
|
if not config['is_public']: |
|
gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.") |
|
|
|
with gr.Row(): |
|
access_method = gr.Radio(config['methods'], label="π Access Method", value=config['methods'][0]) |
|
query = gr.Textbox(label="π Search Query", placeholder="Enter a keyword to search...") |
|
|
|
fetch_button = gr.Button("π Go Fetch!") |
|
|
|
df_output = gr.DataFrame(label="π Results DataFrame", interactive=False, wrap=True) |
|
gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="πΌοΈ Image Results") |
|
|
|
with gr.Accordion("π View/Export Full Results", open=False): |
|
markdown_output = gr.Markdown(label="π Markdown View") |
|
with gr.Row(): |
|
csv_output = gr.File(label="β¬οΈ Download CSV") |
|
xlsx_output = gr.File(label="β¬οΈ Download XLSX") |
|
|
|
copy_output = gr.Code(label="π Copy-Paste (Tab-Delimited)") |
|
|
|
code_output = gr.Code(label="π» Python Code Snippet", language="python") |
|
|
|
fetch_button.click( |
|
fn=fetch_data, |
|
inputs=[gr.State(dataset_key), access_method, query], |
|
outputs=[df_output, gallery_output, markdown_output, csv_output, xlsx_output, copy_output, code_output] |
|
) |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo: |
|
gr.Markdown("# π€ Hugging Face Dataset Explorer") |
|
gr.Markdown( |
|
"Select a dataset, choose an access method, type a query, and see the results instantly. " |
|
"The app demonstrates various ways to access and search Hugging Face datasets and generates the code for you!" |
|
) |
|
|
|
with gr.Tabs(): |
|
for key in DATASET_CONFIG.keys(): |
|
create_dataset_tab(key) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |
|
|