awacke1's picture
Create app.py.v1
c42c89f verified
# app.py
import gradio as gr
import pandas as pd
import requests
import io
import dask.dataframe as dd
from datasets import load_dataset, Image
from mlcroissant import Dataset as CroissantDataset
from huggingface_hub import get_token
import polars as pl
import warnings
import traceback
# 🀫 Let's ignore those pesky warnings, shall we?
warnings.filterwarnings("ignore")
# --- βš™οΈ Configuration & Constants ---
# 🎨 Let's give our datasets some personality with emojis and names!
DATASET_CONFIG = {
"caselaw": {
"name": "common-pile/caselaw_access_project",
"emoji": "βš–οΈ",
"search_col": "text",
"methods": ["πŸ’¨ API (requests)", "🧊 Dask", "πŸ₯ Croissant"],
"is_public": True,
},
"prompts": {
"name": "fka/awesome-chatgpt-prompts",
"emoji": "πŸ€–",
"search_col": ["act", "prompt"],
"methods": ["🐼 Pandas", "πŸ’¨ API (requests)", "πŸ₯ Croissant"],
"is_public": True,
},
"finance": {
"name": "snorkelai/agent-finance-reasoning",
"emoji": "πŸ’°",
"search_col": ["question", "answer"],
"methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"],
"is_public": False,
},
"medical": {
"name": "FreedomIntelligence/medical-o1-reasoning-SFT",
"emoji": "🩺",
"search_col": "conversations",
"methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"],
"is_public": False,
},
"inscene": {
"name": "peteromallet/InScene-Dataset",
"emoji": "πŸ–ΌοΈ",
"search_col": "text",
"methods": ["πŸ€— Datasets", "🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"],
"is_public": False,
},
}
# --- ν—¬ Helpers & Utility Functions ---
def get_auth_headers():
"""πŸ”‘ Creates authorization headers if a Hugging Face token is available."""
token = get_token()
return {"Authorization": f"Bearer {token}"} if token else {}
def dataframe_to_outputs(df: pd.DataFrame):
"""
πŸ“œ Takes a DataFrame and magically transforms it into various formats for your viewing pleasure.
Like a data chameleon!
"""
if df.empty:
return "No results found. 🀷", None, None, "No results to copy."
df_str = df.astype(str)
markdown_output = df_str.to_markdown(index=False)
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)
csv_buffer.seek(0)
excel_buffer = io.BytesIO()
df.to_excel(excel_buffer, index=False, engine='openpyxl')
excel_buffer.seek(0)
tab_delimited_output = df.to_csv(sep='\t', index=False)
return markdown_output, gr.File.from_bytes(csv_buffer.getvalue(), "results.csv"), gr.File.from_bytes(excel_buffer.getvalue(), "results.xlsx"), tab_delimited_output
def handle_error(e: Exception):
"""
😱 Oh no! An error! This function catches it and displays it nicely.
Because even errors deserve to look good.
"""
error_message = f"🚨 An error occurred: {str(e)}\n\n"
auth_tip = "πŸ”‘ For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
full_trace = traceback.format_exc()
print(full_trace)
if "401" in str(e) or "Gated" in str(e):
error_message += auth_tip
return (
pd.DataFrame(),
gr.Gallery(None, label="πŸ–ΌοΈ Image Results"),
f"```\n{error_message}\n\n{full_trace}\n```",
None,
None,
error_message,
f"```python\n# 🚨 Error during code generation:\n# {e}\n```"
)
# --- 🎣 Data Fetching & Processing Functions ---
def fetch_data(dataset_key: str, access_method: str, query: str):
"""
πŸš€ The main mission control function! It fetches, searches, and formats data.
It's the brains of the operation.
"""
try:
config = DATASET_CONFIG[dataset_key]
repo_id = config["name"]
search_cols = [config["search_col"]] if isinstance(config["search_col"], str) else config["search_col"]
df = pd.DataFrame()
code_snippet = ""
if "API" in access_method:
url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset=0&length=100"
headers = get_auth_headers() if not config["is_public"] else {}
response = requests.get(url, headers=headers)
response.raise_for_status()
data = response.json()
df = pd.json_normalize(data['rows'], record_path='row', meta=['row_idx', 'truncated_cells'])
df = df.drop(columns=['row_idx', 'truncated_cells'], errors='ignore')
code_snippet = f"""
# πŸ’» Generated Code: API (requests)
import requests
import pandas as pd
# For gated datasets, get your token from https://huggingface.co/settings/tokens
# Make sure to `huggingface-cli login` first.
headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}}
url = "{url}"
response = requests.get(url, headers=headers) # Pass headers for gated datasets
data = response.json()
df = pd.json_normalize(data['rows'], record_path='row')
print(df.head())
"""
elif "Pandas" in access_method:
file_path = f"hf://datasets/{repo_id}/"
if repo_id == "fka/awesome-chatgpt-prompts":
file_path += "prompts.csv"
df = pd.read_csv(file_path)
else:
try:
df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
except:
try:
df = pd.read_parquet(f"{file_path}train.parquet")
except:
df = pd.read_json(f"{file_path}medical_o1_sft.json")
code_snippet = f"""
# πŸ’» Generated Code: Pandas
import pandas as pd
# Make sure to `huggingface-cli login` for gated datasets.
file_path = "{file_path}"
df = pd.{'read_csv' if '.csv' in file_path else ('read_json' if '.json' in file_path else 'read_parquet')}(file_path)
print(df.head())
"""
elif "Polars" in access_method:
file_path = f"hf://datasets/{repo_id}/"
try:
df = pl.read_parquet(f"{file_path}data/train-00000-of-00001.parquet").to_pandas()
except:
try:
df = pl.read_parquet(f"{file_path}train.parquet").to_pandas()
except:
df = pl.read_json(f"{file_path}medical_o1_sft.json").to_pandas()
code_snippet = f"""
# πŸ’» Generated Code: Polars
import polars as pl
# Make sure to `huggingface-cli login` for gated datasets.
file_path = "{'hf://datasets/' + repo_id + '/data/train-00000-of-00001.parquet'}"
df = pl.read_parquet(file_path)
print(df.head())
"""
elif "Datasets" in access_method:
ds = load_dataset(repo_id, split='train[:100]')
df = ds.to_pandas()
code_snippet = f"""
# πŸ’» Generated Code: Datasets
from datasets import load_dataset
# Make sure to `huggingface-cli login` for gated datasets.
ds = load_dataset("{repo_id}", split='train')
print(ds)
"""
elif "Dask" in access_method:
df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz").head(100)
code_snippet = f"""
# πŸ’» Generated Code: Dask
import dask.dataframe as dd
# Make sure to `huggingface-cli login` for gated datasets.
ddf = dd.read_json("hf://datasets/{repo_id}/**/*.jsonl.gz")
print(ddf.head())
"""
elif "Croissant" in access_method:
headers = get_auth_headers() if not config["is_public"] else {}
jsonld_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
jsonld = requests.get(jsonld_url, headers=headers).json()
ds = CroissantDataset(jsonld=jsonld)
records = ds.records("default")
data_rows = [row for _, row in zip(range(100), records)]
df = pd.DataFrame(data_rows)
code_snippet = f"""
# πŸ’» Generated Code: Croissant
import requests
from mlcroissant import Dataset as CroissantDataset
import pandas as pd
# For gated datasets, get your token from https://huggingface.co/settings/tokens
headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}}
jsonld_url = "{jsonld_url}"
jsonld = requests.get(jsonld_url, headers=headers).json()
ds = CroissantDataset(jsonld=jsonld)
records = ds.records("default") # This is a generator
# To preview data:
preview_rows = [row for _, row in zip(range(100), records)]
df = pd.DataFrame(preview_rows)
print(df.head())
"""
# --- πŸ” Universal Search Logic ---
if query and not df.empty:
if dataset_key == 'medical':
df = df[df['conversations'].apply(lambda x: isinstance(x, list) and len(x) > 1 and query.lower() in str(x[1].get('value', '')).lower())]
else:
combined_mask = pd.Series([False] * len(df))
for col in search_cols:
if col in df.columns and pd.api.types.is_string_dtype(df[col]):
combined_mask |= df[col].str.contains(query, case=False, na=False)
df = df[combined_mask]
# --- πŸ–ΌοΈ Special Image Handling ---
gallery_output = None
if dataset_key == 'inscene' and not df.empty:
gallery_data = []
for _, row in df.iterrows():
if isinstance(row.get('image'), Image.Image):
gallery_data.append((row['image'], row.get('text', '')))
gallery_output = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
md, csv, xlsx, tab = dataframe_to_outputs(df)
return df, gallery_output, md, csv, xlsx, tab, code_snippet
except Exception as e:
return handle_error(e)
# --- πŸ–ΌοΈ UI Generation ---
def create_dataset_tab(dataset_key: str):
"""
πŸ—οΈ This function builds a whole tab in our UI for a single dataset.
It's like a little construction worker for Gradio interfaces.
"""
config = DATASET_CONFIG[dataset_key]
with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
if not config['is_public']:
gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
with gr.Row():
access_method = gr.Radio(config['methods'], label="πŸ”‘ Access Method", value=config['methods'][0])
query = gr.Textbox(label="πŸ” Search Query", placeholder="Enter a keyword to search...")
fetch_button = gr.Button("πŸš€ Go Fetch!")
df_output = gr.DataFrame(label="πŸ“Š Results DataFrame", interactive=False, wrap=True)
gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="πŸ–ΌοΈ Image Results")
with gr.Accordion("πŸ“‚ View/Export Full Results", open=False):
markdown_output = gr.Markdown(label="πŸ“ Markdown View")
with gr.Row():
csv_output = gr.File(label="⬇️ Download CSV")
xlsx_output = gr.File(label="⬇️ Download XLSX")
# CHANGED: Removed the language parameter entirely for maximum compatibility.
copy_output = gr.Code(label="πŸ“‹ Copy-Paste (Tab-Delimited)")
code_output = gr.Code(label="πŸ’» Python Code Snippet", language="python")
fetch_button.click(
fn=fetch_data,
inputs=[gr.State(dataset_key), access_method, query],
outputs=[df_output, gallery_output, markdown_output, csv_output, xlsx_output, copy_output, code_output]
)
# --- πŸš€ Main App ---
with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
gr.Markdown("# πŸ€— Hugging Face Dataset Explorer")
gr.Markdown(
"Select a dataset, choose an access method, type a query, and see the results instantly. "
"The app demonstrates various ways to access and search Hugging Face datasets and generates the code for you!"
)
with gr.Tabs():
for key in DATASET_CONFIG.keys():
create_dataset_tab(key)
if __name__ == "__main__":
demo.launch(debug=True)