Spaces:

awacke1
/

Gradio-Med-Law-Fin-Scene-Gemini

Running

App Files Files Community

Gradio-Med-Law-Fin-Scene-Gemini / app.py.v1

awacke1

Create app.py.v1

c42c89f verified 12 days ago

raw

history blame contribute delete

12.4 kB


	# app.py
	import gradio as gr
	import pandas as pd
	import requests
	import io
	import dask.dataframe as dd
	from datasets import load_dataset, Image
	from mlcroissant import Dataset as CroissantDataset
	from huggingface_hub import get_token
	import polars as pl
	import warnings
	import traceback

	# 🤫 Let's ignore those pesky warnings, shall we?
	warnings.filterwarnings("ignore")

	# --- ⚙️ Configuration & Constants ---

	# 🎨 Let's give our datasets some personality with emojis and names!
	DATASET_CONFIG = {
	"caselaw": {
	"name": "common-pile/caselaw_access_project",
	"emoji": "⚖️",
	"search_col": "text",
	"methods": ["💨 API (requests)", "🧊 Dask", "🥐 Croissant"],
	"is_public": True,
	},
	"prompts": {
	"name": "fka/awesome-chatgpt-prompts",
	"emoji": "🤖",
	"search_col": ["act", "prompt"],
	"methods": ["🐼 Pandas", "💨 API (requests)", "🥐 Croissant"],
	"is_public": True,
	},
	"finance": {
	"name": "snorkelai/agent-finance-reasoning",
	"emoji": "💰",
	"search_col": ["question", "answer"],
	"methods": ["🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"],
	"is_public": False,
	},
	"medical": {
	"name": "FreedomIntelligence/medical-o1-reasoning-SFT",
	"emoji": "🩺",
	"search_col": "conversations",
	"methods": ["🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"],
	"is_public": False,
	},
	"inscene": {
	"name": "peteromallet/InScene-Dataset",
	"emoji": "🖼️",
	"search_col": "text",
	"methods": ["🤗 Datasets", "🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"],
	"is_public": False,
	},
	}

	# --- 헬 Helpers & Utility Functions ---

	def get_auth_headers():
	"""🔑 Creates authorization headers if a Hugging Face token is available."""
	token = get_token()
	return {"Authorization": f"Bearer {token}"} if token else {}

	def dataframe_to_outputs(df: pd.DataFrame):
	"""
	📜 Takes a DataFrame and magically transforms it into various formats for your viewing pleasure.
	Like a data chameleon!
	"""
	if df.empty:
	return "No results found. 🤷", None, None, "No results to copy."

	df_str = df.astype(str)
	markdown_output = df_str.to_markdown(index=False)

	csv_buffer = io.StringIO()
	df.to_csv(csv_buffer, index=False)
	csv_buffer.seek(0)

	excel_buffer = io.BytesIO()
	df.to_excel(excel_buffer, index=False, engine='openpyxl')
	excel_buffer.seek(0)

	tab_delimited_output = df.to_csv(sep='\t', index=False)

	return markdown_output, gr.File.from_bytes(csv_buffer.getvalue(), "results.csv"), gr.File.from_bytes(excel_buffer.getvalue(), "results.xlsx"), tab_delimited_output


	def handle_error(e: Exception):
	"""
	😱 Oh no! An error! This function catches it and displays it nicely.
	Because even errors deserve to look good.
	"""
	error_message = f"🚨 An error occurred: {str(e)}\n\n"
	auth_tip = "🔑 For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
	full_trace = traceback.format_exc()
	print(full_trace)

	if "401" in str(e) or "Gated" in str(e):
	error_message += auth_tip

	return (
	pd.DataFrame(),
	gr.Gallery(None, label="🖼️ Image Results"),
	f"```\n{error_message}\n\n{full_trace}\n```",
	None,
	None,
	error_message,
	f"```python\n# 🚨 Error during code generation:\n# {e}\n```"
	)

	# --- 🎣 Data Fetching & Processing Functions ---

	def fetch_data(dataset_key: str, access_method: str, query: str):
	"""
	🚀 The main mission control function! It fetches, searches, and formats data.
	It's the brains of the operation.
	"""
	try:
	config = DATASET_CONFIG[dataset_key]
	repo_id = config["name"]
	search_cols = [config["search_col"]] if isinstance(config["search_col"], str) else config["search_col"]
	df = pd.DataFrame()
	code_snippet = ""

	if "API" in access_method:
	url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset=0&length=100"
	headers = get_auth_headers() if not config["is_public"] else {}
	response = requests.get(url, headers=headers)
	response.raise_for_status()
	data = response.json()
	df = pd.json_normalize(data['rows'], record_path='row', meta=['row_idx', 'truncated_cells'])
	df = df.drop(columns=['row_idx', 'truncated_cells'], errors='ignore')

	code_snippet = f"""
	# 💻 Generated Code: API (requests)
	import requests
	import pandas as pd

	# For gated datasets, get your token from https://huggingface.co/settings/tokens
	# Make sure to `huggingface-cli login` first.
	headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}}
	url = "{url}"
	response = requests.get(url, headers=headers) # Pass headers for gated datasets
	data = response.json()
	df = pd.json_normalize(data['rows'], record_path='row')
	print(df.head())
	"""

	elif "Pandas" in access_method:
	file_path = f"hf://datasets/{repo_id}/"
	if repo_id == "fka/awesome-chatgpt-prompts":
	file_path += "prompts.csv"
	df = pd.read_csv(file_path)
	else:
	try:
	df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
	except:
	try:
	df = pd.read_parquet(f"{file_path}train.parquet")
	except:
	df = pd.read_json(f"{file_path}medical_o1_sft.json")

	code_snippet = f"""
	# 💻 Generated Code: Pandas
	import pandas as pd

	# Make sure to `huggingface-cli login` for gated datasets.
	file_path = "{file_path}"
	df = pd.{'read_csv' if '.csv' in file_path else ('read_json' if '.json' in file_path else 'read_parquet')}(file_path)
	print(df.head())
	"""

	elif "Polars" in access_method:
	file_path = f"hf://datasets/{repo_id}/"
	try:
	df = pl.read_parquet(f"{file_path}data/train-00000-of-00001.parquet").to_pandas()
	except:
	try:
	df = pl.read_parquet(f"{file_path}train.parquet").to_pandas()
	except:
	df = pl.read_json(f"{file_path}medical_o1_sft.json").to_pandas()

	code_snippet = f"""
	# 💻 Generated Code: Polars
	import polars as pl

	# Make sure to `huggingface-cli login` for gated datasets.
	file_path = "{'hf://datasets/' + repo_id + '/data/train-00000-of-00001.parquet'}"
	df = pl.read_parquet(file_path)
	print(df.head())
	"""

	elif "Datasets" in access_method:
	ds = load_dataset(repo_id, split='train[:100]')
	df = ds.to_pandas()
	code_snippet = f"""
	# 💻 Generated Code: Datasets
	from datasets import load_dataset

	# Make sure to `huggingface-cli login` for gated datasets.
	ds = load_dataset("{repo_id}", split='train')
	print(ds)
	"""

	elif "Dask" in access_method:
	df = dd.read_json(f"hf://datasets/{repo_id}/*/.jsonl.gz").head(100)
	code_snippet = f"""
	# 💻 Generated Code: Dask
	import dask.dataframe as dd

	# Make sure to `huggingface-cli login` for gated datasets.
	ddf = dd.read_json("hf://datasets/{repo_id}/*/.jsonl.gz")
	print(ddf.head())
	"""

	elif "Croissant" in access_method:
	headers = get_auth_headers() if not config["is_public"] else {}
	jsonld_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
	jsonld = requests.get(jsonld_url, headers=headers).json()
	ds = CroissantDataset(jsonld=jsonld)
	records = ds.records("default")
	data_rows = [row for _, row in zip(range(100), records)]
	df = pd.DataFrame(data_rows)
	code_snippet = f"""
	# 💻 Generated Code: Croissant
	import requests
	from mlcroissant import Dataset as CroissantDataset
	import pandas as pd

	# For gated datasets, get your token from https://huggingface.co/settings/tokens
	headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}}
	jsonld_url = "{jsonld_url}"
	jsonld = requests.get(jsonld_url, headers=headers).json()
	ds = CroissantDataset(jsonld=jsonld)
	records = ds.records("default") # This is a generator

	# To preview data:
	preview_rows = [row for _, row in zip(range(100), records)]
	df = pd.DataFrame(preview_rows)
	print(df.head())
	"""

	# --- 🔍 Universal Search Logic ---
	if query and not df.empty:
	if dataset_key == 'medical':
	df = df[df['conversations'].apply(lambda x: isinstance(x, list) and len(x) > 1 and query.lower() in str(x[1].get('value', '')).lower())]
	else:
	combined_mask = pd.Series([False] * len(df))
	for col in search_cols:
	if col in df.columns and pd.api.types.is_string_dtype(df[col]):
	combined_mask \|= df[col].str.contains(query, case=False, na=False)
	df = df[combined_mask]

	# --- 🖼️ Special Image Handling ---
	gallery_output = None
	if dataset_key == 'inscene' and not df.empty:
	gallery_data = []
	for _, row in df.iterrows():
	if isinstance(row.get('image'), Image.Image):
	gallery_data.append((row['image'], row.get('text', '')))
	gallery_output = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)

	md, csv, xlsx, tab = dataframe_to_outputs(df)
	return df, gallery_output, md, csv, xlsx, tab, code_snippet

	except Exception as e:
	return handle_error(e)


	# --- 🖼️ UI Generation ---

	def create_dataset_tab(dataset_key: str):
	"""
	🏗️ This function builds a whole tab in our UI for a single dataset.
	It's like a little construction worker for Gradio interfaces.
	"""
	config = DATASET_CONFIG[dataset_key]

	with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
	gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
	if not config['is_public']:
	gr.Markdown("Note: This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")

	with gr.Row():
	access_method = gr.Radio(config['methods'], label="🔑 Access Method", value=config['methods'][0])
	query = gr.Textbox(label="🔍 Search Query", placeholder="Enter a keyword to search...")

	fetch_button = gr.Button("🚀 Go Fetch!")

	df_output = gr.DataFrame(label="📊 Results DataFrame", interactive=False, wrap=True)
	gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="🖼️ Image Results")

	with gr.Accordion("📂 View/Export Full Results", open=False):
	markdown_output = gr.Markdown(label="📝 Markdown View")
	with gr.Row():
	csv_output = gr.File(label="⬇️ Download CSV")
	xlsx_output = gr.File(label="⬇️ Download XLSX")
	# CHANGED: Removed the language parameter entirely for maximum compatibility.
	copy_output = gr.Code(label="📋 Copy-Paste (Tab-Delimited)")

	code_output = gr.Code(label="💻 Python Code Snippet", language="python")

	fetch_button.click(
	fn=fetch_data,
	inputs=[gr.State(dataset_key), access_method, query],
	outputs=[df_output, gallery_output, markdown_output, csv_output, xlsx_output, copy_output, code_output]
	)

	# --- 🚀 Main App ---
	with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
	gr.Markdown("# 🤗 Hugging Face Dataset Explorer")
	gr.Markdown(
	"Select a dataset, choose an access method, type a query, and see the results instantly. "
	"The app demonstrates various ways to access and search Hugging Face datasets and generates the code for you!"
	)

	with gr.Tabs():
	for key in DATASET_CONFIG.keys():
	create_dataset_tab(key)

	if __name__ == "__main__":
	demo.launch(debug=True)