Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import polars as pl | |
| import logging | |
| logging.basicConfig(format='%(name)s - %(asctime)s - %(message)s', level=logging.INFO) | |
| logging.info("loading data") | |
| data = pl.read_parquet("hf://datasets/polinaeterna/hub_datasets_string_statistics/data/*.parquet") | |
| logging.info("data loaded") | |
| min_num_examples = data["num_examples"].min() | |
| max_num_examples = data["num_examples"].max() | |
| min_null_count = data["null_count"].min() | |
| max_null_count = data["null_count"].max() | |
| min_null_prop = data["null_proportion"].min() | |
| max_null_prop = data["null_proportion"].max() | |
| min_min = data["min"].min() | |
| max_min = data["min"].max() | |
| min_max = data["max"].min() | |
| max_max = data["max"].max() | |
| min_mean = data["mean"].min() | |
| max_mean = data["mean"].max() | |
| min_median = data["median"].min() | |
| max_median = data["median"].max() | |
| min_std = data["std"].min() | |
| max_std = data["std"].max() | |
| def urlize(dataset_name): | |
| return f"[{dataset_name}](https://huggingface.co/datasets/{dataset_name})" | |
| def filter_data( | |
| min_num_examples_input, max_num_examples_input, | |
| min_null_count_input, max_null_count_input, | |
| min_null_prop_input, max_null_prop_input, | |
| min_min_input, max_min_input, | |
| min_max_input, max_max_input, | |
| min_mean_input, max_mean_input, | |
| min_median_input, max_median_input, | |
| min_std_input, max_std_input, | |
| sort_by, | |
| column_name, | |
| include_partial = False, | |
| ): | |
| df = data.filter( | |
| (pl.col("num_examples") >= min_num_examples_input) & (pl.col("num_examples") <= max_num_examples_input) & | |
| (pl.col("null_count") >= min_null_count_input) & (pl.col("null_count") <= max_null_count_input) & | |
| (pl.col("null_proportion") >= min_null_prop_input) & (pl.col("null_proportion") <= max_null_prop_input) & | |
| (pl.col("min") >= min_min_input) & (pl.col("min") <= max_min_input) & | |
| (pl.col("max") >= min_max_input) & (pl.col("max") <= max_max_input) & | |
| (pl.col("mean") >= min_mean_input) & (pl.col("mean") <= max_mean_input) & | |
| (pl.col("median") >= min_median_input) & (pl.col("median") <= max_median_input) & | |
| (pl.col("std") >= min_std_input) & (pl.col("std") <= max_std_input) | |
| ) | |
| if not include_partial: | |
| df = df.filter((pl.col("partial") == include_partial)) | |
| if column_name: | |
| df = df.filter(pl.col("column_name") == column_name) | |
| if sort_by: | |
| try: | |
| sort_cols, sort_descs = parse_sort_by(sort_by) | |
| except: | |
| return [pd.DataFrame(), "incorrect sort string format"] | |
| logging.info(sort_cols) | |
| logging.info(sort_descs) | |
| df = df.sort( | |
| *sort_cols, descending=sort_descs if len(sort_descs) > 1 else sort_descs[0], | |
| ) | |
| n_rows = df.shape[0] | |
| n_splits = df.group_by(["dataset", "config", "split"]).len().shape[0] | |
| n_datasets = df["dataset"].n_unique() | |
| max_rows = 100 | |
| text = f"{n_rows} rows / {n_splits} unique splits / {n_datasets} unique datasets found{' (100 rows displayed).' if n_rows > max_rows else '.'} \n" | |
| df = df.to_pandas() | |
| df["dataset"] = df["dataset"].apply(urlize) | |
| df = df.drop("histogram", axis=1) | |
| logging.info(df.head(2)) | |
| if df.shape[0] > max_rows: | |
| return df.head(max_rows), text | |
| return df, text | |
| def parse_sort_by(sort_string): | |
| args = sort_string.split(";") | |
| col_names, descs = [], [] | |
| for arg in args: | |
| col_name, desc = arg.split(":") | |
| col_names.append(col_name) | |
| descs.append(True if desc == "desc" else False) | |
| return col_names, descs | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # π« Filter text datasets by string statistics π« | |
| ### The raw data is here: | |
| """) | |
| html_code = f""" | |
| <iframe | |
| src="https://huggingface.co/datasets/polinaeterna/hub_datasets_string_statistics/embed/viewer/default/train" | |
| frameborder="0" | |
| width="100%" | |
| height="560px" | |
| ></iframe> | |
| """ | |
| gr.HTML(value=html_code) | |
| gr.Markdown("- Number of examples range") | |
| with gr.Row(): | |
| with gr.Column(): | |
| min_num_examples_input = gr.Slider(min_num_examples, max_num_examples, min_num_examples, step=1, label="Min num examples value") | |
| with gr.Column(): | |
| max_num_examples_input = gr.Slider(min_num_examples, max_num_examples, max_num_examples, step=1, label="Max num examples value") | |
| gr.Markdown("- Null count range") | |
| with gr.Row(): | |
| with gr.Column(): | |
| min_null_count_input = gr.Slider(min_null_count, max_null_count, min_null_count, step=1, label="Min null count value") | |
| with gr.Column(): | |
| max_null_count_input = gr.Slider(min_null_count, max_null_count, max_null_count, step=1, label="Max null count value") | |
| gr.Markdown("- Null proportion range") | |
| with gr.Row(): | |
| with gr.Column(): | |
| min_null_prop_input = gr.Slider(min_null_prop, max_null_prop, min_null_prop, step=0.01, label="Min null proportion value") | |
| with gr.Column(): | |
| max_null_prop_input = gr.Slider(min_null_prop, max_null_prop, max_null_prop, step=0.01, label="Max null proportion value") | |
| gr.Markdown("- Minimum string length (in symbols) range") | |
| with gr.Row(): | |
| with gr.Column(): | |
| min_min_input = gr.Slider(min_min, max_min, min_min, step=1, label="Min min value") | |
| with gr.Column(): | |
| max_min_input = gr.Slider(min_min, max_min, max_min, step=1, label="Max min value") | |
| gr.Markdown("- Maximum string length (in symbols) range") | |
| with gr.Row(): | |
| with gr.Column(): | |
| min_max_input = gr.Slider(min_max, max_max, min_max, step=1, label="Min max value") | |
| with gr.Column(): | |
| max_max_input = gr.Slider(min_max, max_max, max_max, step=1, label="Max max value") | |
| gr.Markdown("- Mean string length (in symbols) range") | |
| with gr.Row(): | |
| with gr.Column(): | |
| min_mean_input = gr.Slider(min_mean, max_mean, min_mean, step=1, label="Min mean value") | |
| with gr.Column(): | |
| max_mean_input = gr.Slider(min_mean, max_mean, max_mean, step=1, label="Max mean value") | |
| gr.Markdown("- Median string length (in symbols) range") | |
| with gr.Row(): | |
| with gr.Column(): | |
| min_median_input = gr.Slider(min_median, max_median, min_median, step=1, label="Min median value") | |
| with gr.Column(): | |
| max_median_input = gr.Slider(min_median, max_median, max_median, step=1, label="Max median value") | |
| gr.Markdown("- Standard deviation of string length (in symbols) range") | |
| with gr.Row(): | |
| with gr.Column(): | |
| min_std_input = gr.Slider(min_std, max_std, min_std, step=1, label="Min std value") | |
| with gr.Column(): | |
| max_std_input = gr.Slider(min_std, max_std, max_std, step=1, label="Max std value") | |
| sort_by = gr.Textbox(placeholder="num_examples:desc;std:asc;null_proportion:asc", label="Sort by (optional), in the following format: '<column_name_1>:desc/asc;<column_name_2>:desc/asc'") | |
| column_name = gr.Textbox(placeholder="text", label="Column name, if you want to check only specific column (optional)") | |
| include_partial = gr.Checkbox(False, label="Include partial datasets") | |
| # max_rows = gr.Number(100, ) | |
| btn = gr.Button("Get datasets") | |
| summary = gr.Markdown() | |
| datasets = gr.DataFrame(datatype="markdown") | |
| btn.click(filter_data, inputs=[ | |
| min_num_examples_input, max_num_examples_input, | |
| min_null_count_input, max_null_count_input, | |
| min_null_prop_input, max_null_prop_input, | |
| min_min_input, max_min_input, | |
| min_max_input, max_max_input, | |
| min_mean_input, max_mean_input, | |
| min_median_input, max_median_input, | |
| min_std_input, max_std_input, | |
| sort_by, | |
| column_name, | |
| include_partial, | |
| ], outputs=[datasets, summary]) | |
| demo.launch(debug=True) |