Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
Β·
f02d36b
1
Parent(s):
3a2ac99
no more dynamic updates
Browse files- app.py +50 -37
- src/display/formatting.py +2 -2
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from gradio_leaderboard import Leaderboard
|
| 3 |
import pandas as pd
|
| 4 |
from huggingface_hub import snapshot_download, create_repo
|
| 5 |
from huggingface_hub.utils import RepositoryNotFoundError
|
|
@@ -21,24 +20,26 @@ from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
|
|
| 21 |
from src.populate import get_leaderboard_df
|
| 22 |
from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
|
| 23 |
|
| 24 |
-
def
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 37 |
-
filter_columns=[
|
| 38 |
-
AutoEvalColumn.model_type.name,
|
| 39 |
-
AutoEvalColumn.precision.name,
|
| 40 |
-
],
|
| 41 |
-
)
|
| 42 |
|
| 43 |
def run_perplexity_test(model_name, revision, precision):
|
| 44 |
"""Run perplexity evaluation on demand."""
|
|
@@ -47,7 +48,7 @@ def run_perplexity_test(model_name, revision, precision):
|
|
| 47 |
import gradio as gr
|
| 48 |
|
| 49 |
if not model_name:
|
| 50 |
-
return "Please enter a model name."
|
| 51 |
|
| 52 |
try:
|
| 53 |
# Use stderr for more reliable logging in HF Spaces
|
|
@@ -62,24 +63,22 @@ def run_perplexity_test(model_name, revision, precision):
|
|
| 62 |
sys.stderr.flush()
|
| 63 |
|
| 64 |
if success:
|
| 65 |
-
sys.stderr.write("Evaluation succeeded - results
|
| 66 |
sys.stderr.flush()
|
| 67 |
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
**Model**: {model_name}
|
| 71 |
**Perplexity Score**: {result:.4f}
|
| 72 |
|
| 73 |
-
π **Results have been saved
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
1. Click on the **π
Leaderboard** tab above
|
| 77 |
-
2. Refresh the page (Ctrl+R or Cmd+R)
|
| 78 |
-
3. Your model should now appear in the rankings!
|
| 79 |
-
|
| 80 |
-
π‘ **Note**: Due to technical limitations with the leaderboard component, results cannot be updated dynamically. The refresh is necessary to see the latest rankings."""
|
| 81 |
else:
|
| 82 |
-
return f"β **Evaluation failed**: {result}"
|
| 83 |
|
| 84 |
except Exception as e:
|
| 85 |
error_msg = str(e)
|
|
@@ -87,7 +86,7 @@ def run_perplexity_test(model_name, revision, precision):
|
|
| 87 |
sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
|
| 88 |
sys.stderr.write(f"Traceback: {traceback_str}\n")
|
| 89 |
sys.stderr.flush()
|
| 90 |
-
return f"β **Critical error**: {error_msg}"
|
| 91 |
|
| 92 |
# Initialize results repository and directory
|
| 93 |
try:
|
|
@@ -117,8 +116,8 @@ except Exception as e:
|
|
| 117 |
# Ensure local directory exists even if repo operations fail
|
| 118 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
| 119 |
|
| 120 |
-
# Get initial
|
| 121 |
-
|
| 122 |
|
| 123 |
# Create the Gradio interface
|
| 124 |
demo = gr.Blocks(css=custom_css)
|
|
@@ -127,8 +126,14 @@ with demo:
|
|
| 127 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 128 |
|
| 129 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 130 |
-
with gr.TabItem("π
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
with gr.TabItem("π About", elem_id="about-tab", id=1):
|
| 134 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
@@ -151,10 +156,18 @@ with demo:
|
|
| 151 |
test_button = gr.Button("π Run Perplexity Test", variant="primary")
|
| 152 |
result = gr.Markdown()
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
gr.Markdown("""
|
| 155 |
### Tips:
|
| 156 |
- **Check stderr logs** in HF Spaces for detailed debugging information
|
| 157 |
-
- **
|
| 158 |
- **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
|
| 159 |
- **Lower perplexity scores = better performance** (better at predicting text)
|
| 160 |
|
|
@@ -162,13 +175,13 @@ with demo:
|
|
| 162 |
1. Enter a model name from Hugging Face Hub
|
| 163 |
2. Click "Run Perplexity Test"
|
| 164 |
3. Wait for evaluation to complete (may take a few minutes for large models)
|
| 165 |
-
4.
|
| 166 |
""")
|
| 167 |
|
| 168 |
test_button.click(
|
| 169 |
run_perplexity_test,
|
| 170 |
[model_name, revision, precision],
|
| 171 |
-
[result]
|
| 172 |
)
|
| 173 |
|
| 174 |
demo.queue(default_concurrency_limit=5).launch()
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
from huggingface_hub import snapshot_download, create_repo
|
| 4 |
from huggingface_hub.utils import RepositoryNotFoundError
|
|
|
|
| 20 |
from src.populate import get_leaderboard_df
|
| 21 |
from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
|
| 22 |
|
| 23 |
+
def create_results_dataframe():
|
| 24 |
+
"""Create and return the results DataFrame for display"""
|
| 25 |
+
df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
| 26 |
+
if df is None or df.empty:
|
| 27 |
+
# Return empty DataFrame with proper columns
|
| 28 |
+
return pd.DataFrame(columns=["Model", "Perplexity", "Average Score", "Type", "Precision"])
|
| 29 |
|
| 30 |
+
# Select and rename columns for display
|
| 31 |
+
display_df = df[[
|
| 32 |
+
AutoEvalColumn.model.name,
|
| 33 |
+
"Perplexity", # This matches the task column name from Tasks.task0.value.col_name
|
| 34 |
+
AutoEvalColumn.average.name,
|
| 35 |
+
AutoEvalColumn.model_type.name,
|
| 36 |
+
AutoEvalColumn.precision.name,
|
| 37 |
+
]].copy()
|
| 38 |
|
| 39 |
+
# Rename columns for better display
|
| 40 |
+
display_df.columns = ["Model", "Perplexity", "Average Score", "Type", "Precision"]
|
| 41 |
+
|
| 42 |
+
return display_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def run_perplexity_test(model_name, revision, precision):
|
| 45 |
"""Run perplexity evaluation on demand."""
|
|
|
|
| 48 |
import gradio as gr
|
| 49 |
|
| 50 |
if not model_name:
|
| 51 |
+
return "Please enter a model name.", gr.update()
|
| 52 |
|
| 53 |
try:
|
| 54 |
# Use stderr for more reliable logging in HF Spaces
|
|
|
|
| 63 |
sys.stderr.flush()
|
| 64 |
|
| 65 |
if success:
|
| 66 |
+
sys.stderr.write("Evaluation succeeded - updating results table\n")
|
| 67 |
sys.stderr.flush()
|
| 68 |
|
| 69 |
+
# Get updated results
|
| 70 |
+
updated_df = create_results_dataframe()
|
| 71 |
+
|
| 72 |
+
success_msg = f"""β
**Perplexity evaluation completed successfully!**
|
| 73 |
|
| 74 |
**Model**: {model_name}
|
| 75 |
**Perplexity Score**: {result:.4f}
|
| 76 |
|
| 77 |
+
π **Results have been saved and the table below has been updated!**"""
|
| 78 |
+
|
| 79 |
+
return success_msg, gr.update(value=updated_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
else:
|
| 81 |
+
return f"β **Evaluation failed**: {result}", gr.update()
|
| 82 |
|
| 83 |
except Exception as e:
|
| 84 |
error_msg = str(e)
|
|
|
|
| 86 |
sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
|
| 87 |
sys.stderr.write(f"Traceback: {traceback_str}\n")
|
| 88 |
sys.stderr.flush()
|
| 89 |
+
return f"β **Critical error**: {error_msg}", gr.update()
|
| 90 |
|
| 91 |
# Initialize results repository and directory
|
| 92 |
try:
|
|
|
|
| 116 |
# Ensure local directory exists even if repo operations fail
|
| 117 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
| 118 |
|
| 119 |
+
# Get initial results data
|
| 120 |
+
RESULTS_DF = create_results_dataframe()
|
| 121 |
|
| 122 |
# Create the Gradio interface
|
| 123 |
demo = gr.Blocks(css=custom_css)
|
|
|
|
| 126 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 127 |
|
| 128 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 129 |
+
with gr.TabItem("π
Results", elem_id="results-tab", id=0):
|
| 130 |
+
gr.Markdown("## Model Evaluation Results")
|
| 131 |
+
results_table = gr.DataFrame(
|
| 132 |
+
value=RESULTS_DF,
|
| 133 |
+
headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
|
| 134 |
+
interactive=False,
|
| 135 |
+
wrap=False
|
| 136 |
+
)
|
| 137 |
|
| 138 |
with gr.TabItem("π About", elem_id="about-tab", id=1):
|
| 139 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 156 |
test_button = gr.Button("π Run Perplexity Test", variant="primary")
|
| 157 |
result = gr.Markdown()
|
| 158 |
|
| 159 |
+
gr.Markdown("## Live Results")
|
| 160 |
+
live_results_table = gr.DataFrame(
|
| 161 |
+
value=RESULTS_DF,
|
| 162 |
+
headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
|
| 163 |
+
interactive=False,
|
| 164 |
+
wrap=False
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
gr.Markdown("""
|
| 168 |
### Tips:
|
| 169 |
- **Check stderr logs** in HF Spaces for detailed debugging information
|
| 170 |
+
- **Results will update automatically** in the table above after evaluation completes
|
| 171 |
- **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
|
| 172 |
- **Lower perplexity scores = better performance** (better at predicting text)
|
| 173 |
|
|
|
|
| 175 |
1. Enter a model name from Hugging Face Hub
|
| 176 |
2. Click "Run Perplexity Test"
|
| 177 |
3. Wait for evaluation to complete (may take a few minutes for large models)
|
| 178 |
+
4. Results will appear automatically in the table above!
|
| 179 |
""")
|
| 180 |
|
| 181 |
test_button.click(
|
| 182 |
run_perplexity_test,
|
| 183 |
[model_name, revision, precision],
|
| 184 |
+
[result, live_results_table]
|
| 185 |
)
|
| 186 |
|
| 187 |
demo.queue(default_concurrency_limit=5).launch()
|
src/display/formatting.py
CHANGED
|
@@ -3,8 +3,8 @@ def model_hyperlink(link, model_name):
|
|
| 3 |
|
| 4 |
|
| 5 |
def make_clickable_model(model_name):
|
| 6 |
-
|
| 7 |
-
return
|
| 8 |
|
| 9 |
|
| 10 |
def styled_error(error):
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def make_clickable_model(model_name):
|
| 6 |
+
# Just return the plain model name without HTML formatting
|
| 7 |
+
return model_name
|
| 8 |
|
| 9 |
|
| 10 |
def styled_error(error):
|