Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
·
1dd4b6a
1
Parent(s):
86c1853
lets see
Browse files- app.py +82 -10
- model-tracing +1 -0
- requirements.txt +12 -1
- src/about.py +20 -3
- src/display/utils.py +2 -0
- src/evaluation/model_trace_eval.py +310 -0
- src/leaderboard/read_evals.py +29 -0
- test_model_trace.py +43 -0
app.py
CHANGED
|
@@ -22,23 +22,66 @@ from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
|
|
| 22 |
|
| 23 |
def create_results_dataframe():
|
| 24 |
"""Create and return the results DataFrame for display"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
if df is None or df.empty:
|
|
|
|
|
|
|
| 27 |
# Return empty DataFrame with proper columns
|
| 28 |
-
return pd.DataFrame(columns=["Model", "Perplexity", "Average Score", "Type", "Precision"])
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
| 32 |
AutoEvalColumn.model.name,
|
| 33 |
-
"Perplexity",
|
|
|
|
| 34 |
AutoEvalColumn.average.name,
|
| 35 |
AutoEvalColumn.model_type.name,
|
| 36 |
AutoEvalColumn.precision.name,
|
| 37 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
# Rename columns for better display
|
| 40 |
-
display_df.columns = ["Model", "Perplexity", "Average Score", "Type", "Precision"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
|
|
|
| 42 |
return display_df
|
| 43 |
|
| 44 |
def run_perplexity_test(model_name, revision, precision):
|
|
@@ -66,15 +109,23 @@ def run_perplexity_test(model_name, revision, precision):
|
|
| 66 |
sys.stderr.write("Evaluation succeeded - updating both results tables\n")
|
| 67 |
sys.stderr.flush()
|
| 68 |
|
| 69 |
-
# Get updated results
|
|
|
|
|
|
|
|
|
|
| 70 |
updated_df = create_results_dataframe()
|
| 71 |
|
|
|
|
|
|
|
|
|
|
| 72 |
success_msg = f"""✅ **Perplexity evaluation completed successfully!**
|
| 73 |
|
| 74 |
**Model**: {model_name}
|
| 75 |
**Perplexity Score**: {result:.4f}
|
| 76 |
|
| 77 |
-
🎉 **Results have been saved and both tables have been updated!**
|
|
|
|
|
|
|
| 78 |
|
| 79 |
return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
|
| 80 |
else:
|
|
@@ -117,9 +168,21 @@ except Exception as e:
|
|
| 117 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
| 118 |
|
| 119 |
# Get initial results data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
RESULTS_DF = create_results_dataframe()
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
# Create the Gradio interface
|
|
|
|
|
|
|
|
|
|
| 123 |
demo = gr.Blocks(css=custom_css)
|
| 124 |
with demo:
|
| 125 |
gr.HTML(TITLE)
|
|
@@ -130,7 +193,7 @@ with demo:
|
|
| 130 |
gr.Markdown("## Model Evaluation Results")
|
| 131 |
results_table = gr.DataFrame(
|
| 132 |
value=RESULTS_DF,
|
| 133 |
-
headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
|
| 134 |
interactive=False,
|
| 135 |
wrap=False
|
| 136 |
)
|
|
@@ -159,7 +222,7 @@ with demo:
|
|
| 159 |
gr.Markdown("## Live Results")
|
| 160 |
live_results_table = gr.DataFrame(
|
| 161 |
value=RESULTS_DF,
|
| 162 |
-
headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
|
| 163 |
interactive=False,
|
| 164 |
wrap=False
|
| 165 |
)
|
|
@@ -184,4 +247,13 @@ with demo:
|
|
| 184 |
[result, live_results_table, results_table]
|
| 185 |
)
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
demo.queue(default_concurrency_limit=5).launch()
|
|
|
|
| 22 |
|
| 23 |
def create_results_dataframe():
|
| 24 |
"""Create and return the results DataFrame for display"""
|
| 25 |
+
import sys
|
| 26 |
+
|
| 27 |
+
sys.stderr.write("\n📊 CREATE_RESULTS_DATAFRAME CALLED\n")
|
| 28 |
+
sys.stderr.flush()
|
| 29 |
+
|
| 30 |
df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
| 31 |
+
|
| 32 |
+
sys.stderr.write(f"📋 Retrieved leaderboard df: {df.shape if df is not None else 'None'}\n")
|
| 33 |
+
sys.stderr.flush()
|
| 34 |
+
|
| 35 |
if df is None or df.empty:
|
| 36 |
+
sys.stderr.write("⚠️ DataFrame is None or empty, returning empty DataFrame\n")
|
| 37 |
+
sys.stderr.flush()
|
| 38 |
# Return empty DataFrame with proper columns
|
| 39 |
+
return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"])
|
| 40 |
|
| 41 |
+
sys.stderr.write(f"📊 Original DataFrame columns: {list(df.columns)}\n")
|
| 42 |
+
sys.stderr.flush()
|
| 43 |
+
|
| 44 |
+
# Check if required columns exist
|
| 45 |
+
required_cols = [
|
| 46 |
AutoEvalColumn.model.name,
|
| 47 |
+
"Perplexity",
|
| 48 |
+
AutoEvalColumn.model_trace_p_value.name,
|
| 49 |
AutoEvalColumn.average.name,
|
| 50 |
AutoEvalColumn.model_type.name,
|
| 51 |
AutoEvalColumn.precision.name,
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 55 |
+
if missing_cols:
|
| 56 |
+
sys.stderr.write(f"⚠️ Missing columns in DataFrame: {missing_cols}\n")
|
| 57 |
+
sys.stderr.flush()
|
| 58 |
+
# Add missing columns with default values
|
| 59 |
+
for col in missing_cols:
|
| 60 |
+
if col == AutoEvalColumn.model_trace_p_value.name:
|
| 61 |
+
df[col] = None
|
| 62 |
+
sys.stderr.write(f"➕ Added {col} column with None values\n")
|
| 63 |
+
|
| 64 |
+
# Select and rename columns for display
|
| 65 |
+
try:
|
| 66 |
+
display_df = df[required_cols].copy()
|
| 67 |
+
sys.stderr.write(f"✅ Selected columns successfully: {list(display_df.columns)}\n")
|
| 68 |
+
except Exception as e:
|
| 69 |
+
sys.stderr.write(f"💥 Error selecting columns: {e}\n")
|
| 70 |
+
sys.stderr.flush()
|
| 71 |
+
return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"])
|
| 72 |
|
| 73 |
# Rename columns for better display
|
| 74 |
+
display_df.columns = ["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"]
|
| 75 |
+
|
| 76 |
+
sys.stderr.write(f"🎯 Final display DataFrame shape: {display_df.shape}\n")
|
| 77 |
+
sys.stderr.write(f"🎯 Final columns: {list(display_df.columns)}\n")
|
| 78 |
+
|
| 79 |
+
# Check p-value column
|
| 80 |
+
if "Match P-Value" in display_df.columns:
|
| 81 |
+
p_value_stats = display_df["Match P-Value"].describe()
|
| 82 |
+
sys.stderr.write(f"📈 P-Value column stats:\n{p_value_stats}\n")
|
| 83 |
|
| 84 |
+
sys.stderr.flush()
|
| 85 |
return display_df
|
| 86 |
|
| 87 |
def run_perplexity_test(model_name, revision, precision):
|
|
|
|
| 109 |
sys.stderr.write("Evaluation succeeded - updating both results tables\n")
|
| 110 |
sys.stderr.flush()
|
| 111 |
|
| 112 |
+
# Get updated results (this will trigger model trace p-value computation for the new model)
|
| 113 |
+
sys.stderr.write("🔄 Creating updated results DataFrame (may compute model trace p-values)...\n")
|
| 114 |
+
sys.stderr.flush()
|
| 115 |
+
|
| 116 |
updated_df = create_results_dataframe()
|
| 117 |
|
| 118 |
+
sys.stderr.write("✅ Updated DataFrame created successfully\n")
|
| 119 |
+
sys.stderr.flush()
|
| 120 |
+
|
| 121 |
success_msg = f"""✅ **Perplexity evaluation completed successfully!**
|
| 122 |
|
| 123 |
**Model**: {model_name}
|
| 124 |
**Perplexity Score**: {result:.4f}
|
| 125 |
|
| 126 |
+
🎉 **Results have been saved and both tables have been updated!**
|
| 127 |
+
|
| 128 |
+
Note: Model trace p-value computation may take additional time and will appear in the logs."""
|
| 129 |
|
| 130 |
return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
|
| 131 |
else:
|
|
|
|
| 168 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
| 169 |
|
| 170 |
# Get initial results data
|
| 171 |
+
import sys
|
| 172 |
+
sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
|
| 173 |
+
sys.stderr.write("📊 Creating initial results DataFrame...\n")
|
| 174 |
+
sys.stderr.flush()
|
| 175 |
+
|
| 176 |
RESULTS_DF = create_results_dataframe()
|
| 177 |
|
| 178 |
+
sys.stderr.write(f"✅ Initial DataFrame created with shape: {RESULTS_DF.shape}\n")
|
| 179 |
+
sys.stderr.write(f"📋 Columns: {list(RESULTS_DF.columns)}\n")
|
| 180 |
+
sys.stderr.flush()
|
| 181 |
+
|
| 182 |
# Create the Gradio interface
|
| 183 |
+
sys.stderr.write("🎨 Creating Gradio interface...\n")
|
| 184 |
+
sys.stderr.flush()
|
| 185 |
+
|
| 186 |
demo = gr.Blocks(css=custom_css)
|
| 187 |
with demo:
|
| 188 |
gr.HTML(TITLE)
|
|
|
|
| 193 |
gr.Markdown("## Model Evaluation Results")
|
| 194 |
results_table = gr.DataFrame(
|
| 195 |
value=RESULTS_DF,
|
| 196 |
+
headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
|
| 197 |
interactive=False,
|
| 198 |
wrap=False
|
| 199 |
)
|
|
|
|
| 222 |
gr.Markdown("## Live Results")
|
| 223 |
live_results_table = gr.DataFrame(
|
| 224 |
value=RESULTS_DF,
|
| 225 |
+
headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
|
| 226 |
interactive=False,
|
| 227 |
wrap=False
|
| 228 |
)
|
|
|
|
| 247 |
[result, live_results_table, results_table]
|
| 248 |
)
|
| 249 |
|
| 250 |
+
sys.stderr.write("🎯 GRADIO INTERFACE SETUP COMPLETE\n")
|
| 251 |
+
sys.stderr.write("🚀 LAUNCHING GRADIO APP WITH MODEL TRACING INTEGRATION\n")
|
| 252 |
+
sys.stderr.write("📊 Features enabled:\n")
|
| 253 |
+
sys.stderr.write(" - Perplexity evaluation\n")
|
| 254 |
+
sys.stderr.write(" - Model trace p-value computation (vs GPT-2 base)\n")
|
| 255 |
+
sys.stderr.write(" - Match statistic with alignment\n")
|
| 256 |
+
sys.stderr.write("🎉 Ready to accept requests!\n")
|
| 257 |
+
sys.stderr.flush()
|
| 258 |
+
|
| 259 |
demo.queue(default_concurrency_limit=5).launch()
|
model-tracing
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 9eb3b67655be2a3576348a6d482e69c62f72fc3e
|
requirements.txt
CHANGED
|
@@ -15,4 +15,15 @@ transformers>=4.30.0
|
|
| 15 |
tokenizers>=0.15.0
|
| 16 |
sentencepiece
|
| 17 |
torch>=2.0.0
|
| 18 |
-
accelerate>=0.20.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
tokenizers>=0.15.0
|
| 16 |
sentencepiece
|
| 17 |
torch>=2.0.0
|
| 18 |
+
accelerate>=0.20.0
|
| 19 |
+
# Model tracing dependencies
|
| 20 |
+
PyYAML==6.0.1
|
| 21 |
+
scipy==1.13.1
|
| 22 |
+
protobuf==5.27.1
|
| 23 |
+
zstandard==0.22.0
|
| 24 |
+
ipdb==0.13.13
|
| 25 |
+
# Development dependencies for model tracing
|
| 26 |
+
ruff==0.1.8
|
| 27 |
+
pre-commit==3.5.0
|
| 28 |
+
nbqa==1.7.1
|
| 29 |
+
ipykernel==6.29.0
|
src/about.py
CHANGED
|
@@ -21,17 +21,34 @@ TITLE = """<h1 align="center" id="space-title">Model Perplexity Leaderboard</h1>
|
|
| 21 |
|
| 22 |
# What does your leaderboard evaluate?
|
| 23 |
INTRODUCTION_TEXT = """
|
| 24 |
-
This leaderboard evaluates language models based on their perplexity scores on a fixed test passage
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
| 26 |
"""
|
| 27 |
|
| 28 |
# Which evaluations are you running?
|
| 29 |
LLM_BENCHMARKS_TEXT = """
|
| 30 |
## How it works
|
| 31 |
|
| 32 |
-
The evaluation runs
|
|
|
|
|
|
|
|
|
|
| 33 |
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
## Test Text
|
| 36 |
|
| 37 |
The evaluation uses the following passage:
|
|
|
|
| 21 |
|
| 22 |
# What does your leaderboard evaluate?
|
| 23 |
INTRODUCTION_TEXT = """
|
| 24 |
+
This leaderboard evaluates language models based on their perplexity scores on a fixed test passage and
|
| 25 |
+
structural similarity to GPT-2 using model tracing analysis.
|
| 26 |
+
|
| 27 |
+
- **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
|
| 28 |
+
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to GPT-2 after fine-tuning (neuron organization is maintained).
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running?
|
| 32 |
LLM_BENCHMARKS_TEXT = """
|
| 33 |
## How it works
|
| 34 |
|
| 35 |
+
The evaluation runs two types of analysis on language models:
|
| 36 |
+
|
| 37 |
+
### 1. Perplexity Evaluation
|
| 38 |
+
Perplexity tests using a fixed test passage about artificial intelligence.
|
| 39 |
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
| 40 |
|
| 41 |
+
### 2. Model Tracing Analysis
|
| 42 |
+
Compares each model's internal structure to GPT-2 using the "match" statistic with alignment:
|
| 43 |
+
- **Base Model**: GPT-2 (`openai-community/gpt2`)
|
| 44 |
+
- **Comparison**: Each model on the leaderboard
|
| 45 |
+
- **Method**: Neuron matching analysis across transformer layers
|
| 46 |
+
- **Alignment**: Models are aligned before comparison using the Hungarian algorithm
|
| 47 |
+
- **Output**: P-value indicating structural similarity (lower = more similar to GPT-2)
|
| 48 |
+
|
| 49 |
+
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
|
| 50 |
+
between the base model and fine-tuned variants.
|
| 51 |
+
|
| 52 |
## Test Text
|
| 53 |
|
| 54 |
The evaluation uses the following passage:
|
src/display/utils.py
CHANGED
|
@@ -34,6 +34,8 @@ for task in Tasks:
|
|
| 34 |
sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
|
| 35 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
|
| 36 |
sys.stderr.flush()
|
|
|
|
|
|
|
| 37 |
# Model information
|
| 38 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 39 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
|
| 34 |
sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
|
| 35 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
|
| 36 |
sys.stderr.flush()
|
| 37 |
+
# Model tracing p-value column
|
| 38 |
+
auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value ⬇️", "number", True)])
|
| 39 |
# Model information
|
| 40 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 41 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/evaluation/model_trace_eval.py
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Model tracing evaluation for computing p-values from neuron matching statistics.
|
| 3 |
+
|
| 4 |
+
This module runs the model-tracing comparison between a base model (gpt2) and
|
| 5 |
+
fine-tuned models to determine structural similarity via p-value analysis.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import subprocess
|
| 11 |
+
import tempfile
|
| 12 |
+
import pickle
|
| 13 |
+
import torch
|
| 14 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 15 |
+
|
| 16 |
+
# Add model-tracing to path
|
| 17 |
+
model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
|
| 18 |
+
if model_tracing_path not in sys.path:
|
| 19 |
+
sys.path.append(model_tracing_path)
|
| 20 |
+
|
| 21 |
+
sys.stderr.write("🔧 ATTEMPTING TO IMPORT MODEL TRACING DEPENDENCIES...\n")
|
| 22 |
+
sys.stderr.flush()
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
sys.stderr.write(" - Importing tracing.utils.llama.model...\n")
|
| 26 |
+
from tracing.utils.llama.model import permute_model, rotate_model
|
| 27 |
+
|
| 28 |
+
sys.stderr.write(" - Importing tracing.utils.llama.matching...\n")
|
| 29 |
+
from tracing.utils.llama.matching import align_model
|
| 30 |
+
|
| 31 |
+
sys.stderr.write(" - Importing tracing.utils.evaluate...\n")
|
| 32 |
+
from tracing.utils.evaluate import prepare_hf_dataset, prepare_hf_dataloader
|
| 33 |
+
|
| 34 |
+
sys.stderr.write(" - Importing tracing.utils.utils...\n")
|
| 35 |
+
from tracing.utils.utils import manual_seed
|
| 36 |
+
|
| 37 |
+
sys.stderr.write(" - Importing tracing.statistics.match...\n")
|
| 38 |
+
from tracing.statistics.match import statistic as match_stat
|
| 39 |
+
|
| 40 |
+
MODEL_TRACING_AVAILABLE = True
|
| 41 |
+
sys.stderr.write("✅ ALL MODEL TRACING IMPORTS SUCCESSFUL\n")
|
| 42 |
+
|
| 43 |
+
except ImportError as e:
|
| 44 |
+
sys.stderr.write(f"❌ MODEL TRACING IMPORTS FAILED: {e}\n")
|
| 45 |
+
import traceback
|
| 46 |
+
sys.stderr.write(f"Full import traceback:\n{traceback.format_exc()}\n")
|
| 47 |
+
MODEL_TRACING_AVAILABLE = False
|
| 48 |
+
|
| 49 |
+
sys.stderr.write(f"🎯 Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
|
| 50 |
+
sys.stderr.flush()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
|
| 54 |
+
"""
|
| 55 |
+
Run model tracing analysis comparing ft_model against gpt2 base.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
ft_model_name: HuggingFace model identifier for the fine-tuned model
|
| 59 |
+
revision: Model revision/commit hash
|
| 60 |
+
precision: Model precision (float16, bfloat16)
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
tuple: (success: bool, result: float or error_message)
|
| 64 |
+
If success, result is the aggregate p-value
|
| 65 |
+
If failure, result is error message
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
if not MODEL_TRACING_AVAILABLE:
|
| 69 |
+
return False, "Model tracing dependencies not available"
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS ===\n")
|
| 73 |
+
sys.stderr.write(f"Base model: openai-community/gpt2\n")
|
| 74 |
+
sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
|
| 75 |
+
sys.stderr.write(f"Revision: {revision}\n")
|
| 76 |
+
sys.stderr.write(f"Precision: {precision}\n")
|
| 77 |
+
sys.stderr.flush()
|
| 78 |
+
|
| 79 |
+
# Set random seed for reproducibility
|
| 80 |
+
manual_seed(0)
|
| 81 |
+
|
| 82 |
+
# Determine dtype
|
| 83 |
+
if precision == "bfloat16":
|
| 84 |
+
dtype = torch.bfloat16
|
| 85 |
+
else:
|
| 86 |
+
dtype = torch.float16
|
| 87 |
+
|
| 88 |
+
# Load base model (gpt2)
|
| 89 |
+
base_model_id = "openai-community/gpt2"
|
| 90 |
+
sys.stderr.write(f"🤖 Loading base model: {base_model_id}\n")
|
| 91 |
+
sys.stderr.write(f" - dtype: {dtype}\n")
|
| 92 |
+
sys.stderr.write(f" - low_cpu_mem_usage: True\n")
|
| 93 |
+
sys.stderr.flush()
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
| 97 |
+
base_model_id,
|
| 98 |
+
torch_dtype=dtype,
|
| 99 |
+
low_cpu_mem_usage=True
|
| 100 |
+
)
|
| 101 |
+
sys.stderr.write("✅ Base model loaded successfully\n")
|
| 102 |
+
except Exception as e:
|
| 103 |
+
sys.stderr.write(f"❌ Failed to load base model: {e}\n")
|
| 104 |
+
raise
|
| 105 |
+
|
| 106 |
+
try:
|
| 107 |
+
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False)
|
| 108 |
+
sys.stderr.write("✅ Base tokenizer loaded successfully\n")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
sys.stderr.write(f"❌ Failed to load base tokenizer: {e}\n")
|
| 111 |
+
raise
|
| 112 |
+
|
| 113 |
+
# Load fine-tuned model
|
| 114 |
+
sys.stderr.write(f"🤖 Loading fine-tuned model: {ft_model_name}\n")
|
| 115 |
+
sys.stderr.write(f" - revision: {revision}\n")
|
| 116 |
+
sys.stderr.write(f" - dtype: {dtype}\n")
|
| 117 |
+
sys.stderr.write(f" - low_cpu_mem_usage: True\n")
|
| 118 |
+
sys.stderr.flush()
|
| 119 |
+
|
| 120 |
+
try:
|
| 121 |
+
ft_model = AutoModelForCausalLM.from_pretrained(
|
| 122 |
+
ft_model_name,
|
| 123 |
+
revision=revision,
|
| 124 |
+
torch_dtype=dtype,
|
| 125 |
+
low_cpu_mem_usage=True
|
| 126 |
+
)
|
| 127 |
+
sys.stderr.write("✅ Fine-tuned model loaded successfully\n")
|
| 128 |
+
except Exception as e:
|
| 129 |
+
sys.stderr.write(f"❌ Failed to load fine-tuned model: {e}\n")
|
| 130 |
+
raise
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_name, revision=revision, use_fast=False)
|
| 134 |
+
sys.stderr.write("✅ Fine-tuned tokenizer loaded successfully\n")
|
| 135 |
+
except Exception as e:
|
| 136 |
+
sys.stderr.write(f"❌ Failed to load fine-tuned tokenizer: {e}\n")
|
| 137 |
+
raise
|
| 138 |
+
|
| 139 |
+
sys.stderr.write("🎯 ALL MODELS AND TOKENIZERS LOADED SUCCESSFULLY\n")
|
| 140 |
+
|
| 141 |
+
# Show memory info if available
|
| 142 |
+
if torch.cuda.is_available():
|
| 143 |
+
memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB
|
| 144 |
+
memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB
|
| 145 |
+
sys.stderr.write(f"💾 GPU Memory - Allocated: {memory_allocated:.2f}GB, Reserved: {memory_reserved:.2f}GB\n")
|
| 146 |
+
|
| 147 |
+
sys.stderr.flush()
|
| 148 |
+
|
| 149 |
+
# Prepare dataset (using wikitext like in the original)
|
| 150 |
+
sys.stderr.write("Preparing dataset...\n")
|
| 151 |
+
sys.stderr.flush()
|
| 152 |
+
|
| 153 |
+
block_size = 512
|
| 154 |
+
batch_size = 1
|
| 155 |
+
dataset = prepare_hf_dataset("dlwh/wikitext_103_detokenized", block_size, base_tokenizer)
|
| 156 |
+
dataloader = prepare_hf_dataloader(dataset, batch_size)
|
| 157 |
+
|
| 158 |
+
sys.stderr.write("Dataset prepared\n")
|
| 159 |
+
sys.stderr.flush()
|
| 160 |
+
|
| 161 |
+
# Run alignment (--align flag)
|
| 162 |
+
sys.stderr.write("Running model alignment...\n")
|
| 163 |
+
sys.stderr.flush()
|
| 164 |
+
|
| 165 |
+
try:
|
| 166 |
+
align_model(base_model, ft_model, ft_model)
|
| 167 |
+
sys.stderr.write("Model alignment completed\n")
|
| 168 |
+
except Exception as e:
|
| 169 |
+
sys.stderr.write(f"Model alignment failed: {e}\n")
|
| 170 |
+
sys.stderr.write("Continuing without alignment...\n")
|
| 171 |
+
sys.stderr.flush()
|
| 172 |
+
|
| 173 |
+
# Run match statistic
|
| 174 |
+
sys.stderr.write("Computing match statistic...\n")
|
| 175 |
+
sys.stderr.flush()
|
| 176 |
+
|
| 177 |
+
# Get number of layers for the models
|
| 178 |
+
if hasattr(base_model, 'transformer') and hasattr(base_model.transformer, 'h'):
|
| 179 |
+
# GPT-2 style
|
| 180 |
+
n_blocks = len(base_model.transformer.h)
|
| 181 |
+
elif hasattr(base_model, 'model') and hasattr(base_model.model, 'layers'):
|
| 182 |
+
# LLaMA style
|
| 183 |
+
n_blocks = len(base_model.model.layers)
|
| 184 |
+
else:
|
| 185 |
+
# Default fallback
|
| 186 |
+
n_blocks = 12 # GPT-2 base has 12 layers
|
| 187 |
+
|
| 188 |
+
# Check if fine-tuned model has compatible architecture
|
| 189 |
+
ft_n_blocks = n_blocks
|
| 190 |
+
if hasattr(ft_model, 'transformer') and hasattr(ft_model.transformer, 'h'):
|
| 191 |
+
ft_n_blocks = len(ft_model.transformer.h)
|
| 192 |
+
elif hasattr(ft_model, 'model') and hasattr(ft_model.model, 'layers'):
|
| 193 |
+
ft_n_blocks = len(ft_model.model.layers)
|
| 194 |
+
|
| 195 |
+
# Use minimum number of blocks to avoid index errors
|
| 196 |
+
n_blocks = min(n_blocks, ft_n_blocks)
|
| 197 |
+
|
| 198 |
+
sys.stderr.write(f"Using {n_blocks} blocks for analysis\n")
|
| 199 |
+
sys.stderr.flush()
|
| 200 |
+
|
| 201 |
+
# Run the match statistic - returns list of p-values per layer
|
| 202 |
+
try:
|
| 203 |
+
p_values = match_stat(base_model, ft_model, dataloader, n_blocks=n_blocks)
|
| 204 |
+
except Exception as e:
|
| 205 |
+
sys.stderr.write(f"Match statistic computation failed: {e}\n")
|
| 206 |
+
sys.stderr.flush()
|
| 207 |
+
# Return a default high p-value indicating no similarity
|
| 208 |
+
return True, 1.0
|
| 209 |
+
|
| 210 |
+
sys.stderr.write(f"Match statistic computed: {len(p_values)} p-values\n")
|
| 211 |
+
sys.stderr.flush()
|
| 212 |
+
|
| 213 |
+
# Filter out None/NaN values
|
| 214 |
+
valid_p_values = [p for p in p_values if p is not None and not (isinstance(p, float) and (p != p or p < 0 or p > 1))]
|
| 215 |
+
|
| 216 |
+
if not valid_p_values:
|
| 217 |
+
sys.stderr.write("No valid p-values found, returning default\n")
|
| 218 |
+
sys.stderr.flush()
|
| 219 |
+
return True, 1.0
|
| 220 |
+
|
| 221 |
+
# Calculate aggregate p-value using Fisher's method
|
| 222 |
+
from tracing.utils.utils import fisher
|
| 223 |
+
try:
|
| 224 |
+
aggregate_p_value = fisher(valid_p_values)
|
| 225 |
+
except Exception as e:
|
| 226 |
+
sys.stderr.write(f"Fisher's method failed: {e}\n")
|
| 227 |
+
sys.stderr.flush()
|
| 228 |
+
# Use the mean of valid p-values as fallback
|
| 229 |
+
aggregate_p_value = sum(valid_p_values) / len(valid_p_values)
|
| 230 |
+
|
| 231 |
+
sys.stderr.write(f"Aggregate p-value: {aggregate_p_value}\n")
|
| 232 |
+
sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
|
| 233 |
+
sys.stderr.flush()
|
| 234 |
+
|
| 235 |
+
# Clean up memory
|
| 236 |
+
del base_model
|
| 237 |
+
del ft_model
|
| 238 |
+
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
| 239 |
+
|
| 240 |
+
return True, aggregate_p_value
|
| 241 |
+
|
| 242 |
+
except Exception as e:
|
| 243 |
+
error_msg = str(e)
|
| 244 |
+
sys.stderr.write(f"Error in model trace analysis: {error_msg}\n")
|
| 245 |
+
import traceback
|
| 246 |
+
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
|
| 247 |
+
sys.stderr.flush()
|
| 248 |
+
|
| 249 |
+
# Clean up memory even on error
|
| 250 |
+
try:
|
| 251 |
+
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
| 252 |
+
except:
|
| 253 |
+
pass
|
| 254 |
+
|
| 255 |
+
return False, error_msg
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def compute_model_trace_p_value(model_name, revision="main", precision="float16"):
|
| 259 |
+
"""
|
| 260 |
+
Wrapper function to compute model trace p-value for a single model.
|
| 261 |
+
|
| 262 |
+
Args:
|
| 263 |
+
model_name: HuggingFace model identifier
|
| 264 |
+
revision: Model revision
|
| 265 |
+
precision: Model precision
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
float or None: P-value if successful, None if failed
|
| 269 |
+
"""
|
| 270 |
+
sys.stderr.write(f"\n{'='*60}\n")
|
| 271 |
+
sys.stderr.write(f"COMPUTE_MODEL_TRACE_P_VALUE CALLED\n")
|
| 272 |
+
sys.stderr.write(f"Model: {model_name}\n")
|
| 273 |
+
sys.stderr.write(f"Revision: {revision}\n")
|
| 274 |
+
sys.stderr.write(f"Precision: {precision}\n")
|
| 275 |
+
sys.stderr.write(f"Model tracing available: {MODEL_TRACING_AVAILABLE}\n")
|
| 276 |
+
sys.stderr.write(f"{'='*60}\n")
|
| 277 |
+
sys.stderr.flush()
|
| 278 |
+
|
| 279 |
+
if not MODEL_TRACING_AVAILABLE:
|
| 280 |
+
sys.stderr.write("❌ MODEL TRACING NOT AVAILABLE - returning None\n")
|
| 281 |
+
sys.stderr.flush()
|
| 282 |
+
return None
|
| 283 |
+
|
| 284 |
+
try:
|
| 285 |
+
sys.stderr.write("🚀 Starting model trace analysis...\n")
|
| 286 |
+
sys.stderr.flush()
|
| 287 |
+
|
| 288 |
+
success, result = run_model_trace_analysis(model_name, revision, precision)
|
| 289 |
+
|
| 290 |
+
sys.stderr.write(f"📊 Analysis completed - Success: {success}, Result: {result}\n")
|
| 291 |
+
sys.stderr.flush()
|
| 292 |
+
|
| 293 |
+
if success:
|
| 294 |
+
sys.stderr.write(f"✅ SUCCESS: Returning p-value {result}\n")
|
| 295 |
+
sys.stderr.flush()
|
| 296 |
+
return result
|
| 297 |
+
else:
|
| 298 |
+
sys.stderr.write(f"❌ FAILED: {result}\n")
|
| 299 |
+
sys.stderr.write("🔄 Returning None as fallback\n")
|
| 300 |
+
sys.stderr.flush()
|
| 301 |
+
return None
|
| 302 |
+
|
| 303 |
+
except Exception as e:
|
| 304 |
+
sys.stderr.write(f"💥 CRITICAL ERROR in compute_model_trace_p_value for {model_name}:\n")
|
| 305 |
+
sys.stderr.write(f"Exception: {e}\n")
|
| 306 |
+
import traceback
|
| 307 |
+
sys.stderr.write(f"Full traceback:\n{traceback.format_exc()}\n")
|
| 308 |
+
sys.stderr.write("🔄 Returning None as fallback\n")
|
| 309 |
+
sys.stderr.flush()
|
| 310 |
+
return None
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -7,6 +7,7 @@ from dataclasses import dataclass
|
|
| 7 |
from src.display.formatting import make_clickable_model
|
| 8 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 9 |
from src.submission.check_validity import is_model_on_hub
|
|
|
|
| 10 |
|
| 11 |
@dataclass
|
| 12 |
class EvalResult:
|
|
@@ -131,6 +132,34 @@ class EvalResult:
|
|
| 131 |
data_dict[AutoEvalColumn.params.name] = 0
|
| 132 |
data_dict[AutoEvalColumn.likes.name] = 0
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
|
| 135 |
sys.stderr.flush()
|
| 136 |
|
|
|
|
| 7 |
from src.display.formatting import make_clickable_model
|
| 8 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 9 |
from src.submission.check_validity import is_model_on_hub
|
| 10 |
+
from src.evaluation.model_trace_eval import compute_model_trace_p_value
|
| 11 |
|
| 12 |
@dataclass
|
| 13 |
class EvalResult:
|
|
|
|
| 132 |
data_dict[AutoEvalColumn.params.name] = 0
|
| 133 |
data_dict[AutoEvalColumn.likes.name] = 0
|
| 134 |
|
| 135 |
+
# Compute model trace p-value
|
| 136 |
+
sys.stderr.write(f"\n🧬 COMPUTING MODEL TRACE P-VALUE FOR: {self.full_model}\n")
|
| 137 |
+
sys.stderr.write(f" - Revision: {self.revision if self.revision else 'main'}\n")
|
| 138 |
+
sys.stderr.write(f" - Precision: {self.precision.value.name.lower()}\n")
|
| 139 |
+
sys.stderr.flush()
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
model_trace_p_value = compute_model_trace_p_value(
|
| 143 |
+
self.full_model,
|
| 144 |
+
self.revision if self.revision else "main",
|
| 145 |
+
self.precision.value.name.lower()
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
if model_trace_p_value is not None:
|
| 149 |
+
sys.stderr.write(f"✅ Model trace p-value computed successfully: {model_trace_p_value}\n")
|
| 150 |
+
else:
|
| 151 |
+
sys.stderr.write(f"⚠️ Model trace p-value is None (computation failed or not available)\n")
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
sys.stderr.write(f"💥 Exception during model trace p-value computation: {e}\n")
|
| 155 |
+
import traceback
|
| 156 |
+
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
|
| 157 |
+
model_trace_p_value = None
|
| 158 |
+
|
| 159 |
+
data_dict[AutoEvalColumn.model_trace_p_value.name] = model_trace_p_value
|
| 160 |
+
sys.stderr.write(f"📝 Added to data_dict: {AutoEvalColumn.model_trace_p_value.name} = {model_trace_p_value}\n")
|
| 161 |
+
sys.stderr.flush()
|
| 162 |
+
|
| 163 |
sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
|
| 164 |
sys.stderr.flush()
|
| 165 |
|
test_model_trace.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for model tracing integration.
|
| 4 |
+
Tests the p-value computation for a simple model comparison.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
# Add src to path
|
| 11 |
+
sys.path.append('src')
|
| 12 |
+
|
| 13 |
+
from evaluation.model_trace_eval import compute_model_trace_p_value
|
| 14 |
+
|
| 15 |
+
def test_model_trace():
|
| 16 |
+
"""Test the model trace p-value computation with a simple example."""
|
| 17 |
+
|
| 18 |
+
print("Testing model trace p-value computation...")
|
| 19 |
+
|
| 20 |
+
# Test with a simple model (should be fast)
|
| 21 |
+
test_model = "openai-community/gpt2"
|
| 22 |
+
|
| 23 |
+
print(f"Computing p-value for {test_model} vs GPT-2...")
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
p_value = compute_model_trace_p_value(test_model, "main", "float16")
|
| 27 |
+
|
| 28 |
+
if p_value is not None:
|
| 29 |
+
print(f"✅ Success! P-value: {p_value}")
|
| 30 |
+
if 0 <= p_value <= 1:
|
| 31 |
+
print("✅ P-value is in valid range [0, 1]")
|
| 32 |
+
else:
|
| 33 |
+
print(f"⚠️ Warning: P-value {p_value} is outside expected range [0, 1]")
|
| 34 |
+
else:
|
| 35 |
+
print("❌ Failed: P-value is None")
|
| 36 |
+
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"❌ Error: {e}")
|
| 39 |
+
import traceback
|
| 40 |
+
traceback.print_exc()
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
test_model_trace()
|