File size: 9,599 Bytes
2c21cf7
4d13673
9002fc2
2c21cf7
 
 
b8cbeff
 
0a30811
b8cbeff
0a30811
b8cbeff
 
 
 
 
 
 
d91b022
2c21cf7
 
 
 
d91b022
2f9dee1
 
 
4d13673
9002fc2
eb1696c
 
 
9002fc2
9c2c019
 
 
913253a
9c2c019
b0aa389
7c06aef
549360a
9c2c019
eb1696c
9002fc2
941d5c5
 
 
 
 
 
 
 
 
 
 
 
 
 
4e8cb1a
7c06aef
4e8cb1a
 
 
 
7c06aef
4e8cb1a
 
 
 
 
 
 
7c06aef
4e8cb1a
 
 
 
7c06aef
 
 
 
53d2039
 
 
7c06aef
98c6811
4e8cb1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb1696c
 
 
7c06aef
 
 
 
 
eb1696c
 
7c06aef
 
 
eb1696c
 
 
 
 
4e8cb1a
7c06aef
4e8cb1a
 
 
 
7c06aef
4e8cb1a
 
 
 
 
 
 
7c06aef
4e8cb1a
 
 
 
7c06aef
 
 
 
53d2039
 
 
7c06aef
98c6811
4e8cb1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb1696c
 
7c06aef
 
 
 
 
eb1696c
 
7c06aef
 
 
eb1696c
 
 
 
2c21cf7
 
 
 
 
 
 
 
 
 
 
 
d91b022
 
 
 
7c06aef
d91b022
eb1696c
4d13673
7c06aef
 
 
 
 
 
 
 
566c57e
92d8154
566c57e
32d50b0
 
 
 
 
 
2c21cf7
 
 
 
 
7c06aef
2c21cf7
 
 
9002fc2
ebaf279
 
 
 
 
 
 
2c21cf7
 
4d13673
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import json
import os

import numpy as np
import pandas as pd
import uvicorn

# Robust import so this file works both as a package module and as a script
try:
    # When executed as a package module (recommended): `python -m uvicorn evals.backend:app`
    from .countries import make_country_table
except Exception:
    try:
        # When executed from project root with package path available
        from evals.countries import make_country_table
    except Exception:
        # When executed directly from evals/ directory
        from countries import make_country_table
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.gzip import GZipMiddleware
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles

scores = pd.read_json("results.json")
languages = pd.read_json("languages.json")
models = pd.read_json("models.json")


def mean(lst):
    return sum(lst) / len(lst) if lst else None


task_metrics = [
    "translation_from_bleu",
    "translation_to_bleu",
    "classification_accuracy",
    "mmlu_accuracy",
    "arc_accuracy",
    "truthfulqa_accuracy",
    "mgsm_accuracy",
]


def compute_normalized_average(df, metrics):
    """Compute average of min-max normalized metric columns."""
    normalized_df = df[metrics].copy()
    for col in metrics:
        if col in normalized_df.columns:
            col_min = normalized_df[col].min()
            col_max = normalized_df[col].max()
            if col_max > col_min:  # Avoid division by zero
                normalized_df[col] = (normalized_df[col] - col_min) / (col_max - col_min)
            else:
                normalized_df[col] = 0  # If all values are the same, set to 0
    return normalized_df.mean(axis=1, skipna=False)


def make_model_table(scores_df, models):
    # Create a combined task_metric for origin
    scores_df["task_metric_origin"] = (
        scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
    )

    # Pivot to get scores for each origin-specific metric
    scores_pivot = scores_df.pivot_table(
        index="model",
        columns="task_metric_origin",
        values="score",
        aggfunc="mean",
    )

    # Create the regular task_metric for the main average calculation
    scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
    main_pivot = scores_df.pivot_table(
        index="model", columns="task_metric", values="score", aggfunc="mean"
    )

    # Merge the two pivots
    df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
    
    for metric in task_metrics:
        if metric not in df.columns:
            df[metric] = np.nan
            
    df["average"] = compute_normalized_average(df, task_metrics)

    # Compute origin presence per model+metric
    origin_presence = (
        scores_df.groupby(["model", "task_metric", "origin"]).size().unstack(fill_value=0)
    )
    # Add boolean flags: show asterisk only if exclusively machine-origin contributed
    for metric in task_metrics:
        human_col_name = "human" if "human" in origin_presence.columns else None
        machine_col_name = "machine" if "machine" in origin_presence.columns else None
        if human_col_name or machine_col_name:
            flags = []
            for model in df.index:
                try:
                    counts = origin_presence.loc[(model, metric)]
                except KeyError:
                    flags.append(False)
                    continue
                human_count = counts.get(human_col_name, 0) if human_col_name else 0
                machine_count = counts.get(machine_col_name, 0) if machine_col_name else 0
                flags.append(machine_count > 0 and human_count == 0)
            df[f"{metric}_is_machine"] = flags
        else:
            df[f"{metric}_is_machine"] = False
    df = df.sort_values(by="average", ascending=False).reset_index()
    df = pd.merge(df, models, left_on="model", right_on="id", how="left")
    df["rank"] = df.index + 1
    
    # Dynamically find all metric columns to include
    final_cols = df.columns
    metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
    
    df = df[
        [
            "rank", "model", "name", "provider_name", "hf_id", "creation_date",
            "size", "type", "license", "cost", "average",
            *sorted(list(set(metric_cols)))
        ]
    ]
    return df


def make_language_table(scores_df, languages):
    # Create a combined task_metric for origin
    scores_df["task_metric_origin"] = (
        scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
    )

    # Pivot to get scores for each origin-specific metric
    scores_pivot = scores_df.pivot_table(
        index="bcp_47",
        columns="task_metric_origin",
        values="score",
        aggfunc="mean",
    )

    # Create the regular task_metric for the main average calculation
    scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
    main_pivot = scores_df.pivot_table(
        index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
    )

    # Merge the two pivots
    df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")

    for metric in task_metrics:
        if metric not in df.columns:
            df[metric] = np.nan
            
    df["average"] = compute_normalized_average(df, task_metrics)

    # Compute origin presence per language+metric; show asterisk only if exclusively machine-origin
    origin_presence = (
        scores_df.groupby(["bcp_47", "task_metric", "origin"]).size().unstack(fill_value=0)
    )
    for metric in task_metrics:
        human_col_name = "human" if "human" in origin_presence.columns else None
        machine_col_name = "machine" if "machine" in origin_presence.columns else None
        if human_col_name or machine_col_name:
            flags = []
            for bcp in df.index:
                try:
                    counts = origin_presence.loc[(bcp, metric)]
                except KeyError:
                    flags.append(False)
                    continue
                human_count = counts.get(human_col_name, 0) if human_col_name else 0
                machine_count = counts.get(machine_col_name, 0) if machine_col_name else 0
                flags.append(machine_count > 0 and human_count == 0)
            df[f"{metric}_is_machine"] = flags
        else:
            df[f"{metric}_is_machine"] = False

    # Per-row machine-origin flags for each metric (true if any machine-origin score exists for the language)
    for metric in task_metrics:
        machine_col = f"{metric}_machine"
        if machine_col in df.columns:
            df[f"{metric}_is_machine"] = df[machine_col].notna()
        else:
            df[f"{metric}_is_machine"] = False
    df = pd.merge(languages, df, on="bcp_47", how="outer")
    df = df.sort_values(by="speakers", ascending=False)
    
    # Dynamically find all metric columns to include
    final_cols = df.columns
    metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
    
    df = df[
        [
            "bcp_47", "language_name", "autonym", "speakers", "family",
            "average", "in_benchmark",
            *sorted(list(set(metric_cols)))
        ]
    ]
    return df


app = FastAPI()

app.add_middleware(CORSMiddleware, allow_origins=["*"])
app.add_middleware(GZipMiddleware, minimum_size=1000)


def serialize(df):
    return df.replace({np.nan: None}).to_dict(orient="records")


@app.post("/api/data")
async def data(request: Request):
    body = await request.body()
    data = json.loads(body)
    selected_languages = data.get("selectedLanguages", {})
    df = scores.groupby(["model", "bcp_47", "task", "metric", "origin"]).mean().reset_index()
    # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
    language_table = make_language_table(df, languages)
    datasets_df = pd.read_json("datasets.json")
    
    # Identify which metrics have machine translations available
    machine_translated_metrics = set()
    for _, row in df.iterrows():
        if row["origin"] == "machine":
            metric_name = f"{row['task']}_{row['metric']}"
            machine_translated_metrics.add(metric_name)
    
    if selected_languages:
        # the filtering is only applied for the model table and the country data
        df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
    if len(df) == 0:
        model_table = pd.DataFrame()
        countries = pd.DataFrame()
    else:
        model_table = make_model_table(df, models)
        countries = make_country_table(make_language_table(df, languages))
    all_tables = {
        "model_table": serialize(model_table),
        "language_table": serialize(language_table),
        "dataset_table": serialize(datasets_df),
        "countries": serialize(countries),
        "machine_translated_metrics": list(machine_translated_metrics),
    }
    return JSONResponse(content=all_tables)


# Only serve static files if build directory exists (production mode)
if os.path.exists("frontend/build"):
    app.mount("/", StaticFiles(directory="frontend/build", html=True), name="frontend")
else:
    print("πŸ§ͺ Development mode: frontend/build directory not found")
    print("🌐 Frontend should be running on http://localhost:3000")
    print("πŸ“‘ API available at http://localhost:8000/api/data")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 8000)))