ItsMeDevRoland commited on
Commit
abd9b6a
·
verified ·
1 Parent(s): cd113ea

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +641 -0
app.py ADDED
@@ -0,0 +1,641 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from plotly.subplots import make_subplots
6
+ import numpy as np
7
+ from PIL import Image
8
+ import base64
9
+ from io import BytesIO
10
+
11
+ # Set page configuration
12
+ st.set_page_config(
13
+ page_title="AI Roleplay Performance Leaderboard",
14
+ page_icon="🤖",
15
+ layout="wide",
16
+ initial_sidebar_state="expanded"
17
+ )
18
+
19
+ # Custom CSS
20
+ st.markdown("""
21
+ <style>
22
+ .main {
23
+ background-color: #f0f2f6;
24
+ }
25
+ .stApp {
26
+ max-width: 1200px;
27
+ margin: 0 auto;
28
+ }
29
+ h1, h2, h3 {
30
+ color: #1E3A8A;
31
+ }
32
+ .metric-card {
33
+ background-color: white;
34
+ border-radius: 10px;
35
+ padding: 20px;
36
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
37
+ margin-bottom: 20px;
38
+ }
39
+ .header-container {
40
+ display: flex;
41
+ align-items: center;
42
+ justify-content: space-between;
43
+ margin-bottom: 20px;
44
+ }
45
+ .logo {
46
+ height: 60px;
47
+ }
48
+ .stTabs [data-baseweb="tab-list"] {
49
+ gap: 24px;
50
+ }
51
+ .stTabs [data-baseweb="tab"] {
52
+ height: 50px;
53
+ white-space: pre-wrap;
54
+ background-color: white;
55
+ border-radius: 5px 5px 0 0;
56
+ padding: 10px 20px;
57
+ font-weight: 500;
58
+ }
59
+ .stTabs [aria-selected="true"] {
60
+ background-color: #1E3A8A;
61
+ color: white;
62
+ }
63
+ .grid-container {
64
+ display: grid;
65
+ grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
66
+ gap: 20px;
67
+ margin-bottom: 30px;
68
+ }
69
+ .model-card {
70
+ background: white;
71
+ padding: 15px;
72
+ border-radius: 10px;
73
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
74
+ transition: transform 0.3s ease;
75
+ }
76
+ .model-card:hover {
77
+ transform: translateY(-5px);
78
+ }
79
+ .footer {
80
+ text-align: center;
81
+ margin-top: 30px;
82
+ padding: 20px;
83
+ font-size: 0.8em;
84
+ color: #666;
85
+ }
86
+ .highlight {
87
+ background-color: #f0f7ff;
88
+ padding: 20px;
89
+ border-radius: 10px;
90
+ margin: 20px 0;
91
+ border-left: 5px solid #1E3A8A;
92
+ }
93
+ .stButton>button {
94
+ background-color: #1E3A8A;
95
+ color: white;
96
+ font-weight: 500;
97
+ }
98
+ .metric-value {
99
+ font-size: 2.5rem;
100
+ font-weight: bold;
101
+ color: #1E3A8A;
102
+ }
103
+ .metric-label {
104
+ font-size: 1rem;
105
+ color: #666;
106
+ }
107
+ </style>
108
+ """, unsafe_allow_html=True)
109
+
110
+ # Define model data based on the graphs
111
+ model_data = {
112
+ "OpenElla-Llama-3-8B": {
113
+ "Length Score": 0.97,
114
+ "Character Consistency": 0.83,
115
+ "Immersion": 0.67,
116
+ "Overall Score": 0.83,
117
+ "Release Date": "2023-11-15",
118
+ "Parameters": "8B",
119
+ "Architecture": "Llama-3",
120
+ "Category": "OpenElla"
121
+ },
122
+ "DeepSeek-Coder-V2-Instruct": {
123
+ "Length Score": 1.0,
124
+ "Character Consistency": 1.0,
125
+ "Immersion": 0.63,
126
+ "Overall Score": 0.88,
127
+ "Release Date": "2023-09-20",
128
+ "Parameters": "33B",
129
+ "Architecture": "DeepSeek",
130
+ "Category": "Competitor"
131
+ },
132
+ "Dolphin": {
133
+ "Length Score": 1.0,
134
+ "Character Consistency": 0.83,
135
+ "Immersion": 0.47,
136
+ "Overall Score": 0.76,
137
+ "Release Date": "2023-10-05",
138
+ "Parameters": "7B",
139
+ "Architecture": "Mistral",
140
+ "Category": "Competitor"
141
+ },
142
+ "Hermes-3-GGUF": {
143
+ "Length Score": 0.8,
144
+ "Character Consistency": 0.82,
145
+ "Immersion": 0.43,
146
+ "Overall Score": 0.75,
147
+ "Release Date": "2023-10-10",
148
+ "Parameters": "7B",
149
+ "Architecture": "Mistral",
150
+ "Category": "Competitor"
151
+ },
152
+ "MiniMaid-L1": {
153
+ "Length Score": 0.9,
154
+ "Character Consistency": 0.5,
155
+ "Immersion": 0.13,
156
+ "Overall Score": 0.51,
157
+ "Release Date": "2023-12-01",
158
+ "Parameters": "3B",
159
+ "Architecture": "Custom",
160
+ "Category": "MiniMaid"
161
+ },
162
+ "MiniMaid-L2": {
163
+ "Length Score": 1.0,
164
+ "Character Consistency": 0.53,
165
+ "Immersion": 0.6,
166
+ "Overall Score": 0.71,
167
+ "Release Date": "2024-01-15",
168
+ "Parameters": "6B",
169
+ "Architecture": "Custom",
170
+ "Category": "MiniMaid"
171
+ },
172
+ "MiniMaid-L3": {
173
+ "Length Score": 1.0,
174
+ "Character Consistency": 0.54,
175
+ "Immersion": 0.73,
176
+ "Overall Score": 0.76,
177
+ "Release Date": "2024-02-20",
178
+ "Parameters": "12B",
179
+ "Architecture": "Custom",
180
+ "Category": "MiniMaid"
181
+ }
182
+ }
183
+
184
+ # Create DataFrame
185
+ df = pd.DataFrame(model_data).T.reset_index()
186
+ df = df.rename(columns={"index": "Model"})
187
+
188
+ # Define model groupings and colors
189
+ category_colors = {
190
+ "OpenElla": "#FF6B6B",
191
+ "MiniMaid": "#4ECDC4",
192
+ "Competitor": "#9D84B7"
193
+ }
194
+
195
+ # Header with logo
196
+ st.markdown("""
197
+ <div class="header-container">
198
+ <h1>🤖 AI Roleplay Performance Leaderboard</h1>
199
+ </div>
200
+ """, unsafe_allow_html=True)
201
+
202
+ # Create tabs
203
+ tab1, tab2, tab3, tab4 = st.tabs(["📊 Leaderboard", "📈 Detailed Analysis", "🔍 Model Comparison", "ℹ️ About"])
204
+
205
+ with tab1:
206
+ st.header("Model Rankings")
207
+
208
+ # Filtering options in the sidebar
209
+ st.sidebar.header("Filter Models")
210
+ selected_categories = st.sidebar.multiselect(
211
+ "Model Categories",
212
+ options=df["Category"].unique(),
213
+ default=df["Category"].unique()
214
+ )
215
+
216
+ # Filter data based on selections
217
+ filtered_df = df[df["Category"].isin(selected_categories)]
218
+
219
+ # Sort by overall score
220
+ sorted_df = filtered_df.sort_values("Overall Score", ascending=False)
221
+
222
+ # Create interactive leaderboard
223
+ fig = px.bar(
224
+ sorted_df,
225
+ x="Model",
226
+ y="Overall Score",
227
+ color="Category",
228
+ color_discrete_map=category_colors,
229
+ hover_data=["Parameters", "Architecture", "Release Date"],
230
+ labels={"Overall Score": "Roleplay Performance Score"},
231
+ height=500,
232
+ )
233
+
234
+ fig.update_layout(
235
+ title="Models Ranked by Overall Roleplay Performance",
236
+ xaxis_title="",
237
+ yaxis_title="Score",
238
+ legend_title="Category",
239
+ font=dict(size=14),
240
+ plot_bgcolor="rgba(0,0,0,0)",
241
+ xaxis=dict(tickangle=-45),
242
+ yaxis=dict(range=[0, 1]),
243
+ margin=dict(l=20, r=20, t=60, b=80),
244
+ )
245
+
246
+ st.plotly_chart(fig, use_container_width=True)
247
+
248
+ # Top 3 models highlight
249
+ st.subheader("🏆 Top Performing Models")
250
+
251
+ col1, col2, col3 = st.columns(3)
252
+
253
+ top3_df = sorted_df.head(3)
254
+
255
+ for i, (idx, row) in enumerate(top3_df.iterrows()):
256
+ col = [col1, col2, col3][i]
257
+ with col:
258
+ st.markdown(f"""
259
+ <div class="model-card">
260
+ <h3>{row['Model']}</h3>
261
+ <div class="metric-value">{row['Overall Score']:.2f}</div>
262
+ <div class="metric-label">Overall Score</div>
263
+ <hr>
264
+ <p><strong>Category:</strong> {row['Category']}</p>
265
+ <p><strong>Parameters:</strong> {row['Parameters']}</p>
266
+ <p><strong>Architecture:</strong> {row['Architecture']}</p>
267
+ </div>
268
+ """, unsafe_allow_html=True)
269
+
270
+ # Show full data table
271
+ st.subheader("Complete Rankings")
272
+ st.dataframe(
273
+ sorted_df[["Model", "Category", "Overall Score", "Length Score", "Character Consistency", "Immersion", "Parameters"]],
274
+ use_container_width=True,
275
+ height=400,
276
+ column_config={
277
+ "Overall Score": st.column_config.ProgressColumn(
278
+ "Overall Score",
279
+ help="Overall roleplay performance score",
280
+ format="%.2f",
281
+ min_value=0,
282
+ max_value=1,
283
+ ),
284
+ "Length Score": st.column_config.ProgressColumn(
285
+ "Length Score",
286
+ help="Score for response length appropriateness",
287
+ format="%.2f",
288
+ min_value=0,
289
+ max_value=1,
290
+ ),
291
+ "Character Consistency": st.column_config.ProgressColumn(
292
+ "Character Consistency",
293
+ help="Score for character persona consistency",
294
+ format="%.2f",
295
+ min_value=0,
296
+ max_value=1,
297
+ ),
298
+ "Immersion": st.column_config.ProgressColumn(
299
+ "Immersion",
300
+ help="Score for immersive quality of roleplay",
301
+ format="%.2f",
302
+ min_value=0,
303
+ max_value=1,
304
+ ),
305
+ }
306
+ )
307
+
308
+ with tab2:
309
+ st.header("Detailed Performance Analysis")
310
+
311
+ # Select model to analyze
312
+ selected_model = st.selectbox(
313
+ "Select model to analyze:",
314
+ options=df["Model"].tolist(),
315
+ index=0
316
+ )
317
+
318
+ model_df = df[df["Model"] == selected_model]
319
+
320
+ # Spider/Radar chart for selected model
321
+ categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
322
+ values = model_df[categories].values.flatten().tolist()
323
+
324
+ # Create radar chart
325
+ fig = go.Figure()
326
+
327
+ fig.add_trace(go.Scatterpolar(
328
+ r=values,
329
+ theta=categories,
330
+ fill='toself',
331
+ name=selected_model,
332
+ line_color=category_colors[model_df["Category"].iloc[0]],
333
+ fillcolor=category_colors[model_df["Category"].iloc[0]] + '50' # Add transparency
334
+ ))
335
+
336
+ fig.update_layout(
337
+ polar=dict(
338
+ radialaxis=dict(
339
+ visible=True,
340
+ range=[0, 1]
341
+ )
342
+ ),
343
+ showlegend=False,
344
+ title=f"Performance Profile: {selected_model}",
345
+ height=500
346
+ )
347
+
348
+ st.plotly_chart(fig, use_container_width=True)
349
+
350
+ # Detailed metrics
351
+ st.subheader("Performance Metrics")
352
+
353
+ col1, col2, col3, col4 = st.columns(4)
354
+
355
+ with col1:
356
+ st.markdown(f"""
357
+ <div class="metric-card">
358
+ <div class="metric-label">Length Score</div>
359
+ <div class="metric-value">{model_df['Length Score'].iloc[0]:.2f}</div>
360
+ </div>
361
+ """, unsafe_allow_html=True)
362
+
363
+ with col2:
364
+ st.markdown(f"""
365
+ <div class="metric-card">
366
+ <div class="metric-label">Character Consistency</div>
367
+ <div class="metric-value">{model_df['Character Consistency'].iloc[0]:.2f}</div>
368
+ </div>
369
+ """, unsafe_allow_html=True)
370
+
371
+ with col3:
372
+ st.markdown(f"""
373
+ <div class="metric-card">
374
+ <div class="metric-label">Immersion</div>
375
+ <div class="metric-value">{model_df['Immersion'].iloc[0]:.2f}</div>
376
+ </div>
377
+ """, unsafe_allow_html=True)
378
+
379
+ with col4:
380
+ st.markdown(f"""
381
+ <div class="metric-card">
382
+ <div class="metric-label">Overall Score</div>
383
+ <div class="metric-value">{model_df['Overall Score'].iloc[0]:.2f}</div>
384
+ </div>
385
+ """, unsafe_allow_html=True)
386
+
387
+ # Model info
388
+ st.subheader("Model Information")
389
+
390
+ st.markdown(f"""
391
+ <div class="highlight">
392
+ <table width="100%">
393
+ <tr>
394
+ <td width="33%"><strong>Category:</strong> {model_df['Category'].iloc[0]}</td>
395
+ <td width="33%"><strong>Parameters:</strong> {model_df['Parameters'].iloc[0]}</td>
396
+ <td width="33%"><strong>Architecture:</strong> {model_df['Architecture'].iloc[0]}</td>
397
+ </tr>
398
+ <tr>
399
+ <td colspan="3"><strong>Release Date:</strong> {model_df['Release Date'].iloc[0]}</td>
400
+ </tr>
401
+ </table>
402
+ </div>
403
+ """, unsafe_allow_html=True)
404
+
405
+ # Performance trend
406
+ if model_df["Category"].iloc[0] == "MiniMaid":
407
+ st.subheader("MiniMaid Series Performance Evolution")
408
+
409
+ minimaid_df = df[df["Category"] == "MiniMaid"].sort_values("Release Date")
410
+
411
+ # Line chart for MiniMaid evolution
412
+ fig = px.line(
413
+ minimaid_df,
414
+ x="Model",
415
+ y=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
416
+ markers=True,
417
+ labels={"value": "Score", "variable": "Metric"},
418
+ height=500
419
+ )
420
+
421
+ fig.update_layout(
422
+ title="MiniMaid Model Series Improvement Over Time",
423
+ xaxis_title="Model Version",
424
+ yaxis_title="Score",
425
+ yaxis=dict(range=[0, 1]),
426
+ legend_title="Metric",
427
+ hovermode="x unified"
428
+ )
429
+
430
+ st.plotly_chart(fig, use_container_width=True)
431
+
432
+ st.markdown("""
433
+ <div class="highlight">
434
+ <h4>MiniMaid Development Insights</h4>
435
+ <p>The MiniMaid series shows clear progression across versions, with significant improvements in immersion
436
+ capabilities from L1 to L3. While character consistency has remained relatively stable, the overall
437
+ performance has steadily increased with each iteration.</p>
438
+ </div>
439
+ """, unsafe_allow_html=True)
440
+
441
+ with tab3:
442
+ st.header("Model Comparison")
443
+
444
+ # Select models to compare
445
+ default_models = ["OpenElla-Llama-3-8B", "MiniMaid-L3"] if "OpenElla-Llama-3-8B" in df["Model"].tolist() and "MiniMaid-L3" in df["Model"].tolist() else df["Model"].tolist()[:2]
446
+
447
+ selected_models = st.multiselect(
448
+ "Select models to compare:",
449
+ options=df["Model"].tolist(),
450
+ default=default_models
451
+ )
452
+
453
+ if len(selected_models) < 2:
454
+ st.warning("Please select at least two models to compare.")
455
+ else:
456
+ comparison_df = df[df["Model"].isin(selected_models)]
457
+
458
+ # Group bar chart for comparison
459
+ fig = px.bar(
460
+ comparison_df,
461
+ x="Model",
462
+ y=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
463
+ barmode="group",
464
+ labels={"value": "Score", "variable": "Metric"},
465
+ height=600,
466
+ color_discrete_sequence=px.colors.qualitative.Bold
467
+ )
468
+
469
+ fig.update_layout(
470
+ title="Side-by-Side Metric Comparison",
471
+ xaxis_title="",
472
+ yaxis_title="Score",
473
+ yaxis=dict(range=[0, 1]),
474
+ legend_title="Metric",
475
+ xaxis=dict(tickangle=-45),
476
+ hovermode="x unified"
477
+ )
478
+
479
+ st.plotly_chart(fig, use_container_width=True)
480
+
481
+ # Radar/Spider chart comparison
482
+ categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
483
+
484
+ fig = go.Figure()
485
+
486
+ for idx, model in enumerate(selected_models):
487
+ model_data = comparison_df[comparison_df["Model"] == model]
488
+ values = model_data[categories].values.flatten().tolist()
489
+
490
+ fig.add_trace(go.Scatterpolar(
491
+ r=values,
492
+ theta=categories,
493
+ fill='toself',
494
+ name=model
495
+ ))
496
+
497
+ fig.update_layout(
498
+ polar=dict(
499
+ radialaxis=dict(
500
+ visible=True,
501
+ range=[0, 1]
502
+ )
503
+ ),
504
+ showlegend=True,
505
+ legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5),
506
+ title="Performance Profile Comparison",
507
+ height=600
508
+ )
509
+
510
+ st.plotly_chart(fig, use_container_width=True)
511
+
512
+ # Comparison table
513
+ st.subheader("Detailed Comparison")
514
+
515
+ comparison_table = comparison_df.set_index("Model")[
516
+ ["Overall Score", "Length Score", "Character Consistency", "Immersion", "Parameters", "Architecture", "Category"]
517
+ ]
518
+
519
+ st.dataframe(comparison_table, use_container_width=True)
520
+
521
+ # Find strengths and weaknesses
522
+ if len(selected_models) == 2:
523
+ model1 = selected_models[0]
524
+ model2 = selected_models[1]
525
+
526
+ model1_data = comparison_df[comparison_df["Model"] == model1]
527
+ model2_data = comparison_df[comparison_df["Model"] == model2]
528
+
529
+ diff = {}
530
+ for metric in ["Length Score", "Character Consistency", "Immersion", "Overall Score"]:
531
+ diff[metric] = model1_data[metric].iloc[0] - model2_data[metric].iloc[0]
532
+
533
+ st.subheader(f"Comparative Analysis: {model1} vs {model2}")
534
+
535
+ col1, col2 = st.columns(2)
536
+
537
+ with col1:
538
+ st.markdown(f"""
539
+ <div class="metric-card">
540
+ <h4>{model1} Strengths</h4>
541
+ <ul>
542
+ """, unsafe_allow_html=True)
543
+
544
+ for metric, value in diff.items():
545
+ if value > 0:
546
+ st.markdown(f"<li>{metric}: +{abs(value):.2f} higher than {model2}</li>", unsafe_allow_html=True)
547
+
548
+ st.markdown("</ul></div>", unsafe_allow_html=True)
549
+
550
+ with col2:
551
+ st.markdown(f"""
552
+ <div class="metric-card">
553
+ <h4>{model2} Strengths</h4>
554
+ <ul>
555
+ """, unsafe_allow_html=True)
556
+
557
+ for metric, value in diff.items():
558
+ if value < 0:
559
+ st.markdown(f"<li>{metric}: +{abs(value):.2f} higher than {model1}</li>", unsafe_allow_html=True)
560
+
561
+ st.markdown("</ul></div>", unsafe_allow_html=True)
562
+
563
+ # Overall summary
564
+ overall_diff = diff["Overall Score"]
565
+ better_model = model1 if overall_diff > 0 else model2
566
+ worse_model = model2 if overall_diff > 0 else model1
567
+
568
+ st.markdown(f"""
569
+ <div class="highlight">
570
+ <h4>Summary</h4>
571
+ <p>Overall, <strong>{better_model}</strong> outperforms <strong>{worse_model}</strong> by
572
+ {abs(overall_diff):.2f} points in the combined roleplay score. The most significant difference is in
573
+ the {max(diff.items(), key=lambda x: abs(x[1]))[0]} metric.</p>
574
+ </div>
575
+ """, unsafe_allow_html=True)
576
+
577
+ with tab4:
578
+ st.header("About This Leaderboard")
579
+
580
+ st.markdown("""
581
+ ## Understanding the Metrics
582
+
583
+ This leaderboard evaluates AI models on their roleplay capabilities using four key metrics:
584
+
585
+ - **Length Score**: Measures the model's ability to provide responses of appropriate length for roleplay scenarios. Higher scores indicate better response length management.
586
+
587
+ - **Character Consistency**: Evaluates how well the model maintains a consistent character persona throughout the interaction. Higher scores indicate better adherence to character traits and background.
588
+
589
+ - **Immersion**: Assesses the model's ability to create an immersive roleplay experience, including environmental details, emotional depth, and narrative engagement.
590
+
591
+ - **Overall Score**: A composite score reflecting the model's overall roleplay performance, combining all metrics.
592
+
593
+ ## Methodology
594
+
595
+ Models are evaluated through a standardized testing protocol involving multiple roleplay scenarios across different genres and contexts. Each model is tested with identical prompts to ensure fair comparison.
596
+
597
+ The evaluation process involves:
598
+
599
+ 1. Running models through a standardized set of roleplay scenarios
600
+ 2. Expert evaluation of responses against established criteria
601
+ 3. Quantitative scoring based on objective metrics
602
+ 4. Normalization of scores across model sizes and architectures
603
+
604
+ ## Data Updates
605
+
606
+ This leaderboard is regularly updated as new models are released or existing models are improved. The most recent update was on April 2025.
607
+
608
+ ## Contact Information
609
+
610
+ For questions about the methodology or to submit a model for evaluation, please contact: [[email protected]]
611
+ """)
612
+
613
+ # Add a download button for the complete dataset
614
+ csv = df.to_csv(index=False)
615
+ b64 = base64.b64encode(csv.encode()).decode()
616
+ href = f'<a href="data:file/csv;base64,{b64}" download="ai_roleplay_leaderboard.csv">Download Full Dataset (CSV)</a>'
617
+ st.markdown(href, unsafe_allow_html=True)
618
+
619
+ # Footer
620
+ st.markdown("""
621
+ <div class="footer">
622
+ <p>© 2025 AI Roleplay Performance Leaderboard | Created with Streamlit | Data last updated: April 2025</p>
623
+ </div>
624
+ """, unsafe_allow_html=True)
625
+
626
+ # Add custom JavaScript for interactivity
627
+ st.markdown("""
628
+ <script>
629
+ const modelCards = document.querySelectorAll('.model-card');
630
+ modelCards.forEach(card => {
631
+ card.addEventListener('mouseenter', () => {
632
+ card.style.transform = 'translateY(-10px)';
633
+ card.style.boxShadow = '0 10px 20px rgba(0, 0, 0, 0.2)';
634
+ });
635
+ card.addEventListener('mouseleave', () => {
636
+ card.style.transform = 'translateY(0)';
637
+ card.style.boxShadow = '0 4px 6px rgba(0, 0, 0, 0.1)';
638
+ });
639
+ });
640
+ </script>
641
+ """, unsafe_allow_html=True)