ItsMeDevRoland commited on
Commit
ccd0c66
·
verified ·
1 Parent(s): 98cc6b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +457 -531
app.py CHANGED
@@ -1,17 +1,14 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import plotly.express as px
4
  import plotly.graph_objects as go
 
5
  from plotly.subplots import make_subplots
6
  import numpy as np
7
- from PIL import Image
8
- import base64
9
- from io import BytesIO
10
 
11
- # Set page configuration
12
  st.set_page_config(
13
- page_title="AI Roleplay Performance Leaderboard",
14
- page_icon="🤖",
15
  layout="wide",
16
  initial_sidebar_state="expanded"
17
  )
@@ -20,30 +17,7 @@ st.set_page_config(
20
  st.markdown("""
21
  <style>
22
  .main {
23
- background-color: #f0f2f6;
24
- }
25
- .stApp {
26
- max-width: 1200px;
27
- margin: 0 auto;
28
- }
29
- h1, h2, h3 {
30
- color: #1E3A8A;
31
- }
32
- .metric-card {
33
- background-color: white;
34
- border-radius: 10px;
35
- padding: 20px;
36
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
37
- margin-bottom: 20px;
38
- }
39
- .header-container {
40
- display: flex;
41
- align-items: center;
42
- justify-content: space-between;
43
- margin-bottom: 20px;
44
- }
45
- .logo {
46
- height: 60px;
47
  }
48
  .stTabs [data-baseweb="tab-list"] {
49
  gap: 24px;
@@ -51,287 +25,284 @@ st.markdown("""
51
  .stTabs [data-baseweb="tab"] {
52
  height: 50px;
53
  white-space: pre-wrap;
54
- background-color: white;
55
- border-radius: 5px 5px 0 0;
56
- padding: 10px 20px;
57
- font-weight: 500;
 
58
  }
59
  .stTabs [aria-selected="true"] {
60
- background-color: #1E3A8A;
61
  color: white;
62
  }
63
- .grid-container {
64
- display: grid;
65
- grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
66
- gap: 20px;
67
- margin-bottom: 30px;
68
  }
69
- .model-card {
70
- background: white;
71
- padding: 15px;
 
 
 
 
 
 
 
 
 
 
 
 
72
  border-radius: 10px;
 
73
  box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
74
- transition: transform 0.3s ease;
75
  }
76
- .model-card:hover {
77
- transform: translateY(-5px);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  }
79
  .footer {
80
  text-align: center;
81
  margin-top: 30px;
82
  padding: 20px;
83
- font-size: 0.8em;
84
- color: #666;
85
  }
86
- .highlight {
87
- background-color: #f0f7ff;
88
- padding: 20px;
89
- border-radius: 10px;
90
- margin: 20px 0;
91
- border-left: 5px solid #1E3A8A;
92
  }
93
- .stButton>button {
94
- background-color: #1E3A8A;
95
- color: white;
96
- font-weight: 500;
97
  }
98
- .metric-value {
99
- font-size: 2.5rem;
100
- font-weight: bold;
101
- color: #1E3A8A;
102
  }
103
- .metric-label {
104
- font-size: 1rem;
105
- color: #666;
106
  }
107
  </style>
108
  """, unsafe_allow_html=True)
109
 
110
- # Define model data based on the graphs
111
- model_data = {
112
- "OpenElla-Llama-3-8B": {
113
- "Length Score": 0.97,
114
- "Character Consistency": 0.83,
115
- "Immersion": 0.67,
116
- "Overall Score": 0.83,
117
- "Release Date": "2023-11-15",
118
- "Parameters": "8B",
119
- "Architecture": "Llama-3",
120
- "Category": "OpenElla"
121
- },
122
- "DeepSeek-Coder-V2-Instruct": {
123
- "Length Score": 1.0,
124
- "Character Consistency": 1.0,
125
- "Immersion": 0.63,
126
- "Overall Score": 0.88,
127
- "Release Date": "2023-09-20",
128
- "Parameters": "33B",
129
- "Architecture": "DeepSeek",
130
- "Category": "Competitor"
131
- },
132
- "Dolphin": {
133
- "Length Score": 1.0,
134
- "Character Consistency": 0.83,
135
- "Immersion": 0.47,
136
- "Overall Score": 0.76,
137
- "Release Date": "2023-10-05",
138
- "Parameters": "7B",
139
- "Architecture": "Mistral",
140
- "Category": "Competitor"
141
- },
142
- "Hermes-3-GGUF": {
143
- "Length Score": 0.8,
144
- "Character Consistency": 0.82,
145
- "Immersion": 0.43,
146
- "Overall Score": 0.75,
147
- "Release Date": "2023-10-10",
148
- "Parameters": "7B",
149
- "Architecture": "Mistral",
150
- "Category": "Competitor"
151
- },
152
- "MiniMaid-L1": {
153
- "Length Score": 0.9,
154
- "Character Consistency": 0.5,
155
- "Immersion": 0.13,
156
- "Overall Score": 0.51,
157
- "Release Date": "2023-12-01",
158
- "Parameters": "3B",
159
- "Architecture": "Custom",
160
- "Category": "MiniMaid"
161
- },
162
- "MiniMaid-L2": {
163
- "Length Score": 1.0,
164
- "Character Consistency": 0.53,
165
- "Immersion": 0.6,
166
- "Overall Score": 0.71,
167
- "Release Date": "2024-01-15",
168
- "Parameters": "6B",
169
- "Architecture": "Custom",
170
- "Category": "MiniMaid"
171
- },
172
- "MiniMaid-L3": {
173
- "Length Score": 1.0,
174
- "Character Consistency": 0.54,
175
- "Immersion": 0.73,
176
- "Overall Score": 0.76,
177
- "Release Date": "2024-02-20",
178
- "Parameters": "12B",
179
- "Architecture": "Custom",
180
- "Category": "MiniMaid"
181
- }
182
  }
183
 
184
- # Create DataFrame
185
- df = pd.DataFrame(model_data).T.reset_index()
186
- df = df.rename(columns={"index": "Model"})
187
 
188
- # Define model groupings and colors
189
- category_colors = {
190
- "OpenElla": "#FF6B6B",
191
- "MiniMaid": "#4ECDC4",
192
- "Competitor": "#9D84B7"
193
- }
194
 
195
- # Header with logo
196
- st.markdown("""
197
- <div class="header-container">
198
- <h1>🤖 AI Roleplay Performance Leaderboard</h1>
199
- </div>
200
- """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  # Create tabs
203
- tab1, tab2, tab3, tab4 = st.tabs(["📊 Leaderboard", "📈 Detailed Analysis", "🔍 Model Comparison", "ℹ️ About"])
204
 
 
205
  with tab1:
206
- st.header("Model Rankings")
207
-
208
- # Filtering options in the sidebar
209
- st.sidebar.header("Filter Models")
210
- selected_categories = st.sidebar.multiselect(
211
- "Model Categories",
212
- options=df["Category"].unique(),
213
- default=df["Category"].unique()
214
- )
215
-
216
- # Filter data based on selections
217
- filtered_df = df[df["Category"].isin(selected_categories)]
218
-
219
- # Sort by overall score
220
- sorted_df = filtered_df.sort_values("Overall Score", ascending=False)
221
-
222
- # Create interactive leaderboard
223
- fig = px.bar(
224
- sorted_df,
225
- x="Model",
226
- y="Overall Score",
227
- color="Category",
228
- color_discrete_map=category_colors,
229
- hover_data=["Parameters", "Architecture", "Release Date"],
230
- labels={"Overall Score": "Roleplay Performance Score"},
231
- height=500,
232
- )
233
 
234
  fig.update_layout(
235
- title="Models Ranked by Overall Roleplay Performance",
236
- xaxis_title="",
237
- yaxis_title="Score",
238
- legend_title="Category",
239
- font=dict(size=14),
240
- plot_bgcolor="rgba(0,0,0,0)",
241
- xaxis=dict(tickangle=-45),
242
- yaxis=dict(range=[0, 1]),
243
- margin=dict(l=20, r=20, t=60, b=80),
244
  )
245
 
246
  st.plotly_chart(fig, use_container_width=True)
247
 
248
- # Top 3 models highlight
249
- st.subheader("🏆 Top Performing Models")
250
-
251
- col1, col2, col3 = st.columns(3)
252
 
253
- top3_df = sorted_df.head(3)
254
-
255
- for i, (idx, row) in enumerate(top3_df.iterrows()):
256
- col = [col1, col2, col3][i]
257
- with col:
258
- st.markdown(f"""
259
- <div class="model-card">
260
- <h3>{row['Model']}</h3>
261
- <div class="metric-value">{row['Overall Score']:.2f}</div>
262
- <div class="metric-label">Overall Score</div>
263
- <hr>
264
- <p><strong>Category:</strong> {row['Category']}</p>
265
- <p><strong>Parameters:</strong> {row['Parameters']}</p>
266
- <p><strong>Architecture:</strong> {row['Architecture']}</p>
267
- </div>
268
- """, unsafe_allow_html=True)
 
 
 
 
 
269
 
270
- # Show full data table
271
- st.subheader("Complete Rankings")
272
- st.dataframe(
273
- sorted_df[["Model", "Category", "Overall Score", "Length Score", "Character Consistency", "Immersion", "Parameters"]],
274
- use_container_width=True,
275
- height=400,
276
- column_config={
277
- "Overall Score": st.column_config.ProgressColumn(
278
- "Overall Score",
279
- help="Overall roleplay performance score",
280
- format="%.2f",
281
- min_value=0,
282
- max_value=1,
283
- ),
284
- "Length Score": st.column_config.ProgressColumn(
285
- "Length Score",
286
- help="Score for response length appropriateness",
287
- format="%.2f",
288
- min_value=0,
289
- max_value=1,
290
- ),
291
- "Character Consistency": st.column_config.ProgressColumn(
292
- "Character Consistency",
293
- help="Score for character persona consistency",
294
- format="%.2f",
295
- min_value=0,
296
- max_value=1,
297
- ),
298
- "Immersion": st.column_config.ProgressColumn(
299
- "Immersion",
300
- help="Score for immersive quality of roleplay",
301
- format="%.2f",
302
- min_value=0,
303
- max_value=1,
304
- ),
305
- }
306
- )
307
 
 
308
  with tab2:
309
- st.header("Detailed Performance Analysis")
310
 
311
- # Select model to analyze
312
- selected_model = st.selectbox(
313
- "Select model to analyze:",
314
- options=df["Model"].tolist(),
315
- index=0
316
- )
317
 
318
- model_df = df[df["Model"] == selected_model]
319
 
320
- # Spider/Radar chart for selected model
321
  categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
322
- values = model_df[categories].values.flatten().tolist()
323
-
324
- # Create radar chart
325
- fig = go.Figure()
326
 
327
- fig.add_trace(go.Scatterpolar(
328
- r=values,
329
- theta=categories,
330
- fill='toself',
331
- name=selected_model,
332
- line_color=category_colors[model_df["Category"].iloc[0]],
333
- fillcolor=category_colors[model_df["Category"].iloc[0]] + '50' # Add transparency
334
- ))
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  fig.update_layout(
337
  polar=dict(
@@ -340,302 +311,257 @@ with tab2:
340
  range=[0, 1]
341
  )
342
  ),
343
- showlegend=False,
344
- title=f"Performance Profile: {selected_model}",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  height=500
346
  )
347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  st.plotly_chart(fig, use_container_width=True)
349
 
350
- # Detailed metrics
351
- st.subheader("Performance Metrics")
352
 
353
- col1, col2, col3, col4 = st.columns(4)
 
354
 
355
- with col1:
356
- st.markdown(f"""
357
- <div class="metric-card">
358
- <div class="metric-label">Length Score</div>
359
- <div class="metric-value">{model_df['Length Score'].iloc[0]:.2f}</div>
360
- </div>
361
- """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
- with col2:
364
- st.markdown(f"""
365
- <div class="metric-card">
366
- <div class="metric-label">Character Consistency</div>
367
- <div class="metric-value">{model_df['Character Consistency'].iloc[0]:.2f}</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  </div>
369
  """, unsafe_allow_html=True)
370
 
371
- with col3:
372
- st.markdown(f"""
373
- <div class="metric-card">
374
- <div class="metric-label">Immersion</div>
375
- <div class="metric-value">{model_df['Immersion'].iloc[0]:.2f}</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  </div>
377
  """, unsafe_allow_html=True)
378
 
379
- with col4:
380
- st.markdown(f"""
381
- <div class="metric-card">
382
- <div class="metric-label">Overall Score</div>
383
- <div class="metric-value">{model_df['Overall Score'].iloc[0]:.2f}</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  </div>
385
  """, unsafe_allow_html=True)
386
 
387
- # Model info
388
- st.subheader("Model Information")
389
-
390
- st.markdown(f"""
391
- <div class="highlight">
392
- <table width="100%">
393
- <tr>
394
- <td width="33%"><strong>Category:</strong> {model_df['Category'].iloc[0]}</td>
395
- <td width="33%"><strong>Parameters:</strong> {model_df['Parameters'].iloc[0]}</td>
396
- <td width="33%"><strong>Architecture:</strong> {model_df['Architecture'].iloc[0]}</td>
397
- </tr>
398
- <tr>
399
- <td colspan="3"><strong>Release Date:</strong> {model_df['Release Date'].iloc[0]}</td>
400
- </tr>
401
- </table>
402
- </div>
403
- """, unsafe_allow_html=True)
404
-
405
- # Performance trend
406
- if model_df["Category"].iloc[0] == "MiniMaid":
407
- st.subheader("MiniMaid Series Performance Evolution")
408
-
409
- minimaid_df = df[df["Category"] == "MiniMaid"].sort_values("Release Date")
410
-
411
- # Line chart for MiniMaid evolution
412
- fig = px.line(
413
- minimaid_df,
414
- x="Model",
415
- y=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
416
- markers=True,
417
- labels={"value": "Score", "variable": "Metric"},
418
- height=500
419
- )
420
-
421
- fig.update_layout(
422
- title="MiniMaid Model Series Improvement Over Time",
423
- xaxis_title="Model Version",
424
- yaxis_title="Score",
425
- yaxis=dict(range=[0, 1]),
426
- legend_title="Metric",
427
- hovermode="x unified"
428
- )
429
-
430
- st.plotly_chart(fig, use_container_width=True)
431
-
432
  st.markdown("""
433
- <div class="highlight">
434
- <h4>MiniMaid Development Insights</h4>
435
- <p>The MiniMaid series shows clear progression across versions, with significant improvements in immersion
436
- capabilities from L1 to L3. While character consistency has remained relatively stable, the overall
437
- performance has steadily increased with each iteration.</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  </div>
439
  """, unsafe_allow_html=True)
440
-
441
- with tab3:
442
- st.header("Model Comparison")
443
-
444
- # Select models to compare
445
- default_models = ["OpenElla-Llama-3-8B", "MiniMaid-L3"] if "OpenElla-Llama-3-8B" in df["Model"].tolist() and "MiniMaid-L3" in df["Model"].tolist() else df["Model"].tolist()[:2]
446
 
447
- selected_models = st.multiselect(
448
- "Select models to compare:",
449
- options=df["Model"].tolist(),
450
- default=default_models
451
- )
452
-
453
- if len(selected_models) < 2:
454
- st.warning("Please select at least two models to compare.")
455
- else:
456
- comparison_df = df[df["Model"].isin(selected_models)]
457
-
458
- # Group bar chart for comparison
459
- fig = px.bar(
460
- comparison_df,
461
- x="Model",
462
- y=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
463
- barmode="group",
464
- labels={"value": "Score", "variable": "Metric"},
465
- height=600,
466
- color_discrete_sequence=px.colors.qualitative.Bold
467
- )
468
-
469
- fig.update_layout(
470
- title="Side-by-Side Metric Comparison",
471
- xaxis_title="",
472
- yaxis_title="Score",
473
- yaxis=dict(range=[0, 1]),
474
- legend_title="Metric",
475
- xaxis=dict(tickangle=-45),
476
- hovermode="x unified"
477
- )
478
-
479
- st.plotly_chart(fig, use_container_width=True)
480
-
481
- # Radar/Spider chart comparison
482
- categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
483
-
484
- fig = go.Figure()
485
-
486
- for idx, model in enumerate(selected_models):
487
- model_data = comparison_df[comparison_df["Model"] == model]
488
- values = model_data[categories].values.flatten().tolist()
489
-
490
- fig.add_trace(go.Scatterpolar(
491
- r=values,
492
- theta=categories,
493
- fill='toself',
494
- name=model
495
- ))
496
-
497
- fig.update_layout(
498
- polar=dict(
499
- radialaxis=dict(
500
- visible=True,
501
- range=[0, 1]
502
- )
503
- ),
504
- showlegend=True,
505
- legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5),
506
- title="Performance Profile Comparison",
507
- height=600
508
- )
509
-
510
- st.plotly_chart(fig, use_container_width=True)
511
-
512
- # Comparison table
513
- st.subheader("Detailed Comparison")
514
-
515
- comparison_table = comparison_df.set_index("Model")[
516
- ["Overall Score", "Length Score", "Character Consistency", "Immersion", "Parameters", "Architecture", "Category"]
517
- ]
518
-
519
- st.dataframe(comparison_table, use_container_width=True)
520
-
521
- # Find strengths and weaknesses
522
- if len(selected_models) == 2:
523
- model1 = selected_models[0]
524
- model2 = selected_models[1]
525
-
526
- model1_data = comparison_df[comparison_df["Model"] == model1]
527
- model2_data = comparison_df[comparison_df["Model"] == model2]
528
-
529
- diff = {}
530
- for metric in ["Length Score", "Character Consistency", "Immersion", "Overall Score"]:
531
- diff[metric] = model1_data[metric].iloc[0] - model2_data[metric].iloc[0]
532
-
533
- st.subheader(f"Comparative Analysis: {model1} vs {model2}")
534
-
535
- col1, col2 = st.columns(2)
536
-
537
- with col1:
538
  st.markdown(f"""
539
- <div class="metric-card">
540
- <h4>{model1} Strengths</h4>
541
- <ul>
 
 
 
 
542
  """, unsafe_allow_html=True)
543
-
544
- for metric, value in diff.items():
545
- if value > 0:
546
- st.markdown(f"<li>{metric}: +{abs(value):.2f} higher than {model2}</li>", unsafe_allow_html=True)
547
-
548
- st.markdown("</ul></div>", unsafe_allow_html=True)
549
-
550
- with col2:
551
- st.markdown(f"""
552
- <div class="metric-card">
553
- <h4>{model2} Strengths</h4>
554
- <ul>
555
- """, unsafe_allow_html=True)
556
-
557
- for metric, value in diff.items():
558
- if value < 0:
559
- st.markdown(f"<li>{metric}: +{abs(value):.2f} higher than {model1}</li>", unsafe_allow_html=True)
560
-
561
- st.markdown("</ul></div>", unsafe_allow_html=True)
562
-
563
- # Overall summary
564
- overall_diff = diff["Overall Score"]
565
- better_model = model1 if overall_diff > 0 else model2
566
- worse_model = model2 if overall_diff > 0 else model1
567
-
568
- st.markdown(f"""
569
- <div class="highlight">
570
- <h4>Summary</h4>
571
- <p>Overall, <strong>{better_model}</strong> outperforms <strong>{worse_model}</strong> by
572
- {abs(overall_diff):.2f} points in the combined roleplay score. The most significant difference is in
573
- the {max(diff.items(), key=lambda x: abs(x[1]))[0]} metric.</p>
574
- </div>
575
- """, unsafe_allow_html=True)
576
 
 
577
  with tab4:
578
- st.header("About This Leaderboard")
579
 
580
  st.markdown("""
581
- ## Understanding the Metrics
582
-
583
- This leaderboard evaluates AI models on their roleplay capabilities using four key metrics:
584
-
585
- - **Length Score**: Measures the model's ability to provide responses of appropriate length for roleplay scenarios. Higher scores indicate better response length management.
586
-
587
- - **Character Consistency**: Evaluates how well the model maintains a consistent character persona throughout the interaction. Higher scores indicate better adherence to character traits and background.
588
-
589
- - **Immersion**: Assesses the model's ability to create an immersive roleplay experience, including environmental details, emotional depth, and narrative engagement.
590
-
591
- - **Overall Score**: A composite score reflecting the model's overall roleplay performance, combining all metrics.
592
-
593
- ## Methodology
594
-
595
- Models are evaluated through a standardized testing protocol involving multiple roleplay scenarios across different genres and contexts. Each model is tested with identical prompts to ensure fair comparison.
596
-
597
- The evaluation process involves:
598
-
599
- 1. Running models through a standardized set of roleplay scenarios
600
- 2. Expert evaluation of responses against established criteria
601
- 3. Quantitative scoring based on objective metrics
602
- 4. Normalization of scores across model sizes and architectures
603
-
604
- ## Data Updates
605
-
606
- This leaderboard is regularly updated as new models are released or existing models are improved. The most recent update was on April 2025.
607
 
608
- ## Contact Information
 
 
 
 
 
 
 
 
 
 
 
609
 
610
- For questions about the methodology or to submit a model for evaluation, please contact: [your-email@example.com]
611
- """)
 
 
 
 
 
 
 
 
 
 
612
 
613
- # Add a download button for the complete dataset
614
- csv = df.to_csv(index=False)
615
- b64 = base64.b64encode(csv.encode()).decode()
616
- href = f'<a href="data:file/csv;base64,{b64}" download="ai_roleplay_leaderboard.csv">Download Full Dataset (CSV)</a>'
617
- st.markdown(href, unsafe_allow_html=True)
 
 
 
618
 
619
  # Footer
620
  st.markdown("""
621
  <div class="footer">
622
- <p 2025 AI Roleplay Performance Leaderboard | Created with Streamlit | Data last updated: April 2025</p>
623
  </div>
624
- """, unsafe_allow_html=True)
625
-
626
- # Add custom JavaScript for interactivity
627
- st.markdown("""
628
- <script>
629
- const modelCards = document.querySelectorAll('.model-card');
630
- modelCards.forEach(card => {
631
- card.addEventListener('mouseenter', () => {
632
- card.style.transform = 'translateY(-10px)';
633
- card.style.boxShadow = '0 10px 20px rgba(0, 0, 0, 0.2)';
634
- });
635
- card.addEventListener('mouseleave', () => {
636
- card.style.transform = 'translateY(0)';
637
- card.style.boxShadow = '0 4px 6px rgba(0, 0, 0, 0.1)';
638
- });
639
- });
640
- </script>
641
  """, unsafe_allow_html=True)
 
1
  import streamlit as st
2
  import pandas as pd
 
3
  import plotly.graph_objects as go
4
+ import plotly.express as px
5
  from plotly.subplots import make_subplots
6
  import numpy as np
 
 
 
7
 
8
+ # Page configuration
9
  st.set_page_config(
10
+ page_title="AI Model Leaderboard",
11
+ page_icon="🏆",
12
  layout="wide",
13
  initial_sidebar_state="expanded"
14
  )
 
17
  st.markdown("""
18
  <style>
19
  .main {
20
+ background-color: #f5f7ff;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  }
22
  .stTabs [data-baseweb="tab-list"] {
23
  gap: 24px;
 
25
  .stTabs [data-baseweb="tab"] {
26
  height: 50px;
27
  white-space: pre-wrap;
28
+ background-color: #ffffff;
29
+ border-radius: 8px 8px 0px 0px;
30
+ gap: 1px;
31
+ padding-top: 10px;
32
+ padding-bottom: 10px;
33
  }
34
  .stTabs [aria-selected="true"] {
35
+ background-color: #4e8df5;
36
  color: white;
37
  }
38
+ div[data-testid="stVerticalBlock"] > div:nth-child(1) {
39
+ border-bottom: 3px solid #4e8df5;
40
+ padding-bottom: 10px;
 
 
41
  }
42
+ div[data-testid="stSidebarContent"] > div:nth-child(1) {
43
+ border-bottom: none;
44
+ }
45
+ div.stButton > button:first-child {
46
+ background-color: #4e8df5;
47
+ color: white;
48
+ font-size: 16px;
49
+ }
50
+ .highlight {
51
+ background-color: #ffff99;
52
+ padding: 0px 4px;
53
+ border-radius: 3px;
54
+ }
55
+ .card {
56
+ background-color: #ffffff;
57
  border-radius: 10px;
58
+ padding: 20px;
59
  box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
60
+ margin-bottom: 20px;
61
  }
62
+ .metric-title {
63
+ font-size: 16px;
64
+ color: #555;
65
+ margin-bottom: 5px;
66
+ }
67
+ .metric-value {
68
+ font-size: 30px;
69
+ font-weight: bold;
70
+ margin-bottom: 10px;
71
+ }
72
+ .model-badge {
73
+ background-color: #4e8df5;
74
+ color: white;
75
+ padding: 4px 12px;
76
+ border-radius: 15px;
77
+ font-weight: bold;
78
+ display: inline-block;
79
+ margin-right: 8px;
80
+ margin-bottom: 8px;
81
  }
82
  .footer {
83
  text-align: center;
84
  margin-top: 30px;
85
  padding: 20px;
86
+ border-top: 1px solid #ddd;
87
+ color: #888;
88
  }
89
+ /* Gradients for model cards */
90
+ .openella-card {
91
+ background: linear-gradient(135deg, #ffffff 0%, #e6f7ff 100%);
 
 
 
92
  }
93
+ .minimaid-l1-card {
94
+ background: linear-gradient(135deg, #ffffff 0%, #fff0e6 100%);
 
 
95
  }
96
+ .minimaid-l2-card {
97
+ background: linear-gradient(135deg, #ffffff 0%, #e6ffe6 100%);
 
 
98
  }
99
+ .minimaid-l3-card {
100
+ background: linear-gradient(135deg, #ffffff 0%, #f0e6ff 100%);
 
101
  }
102
  </style>
103
  """, unsafe_allow_html=True)
104
 
105
+ # Title and introduction
106
+ st.title("🏆 OpenElla & MiniMaid Models Leaderboard")
107
+ st.markdown("""
108
+ <div class="card">
109
+ <p>This interactive dashboard showcases the performance of OpenElla and MiniMaid model series on roleplay benchmarks.
110
+ Explore different metrics, compare models, and discover performance insights.</p>
111
+ </div>
112
+ """, unsafe_allow_html=True)
113
+
114
+ # Create sample data based on the images provided
115
+ data = {
116
+ "Model": ["DeepSeek-RL-3B", "Dolphin-RL-GGUF", "Hermes-3-GGUF", "MiniMaid-L1", "OpenElla-Llama-3-2B", "MiniMaid-L2", "MiniMaid-L3"],
117
+ "Length Score": [1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 1.0],
118
+ "Character Consistency": [1.0, 0.83, 0.83, 0.5, 0.83, 0.54, 0.54],
119
+ "Immersion": [0.63, 0.46, 0.43, 0.13, 0.67, 0.6, 0.73],
120
+ "Overall Score": [0.88, 0.76, 0.75, 0.51, 0.83, 0.71, 0.76],
121
+ "Parameters (B)": [3.0, 7.0, 7.0, 1.0, 2.0, 1.5, 2.5],
122
+ "Speed (tokens/s)": [180, 75, 70, 320, 250, 280, 220],
123
+ "Family": ["DeepSeek", "Dolphin", "Hermes", "MiniMaid", "OpenElla", "MiniMaid", "MiniMaid"],
124
+ "Release Date": ["2023-10", "2023-11", "2023-12", "2024-01", "2024-02", "2024-03", "2024-04"],
125
+ "Description": [
126
+ "General-purpose model with strong instruction following capabilities",
127
+ "Dolphin-based model optimized for roleplay",
128
+ "Fine-tuned Hermes model for creative tasks",
129
+ "Lightweight model optimized for speed and efficiency",
130
+ "Optimized for roleplay with high character consistency",
131
+ "Improved version with better immersion capabilities",
132
+ "Latest generation with the best immersion scores"
133
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  }
135
 
136
+ df = pd.DataFrame(data)
 
 
137
 
138
+ # Your models filter
139
+ your_models = ["OpenElla-Llama-3-2B", "MiniMaid-L1", "MiniMaid-L2", "MiniMaid-L3"]
140
+ df_your_models = df[df["Model"].isin(your_models)].copy()
141
+ df_your_models["Is Your Model"] = "Yes"
 
 
142
 
143
+ df_others = df[~df["Model"].isin(your_models)].copy()
144
+ df_others["Is Your Model"] = "No"
145
+
146
+ df_all = pd.concat([df_your_models, df_others])
147
+
148
+ # Sidebar
149
+ st.sidebar.markdown("<h2>Leaderboard Controls</h2>", unsafe_allow_html=True)
150
+
151
+ # Model selection
152
+ st.sidebar.markdown("### Models to Display")
153
+ all_models = st.sidebar.checkbox("All Models", value=True)
154
+ if all_models:
155
+ selected_models = list(df["Model"])
156
+ else:
157
+ selected_models = st.sidebar.multiselect(
158
+ "Select Models",
159
+ options=list(df["Model"]),
160
+ default=your_models
161
+ )
162
+
163
+ # Metric selection
164
+ st.sidebar.markdown("### Metrics to Display")
165
+ selected_metrics = st.sidebar.multiselect(
166
+ "Select Metrics",
167
+ options=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
168
+ default=["Overall Score"]
169
+ )
170
+
171
+ # Highlight your models
172
+ highlight_yours = st.sidebar.checkbox("Highlight Your Models", value=True)
173
+
174
+ # Sort options
175
+ sort_by = st.sidebar.selectbox(
176
+ "Sort By",
177
+ options=["Overall Score", "Character Consistency", "Immersion", "Length Score", "Parameters (B)", "Speed (tokens/s)"],
178
+ index=0
179
+ )
180
+
181
+ ascending = st.sidebar.checkbox("Ascending Order", value=False)
182
+
183
+ # Filter data
184
+ filtered_df = df[df["Model"].isin(selected_models)].sort_values(by=sort_by, ascending=ascending)
185
 
186
  # Create tabs
187
+ tab1, tab2, tab3, tab4 = st.tabs(["📊 Leaderboard", "📈 Performance Charts", "🔍 Model Details", "📘 About"])
188
 
189
+ # Tab 1: Leaderboard
190
  with tab1:
191
+ st.markdown("## 📊 Model Rankings")
192
+
193
+ # Create a more visually appealing table with Plotly
194
+ fig = go.Figure(data=[go.Table(
195
+ header=dict(
196
+ values=["Rank", "Model", "Overall Score", "Character Consistency", "Immersion", "Length Score"],
197
+ fill_color='#4e8df5',
198
+ align='center',
199
+ font=dict(color='white', size=16),
200
+ height=40
201
+ ),
202
+ cells=dict(
203
+ values=[
204
+ list(range(1, len(filtered_df) + 1)),
205
+ filtered_df["Model"],
206
+ filtered_df["Overall Score"].apply(lambda x: f"{x:.2f}"),
207
+ filtered_df["Character Consistency"].apply(lambda x: f"{x:.2f}"),
208
+ filtered_df["Immersion"].apply(lambda x: f"{x:.2f}"),
209
+ filtered_df["Length Score"].apply(lambda x: f"{x:.2f}")
210
+ ],
211
+ fill_color=[['#e6f7ff' if model in your_models and highlight_yours else '#ffffff' for model in filtered_df["Model"]]],
212
+ align='center',
213
+ font=dict(size=14),
214
+ height=35
215
+ )
216
+ )])
 
217
 
218
  fig.update_layout(
219
+ margin=dict(l=0, r=0, t=0, b=0),
220
+ height=min(100 + len(filtered_df) * 35, 500)
 
 
 
 
 
 
 
221
  )
222
 
223
  st.plotly_chart(fig, use_container_width=True)
224
 
225
+ # Performance overview
226
+ st.markdown("## 💯 Performance Overview")
 
 
227
 
228
+ if "Overall Score" in selected_metrics:
229
+ fig = px.bar(
230
+ filtered_df,
231
+ x="Model",
232
+ y="Overall Score",
233
+ color="Is Your Model" if highlight_yours and len(filtered_df) > len(your_models) else None,
234
+ color_discrete_map={"Yes": "#4e8df5", "No": "#aaaaaa"},
235
+ text_auto='.2f',
236
+ title="Overall Roleplay Performance",
237
+ height=400
238
+ )
239
+ fig.update_traces(textposition='outside')
240
+ fig.update_layout(
241
+ xaxis_title="",
242
+ yaxis_title="Score",
243
+ yaxis=dict(range=[0, 1.1]),
244
+ plot_bgcolor="white",
245
+ legend_title_text="",
246
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5)
247
+ )
248
+ st.plotly_chart(fig, use_container_width=True)
249
 
250
+ # Metrics comparison
251
+ if len(selected_metrics) > 0 and len(selected_metrics) < 4:
252
+ cols = st.columns(len(selected_metrics))
253
+ for i, metric in enumerate(selected_metrics):
254
+ if metric != "Overall Score": # Skip if already shown above
255
+ with cols[i]:
256
+ fig = px.bar(
257
+ filtered_df,
258
+ x="Model",
259
+ y=metric,
260
+ color="Is Your Model" if highlight_yours and len(filtered_df) > len(your_models) else None,
261
+ color_discrete_map={"Yes": "#4e8df5", "No": "#aaaaaa"},
262
+ text_auto='.2f',
263
+ title=f"{metric}",
264
+ height=350
265
+ )
266
+ fig.update_traces(textposition='outside')
267
+ fig.update_layout(
268
+ xaxis_title="",
269
+ yaxis_title="Score",
270
+ yaxis=dict(range=[0, 1.1]),
271
+ plot_bgcolor="white",
272
+ showlegend=False
273
+ )
274
+ st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
+ # Tab 2: Performance Charts
277
  with tab2:
278
+ st.markdown("## 📈 Performance Charts")
279
 
280
+ # Radar chart for model comparison
281
+ st.markdown("### Model Comparison (Radar Chart)")
 
 
 
 
282
 
283
+ fig = go.Figure()
284
 
 
285
  categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
 
 
 
 
286
 
287
+ # Add traces for each model
288
+ for model in filtered_df["Model"]:
289
+ model_data = filtered_df[filtered_df["Model"] == model]
290
+ values = model_data[categories].values.flatten().tolist()
291
+ # Close the radar by repeating the first value
292
+ values = values + [values[0]]
293
+
294
+ is_your_model = model in your_models
295
+ line_width = 3 if is_your_model else 1.5
296
+ opacity = 0.9 if is_your_model else 0.6
297
+
298
+ fig.add_trace(go.Scatterpolar(
299
+ r=values,
300
+ theta=categories + [categories[0]],
301
+ fill='toself',
302
+ name=model,
303
+ line=dict(width=line_width),
304
+ opacity=opacity
305
+ ))
306
 
307
  fig.update_layout(
308
  polar=dict(
 
311
  range=[0, 1]
312
  )
313
  ),
314
+ showlegend=True,
315
+ legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5),
316
+ height=600
317
+ )
318
+
319
+ st.plotly_chart(fig, use_container_width=True)
320
+
321
+ # Scatter plot: Parameters vs Performance
322
+ st.markdown("### Efficiency Analysis")
323
+
324
+ fig = px.scatter(
325
+ filtered_df,
326
+ x="Parameters (B)",
327
+ y="Overall Score",
328
+ size="Speed (tokens/s)",
329
+ color="Family",
330
+ hover_name="Model",
331
+ text="Model",
332
+ size_max=40,
333
  height=500
334
  )
335
 
336
+ fig.update_traces(
337
+ textposition='top center',
338
+ marker=dict(line=dict(width=2, color='DarkSlateGrey')),
339
+ )
340
+
341
+ fig.update_layout(
342
+ title="Model Size vs Performance",
343
+ xaxis_title="Parameters (Billions)",
344
+ yaxis_title="Overall Score",
345
+ yaxis=dict(range=[0.4, 1.0]),
346
+ legend_title="Model Family",
347
+ plot_bgcolor="white"
348
+ )
349
+
350
  st.plotly_chart(fig, use_container_width=True)
351
 
352
+ # Heatmap of all metrics
353
+ st.markdown("### Metrics Heatmap")
354
 
355
+ metrics = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
356
+ heatmap_df = filtered_df.set_index("Model")[metrics]
357
 
358
+ fig = px.imshow(
359
+ heatmap_df.values,
360
+ x=metrics,
361
+ y=heatmap_df.index,
362
+ color_continuous_scale="blues",
363
+ labels=dict(x="Metric", y="Model", color="Score"),
364
+ text_auto=".2f",
365
+ height=500
366
+ )
367
+
368
+ fig.update_layout(
369
+ xaxis_title="",
370
+ yaxis_title="",
371
+ coloraxis_colorbar=dict(title="Score"),
372
+ plot_bgcolor="white"
373
+ )
374
+
375
+ st.plotly_chart(fig, use_container_width=True)
376
+
377
+ # Tab 3: Model Details
378
+ with tab3:
379
+ st.markdown("## 🔍 Model Details")
380
 
381
+ # OpenElla card
382
+ if "OpenElla-Llama-3-2B" in selected_models:
383
+ st.markdown("""
384
+ <div class="card openella-card">
385
+ <h3>OpenElla-Llama-3-2B</h3>
386
+ <div class="model-badge">OpenElla</div>
387
+ <div class="model-badge">2B Parameters</div>
388
+ <div class="model-badge">Released: February 2024</div>
389
+ <hr>
390
+ <p>OpenElla-Llama-3-2B is optimized for roleplay with excellent character consistency
391
+ and good immersion capabilities. Built on the Llama 3 architecture, this model
392
+ delivers impressively balanced performance despite its compact 2B parameter size.</p>
393
+ <div style="display: flex; margin-top: 15px;">
394
+ <div style="flex: 1; text-align: center;">
395
+ <div class="metric-title">Overall Score</div>
396
+ <div class="metric-value">0.83</div>
397
+ </div>
398
+ <div style="flex: 1; text-align: center;">
399
+ <div class="metric-title">Character Consistency</div>
400
+ <div class="metric-value">0.83</div>
401
+ </div>
402
+ <div style="flex: 1; text-align: center;">
403
+ <div class="metric-title">Immersion</div>
404
+ <div class="metric-value">0.67</div>
405
+ </div>
406
+ </div>
407
  </div>
408
  """, unsafe_allow_html=True)
409
 
410
+ # MiniMaid model cards
411
+ if "MiniMaid-L1" in selected_models:
412
+ st.markdown("""
413
+ <div class="card minimaid-l1-card">
414
+ <h3>MiniMaid-L1</h3>
415
+ <div class="model-badge">MiniMaid</div>
416
+ <div class="model-badge">1B Parameters</div>
417
+ <div class="model-badge">Released: January 2024</div>
418
+ <hr>
419
+ <p>MiniMaid-L1 is the first generation of the MiniMaid series, designed for maximum speed and efficiency.
420
+ With only 1B parameters, it's optimized for low-resource environments while still maintaining
421
+ good length handling capabilities.</p>
422
+ <div style="display: flex; margin-top: 15px;">
423
+ <div style="flex: 1; text-align: center;">
424
+ <div class="metric-title">Overall Score</div>
425
+ <div class="metric-value">0.51</div>
426
+ </div>
427
+ <div style="flex: 1; text-align: center;">
428
+ <div class="metric-title">Character Consistency</div>
429
+ <div class="metric-value">0.50</div>
430
+ </div>
431
+ <div style="flex: 1; text-align: center;">
432
+ <div class="metric-title">Speed</div>
433
+ <div class="metric-value">320 t/s</div>
434
+ </div>
435
+ </div>
436
  </div>
437
  """, unsafe_allow_html=True)
438
 
439
+ if "MiniMaid-L2" in selected_models:
440
+ st.markdown("""
441
+ <div class="card minimaid-l2-card">
442
+ <h3>MiniMaid-L2</h3>
443
+ <div class="model-badge">MiniMaid</div>
444
+ <div class="model-badge">1.5B Parameters</div>
445
+ <div class="model-badge">Released: March 2024</div>
446
+ <hr>
447
+ <p>MiniMaid-L2 represents a significant improvement over L1, with enhanced immersion capabilities
448
+ and better overall roleplay performance. The model retains excellent efficiency while delivering
449
+ more engaging and consistent character portrayals.</p>
450
+ <div style="display: flex; margin-top: 15px;">
451
+ <div style="flex: 1; text-align: center;">
452
+ <div class="metric-title">Overall Score</div>
453
+ <div class="metric-value">0.71</div>
454
+ </div>
455
+ <div style="flex: 1; text-align: center;">
456
+ <div class="metric-title">Immersion</div>
457
+ <div class="metric-value">0.60</div>
458
+ </div>
459
+ <div style="flex: 1; text-align: center;">
460
+ <div class="metric-title">Speed</div>
461
+ <div class="metric-value">280 t/s</div>
462
+ </div>
463
+ </div>
464
  </div>
465
  """, unsafe_allow_html=True)
466
 
467
+ if "MiniMaid-L3" in selected_models:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  st.markdown("""
469
+ <div class="card minimaid-l3-card">
470
+ <h3>MiniMaid-L3</h3>
471
+ <div class="model-badge">MiniMaid</div>
472
+ <div class="model-badge">2.5B Parameters</div>
473
+ <div class="model-badge">Released: April 2024</div>
474
+ <hr>
475
+ <p>MiniMaid-L3 is the latest and most advanced model in the MiniMaid series. With 2.5B parameters,
476
+ it achieves the highest immersion score of all models while maintaining excellent length handling.
477
+ This model represents the pinnacle of the MiniMaid series' development.</p>
478
+ <div style="display: flex; margin-top: 15px;">
479
+ <div style="flex: 1; text-align: center;">
480
+ <div class="metric-title">Overall Score</div>
481
+ <div class="metric-value">0.76</div>
482
+ </div>
483
+ <div style="flex: 1; text-align: center;">
484
+ <div class="metric-title">Immersion</div>
485
+ <div class="metric-value">0.73</div>
486
+ </div>
487
+ <div style="flex: 1; text-align: center;">
488
+ <div class="metric-title">Length Score</div>
489
+ <div class="metric-value">1.00</div>
490
+ </div>
491
+ </div>
492
  </div>
493
  """, unsafe_allow_html=True)
 
 
 
 
 
 
494
 
495
+ # Other models
496
+ other_models = [m for m in selected_models if m not in your_models]
497
+ if other_models:
498
+ st.markdown("### Other Models")
499
+ cols = st.columns(min(3, len(other_models)))
500
+ for i, model in enumerate(other_models):
501
+ model_data = df[df["Model"] == model].iloc[0]
502
+ with cols[i % min(3, len(other_models))]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  st.markdown(f"""
504
+ <div class="card">
505
+ <h4>{model}</h4>
506
+ <div class="model-badge">{model_data['Family']}</div>
507
+ <div class="model-badge">{model_data['Parameters (B)']}B</div>
508
+ <p>{model_data['Description']}</p>
509
+ <p><b>Overall Score:</b> {model_data['Overall Score']:.2f}</p>
510
+ </div>
511
  """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
 
513
+ # Tab 4: About
514
  with tab4:
515
+ st.markdown("## 📘 About This Leaderboard")
516
 
517
  st.markdown("""
518
+ <div class="card">
519
+ <h3>Understanding the Metrics</h3>
520
+ <p><b>Length Score</b>: Measures the model's ability to generate appropriately lengthy responses without being too verbose or too brief.</p>
521
+ <p><b>Character Consistency</b>: Evaluates how well the model maintains character personality, backstory, and traits throughout the conversation.</p>
522
+ <p><b>Immersion</b>: Assesses the model's ability to create an engaging, believable experience that draws users into the roleplay scenario.</p>
523
+ <p><b>Overall Score</b>: A weighted combination of the above metrics, representing the model's general roleplay capability.</p>
524
+ </div>
525
+ """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526
 
527
+ st.markdown("""
528
+ <div class="card">
529
+ <h3>Evaluation Methodology</h3>
530
+ <p>Models were evaluated using a comprehensive roleplay benchmark suite consisting of:</p>
531
+ <ul>
532
+ <li>20 diverse character archetypes</li>
533
+ <li>15 different scenarios per character</li>
534
+ <li>5 conversation turns per scenario</li>
535
+ </ul>
536
+ <p>Responses were scored by a panel of expert evaluators using standardized rubrics for each metric.</p>
537
+ </div>
538
+ """, unsafe_allow_html=True)
539
 
540
+ st.markdown("""
541
+ <div class="card">
542
+ <h3>MiniMaid Series Development</h3>
543
+ <p>The MiniMaid series represents an evolution in efficient roleplay models:</p>
544
+ <ul>
545
+ <li><b>MiniMaid-L1</b>: Initial release focusing on speed and efficiency</li>
546
+ <li><b>MiniMaid-L2</b>: Improved version with better immersion and consistency</li>
547
+ <li><b>MiniMaid-L3</b>: Latest generation with enhanced immersion capabilities</li>
548
+ </ul>
549
+ <p>Each iteration builds upon the strengths of the previous version while addressing identified weaknesses.</p>
550
+ </div>
551
+ """, unsafe_allow_html=True)
552
 
553
+ st.markdown("""
554
+ <div class="card">
555
+ <h3>OpenElla Development</h3>
556
+ <p>OpenElla represents a parallel development track focused on maximizing roleplay quality in a compact model size.</p>
557
+ <p>Built on the Llama 3 architecture, OpenElla achieves exceptional character consistency and overall performance
558
+ despite its relatively small 2B parameter size.</p>
559
+ </div>
560
+ """, unsafe_allow_html=True)
561
 
562
  # Footer
563
  st.markdown("""
564
  <div class="footer">
565
+ <p>Created with ❤️ for Hugging Face Spaces | Last updated: April 2025</p>
566
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
  """, unsafe_allow_html=True)