Spaces:

N-Bot-Int
/

RP_BENCHMARK

Sleeping

App Files Files Community

ItsMeDevRoland commited on Apr 12

Commit

ccd0c66

verified ·

1 Parent(s): 98cc6b3

Update app.py

Browse files

Files changed (1) hide show

app.py +457 -531

app.py CHANGED Viewed

@@ -1,17 +1,14 @@
 import streamlit as st
 import pandas as pd
-import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 import numpy as np
-from PIL import Image
-import base64
-from io import BytesIO
-# Set page configuration
 st.set_page_config(
-    page_title="AI Roleplay Performance Leaderboard",
-    page_icon="🤖",
     layout="wide",
     initial_sidebar_state="expanded"
 )
@@ -20,30 +17,7 @@ st.set_page_config(
 st.markdown("""
 <style>
     .main {
-        background-color: #f0f2f6;
-    }
-    .stApp {
-        max-width: 1200px;
-        margin: 0 auto;
-    }
-    h1, h2, h3 {
-        color: #1E3A8A;
-    }
-    .metric-card {
-        background-color: white;
-        border-radius: 10px;
-        padding: 20px;
-        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-        margin-bottom: 20px;
-    }
-    .header-container {
-        display: flex;
-        align-items: center;
-        justify-content: space-between;
-        margin-bottom: 20px;
-    }
-    .logo {
-        height: 60px;
     }
     .stTabs [data-baseweb="tab-list"] {
         gap: 24px;
@@ -51,287 +25,284 @@ st.markdown("""
     .stTabs [data-baseweb="tab"] {
         height: 50px;
         white-space: pre-wrap;
-        background-color: white;
-        border-radius: 5px 5px 0 0;
-        padding: 10px 20px;
-        font-weight: 500;
     }
     .stTabs [aria-selected="true"] {
-        background-color: #1E3A8A;
         color: white;
     }
-    .grid-container {
-        display: grid;
-        grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
-        gap: 20px;
-        margin-bottom: 30px;
     }
-    .model-card {
-        background: white;
-        padding: 15px;
         border-radius: 10px;
         box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-        transition: transform 0.3s ease;
     }
-    .model-card:hover {
-        transform: translateY(-5px);
     }
     .footer {
         text-align: center;
         margin-top: 30px;
         padding: 20px;
-        font-size: 0.8em;
-        color: #666;
     }
-    .highlight {
-        background-color: #f0f7ff;
-        padding: 20px;
-        border-radius: 10px;
-        margin: 20px 0;
-        border-left: 5px solid #1E3A8A;
     }
-    .stButton>button {
-        background-color: #1E3A8A;
-        color: white;
-        font-weight: 500;
     }
-    .metric-value {
-        font-size: 2.5rem;
-        font-weight: bold;
-        color: #1E3A8A;
     }
-    .metric-label {
-        font-size: 1rem;
-        color: #666;
     }
 </style>
 """, unsafe_allow_html=True)
-# Define model data based on the graphs
-model_data = {
-    "OpenElla-Llama-3-8B": {
-        "Length Score": 0.97,
-        "Character Consistency": 0.83,
-        "Immersion": 0.67,
-        "Overall Score": 0.83,
-        "Release Date": "2023-11-15",
-        "Parameters": "8B",
-        "Architecture": "Llama-3",
-        "Category": "OpenElla"
-    },
-    "DeepSeek-Coder-V2-Instruct": {
-        "Length Score": 1.0,
-        "Character Consistency": 1.0,
-        "Immersion": 0.63,
-        "Overall Score": 0.88,
-        "Release Date": "2023-09-20",
-        "Parameters": "33B",
-        "Architecture": "DeepSeek",
-        "Category": "Competitor"
-    },
-    "Dolphin": {
-        "Length Score": 1.0,
-        "Character Consistency": 0.83,
-        "Immersion": 0.47,
-        "Overall Score": 0.76,
-        "Release Date": "2023-10-05",
-        "Parameters": "7B",
-        "Architecture": "Mistral",
-        "Category": "Competitor"
-    },
-    "Hermes-3-GGUF": {
-        "Length Score": 0.8,
-        "Character Consistency": 0.82,
-        "Immersion": 0.43,
-        "Overall Score": 0.75,
-        "Release Date": "2023-10-10",
-        "Parameters": "7B",
-        "Architecture": "Mistral",
-        "Category": "Competitor"
-    },
-    "MiniMaid-L1": {
-        "Length Score": 0.9,
-        "Character Consistency": 0.5,
-        "Immersion": 0.13,
-        "Overall Score": 0.51,
-        "Release Date": "2023-12-01",
-        "Parameters": "3B",
-        "Architecture": "Custom",
-        "Category": "MiniMaid"
-    },
-    "MiniMaid-L2": {
-        "Length Score": 1.0,
-        "Character Consistency": 0.53,
-        "Immersion": 0.6,
-        "Overall Score": 0.71,
-        "Release Date": "2024-01-15",
-        "Parameters": "6B",
-        "Architecture": "Custom",
-        "Category": "MiniMaid"
-    },
-    "MiniMaid-L3": {
-        "Length Score": 1.0,
-        "Character Consistency": 0.54,
-        "Immersion": 0.73,
-        "Overall Score": 0.76,
-        "Release Date": "2024-02-20",
-        "Parameters": "12B",
-        "Architecture": "Custom",
-        "Category": "MiniMaid"
-    }
 }
-# Create DataFrame
-df = pd.DataFrame(model_data).T.reset_index()
-df = df.rename(columns={"index": "Model"})
-# Define model groupings and colors
-category_colors = {
-    "OpenElla": "#FF6B6B",
-    "MiniMaid": "#4ECDC4",
-    "Competitor": "#9D84B7"
-}
-# Header with logo
-st.markdown("""
-<div class="header-container">
-    <h1>🤖 AI Roleplay Performance Leaderboard</h1>
-</div>
-""", unsafe_allow_html=True)
 # Create tabs
-tab1, tab2, tab3, tab4 = st.tabs(["📊 Leaderboard", "📈 Detailed Analysis", "🔍 Model Comparison", "ℹ️ About"])
 with tab1:
-    st.header("Model Rankings")
-    # Filtering options in the sidebar
-    st.sidebar.header("Filter Models")
-    selected_categories = st.sidebar.multiselect(
-        "Model Categories",
-        options=df["Category"].unique(),
-        default=df["Category"].unique()
-    )
-    # Filter data based on selections
-    filtered_df = df[df["Category"].isin(selected_categories)]
-    # Sort by overall score
-    sorted_df = filtered_df.sort_values("Overall Score", ascending=False)
-    # Create interactive leaderboard
-    fig = px.bar(
-        sorted_df,
-        x="Model",
-        y="Overall Score",
-        color="Category",
-        color_discrete_map=category_colors,
-        hover_data=["Parameters", "Architecture", "Release Date"],
-        labels={"Overall Score": "Roleplay Performance Score"},
-        height=500,
-    )
     fig.update_layout(
-        title="Models Ranked by Overall Roleplay Performance",
-        xaxis_title="",
-        yaxis_title="Score",
-        legend_title="Category",
-        font=dict(size=14),
-        plot_bgcolor="rgba(0,0,0,0)",
-        xaxis=dict(tickangle=-45),
-        yaxis=dict(range=[0, 1]),
-        margin=dict(l=20, r=20, t=60, b=80),
     )
     st.plotly_chart(fig, use_container_width=True)
-    # Top 3 models highlight
-    st.subheader("🏆 Top Performing Models")
-    col1, col2, col3 = st.columns(3)
-    top3_df = sorted_df.head(3)
-    for i, (idx, row) in enumerate(top3_df.iterrows()):
-        col = [col1, col2, col3][i]
-        with col:
-            st.markdown(f"""
-            <div class="model-card">
-                <h3>{row['Model']}</h3>
-                <div class="metric-value">{row['Overall Score']:.2f}</div>
-                <div class="metric-label">Overall Score</div>
-                <hr>
-                <p><strong>Category:</strong> {row['Category']}</p>
-                <p><strong>Parameters:</strong> {row['Parameters']}</p>
-                <p><strong>Architecture:</strong> {row['Architecture']}</p>
-            </div>
-            """, unsafe_allow_html=True)
-    # Show full data table
-    st.subheader("Complete Rankings")
-    st.dataframe(
-        sorted_df[["Model", "Category", "Overall Score", "Length Score", "Character Consistency", "Immersion", "Parameters"]],
-        use_container_width=True,
-        height=400,
-        column_config={
-            "Overall Score": st.column_config.ProgressColumn(
-                "Overall Score",
-                help="Overall roleplay performance score",
-                format="%.2f",
-                min_value=0,
-                max_value=1,
-            ),
-            "Length Score": st.column_config.ProgressColumn(
-                "Length Score",
-                help="Score for response length appropriateness",
-                format="%.2f",
-                min_value=0,
-                max_value=1,
-            ),
-            "Character Consistency": st.column_config.ProgressColumn(
-                "Character Consistency",
-                help="Score for character persona consistency",
-                format="%.2f",
-                min_value=0,
-                max_value=1,
-            ),
-            "Immersion": st.column_config.ProgressColumn(
-                "Immersion",
-                help="Score for immersive quality of roleplay",
-                format="%.2f",
-                min_value=0,
-                max_value=1,
-            ),
-        }
-    )
 with tab2:
-    st.header("Detailed Performance Analysis")
-    # Select model to analyze
-    selected_model = st.selectbox(
-        "Select model to analyze:",
-        options=df["Model"].tolist(),
-        index=0
-    )
-    model_df = df[df["Model"] == selected_model]
-    # Spider/Radar chart for selected model
     categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
-    values = model_df[categories].values.flatten().tolist()
-    # Create radar chart
-    fig = go.Figure()
-    fig.add_trace(go.Scatterpolar(
-        r=values,
-        theta=categories,
-        fill='toself',
-        name=selected_model,
-        line_color=category_colors[model_df["Category"].iloc[0]],
-        fillcolor=category_colors[model_df["Category"].iloc[0]] + '50'  # Add transparency
-    ))
     fig.update_layout(
         polar=dict(
@@ -340,302 +311,257 @@ with tab2:
                 range=[0, 1]
             )
         ),
-        showlegend=False,
-        title=f"Performance Profile: {selected_model}",
         height=500
     )
     st.plotly_chart(fig, use_container_width=True)
-    # Detailed metrics
-    st.subheader("Performance Metrics")
-    col1, col2, col3, col4 = st.columns(4)
-    with col1:
-        st.markdown(f"""
-        <div class="metric-card">
-            <div class="metric-label">Length Score</div>
-            <div class="metric-value">{model_df['Length Score'].iloc[0]:.2f}</div>
-        </div>
-        """, unsafe_allow_html=True)
-    with col2:
-        st.markdown(f"""
-        <div class="metric-card">
-            <div class="metric-label">Character Consistency</div>
-            <div class="metric-value">{model_df['Character Consistency'].iloc[0]:.2f}</div>
         </div>
         """, unsafe_allow_html=True)
-    with col3:
-        st.markdown(f"""
-        <div class="metric-card">
-            <div class="metric-label">Immersion</div>
-            <div class="metric-value">{model_df['Immersion'].iloc[0]:.2f}</div>
         </div>
         """, unsafe_allow_html=True)
-    with col4:
-        st.markdown(f"""
-        <div class="metric-card">
-            <div class="metric-label">Overall Score</div>
-            <div class="metric-value">{model_df['Overall Score'].iloc[0]:.2f}</div>
         </div>
         """, unsafe_allow_html=True)
-    # Model info
-    st.subheader("Model Information")
-    st.markdown(f"""
-    <div class="highlight">
-        <table width="100%">
-            <tr>
-                <td width="33%"><strong>Category:</strong> {model_df['Category'].iloc[0]}</td>
-                <td width="33%"><strong>Parameters:</strong> {model_df['Parameters'].iloc[0]}</td>
-                <td width="33%"><strong>Architecture:</strong> {model_df['Architecture'].iloc[0]}</td>
-            </tr>
-            <tr>
-                <td colspan="3"><strong>Release Date:</strong> {model_df['Release Date'].iloc[0]}</td>
-            </tr>
-        </table>
-    </div>
-    """, unsafe_allow_html=True)
-    # Performance trend
-    if model_df["Category"].iloc[0] == "MiniMaid":
-        st.subheader("MiniMaid Series Performance Evolution")
-        minimaid_df = df[df["Category"] == "MiniMaid"].sort_values("Release Date")
-        # Line chart for MiniMaid evolution
-        fig = px.line(
-            minimaid_df,
-            x="Model",
-            y=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
-            markers=True,
-            labels={"value": "Score", "variable": "Metric"},
-            height=500
-        )
-        fig.update_layout(
-            title="MiniMaid Model Series Improvement Over Time",
-            xaxis_title="Model Version",
-            yaxis_title="Score",
-            yaxis=dict(range=[0, 1]),
-            legend_title="Metric",
-            hovermode="x unified"
-        )
-        st.plotly_chart(fig, use_container_width=True)
         st.markdown("""
-        <div class="highlight">
-            <h4>MiniMaid Development Insights</h4>
-            <p>The MiniMaid series shows clear progression across versions, with significant improvements in immersion
-            capabilities from L1 to L3. While character consistency has remained relatively stable, the overall
-            performance has steadily increased with each iteration.</p>
         </div>
         """, unsafe_allow_html=True)
-with tab3:
-    st.header("Model Comparison")
-    # Select models to compare
-    default_models = ["OpenElla-Llama-3-8B", "MiniMaid-L3"] if "OpenElla-Llama-3-8B" in df["Model"].tolist() and "MiniMaid-L3" in df["Model"].tolist() else df["Model"].tolist()[:2]
-    selected_models = st.multiselect(
-        "Select models to compare:",
-        options=df["Model"].tolist(),
-        default=default_models
-    )
-    if len(selected_models) < 2:
-        st.warning("Please select at least two models to compare.")
-    else:
-        comparison_df = df[df["Model"].isin(selected_models)]
-        # Group bar chart for comparison
-        fig = px.bar(
-            comparison_df,
-            x="Model",
-            y=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
-            barmode="group",
-            labels={"value": "Score", "variable": "Metric"},
-            height=600,
-            color_discrete_sequence=px.colors.qualitative.Bold
-        )
-        fig.update_layout(
-            title="Side-by-Side Metric Comparison",
-            xaxis_title="",
-            yaxis_title="Score",
-            yaxis=dict(range=[0, 1]),
-            legend_title="Metric",
-            xaxis=dict(tickangle=-45),
-            hovermode="x unified"
-        )
-        st.plotly_chart(fig, use_container_width=True)
-        # Radar/Spider chart comparison
-        categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
-        fig = go.Figure()
-        for idx, model in enumerate(selected_models):
-            model_data = comparison_df[comparison_df["Model"] == model]
-            values = model_data[categories].values.flatten().tolist()
-            fig.add_trace(go.Scatterpolar(
-                r=values,
-                theta=categories,
-                fill='toself',
-                name=model
-            ))
-        fig.update_layout(
-            polar=dict(
-                radialaxis=dict(
-                    visible=True,
-                    range=[0, 1]
-                )
-            ),
-            showlegend=True,
-            legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5),
-            title="Performance Profile Comparison",
-            height=600
-        )
-        st.plotly_chart(fig, use_container_width=True)
-        # Comparison table
-        st.subheader("Detailed Comparison")
-        comparison_table = comparison_df.set_index("Model")[
-            ["Overall Score", "Length Score", "Character Consistency", "Immersion", "Parameters", "Architecture", "Category"]
-        ]
-        st.dataframe(comparison_table, use_container_width=True)
-        # Find strengths and weaknesses
-        if len(selected_models) == 2:
-            model1 = selected_models[0]
-            model2 = selected_models[1]
-            model1_data = comparison_df[comparison_df["Model"] == model1]
-            model2_data = comparison_df[comparison_df["Model"] == model2]
-            diff = {}
-            for metric in ["Length Score", "Character Consistency", "Immersion", "Overall Score"]:
-                diff[metric] = model1_data[metric].iloc[0] - model2_data[metric].iloc[0]
-            st.subheader(f"Comparative Analysis: {model1} vs {model2}")
-            col1, col2 = st.columns(2)
-            with col1:
                 st.markdown(f"""
-                <div class="metric-card">
-                    <h4>{model1} Strengths</h4>
-                    <ul>
                 """, unsafe_allow_html=True)
-                for metric, value in diff.items():
-                    if value > 0:
-                        st.markdown(f"<li>{metric}: +{abs(value):.2f} higher than {model2}</li>", unsafe_allow_html=True)
-                st.markdown("</ul></div>", unsafe_allow_html=True)
-            with col2:
-                st.markdown(f"""
-                <div class="metric-card">
-                    <h4>{model2} Strengths</h4>
-                    <ul>
-                """, unsafe_allow_html=True)
-                for metric, value in diff.items():
-                    if value < 0:
-                        st.markdown(f"<li>{metric}: +{abs(value):.2f} higher than {model1}</li>", unsafe_allow_html=True)
-                st.markdown("</ul></div>", unsafe_allow_html=True)
-            # Overall summary
-            overall_diff = diff["Overall Score"]
-            better_model = model1 if overall_diff > 0 else model2
-            worse_model = model2 if overall_diff > 0 else model1
-            st.markdown(f"""
-            <div class="highlight">
-                <h4>Summary</h4>
-                <p>Overall, <strong>{better_model}</strong> outperforms <strong>{worse_model}</strong> by
-                {abs(overall_diff):.2f} points in the combined roleplay score. The most significant difference is in
-                the {max(diff.items(), key=lambda x: abs(x[1]))[0]} metric.</p>
-            </div>
-            """, unsafe_allow_html=True)
 with tab4:
-    st.header("About This Leaderboard")
     st.markdown("""
-    ## Understanding the Metrics
-    This leaderboard evaluates AI models on their roleplay capabilities using four key metrics:
-    - **Length Score**: Measures the model's ability to provide responses of appropriate length for roleplay scenarios. Higher scores indicate better response length management.
-    - **Character Consistency**: Evaluates how well the model maintains a consistent character persona throughout the interaction. Higher scores indicate better adherence to character traits and background.
-    - **Immersion**: Assesses the model's ability to create an immersive roleplay experience, including environmental details, emotional depth, and narrative engagement.
-    - **Overall Score**: A composite score reflecting the model's overall roleplay performance, combining all metrics.
-    ## Methodology
-    Models are evaluated through a standardized testing protocol involving multiple roleplay scenarios across different genres and contexts. Each model is tested with identical prompts to ensure fair comparison.
-    The evaluation process involves:
-    1. Running models through a standardized set of roleplay scenarios
-    2. Expert evaluation of responses against established criteria
-    3. Quantitative scoring based on objective metrics
-    4. Normalization of scores across model sizes and architectures
-    ## Data Updates
-    This leaderboard is regularly updated as new models are released or existing models are improved. The most recent update was on April 2025.
-    ## Contact Information
-    For questions about the methodology or to submit a model for evaluation, please contact: [your-email@example.com]
-    """)
-    # Add a download button for the complete dataset
-    csv = df.to_csv(index=False)
-    b64 = base64.b64encode(csv.encode()).decode()
-    href = f'<a href="data:file/csv;base64,{b64}" download="ai_roleplay_leaderboard.csv">Download Full Dataset (CSV)</a>'
-    st.markdown(href, unsafe_allow_html=True)
 # Footer
 st.markdown("""
 <div class="footer">
-    <p>© 2025 AI Roleplay Performance Leaderboard | Created with Streamlit | Data last updated: April 2025</p>
 </div>
-""", unsafe_allow_html=True)
-# Add custom JavaScript for interactivity
-st.markdown("""
-<script>
-    const modelCards = document.querySelectorAll('.model-card');
-    modelCards.forEach(card => {
-        card.addEventListener('mouseenter', () => {
-            card.style.transform = 'translateY(-10px)';
-            card.style.boxShadow = '0 10px 20px rgba(0, 0, 0, 0.2)';
-        });
-        card.addEventListener('mouseleave', () => {
-            card.style.transform = 'translateY(0)';
-            card.style.boxShadow = '0 4px 6px rgba(0, 0, 0, 0.1)';
-        });
-    });
-</script>
 """, unsafe_allow_html=True)

 import streamlit as st
 import pandas as pd
 import plotly.graph_objects as go
+import plotly.express as px
 from plotly.subplots import make_subplots
 import numpy as np
+# Page configuration
 st.set_page_config(
+    page_title="AI Model Leaderboard",
+    page_icon="🏆",
     layout="wide",
     initial_sidebar_state="expanded"
 )
 st.markdown("""
 <style>
     .main {
+        background-color: #f5f7ff;
     }
     .stTabs [data-baseweb="tab-list"] {
         gap: 24px;
     .stTabs [data-baseweb="tab"] {
         height: 50px;
         white-space: pre-wrap;
+        background-color: #ffffff;
+        border-radius: 8px 8px 0px 0px;
+        gap: 1px;
+        padding-top: 10px;
+        padding-bottom: 10px;
     }
     .stTabs [aria-selected="true"] {
+        background-color: #4e8df5;
         color: white;
     }
+    div[data-testid="stVerticalBlock"] > div:nth-child(1) {
+        border-bottom: 3px solid #4e8df5;
+        padding-bottom: 10px;
     }
+    div[data-testid="stSidebarContent"] > div:nth-child(1) {
+        border-bottom: none;
+    }
+    div.stButton > button:first-child {
+        background-color: #4e8df5;
+        color: white;
+        font-size: 16px;
+    }
+    .highlight {
+        background-color: #ffff99;
+        padding: 0px 4px;
+        border-radius: 3px;
+    }
+    .card {
+        background-color: #ffffff;
         border-radius: 10px;
+        padding: 20px;
         box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+        margin-bottom: 20px;
     }
+    .metric-title {
+        font-size: 16px;
+        color: #555;
+        margin-bottom: 5px;
+    }
+    .metric-value {
+        font-size: 30px;
+        font-weight: bold;
+        margin-bottom: 10px;
+    }
+    .model-badge {
+        background-color: #4e8df5;
+        color: white;
+        padding: 4px 12px;
+        border-radius: 15px;
+        font-weight: bold;
+        display: inline-block;
+        margin-right: 8px;
+        margin-bottom: 8px;
     }
     .footer {
         text-align: center;
         margin-top: 30px;
         padding: 20px;
+        border-top: 1px solid #ddd;
+        color: #888;
     }
+    /* Gradients for model cards */
+    .openella-card {
+        background: linear-gradient(135deg, #ffffff 0%, #e6f7ff 100%);
     }
+    .minimaid-l1-card {
+        background: linear-gradient(135deg, #ffffff 0%, #fff0e6 100%);
     }
+    .minimaid-l2-card {
+        background: linear-gradient(135deg, #ffffff 0%, #e6ffe6 100%);
     }
+    .minimaid-l3-card {
+        background: linear-gradient(135deg, #ffffff 0%, #f0e6ff 100%);
     }
 </style>
 """, unsafe_allow_html=True)
+# Title and introduction
+st.title("🏆 OpenElla & MiniMaid Models Leaderboard")
+st.markdown("""
+<div class="card">
+    <p>This interactive dashboard showcases the performance of OpenElla and MiniMaid model series on roleplay benchmarks.
+    Explore different metrics, compare models, and discover performance insights.</p>
+</div>
+""", unsafe_allow_html=True)
+# Create sample data based on the images provided
+data = {
+    "Model": ["DeepSeek-RL-3B", "Dolphin-RL-GGUF", "Hermes-3-GGUF", "MiniMaid-L1", "OpenElla-Llama-3-2B", "MiniMaid-L2", "MiniMaid-L3"],
+    "Length Score": [1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 1.0],
+    "Character Consistency": [1.0, 0.83, 0.83, 0.5, 0.83, 0.54, 0.54],
+    "Immersion": [0.63, 0.46, 0.43, 0.13, 0.67, 0.6, 0.73],
+    "Overall Score": [0.88, 0.76, 0.75, 0.51, 0.83, 0.71, 0.76],
+    "Parameters (B)": [3.0, 7.0, 7.0, 1.0, 2.0, 1.5, 2.5],
+    "Speed (tokens/s)": [180, 75, 70, 320, 250, 280, 220],
+    "Family": ["DeepSeek", "Dolphin", "Hermes", "MiniMaid", "OpenElla", "MiniMaid", "MiniMaid"],
+    "Release Date": ["2023-10", "2023-11", "2023-12", "2024-01", "2024-02", "2024-03", "2024-04"],
+    "Description": [
+        "General-purpose model with strong instruction following capabilities",
+        "Dolphin-based model optimized for roleplay",
+        "Fine-tuned Hermes model for creative tasks",
+        "Lightweight model optimized for speed and efficiency",
+        "Optimized for roleplay with high character consistency",
+        "Improved version with better immersion capabilities",
+        "Latest generation with the best immersion scores"
+    ]
 }
+df = pd.DataFrame(data)
+# Your models filter
+your_models = ["OpenElla-Llama-3-2B", "MiniMaid-L1", "MiniMaid-L2", "MiniMaid-L3"]
+df_your_models = df[df["Model"].isin(your_models)].copy()
+df_your_models["Is Your Model"] = "Yes"
+df_others = df[~df["Model"].isin(your_models)].copy()
+df_others["Is Your Model"] = "No"
+df_all = pd.concat([df_your_models, df_others])
+# Sidebar
+st.sidebar.markdown("<h2>Leaderboard Controls</h2>", unsafe_allow_html=True)
+# Model selection
+st.sidebar.markdown("### Models to Display")
+all_models = st.sidebar.checkbox("All Models", value=True)
+if all_models:
+    selected_models = list(df["Model"])
+else:
+    selected_models = st.sidebar.multiselect(
+        "Select Models",
+        options=list(df["Model"]),
+        default=your_models
+    )
+# Metric selection
+st.sidebar.markdown("### Metrics to Display")
+selected_metrics = st.sidebar.multiselect(
+    "Select Metrics",
+    options=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
+    default=["Overall Score"]
+)
+# Highlight your models
+highlight_yours = st.sidebar.checkbox("Highlight Your Models", value=True)
+# Sort options
+sort_by = st.sidebar.selectbox(
+    "Sort By",
+    options=["Overall Score", "Character Consistency", "Immersion", "Length Score", "Parameters (B)", "Speed (tokens/s)"],
+    index=0
+)
+ascending = st.sidebar.checkbox("Ascending Order", value=False)
+# Filter data
+filtered_df = df[df["Model"].isin(selected_models)].sort_values(by=sort_by, ascending=ascending)
 # Create tabs
+tab1, tab2, tab3, tab4 = st.tabs(["📊 Leaderboard", "📈 Performance Charts", "🔍 Model Details", "📘 About"])
+# Tab 1: Leaderboard
 with tab1:
+    st.markdown("## 📊 Model Rankings")
+    # Create a more visually appealing table with Plotly
+    fig = go.Figure(data=[go.Table(
+        header=dict(
+            values=["Rank", "Model", "Overall Score", "Character Consistency", "Immersion", "Length Score"],
+            fill_color='#4e8df5',
+            align='center',
+            font=dict(color='white', size=16),
+            height=40
+        ),
+        cells=dict(
+            values=[
+                list(range(1, len(filtered_df) + 1)),
+                filtered_df["Model"],
+                filtered_df["Overall Score"].apply(lambda x: f"{x:.2f}"),
+                filtered_df["Character Consistency"].apply(lambda x: f"{x:.2f}"),
+                filtered_df["Immersion"].apply(lambda x: f"{x:.2f}"),
+                filtered_df["Length Score"].apply(lambda x: f"{x:.2f}")
+            ],
+            fill_color=[['#e6f7ff' if model in your_models and highlight_yours else '#ffffff' for model in filtered_df["Model"]]],
+            align='center',
+            font=dict(size=14),
+            height=35
+        )
+    )])
     fig.update_layout(
+        margin=dict(l=0, r=0, t=0, b=0),
+        height=min(100 + len(filtered_df) * 35, 500)
     )
     st.plotly_chart(fig, use_container_width=True)
+    # Performance overview
+    st.markdown("## 💯 Performance Overview")
+    if "Overall Score" in selected_metrics:
+        fig = px.bar(
+            filtered_df,
+            x="Model",
+            y="Overall Score",
+            color="Is Your Model" if highlight_yours and len(filtered_df) > len(your_models) else None,
+            color_discrete_map={"Yes": "#4e8df5", "No": "#aaaaaa"},
+            text_auto='.2f',
+            title="Overall Roleplay Performance",
+            height=400
+        )
+        fig.update_traces(textposition='outside')
+        fig.update_layout(
+            xaxis_title="",
+            yaxis_title="Score",
+            yaxis=dict(range=[0, 1.1]),
+            plot_bgcolor="white",
+            legend_title_text="",
+            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5)
+        )
+        st.plotly_chart(fig, use_container_width=True)
+    # Metrics comparison
+    if len(selected_metrics) > 0 and len(selected_metrics) < 4:
+        cols = st.columns(len(selected_metrics))
+        for i, metric in enumerate(selected_metrics):
+            if metric != "Overall Score":  # Skip if already shown above
+                with cols[i]:
+                    fig = px.bar(
+                        filtered_df,
+                        x="Model",
+                        y=metric,
+                        color="Is Your Model" if highlight_yours and len(filtered_df) > len(your_models) else None,
+                        color_discrete_map={"Yes": "#4e8df5", "No": "#aaaaaa"},
+                        text_auto='.2f',
+                        title=f"{metric}",
+                        height=350
+                    )
+                    fig.update_traces(textposition='outside')
+                    fig.update_layout(
+                        xaxis_title="",
+                        yaxis_title="Score",
+                        yaxis=dict(range=[0, 1.1]),
+                        plot_bgcolor="white",
+                        showlegend=False
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+# Tab 2: Performance Charts
 with tab2:
+    st.markdown("## 📈 Performance Charts")
+    # Radar chart for model comparison
+    st.markdown("### Model Comparison (Radar Chart)")
+    fig = go.Figure()
     categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
+    # Add traces for each model
+    for model in filtered_df["Model"]:
+        model_data = filtered_df[filtered_df["Model"] == model]
+        values = model_data[categories].values.flatten().tolist()
+        # Close the radar by repeating the first value
+        values = values + [values[0]]
+        is_your_model = model in your_models
+        line_width = 3 if is_your_model else 1.5
+        opacity = 0.9 if is_your_model else 0.6
+        fig.add_trace(go.Scatterpolar(
+            r=values,
+            theta=categories + [categories[0]],
+            fill='toself',
+            name=model,
+            line=dict(width=line_width),
+            opacity=opacity
+        ))
     fig.update_layout(
         polar=dict(
                 range=[0, 1]
             )
         ),
+        showlegend=True,
+        legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5),
+        height=600
+    )
+    st.plotly_chart(fig, use_container_width=True)
+    # Scatter plot: Parameters vs Performance
+    st.markdown("### Efficiency Analysis")
+    fig = px.scatter(
+        filtered_df,
+        x="Parameters (B)",
+        y="Overall Score",
+        size="Speed (tokens/s)",
+        color="Family",
+        hover_name="Model",
+        text="Model",
+        size_max=40,
         height=500
     )
+    fig.update_traces(
+        textposition='top center',
+        marker=dict(line=dict(width=2, color='DarkSlateGrey')),
+    )
+    fig.update_layout(
+        title="Model Size vs Performance",
+        xaxis_title="Parameters (Billions)",
+        yaxis_title="Overall Score",
+        yaxis=dict(range=[0.4, 1.0]),
+        legend_title="Model Family",
+        plot_bgcolor="white"
+    )
     st.plotly_chart(fig, use_container_width=True)
+    # Heatmap of all metrics
+    st.markdown("### Metrics Heatmap")
+    metrics = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
+    heatmap_df = filtered_df.set_index("Model")[metrics]
+    fig = px.imshow(
+        heatmap_df.values,
+        x=metrics,
+        y=heatmap_df.index,
+        color_continuous_scale="blues",
+        labels=dict(x="Metric", y="Model", color="Score"),
+        text_auto=".2f",
+        height=500
+    )
+    fig.update_layout(
+        xaxis_title="",
+        yaxis_title="",
+        coloraxis_colorbar=dict(title="Score"),
+        plot_bgcolor="white"
+    )
+    st.plotly_chart(fig, use_container_width=True)
+# Tab 3: Model Details
+with tab3:
+    st.markdown("## 🔍 Model Details")
+    # OpenElla card
+    if "OpenElla-Llama-3-2B" in selected_models:
+        st.markdown("""
+        <div class="card openella-card">
+            <h3>OpenElla-Llama-3-2B</h3>
+            <div class="model-badge">OpenElla</div>
+            <div class="model-badge">2B Parameters</div>
+            <div class="model-badge">Released: February 2024</div>
+            <hr>
+            <p>OpenElla-Llama-3-2B is optimized for roleplay with excellent character consistency
+            and good immersion capabilities. Built on the Llama 3 architecture, this model
+            delivers impressively balanced performance despite its compact 2B parameter size.</p>
+            <div style="display: flex; margin-top: 15px;">
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Overall Score</div>
+                    <div class="metric-value">0.83</div>
+                </div>
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Character Consistency</div>
+                    <div class="metric-value">0.83</div>
+                </div>
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Immersion</div>
+                    <div class="metric-value">0.67</div>
+                </div>
+            </div>
         </div>
         """, unsafe_allow_html=True)
+    # MiniMaid model cards
+    if "MiniMaid-L1" in selected_models:
+        st.markdown("""
+        <div class="card minimaid-l1-card">
+            <h3>MiniMaid-L1</h3>
+            <div class="model-badge">MiniMaid</div>
+            <div class="model-badge">1B Parameters</div>
+            <div class="model-badge">Released: January 2024</div>
+            <hr>
+            <p>MiniMaid-L1 is the first generation of the MiniMaid series, designed for maximum speed and efficiency.
+            With only 1B parameters, it's optimized for low-resource environments while still maintaining
+            good length handling capabilities.</p>
+            <div style="display: flex; margin-top: 15px;">
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Overall Score</div>
+                    <div class="metric-value">0.51</div>
+                </div>
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Character Consistency</div>
+                    <div class="metric-value">0.50</div>
+                </div>
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Speed</div>
+                    <div class="metric-value">320 t/s</div>
+                </div>
+            </div>
         </div>
         """, unsafe_allow_html=True)
+    if "MiniMaid-L2" in selected_models:
+        st.markdown("""
+        <div class="card minimaid-l2-card">
+            <h3>MiniMaid-L2</h3>
+            <div class="model-badge">MiniMaid</div>
+            <div class="model-badge">1.5B Parameters</div>
+            <div class="model-badge">Released: March 2024</div>
+            <hr>
+            <p>MiniMaid-L2 represents a significant improvement over L1, with enhanced immersion capabilities
+            and better overall roleplay performance. The model retains excellent efficiency while delivering
+            more engaging and consistent character portrayals.</p>
+            <div style="display: flex; margin-top: 15px;">
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Overall Score</div>
+                    <div class="metric-value">0.71</div>
+                </div>
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Immersion</div>
+                    <div class="metric-value">0.60</div>
+                </div>
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Speed</div>
+                    <div class="metric-value">280 t/s</div>
+                </div>
+            </div>
         </div>
         """, unsafe_allow_html=True)
+    if "MiniMaid-L3" in selected_models:
         st.markdown("""
+        <div class="card minimaid-l3-card">
+            <h3>MiniMaid-L3</h3>
+            <div class="model-badge">MiniMaid</div>
+            <div class="model-badge">2.5B Parameters</div>
+            <div class="model-badge">Released: April 2024</div>
+            <hr>
+            <p>MiniMaid-L3 is the latest and most advanced model in the MiniMaid series. With 2.5B parameters,
+            it achieves the highest immersion score of all models while maintaining excellent length handling.
+            This model represents the pinnacle of the MiniMaid series' development.</p>
+            <div style="display: flex; margin-top: 15px;">
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Overall Score</div>
+                    <div class="metric-value">0.76</div>
+                </div>
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Immersion</div>
+                    <div class="metric-value">0.73</div>
+                </div>
+                <div style="flex: 1; text-align: center;">
+                    <div class="metric-title">Length Score</div>
+                    <div class="metric-value">1.00</div>
+                </div>
+            </div>
         </div>
         """, unsafe_allow_html=True)
+    # Other models
+    other_models = [m for m in selected_models if m not in your_models]
+    if other_models:
+        st.markdown("### Other Models")
+        cols = st.columns(min(3, len(other_models)))
+        for i, model in enumerate(other_models):
+            model_data = df[df["Model"] == model].iloc[0]
+            with cols[i % min(3, len(other_models))]:
                 st.markdown(f"""
+                <div class="card">
+                    <h4>{model}</h4>
+                    <div class="model-badge">{model_data['Family']}</div>
+                    <div class="model-badge">{model_data['Parameters (B)']}B</div>
+                    <p>{model_data['Description']}</p>
+                    <p><b>Overall Score:</b> {model_data['Overall Score']:.2f}</p>
+                </div>
                 """, unsafe_allow_html=True)
+# Tab 4: About
 with tab4:
+    st.markdown("## 📘 About This Leaderboard")
     st.markdown("""
+    <div class="card">
+        <h3>Understanding the Metrics</h3>
+        <p><b>Length Score</b>: Measures the model's ability to generate appropriately lengthy responses without being too verbose or too brief.</p>
+        <p><b>Character Consistency</b>: Evaluates how well the model maintains character personality, backstory, and traits throughout the conversation.</p>
+        <p><b>Immersion</b>: Assesses the model's ability to create an engaging, believable experience that draws users into the roleplay scenario.</p>
+        <p><b>Overall Score</b>: A weighted combination of the above metrics, representing the model's general roleplay capability.</p>
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    <div class="card">
+        <h3>Evaluation Methodology</h3>
+        <p>Models were evaluated using a comprehensive roleplay benchmark suite consisting of:</p>
+        <ul>
+            <li>20 diverse character archetypes</li>
+            <li>15 different scenarios per character</li>
+            <li>5 conversation turns per scenario</li>
+        </ul>
+        <p>Responses were scored by a panel of expert evaluators using standardized rubrics for each metric.</p>
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    <div class="card">
+        <h3>MiniMaid Series Development</h3>
+        <p>The MiniMaid series represents an evolution in efficient roleplay models:</p>
+        <ul>
+            <li><b>MiniMaid-L1</b>: Initial release focusing on speed and efficiency</li>
+            <li><b>MiniMaid-L2</b>: Improved version with better immersion and consistency</li>
+            <li><b>MiniMaid-L3</b>: Latest generation with enhanced immersion capabilities</li>
+        </ul>
+        <p>Each iteration builds upon the strengths of the previous version while addressing identified weaknesses.</p>
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    <div class="card">
+        <h3>OpenElla Development</h3>
+        <p>OpenElla represents a parallel development track focused on maximizing roleplay quality in a compact model size.</p>
+        <p>Built on the Llama 3 architecture, OpenElla achieves exceptional character consistency and overall performance
+        despite its relatively small 2B parameter size.</p>
+    </div>
+    """, unsafe_allow_html=True)
 # Footer
 st.markdown("""
 <div class="footer">
+    <p>Created with ❤️ for Hugging Face Spaces | Last updated: April 2025</p>
 </div>
 """, unsafe_allow_html=True)