Spaces:

N-Bot-Int
/

RP_BENCHMARK

Sleeping

App Files Files Community

ItsMeDevRoland commited on Apr 12

Commit

abd9b6a

verified ·

1 Parent(s): cd113ea

Create app.py

Browse files

Files changed (1) hide show

app.py +641 -0

app.py ADDED Viewed

	@@ -0,0 +1,641 @@

+import streamlit as st
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import numpy as np
+from PIL import Image
+import base64
+from io import BytesIO
+# Set page configuration
+st.set_page_config(
+    page_title="AI Roleplay Performance Leaderboard",
+    page_icon="🤖",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .main {
+        background-color: #f0f2f6;
+    }
+    .stApp {
+        max-width: 1200px;
+        margin: 0 auto;
+    }
+    h1, h2, h3 {
+        color: #1E3A8A;
+    }
+    .metric-card {
+        background-color: white;
+        border-radius: 10px;
+        padding: 20px;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+        margin-bottom: 20px;
+    }
+    .header-container {
+        display: flex;
+        align-items: center;
+        justify-content: space-between;
+        margin-bottom: 20px;
+    }
+    .logo {
+        height: 60px;
+    }
+    .stTabs [data-baseweb="tab-list"] {
+        gap: 24px;
+    }
+    .stTabs [data-baseweb="tab"] {
+        height: 50px;
+        white-space: pre-wrap;
+        background-color: white;
+        border-radius: 5px 5px 0 0;
+        padding: 10px 20px;
+        font-weight: 500;
+    }
+    .stTabs [aria-selected="true"] {
+        background-color: #1E3A8A;
+        color: white;
+    }
+    .grid-container {
+        display: grid;
+        grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
+        gap: 20px;
+        margin-bottom: 30px;
+    }
+    .model-card {
+        background: white;
+        padding: 15px;
+        border-radius: 10px;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+        transition: transform 0.3s ease;
+    }
+    .model-card:hover {
+        transform: translateY(-5px);
+    }
+    .footer {
+        text-align: center;
+        margin-top: 30px;
+        padding: 20px;
+        font-size: 0.8em;
+        color: #666;
+    }
+    .highlight {
+        background-color: #f0f7ff;
+        padding: 20px;
+        border-radius: 10px;
+        margin: 20px 0;
+        border-left: 5px solid #1E3A8A;
+    }
+    .stButton>button {
+        background-color: #1E3A8A;
+        color: white;
+        font-weight: 500;
+    }
+    .metric-value {
+        font-size: 2.5rem;
+        font-weight: bold;
+        color: #1E3A8A;
+    }
+    .metric-label {
+        font-size: 1rem;
+        color: #666;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Define model data based on the graphs
+model_data = {
+    "OpenElla-Llama-3-8B": {
+        "Length Score": 0.97,
+        "Character Consistency": 0.83,
+        "Immersion": 0.67,
+        "Overall Score": 0.83,
+        "Release Date": "2023-11-15",
+        "Parameters": "8B",
+        "Architecture": "Llama-3",
+        "Category": "OpenElla"
+    },
+    "DeepSeek-Coder-V2-Instruct": {
+        "Length Score": 1.0,
+        "Character Consistency": 1.0,
+        "Immersion": 0.63,
+        "Overall Score": 0.88,
+        "Release Date": "2023-09-20",
+        "Parameters": "33B",
+        "Architecture": "DeepSeek",
+        "Category": "Competitor"
+    },
+    "Dolphin": {
+        "Length Score": 1.0,
+        "Character Consistency": 0.83,
+        "Immersion": 0.47,
+        "Overall Score": 0.76,
+        "Release Date": "2023-10-05",
+        "Parameters": "7B",
+        "Architecture": "Mistral",
+        "Category": "Competitor"
+    },
+    "Hermes-3-GGUF": {
+        "Length Score": 0.8,
+        "Character Consistency": 0.82,
+        "Immersion": 0.43,
+        "Overall Score": 0.75,
+        "Release Date": "2023-10-10",
+        "Parameters": "7B",
+        "Architecture": "Mistral",
+        "Category": "Competitor"
+    },
+    "MiniMaid-L1": {
+        "Length Score": 0.9,
+        "Character Consistency": 0.5,
+        "Immersion": 0.13,
+        "Overall Score": 0.51,
+        "Release Date": "2023-12-01",
+        "Parameters": "3B",
+        "Architecture": "Custom",
+        "Category": "MiniMaid"
+    },
+    "MiniMaid-L2": {
+        "Length Score": 1.0,
+        "Character Consistency": 0.53,
+        "Immersion": 0.6,
+        "Overall Score": 0.71,
+        "Release Date": "2024-01-15",
+        "Parameters": "6B",
+        "Architecture": "Custom",
+        "Category": "MiniMaid"
+    },
+    "MiniMaid-L3": {
+        "Length Score": 1.0,
+        "Character Consistency": 0.54,
+        "Immersion": 0.73,
+        "Overall Score": 0.76,
+        "Release Date": "2024-02-20",
+        "Parameters": "12B",
+        "Architecture": "Custom",
+        "Category": "MiniMaid"
+    }
+}
+# Create DataFrame
+df = pd.DataFrame(model_data).T.reset_index()
+df = df.rename(columns={"index": "Model"})
+# Define model groupings and colors
+category_colors = {
+    "OpenElla": "#FF6B6B",
+    "MiniMaid": "#4ECDC4",
+    "Competitor": "#9D84B7"
+}
+# Header with logo
+st.markdown("""
+<div class="header-container">
+    <h1>🤖 AI Roleplay Performance Leaderboard</h1>
+</div>
+""", unsafe_allow_html=True)
+# Create tabs
+tab1, tab2, tab3, tab4 = st.tabs(["📊 Leaderboard", "📈 Detailed Analysis", "🔍 Model Comparison", "ℹ️ About"])
+with tab1:
+    st.header("Model Rankings")
+    # Filtering options in the sidebar
+    st.sidebar.header("Filter Models")
+    selected_categories = st.sidebar.multiselect(
+        "Model Categories",
+        options=df["Category"].unique(),
+        default=df["Category"].unique()
+    )
+    # Filter data based on selections
+    filtered_df = df[df["Category"].isin(selected_categories)]
+    # Sort by overall score
+    sorted_df = filtered_df.sort_values("Overall Score", ascending=False)
+    # Create interactive leaderboard
+    fig = px.bar(
+        sorted_df,
+        x="Model",
+        y="Overall Score",
+        color="Category",
+        color_discrete_map=category_colors,
+        hover_data=["Parameters", "Architecture", "Release Date"],
+        labels={"Overall Score": "Roleplay Performance Score"},
+        height=500,
+    )
+    fig.update_layout(
+        title="Models Ranked by Overall Roleplay Performance",
+        xaxis_title="",
+        yaxis_title="Score",
+        legend_title="Category",
+        font=dict(size=14),
+        plot_bgcolor="rgba(0,0,0,0)",
+        xaxis=dict(tickangle=-45),
+        yaxis=dict(range=[0, 1]),
+        margin=dict(l=20, r=20, t=60, b=80),
+    )
+    st.plotly_chart(fig, use_container_width=True)
+    # Top 3 models highlight
+    st.subheader("🏆 Top Performing Models")
+    col1, col2, col3 = st.columns(3)
+    top3_df = sorted_df.head(3)
+    for i, (idx, row) in enumerate(top3_df.iterrows()):
+        col = [col1, col2, col3][i]
+        with col:
+            st.markdown(f"""
+            <div class="model-card">
+                <h3>{row['Model']}</h3>
+                <div class="metric-value">{row['Overall Score']:.2f}</div>
+                <div class="metric-label">Overall Score</div>
+                <hr>
+                <p><strong>Category:</strong> {row['Category']}</p>
+                <p><strong>Parameters:</strong> {row['Parameters']}</p>
+                <p><strong>Architecture:</strong> {row['Architecture']}</p>
+            </div>
+            """, unsafe_allow_html=True)
+    # Show full data table
+    st.subheader("Complete Rankings")
+    st.dataframe(
+        sorted_df[["Model", "Category", "Overall Score", "Length Score", "Character Consistency", "Immersion", "Parameters"]],
+        use_container_width=True,
+        height=400,
+        column_config={
+            "Overall Score": st.column_config.ProgressColumn(
+                "Overall Score",
+                help="Overall roleplay performance score",
+                format="%.2f",
+                min_value=0,
+                max_value=1,
+            ),
+            "Length Score": st.column_config.ProgressColumn(
+                "Length Score",
+                help="Score for response length appropriateness",
+                format="%.2f",
+                min_value=0,
+                max_value=1,
+            ),
+            "Character Consistency": st.column_config.ProgressColumn(
+                "Character Consistency",
+                help="Score for character persona consistency",
+                format="%.2f",
+                min_value=0,
+                max_value=1,
+            ),
+            "Immersion": st.column_config.ProgressColumn(
+                "Immersion",
+                help="Score for immersive quality of roleplay",
+                format="%.2f",
+                min_value=0,
+                max_value=1,
+            ),
+        }
+    )
+with tab2:
+    st.header("Detailed Performance Analysis")
+    # Select model to analyze
+    selected_model = st.selectbox(
+        "Select model to analyze:",
+        options=df["Model"].tolist(),
+        index=0
+    )
+    model_df = df[df["Model"] == selected_model]
+    # Spider/Radar chart for selected model
+    categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
+    values = model_df[categories].values.flatten().tolist()
+    # Create radar chart
+    fig = go.Figure()
+    fig.add_trace(go.Scatterpolar(
+        r=values,
+        theta=categories,
+        fill='toself',
+        name=selected_model,
+        line_color=category_colors[model_df["Category"].iloc[0]],
+        fillcolor=category_colors[model_df["Category"].iloc[0]] + '50'  # Add transparency
+    ))
+    fig.update_layout(
+        polar=dict(
+            radialaxis=dict(
+                visible=True,
+                range=[0, 1]
+            )
+        ),
+        showlegend=False,
+        title=f"Performance Profile: {selected_model}",
+        height=500
+    )
+    st.plotly_chart(fig, use_container_width=True)
+    # Detailed metrics
+    st.subheader("Performance Metrics")
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.markdown(f"""
+        <div class="metric-card">
+            <div class="metric-label">Length Score</div>
+            <div class="metric-value">{model_df['Length Score'].iloc[0]:.2f}</div>
+        </div>
+        """, unsafe_allow_html=True)
+    with col2:
+        st.markdown(f"""
+        <div class="metric-card">
+            <div class="metric-label">Character Consistency</div>
+            <div class="metric-value">{model_df['Character Consistency'].iloc[0]:.2f}</div>
+        </div>
+        """, unsafe_allow_html=True)
+    with col3:
+        st.markdown(f"""
+        <div class="metric-card">
+            <div class="metric-label">Immersion</div>
+            <div class="metric-value">{model_df['Immersion'].iloc[0]:.2f}</div>
+        </div>
+        """, unsafe_allow_html=True)
+    with col4:
+        st.markdown(f"""
+        <div class="metric-card">
+            <div class="metric-label">Overall Score</div>
+            <div class="metric-value">{model_df['Overall Score'].iloc[0]:.2f}</div>
+        </div>
+        """, unsafe_allow_html=True)
+    # Model info
+    st.subheader("Model Information")
+    st.markdown(f"""
+    <div class="highlight">
+        <table width="100%">
+            <tr>
+                <td width="33%"><strong>Category:</strong> {model_df['Category'].iloc[0]}</td>
+                <td width="33%"><strong>Parameters:</strong> {model_df['Parameters'].iloc[0]}</td>
+                <td width="33%"><strong>Architecture:</strong> {model_df['Architecture'].iloc[0]}</td>
+            </tr>
+            <tr>
+                <td colspan="3"><strong>Release Date:</strong> {model_df['Release Date'].iloc[0]}</td>
+            </tr>
+        </table>
+    </div>
+    """, unsafe_allow_html=True)
+    # Performance trend
+    if model_df["Category"].iloc[0] == "MiniMaid":
+        st.subheader("MiniMaid Series Performance Evolution")
+        minimaid_df = df[df["Category"] == "MiniMaid"].sort_values("Release Date")
+        # Line chart for MiniMaid evolution
+        fig = px.line(
+            minimaid_df,
+            x="Model",
+            y=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
+            markers=True,
+            labels={"value": "Score", "variable": "Metric"},
+            height=500
+        )
+        fig.update_layout(
+            title="MiniMaid Model Series Improvement Over Time",
+            xaxis_title="Model Version",
+            yaxis_title="Score",
+            yaxis=dict(range=[0, 1]),
+            legend_title="Metric",
+            hovermode="x unified"
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        st.markdown("""
+        <div class="highlight">
+            <h4>MiniMaid Development Insights</h4>
+            <p>The MiniMaid series shows clear progression across versions, with significant improvements in immersion
+            capabilities from L1 to L3. While character consistency has remained relatively stable, the overall
+            performance has steadily increased with each iteration.</p>
+        </div>
+        """, unsafe_allow_html=True)
+with tab3:
+    st.header("Model Comparison")
+    # Select models to compare
+    default_models = ["OpenElla-Llama-3-8B", "MiniMaid-L3"] if "OpenElla-Llama-3-8B" in df["Model"].tolist() and "MiniMaid-L3" in df["Model"].tolist() else df["Model"].tolist()[:2]
+    selected_models = st.multiselect(
+        "Select models to compare:",
+        options=df["Model"].tolist(),
+        default=default_models
+    )
+    if len(selected_models) < 2:
+        st.warning("Please select at least two models to compare.")
+    else:
+        comparison_df = df[df["Model"].isin(selected_models)]
+        # Group bar chart for comparison
+        fig = px.bar(
+            comparison_df,
+            x="Model",
+            y=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
+            barmode="group",
+            labels={"value": "Score", "variable": "Metric"},
+            height=600,
+            color_discrete_sequence=px.colors.qualitative.Bold
+        )
+        fig.update_layout(
+            title="Side-by-Side Metric Comparison",
+            xaxis_title="",
+            yaxis_title="Score",
+            yaxis=dict(range=[0, 1]),
+            legend_title="Metric",
+            xaxis=dict(tickangle=-45),
+            hovermode="x unified"
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        # Radar/Spider chart comparison
+        categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
+        fig = go.Figure()
+        for idx, model in enumerate(selected_models):
+            model_data = comparison_df[comparison_df["Model"] == model]
+            values = model_data[categories].values.flatten().tolist()
+            fig.add_trace(go.Scatterpolar(
+                r=values,
+                theta=categories,
+                fill='toself',
+                name=model
+            ))
+        fig.update_layout(
+            polar=dict(
+                radialaxis=dict(
+                    visible=True,
+                    range=[0, 1]
+                )
+            ),
+            showlegend=True,
+            legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5),
+            title="Performance Profile Comparison",
+            height=600
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        # Comparison table
+        st.subheader("Detailed Comparison")
+        comparison_table = comparison_df.set_index("Model")[
+            ["Overall Score", "Length Score", "Character Consistency", "Immersion", "Parameters", "Architecture", "Category"]
+        ]
+        st.dataframe(comparison_table, use_container_width=True)
+        # Find strengths and weaknesses
+        if len(selected_models) == 2:
+            model1 = selected_models[0]
+            model2 = selected_models[1]
+            model1_data = comparison_df[comparison_df["Model"] == model1]
+            model2_data = comparison_df[comparison_df["Model"] == model2]
+            diff = {}
+            for metric in ["Length Score", "Character Consistency", "Immersion", "Overall Score"]:
+                diff[metric] = model1_data[metric].iloc[0] - model2_data[metric].iloc[0]
+            st.subheader(f"Comparative Analysis: {model1} vs {model2}")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.markdown(f"""
+                <div class="metric-card">
+                    <h4>{model1} Strengths</h4>
+                    <ul>
+                """, unsafe_allow_html=True)
+                for metric, value in diff.items():
+                    if value > 0:
+                        st.markdown(f"<li>{metric}: +{abs(value):.2f} higher than {model2}</li>", unsafe_allow_html=True)
+                st.markdown("</ul></div>", unsafe_allow_html=True)
+            with col2:
+                st.markdown(f"""
+                <div class="metric-card">
+                    <h4>{model2} Strengths</h4>
+                    <ul>
+                """, unsafe_allow_html=True)
+                for metric, value in diff.items():
+                    if value < 0:
+                        st.markdown(f"<li>{metric}: +{abs(value):.2f} higher than {model1}</li>", unsafe_allow_html=True)
+                st.markdown("</ul></div>", unsafe_allow_html=True)
+            # Overall summary
+            overall_diff = diff["Overall Score"]
+            better_model = model1 if overall_diff > 0 else model2
+            worse_model = model2 if overall_diff > 0 else model1
+            st.markdown(f"""
+            <div class="highlight">
+                <h4>Summary</h4>
+                <p>Overall, <strong>{better_model}</strong> outperforms <strong>{worse_model}</strong> by
+                {abs(overall_diff):.2f} points in the combined roleplay score. The most significant difference is in
+                the {max(diff.items(), key=lambda x: abs(x[1]))[0]} metric.</p>
+            </div>
+            """, unsafe_allow_html=True)
+with tab4:
+    st.header("About This Leaderboard")
+    st.markdown("""
+    ## Understanding the Metrics
+    This leaderboard evaluates AI models on their roleplay capabilities using four key metrics:
+    - **Length Score**: Measures the model's ability to provide responses of appropriate length for roleplay scenarios. Higher scores indicate better response length management.
+    - **Character Consistency**: Evaluates how well the model maintains a consistent character persona throughout the interaction. Higher scores indicate better adherence to character traits and background.
+    - **Immersion**: Assesses the model's ability to create an immersive roleplay experience, including environmental details, emotional depth, and narrative engagement.
+    - **Overall Score**: A composite score reflecting the model's overall roleplay performance, combining all metrics.
+    ## Methodology
+    Models are evaluated through a standardized testing protocol involving multiple roleplay scenarios across different genres and contexts. Each model is tested with identical prompts to ensure fair comparison.
+    The evaluation process involves:
+    1. Running models through a standardized set of roleplay scenarios
+    2. Expert evaluation of responses against established criteria
+    3. Quantitative scoring based on objective metrics
+    4. Normalization of scores across model sizes and architectures
+    ## Data Updates
+    This leaderboard is regularly updated as new models are released or existing models are improved. The most recent update was on April 2025.
+    ## Contact Information
+    For questions about the methodology or to submit a model for evaluation, please contact: [[email protected]]
+    """)
+    # Add a download button for the complete dataset
+    csv = df.to_csv(index=False)
+    b64 = base64.b64encode(csv.encode()).decode()
+    href = f'<a href="data:file/csv;base64,{b64}" download="ai_roleplay_leaderboard.csv">Download Full Dataset (CSV)</a>'
+    st.markdown(href, unsafe_allow_html=True)
+# Footer
+st.markdown("""
+<div class="footer">
+    <p>© 2025 AI Roleplay Performance Leaderboard | Created with Streamlit | Data last updated: April 2025</p>
+</div>
+""", unsafe_allow_html=True)
+# Add custom JavaScript for interactivity
+st.markdown("""
+<script>
+    const modelCards = document.querySelectorAll('.model-card');
+    modelCards.forEach(card => {
+        card.addEventListener('mouseenter', () => {
+            card.style.transform = 'translateY(-10px)';
+            card.style.boxShadow = '0 10px 20px rgba(0, 0, 0, 0.2)';
+        });
+        card.addEventListener('mouseleave', () => {
+            card.style.transform = 'translateY(0)';
+            card.style.boxShadow = '0 4px 6px rgba(0, 0, 0, 0.1)';
+        });
+    });
+</script>
+""", unsafe_allow_html=True)