| """ |
| DevRev Search Evaluation Leaderboard |
| |
| An interactive leaderboard for benchmarking search and retrieval systems |
| on enterprise knowledge bases. Built with Gradio and ready for Hugging Face Spaces. |
| |
| Uses MTEB-style standardized JSON format for evaluation results. |
| """ |
|
|
| import base64 |
| import io |
| import json |
| import os |
| from datetime import datetime |
| from pathlib import Path |
|
|
| import gradio as gr |
| import matplotlib.pyplot as plt |
| import pandas as pd |
| from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns |
|
|
|
|
| def load_results_from_json(): |
| """Load evaluation results from standardized JSON files""" |
| results = [] |
|
|
| |
| results_dirs = ["results", "leaderboard/results", "."] |
| results_dir = None |
|
|
| for dir_path in results_dirs: |
| if os.path.exists(dir_path): |
| temp_dir = Path(dir_path) |
| if any(temp_dir.glob("*.json")): |
| results_dir = temp_dir |
| break |
|
|
| if not results_dir: |
| print( |
| "No results directory found. Please create a 'results' directory with JSON files." |
| ) |
| return [] |
|
|
| |
| for json_file in results_dir.glob("*.json"): |
| |
| if json_file.name == "RESULT_SCHEMA.json": |
| continue |
|
|
| try: |
| with open(json_file, "r") as f: |
| data = json.load(f) |
| |
| if "model_name" in data and "metrics" in data: |
| results.append(data) |
| print(f"Loaded: {json_file.name}") |
| except Exception as e: |
| print(f"Error loading {json_file}: {e}") |
|
|
| return results |
|
|
|
|
| def create_leaderboard_data(): |
| """Create the leaderboard dataframe from JSON results""" |
|
|
| |
| results = load_results_from_json() |
|
|
| if not results: |
| print( |
| "No evaluation results found. Please add JSON files to the 'results' directory." |
| ) |
| return pd.DataFrame() |
|
|
| |
| data = [] |
| for result in results: |
| metrics = result.get("metrics", {}) |
|
|
| |
| paper_field = result.get("paper", "N/A") |
| if paper_field and paper_field != "N/A": |
| |
| references = [ref.strip() for ref in paper_field.split(";")] |
| formatted_refs = [] |
| for ref in references: |
| if ref.startswith("http"): |
| |
| formatted_refs.append(f"[{ref}]({ref})") |
| else: |
| |
| formatted_refs.append(ref) |
| paper_display = " | ".join(formatted_refs) |
| else: |
| paper_display = "N/A" |
|
|
| row = { |
| "π Rank": 0, |
| "π§ Method": result.get("model_name", "Unknown"), |
| "π Paper/Details": paper_display, |
| "π·οΈ Type": result.get("model_type", "Unknown"), |
| "π Recall@5": metrics.get("recall@5", 0), |
| "π Recall@10": metrics.get("recall@10", 0), |
| "π Recall@25": metrics.get("recall@25", 0), |
| "π Recall@50": metrics.get("recall@50", 0), |
| "π Precision@5": metrics.get("precision@5", 0), |
| "π Precision@10": metrics.get("precision@10", 0), |
| "π Precision@25": metrics.get("precision@25", 0), |
| "π Precision@50": metrics.get("precision@50", 0), |
| "π Open Source": "β
" if result.get("open_source", False) else "β", |
| "π
Date": result.get("evaluation_date", "N/A"), |
| } |
| data.append(row) |
|
|
| |
| df = pd.DataFrame(data) |
|
|
| |
| df = df.sort_values(["π Recall@10", "π Precision@10"], ascending=False) |
|
|
| |
| df["π Rank"] = range(1, len(df) + 1) |
|
|
| |
| columns_order = [ |
| "π Rank", |
| "π§ Method", |
| "π Paper/Details", |
| "π·οΈ Type", |
| "π Recall@5", |
| "π Recall@10", |
| "π Recall@25", |
| "π Recall@50", |
| "π Precision@5", |
| "π Precision@10", |
| "π Precision@25", |
| "π Precision@50", |
| "π Open Source", |
| "π
Date", |
| ] |
| df = df[columns_order] |
|
|
| return df |
|
|
|
|
| def create_comparison_plot(): |
| """Create performance comparison visualizations""" |
| df = create_leaderboard_data() |
|
|
| if df.empty: |
| return "<p style='text-align: center; color: #666;'>No data available for visualization. Please add evaluation results to the 'results' directory.</p>" |
|
|
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) |
|
|
| |
| df_sorted = df.sort_values("π Recall@50", ascending=True) |
|
|
| |
| methods = df_sorted["π§ Method"].tolist() |
| recall_50 = df_sorted["π Recall@50"].tolist() |
| colors = ["#ff6b6b" if "DevRev" in m else "#4ecdc4" for m in methods] |
|
|
| ax1.barh(methods, recall_50, color=colors, alpha=0.8) |
| ax1.set_xlabel("Recall@50 (%)", fontsize=12) |
| ax1.set_title("Recall@50 Comparison", fontsize=14, fontweight="bold") |
| ax1.grid(True, axis="x", alpha=0.3) |
|
|
| |
| for i, (method, recall) in enumerate(zip(methods, recall_50)): |
| ax1.text(recall + 0.5, i, f"{recall:.1f}%", va="center", fontsize=10) |
|
|
| |
| precision_50 = df_sorted["π Precision@50"].tolist() |
|
|
| ax2.barh(methods, precision_50, color=colors, alpha=0.8) |
| ax2.set_xlabel("Precision@50 (%)", fontsize=12) |
| ax2.set_title("Precision@50 Comparison", fontsize=14, fontweight="bold") |
| ax2.grid(True, axis="x", alpha=0.3) |
|
|
| |
| for i, (method, precision) in enumerate(zip(methods, precision_50)): |
| ax2.text( |
| precision + 0.5, |
| i, |
| f"{precision:.1f}%", |
| va="center", |
| fontsize=10, |
| ) |
|
|
| plt.tight_layout() |
|
|
| |
| buf = io.BytesIO() |
| plt.savefig(buf, format="png", dpi=150, bbox_inches="tight") |
| buf.seek(0) |
| img_base64 = base64.b64encode(buf.read()).decode() |
| plt.close() |
|
|
| return f'<img src="data:image/png;base64,{img_base64}" style="width: 100%; max-width: 1000px; margin: 20px auto; display: block;">' |
|
|
|
|
| def create_interface(): |
| """Create the Gradio interface with leaderboard and visualizations""" |
|
|
| deep_link_js = r""" |
| () => { |
| function openAboutAndScroll() { |
| if (window.location.hash !== "#about") return; |
| |
| // Switch to the About tab (Gradio tabs are rendered as role="tab" buttons) |
| const tabs = Array.from(document.querySelectorAll('button[role="tab"]')); |
| const aboutTab = tabs.find((b) => (b.innerText || "").includes("About")); |
| if (aboutTab) aboutTab.click(); |
| |
| // The About content is mounted after tab switch; retry briefly. |
| let attempts = 0; |
| const timer = setInterval(() => { |
| const el = document.getElementById("about"); |
| if (el) { |
| el.scrollIntoView({ behavior: "smooth", block: "start" }); |
| clearInterval(timer); |
| } |
| attempts += 1; |
| if (attempts > 25) clearInterval(timer); |
| }, 200); |
| } |
| |
| window.addEventListener("hashchange", openAboutAndScroll); |
| openAboutAndScroll(); |
| setTimeout(openAboutAndScroll, 600); |
| } |
| """ |
|
|
| with gr.Blocks( |
| title="DevRev Search Evaluation Leaderboard", js=deep_link_js |
| ) as demo: |
| |
| gr.HTML( |
| """ |
| <div style="text-align: center; margin-bottom: 30px;"> |
| <h1 style="font-size: 3em; font-weight: bold; margin-bottom: 10px;"> |
| π DevRev Search Evaluation Leaderboard |
| </h1> |
| <p style="font-size: 1.2em; color: #666;"> |
| Benchmarking Search and Retrieval Systems for Enterprise Knowledge Bases |
| </p> |
| </div> |
| """ |
| ) |
|
|
| |
| with gr.Tabs(): |
| |
| with gr.TabItem("π Main Leaderboard"): |
| gr.Markdown( |
| """ |
| ### Evaluation Overview |
| This leaderboard displays metrics of search systems on the test queries present in [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search). |
| All methods are evaluated on the same set of agent support queries with consistent evaluation protocols. |
| |
| **Metrics**: Recall@K and Precision@K measure the effectiveness of retrieving relevant articles within the top K retrieved articles. |
| |
| **Leaderboard ranking**: Sorted by **Recall@10** (primary) and **Precision@10** (secondary). |
| |
| **To add your results**: Submission details are available in the [About](#about) section. |
| """ |
| ) |
|
|
| |
| df = create_leaderboard_data() |
|
|
| if not df.empty: |
| |
| default_columns = [ |
| "π Rank", |
| "π§ Method", |
| "π·οΈ Type", |
| "π Recall@10", |
| "π Recall@50", |
| "π Precision@10", |
| "π Precision@50", |
| "π Open Source", |
| ] |
|
|
| |
| type_column = ColumnFilter("π·οΈ Type", type="checkboxgroup") |
| open_source_column = ColumnFilter( |
| "π Open Source", type="checkboxgroup" |
| ) |
|
|
| |
| Leaderboard( |
| value=df, |
| datatype=[ |
| "number", |
| "markdown", |
| "markdown", |
| "str", |
| "number", |
| "number", |
| "number", |
| "number", |
| "number", |
| "number", |
| "number", |
| "number", |
| "str", |
| "str", |
| ], |
| select_columns=SelectColumns( |
| default_selection=default_columns, |
| cant_deselect=[ |
| "π Rank", |
| "π§ Method", |
| "π Recall@10", |
| ], |
| label="Select Columns to Display", |
| ), |
| search_columns=[ |
| "π§ Method", |
| "π Paper/Details", |
| "π·οΈ Type", |
| ], |
| hide_columns=["π
Date"], |
| filter_columns=[type_column, open_source_column], |
| interactive=False, |
| ) |
| else: |
| gr.HTML( |
| """ |
| <div style="text-align: center; padding: 50px; background: #f5f5f5; border-radius: 10px;"> |
| <h3>No Results Found</h3> |
| <p>Please add JSON evaluation files to the 'results' directory.</p> |
| <p>See the About tab for the required format.</p> |
| </div> |
| """ |
| ) |
|
|
| |
| with gr.TabItem("βΉοΈ About"): |
| gr.Markdown( |
| """ |
| ## About This Leaderboard |
| |
| This leaderboard tracks the performance of various search and retrieval systems on the [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search). |
| |
| ### π Evaluation Metrics |
| |
| - **Recall@K**: The percentage of relevant article chunks retrieved in the top K article chunks |
| - **Precision@K**: The percentage of retrieved article chunks that are relevant among the top K article chunks |
| |
| ### π€ How to Submit |
| |
| 1. Run your retrieval on the test queries in DevRev Search Dataset |
| 2. Submit the results in same format as annotated_queries in the dataset through email to prateek.jain@devrev.ai |
| 3. Also include a **one-line system detail/link**, the **system type**, and whether it is **open source** |
| |
| ### π Resources |
| |
| - [Computer by DevRev](https://devrev.ai/meet-computer) |
| - [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search) |
| |
| ### π Acknowledgments |
| |
| Inspired by: |
| - [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard) |
| - [Berkeley Function Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard) |
| |
| ### π Citation |
| |
| ```bibtex |
| @misc{devrev_search_leaderboard_2026, |
| title={DevRev Search Leaderboard}, |
| author={Research@DevRev}, |
| year={2026}, |
| url={https://huggingface.co/spaces/devrev/search} |
| } |
| ``` |
| """, |
| elem_id="about", |
| ) |
|
|
| |
| gr.HTML( |
| f""" |
| <div style="text-align: center; margin-top: 50px; padding: 20px; border-top: 1px solid #e0e0e0; color: #666;"> |
| <p> |
| Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M UTC")} |
| </p> |
| </div> |
| """ |
| ) |
|
|
| return demo |
|
|
|
|
| |
| if __name__ == "__main__": |
| demo = create_interface() |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=True, show_api=False) |
|
|