| import gradio as gr |
| import matplotlib.pyplot as plt |
| import numpy as np |
| import pandas as pd |
| import plotly.express as px |
|
|
| from strings import api_descriptions, func_definitions |
|
|
| |
| |
| bubble_html = """ |
| <div style="{style}" class="bubble"> |
| {text} |
| </div> |
| """ |
|
|
| bubble_style = """ |
| padding: 10px; |
| margin: 5px; |
| background: linear-gradient(to bottom right, #FFFFFF, #E8E8E8); /* Lighter background for contrast */ |
| border-radius: 15px; |
| border: 1px solid #a1a1a1; /* Lighter border for subtle definition */ |
| box-shadow: 2px 2px 10px rgba(255,255,255,0.1); /* Softer shadow with a hint of white for depth */ |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; |
| font-size: calc(4vw + 4vh) / 2; /* Scales dynamically with the viewport */ |
| text-align: center; |
| display: flex; |
| align-items: center; /* Centers text vertically */ |
| justify-content: center; /* Centers text horizontally */ |
| min-height: 50px; /* Adjust as needed */ |
| max-height: 140px; /* Adjust as needed */ |
| max-width: 100%; |
| color: #333333; /* Dark text for contrast against light background */ |
| overflow-wrap: break-word; /* Allows long words to be broken and wrap onto the next line */ |
| """ |
|
|
| |
| hover_css = """ |
| <style> |
| .bubble:hover { |
| transform: scale(1.05); /* Scales up the bubble */ |
| z-index: 10; /* Ensures the scaled bubble is above others */ |
| } |
| </style> |
| """ |
|
|
|
|
| |
| RESULTS = { |
| 'Climate': {"NexusRaven-V2": 0.7021, "GPT4-1106": 0.6809, "GPT3.5": 0.2553, "Gorilla open-function-v1": 0.0213}, |
| 'Heldout_Combined': {"NexusRaven-V2": 0.5990, "GPT4-1106": 0.4814, "GPT3.5": 0.4495}, |
| 'Places_API': {"NexusRaven-V2": 0.5000, "GPT4-1106": 0.4375, "GPT3.5": 0.2500, "Gorilla open-function-v1": 0.0208}, |
| 'OTX': {"NexusRaven-V2": 0.9022, "GPT4-1106": 0.9022, "GPT3.5": 0.8913, "Gorilla open-function-v1": 0.2935}, |
| 'VirusTotal': {"GPT4-1106": 0.8800, "GPT3.5": 0.8100, "NexusRaven-V2": 0.8013, "Gorilla open-function-v1": 0.0728}, |
| 'VT_Multi_Dependency': {"NexusRaven-V2": 0.3878, "GPT4-1106": 0.3673, "GPT3.5": 0.0204, "Gorilla open-function-v1": 0.0000}, |
| 'VT_Multi_Disconnected': {"NexusRaven-V2": 0.4286, "GPT4-1106": 0.2857, "GPT3.5": 0.1429, "Gorilla open-function-v1": 0.0000}, |
| 'CVECPE': {"GPT4-1106": 0.7700, "NexusRaven-V2": 0.6667, "GPT3.5": 0.4800, "Gorilla open-function-v1": 0.0897}, |
| 'CVECPE_Multi_Dependency': {"NexusRaven-V2": 0.2500, "GPT4-1106": 0.0714, "GPT3.5": 0.0714, "Gorilla open-function-v1": 0.0000}, |
| } |
|
|
| SAMPLES = { |
| 'OTX': "data/OTX.json", |
| 'CVECPE' : "data/CVECPE.json", |
| 'CVECPE_Multi_Dependency' : "data/CVECPE_MultiAPIs.json", |
| 'VirusTotal' : 'data/VirusTotal.json', |
| 'VT_Multi_Dependency': 'data/VT_MultiAPIs_Nested.json', |
| 'VT_Multi_Disconnected': 'data/VT_MultiAPIs_Disconnected.json', |
| 'Climate' : 'data/Climate.jsonl', |
| 'Places_API' : 'data/Places_API.jsonl' |
| } |
| import json |
| import random |
| import gradio as gr |
|
|
| def read_json_or_jsonl(file_path): |
| """ |
| Read a file and determine if it's JSON or JSONL. |
| Return the data as a list of items. |
| """ |
| try: |
| with open(file_path, 'r') as file: |
| if file_path.endswith('.jsonl'): |
| |
| data = [json.loads(line) for line in file] |
| else: |
| |
| data = json.load(file) |
| for item in data: |
| if "input" in item: |
| item["Input"] = item["input"] |
|
|
| return data |
| except Exception as e: |
| print(f"Error reading file: {e}") |
| return [] |
|
|
| def sample_data(data, sample_size=5): |
| """ |
| Randomly sample items from the data. |
| """ |
| if not data: |
| return [] |
| sample_size = min(sample_size, len(data)) |
| return random.sample(data, sample_size) |
|
|
| def highlight_row(s, column, value, color='yellow'): |
| """ |
| Highlight a row where the column has a specified value. |
| |
| Args: |
| s (pd.Series): Row of the DataFrame. |
| column (str): Column name to check the value. |
| value (any): Value to check against. |
| color (str): Background color for highlighting. Default is yellow. |
| |
| Returns: |
| [str]: A list of CSS strings for each cell in the row. |
| """ |
| return [f'background-color: {color}' if v == value else '' |
| for v in s[column]] |
|
|
| def create_bar_chart(data, title, theme): |
| df = pd.DataFrame.from_dict(data, orient='index', columns=['Score']).reset_index() |
| df.rename(columns={'index': 'Model'}, inplace=True) |
| |
| |
| colors = ['#636EFA', '#EF553B'] if theme == 'dark' else ['#00CC96', '#AB63FA'] |
| |
| fig = px.bar( |
| df, x='Model', y='Score', title=title, |
| color='Model', color_discrete_sequence=colors, |
| text='Score', barmode='group' |
| ) |
| |
| |
| fig.update_layout( |
| plot_bgcolor='rgba(0,0,0,0)' if theme == 'dark' else 'rgba(255,255,255,1)', |
| paper_bgcolor='rgba(0,0,0,0)' if theme == 'dark' else 'rgba(255,255,255,1)', |
| font_color='white' if theme == 'dark' else 'black' |
| ) |
| |
| |
| fig.update_traces( |
| hoverinfo='all', hovertemplate='Model: %{x}<br>Score: %{y:.2f}' |
| ) |
| |
| |
| max_score = df['Score'].max() |
| fig.update_yaxes(range=[0, max_score + max_score * 0.1]) |
| |
| return fig |
|
|
|
|
| |
| simple_tasks = ['OTX', 'CVECPE', 'VirusTotal', 'VT_Multi_Disconnected', 'Heldout_Combined'] |
| difficult_tasks = ['VT_Multi_Dependency', 'Climate', 'Places_API', 'CVECPE_Multi_Dependency'] |
|
|
| |
| def format_scores(val): |
| if isinstance(val, float): |
| val = val * 100 |
| return f"{val:.4g}" |
| return val |
|
|
| |
| def calculate_averages(results): |
| all_tasks_avg = pd.DataFrame(results).mean(axis=1) |
| simple_tasks_avg = pd.DataFrame({k: results[k] for k in simple_tasks}).mean(axis=1) |
| difficult_tasks_avg = pd.DataFrame({k: results[k] for k in difficult_tasks}).mean(axis=1) |
|
|
| avg_data = pd.DataFrame({ |
| 'All Tasks': all_tasks_avg[:-1], |
| 'Tasks with Single Call (simple)': simple_tasks_avg[:-1], |
| 'Tasks with Nested/Parallel Calls (challenging)': difficult_tasks_avg[:-1] |
| }).reset_index().rename(columns={'index': 'Model'}) |
|
|
| return avg_data |
|
|
| |
| def display_averages(): |
| avg_data = calculate_averages(RESULTS) |
| return avg_data |
|
|
|
|
| |
| single_calls = ['OTX', 'CVECPE', 'VirusTotal', 'Heldout_Combined'] |
| nested_calls = ['VT_Multi_Dependency', 'Places_API', 'CVECPE_Multi_Dependency', 'Heldout_Combined'] |
| parallel_calls = ['Climate', 'VT_Multi_Disconnected'] |
|
|
| otx = ["OTX"] |
| cvecpe = ['CVECPE'] |
| virustotal = ['VirusTotal'] |
| vt_multi_dependency = ['VT_Multi_Dependency'] |
| places = ['Places_API'] |
| cvecpe_multi_dependency = ['CVECPE_Multi_Dependency'] |
| heldout = ['Heldout_Combined'] |
| climate = ['Climate'] |
| vt_multi_disconnected = ['VT_Multi_Disconnected'] |
|
|
| |
| def calculate_capability_scores(results, type): |
| if type == "general ability": |
| single_calls_avg = pd.DataFrame({k: results[k] for k in single_calls}).mean(axis=1) |
| nested_calls_avg = pd.DataFrame({k: results[k] for k in nested_calls}).mean(axis=1) |
| parallel_calls_avg = pd.DataFrame({k: results[k] for k in parallel_calls}).mean(axis=1) |
|
|
| capability_data = pd.DataFrame({ |
| 'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'], |
| 'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']], |
| 'GPT4-1106': [single_calls_avg['GPT4-1106'], nested_calls_avg['GPT4-1106'], parallel_calls_avg['GPT4-1106']], |
| 'GPT3.5': [single_calls_avg['GPT3.5'], nested_calls_avg['GPT3.5'], parallel_calls_avg['GPT3.5']] |
| }).melt(id_vars=['Capability'], var_name='Model', value_name='Score') |
| elif type == "many apis many args": |
| otx_avg = pd.DataFrame({k: results[k] for k in otx}).mean(axis=1) |
| cvecpe_avg = pd.DataFrame({k: results[k] for k in cvecpe}).mean(axis=1) |
| virustotal_avg = pd.DataFrame({k: results[k] for k in virustotal}).mean(axis=1) |
| vt_multi_dependency_avg = pd.DataFrame({k: results[k] for k in vt_multi_dependency}).mean(axis=1) |
| places_avg = pd.DataFrame({k: results[k] for k in places}).mean(axis=1) |
| cvecpe_multi_dependency_avg = pd.DataFrame({k: results[k] for k in cvecpe_multi_dependency}).mean(axis=1) |
| heldout_avg = pd.DataFrame({k: results[k] for k in heldout}).mean(axis=1) |
| climate_avg = pd.DataFrame({k: results[k] for k in climate}).mean(axis=1) |
| vt_multi_disconnected_avg = pd.DataFrame({k: results[k] for k in vt_multi_disconnected}).mean(axis=1) |
|
|
| capability_data = pd.DataFrame({ |
| 'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'NVDLibrary (Single)', 'NVDLibrary_Multi (Nested)', 'Places (Nested)', 'Climate (Nested/Parallel)', 'Stack (Mostly Single)'], |
| 'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'], |
| places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']], |
| 'GPT4-1106': [otx_avg['GPT4-1106'], virustotal_avg['GPT4-1106'], vt_multi_dependency_avg['GPT4-1106'], vt_multi_disconnected_avg['GPT4-1106'], cvecpe_avg['GPT4-1106'], cvecpe_multi_dependency_avg['GPT4-1106'], places_avg['GPT4-1106'], climate_avg['GPT4-1106'], heldout_avg['GPT4-1106']], |
| 'GPT3.5': [otx_avg['GPT3.5'], virustotal_avg['GPT3.5'], vt_multi_dependency_avg['GPT3.5'], vt_multi_disconnected_avg['GPT3.5'], cvecpe_avg['GPT3.5'], cvecpe_multi_dependency_avg['GPT3.5'], places_avg['GPT3.5'], climate_avg['GPT3.5'], heldout_avg['GPT3.5']] |
| |
| |
| }).melt(id_vars=['Capability'], var_name='Model', value_name='Score') |
|
|
| return capability_data |
|
|
| |
| def display_radar_chart(type): |
| if type == "general ability": |
| data = calculate_capability_scores(RESULTS, "general ability") |
| fig = px.line_polar(data, r='Score', theta='Capability', color='Model', line_close=True, |
| markers=True, |
| color_discrete_sequence=px.colors.qualitative.Pastel, |
| template='plotly_dark', |
| title='Capability Radar Chart on Different Function Calling Types') |
| elif type == "many apis many args": |
| data = calculate_capability_scores(RESULTS, "many apis many args") |
| fig = px.line_polar(data, r='Score', theta='Capability', color='Model', line_close=True, |
| markers=True, |
| color_discrete_sequence=px.colors.qualitative.Pastel, |
| template='plotly_dark', |
| title='Capability Radar Chart on All Tasks') |
|
|
| |
| fig.update_traces(marker=dict(size=10), line=dict(width=4)) |
| |
| return fig |
|
|
|
|
| INTRO_TEXT = """ |
| # Nexus Function Calling Leaderboard |
| |
| Welcome to the Nexus Function Calling Leaderboard! We provide a focused benchmarking platform that evaluates a range of models on their ability to perform zero-shot function calling and API usage. Our leaderboard features the following highlights: |
| |
| - **Nine Varied Tasks**: We cover a broad spectrum, from cybersecurity and climate APIs to recommendation systems, along with some pure Python functions. |
| - **Zero-Shot Challenges**: Models are tested on their innate ability to handle tasks they haven't seen before, showcasing their versatility and comprehension from the function definitions and user queries ONLY. |
| - **Diverse Model Participation**: We included a mix of both open-source and closed-source models. We initially benchmarked three models, and we are more than happy to work together with the community to involve more models. |
| |
| This leaderboard is an exciting step towards understanding and improving the capabilities of large language models in diverse, real-world applications with building semantic interfaces around APIs! |
| |
| The evaluation logic for the results here can be found here: https://github.com/nexusflowai/NexusRaven-V2/tree/master/evaluation_notebook |
| """ |
|
|
| CSS = """ |
| .intro-text { |
| font-size: 26px; |
| } |
| footer { |
| visibility: hidden; |
| } |
| """ |
|
|
| |
| custom_css = """ |
| <style> |
| .markdown-class { |
| font-size: 16px !important; /* Adjust the font size as needed */ |
| } |
| </style> |
| """ |
|
|
| with gr.Blocks(theme="dark") as demo: |
| gr.HTML( |
| """<img width="50" height="50" style="float:left; margin: 0px;" src="/file=logo.png"> |
| <h1 style="overflow: hidden; padding-top: 17px; margin: 0px;">Nexusflow</h1> |
| """ |
| ) |
| with gr.Row(): |
| gr.Image( |
| "raven.png", |
| show_label=False, |
| show_share_button=True, |
| min_width=40, |
| scale=1, |
| ) |
| with gr.Column(scale=4): |
| gr.HTML(custom_css) |
| gr.Markdown(INTRO_TEXT, elem_classes="markdown-class") |
| with gr.Tab("Overall"): |
| |
| |
| with gr.Accordion("Task Averages:"): |
| gr.Dataframe(display_averages().map(format_scores)) |
|
|
| with gr.Accordion("Model Capabilities:"): |
| with gr.Row(): |
| gr.Plot(display_radar_chart("general ability")) |
| gr.Plot(display_radar_chart("many apis many args")) |
|
|
| for key, value in RESULTS.items(): |
| tab_names = { |
| 'OTX': 'OTX (Single)', |
| 'CVECPE': 'NVDLibrary (Single)', |
| 'VirusTotal': 'VirusTotal (Single)', |
| 'VT_Multi_Dependency': 'VT_Multi (Nested)', |
| 'Places_API': 'Places (Nested)', |
| 'CVECPE_Multi_Dependency': 'NVDLibrary_Multi (Nested)', |
| 'Heldout_Combined': 'Stack (Mostly Single)', |
| 'Climate': 'Climate (Nested/Parallel)', |
| 'VT_Multi_Disconnected': 'VT_Multi (Parallel)' |
| } |
| |
| tab_name = tab_names.get(key, key) |
|
|
| with gr.Tab(tab_name): |
| |
| with gr.Accordion("Details of the " + tab_name + ":", open=False) as accordion: |
| gr.Markdown(api_descriptions[key]) |
| if key == "Heldout_Combined": |
| accordion.open = True |
| else: |
| func_definition_list = func_definitions[key] |
| |
| with gr.Group(): |
| for i in range(len(func_definition_list)): |
| with gr.Accordion(func_definition_list[i][0], open=False): |
| gr.Markdown(func_definition_list[i][1]) |
|
|
| df = pd.DataFrame.from_dict(value, orient='index', columns=['Score']).reset_index() |
| df.rename(columns={'index': 'Model'}, inplace=True) |
| gr.Dataframe(df.map(format_scores)) |
| if key in SAMPLES: |
| file_path = SAMPLES[key] |
| data = read_json_or_jsonl(file_path) |
| samples = sample_data(data) |
| |
| |
| |
| for sample in samples: |
| s = sample["Output"] |
| |
| n = 90 |
| from black import Mode, format_str |
| |
| if isinstance(s, list): |
| sample['Output'] = ''.join([format_str(item, mode=Mode()) for item in s]) |
| else: |
| sample['Output'] = format_str(s, mode=Mode()) |
|
|
| samples = [[hover_css + bubble_html.format(style=bubble_style, text=sample['Input']), f"```python\n{sample['Output']}\n```".replace("; ", ";\n")] for sample in samples] |
| gr.Dataset( |
| |
| components=[gr.HTML(), gr.Markdown()], |
| headers= ["Prompt", "API Use"], |
| label=f"{key} Samples", |
| samples=samples |
| ) |
| demo.load( |
| None, |
| None, |
| js=""" |
| () => { |
| const params = new URLSearchParams(window.location.search); |
| if (!params.has('__theme')) { |
| params.set('__theme', 'dark'); |
| window.location.search = params.toString(); |
| } |
| }""" |
| ) |
|
|
|
|
| demo.launch(share=True, allowed_paths=["logo.png", "raven.png"]) |
|
|
|
|