| import json |
| import os |
| import gradio as gr |
|
|
| |
| datasets = {} |
| difficulty_files = { |
| "Easy": "data/scalar_easy.json", |
| "Medium": "data/scalar_medium.json", |
| "Hard": "data/scalar_hard.json" |
| } |
|
|
| |
| for difficulty, filepath in difficulty_files.items(): |
| with open(filepath) as f: |
| datasets[difficulty] = json.load(f) |
|
|
| |
| current_index = 0 |
| current_difficulty = "Easy" |
| current_data = datasets[current_difficulty] |
|
|
| |
| def update_display(index, difficulty=None): |
| global current_index, current_difficulty, current_data |
| |
| |
| if difficulty is not None and difficulty != current_difficulty: |
| current_difficulty = difficulty |
| current_data = datasets[difficulty] |
| current_index = 0 |
| index = 0 |
| |
| current_index = index |
| item = current_data[current_index] |
| |
| |
| question = item['question'].replace("<", "<").replace(">", ">") |
| |
| |
| colored_question = question.replace( |
| "[MASKED_CITATION]", |
| '<span style="background-color: #ffebeb; padding: 2px 6px; border-radius: 4px; color: blue;">[MASKED_CITATION]</span>' |
| ) |
| |
| |
| formatted_citation = json.dumps(item["citation"], indent=4, ensure_ascii=False) |
| return colored_question, formatted_citation, item["reference_id"], item['label_id'], index |
|
|
| def prev_item(difficulty): |
| global current_index, current_data |
| current_data = datasets[difficulty] |
| current_index = (current_index - 1) % len(current_data) |
| return update_display(current_index, difficulty) |
|
|
| def next_item(difficulty): |
| global current_index, current_data |
| current_data = datasets[difficulty] |
| current_index = (current_index + 1) % len(current_data) |
| return update_display(current_index, difficulty) |
|
|
| |
| with gr.Blocks() as demo: |
| gr.Markdown(""" |
| ## Data viewer for [SCALAR](https://arxiv.org/abs/2502.13753/) |
| * SCALAR (Scientific Citation-based Live Assessment of Long-context Academic Reasoning) is a novel benchmark that leverages academic papers and their citation networks, featuring automatic generation of high-quality ground truth labels without human annotation, controllable difficulty levels, and a dynamic updating mechanism that prevents data contamination. |
| * The benchmark provides a reliable and sustainable way to track progress in long-context understanding as LLM capabilities evolve. |
| """) |
| with gr.Column(): |
| with gr.Row(): |
| |
| |
| with gr.Column(): |
|
|
| |
| difficulty_dropdown = gr.Dropdown( |
| choices=list(difficulty_files.keys()), |
| value="Easy", |
| label="Difficulty Level" |
| ) |
|
|
| index_slider = gr.Slider(minimum=0, maximum=len(datasets["Easy"]) - 1, step=1, label="Select Index", value=0) |
| with gr.Row(): |
| prev_btn = gr.Button("Previous") |
| next_btn = gr.Button("Next") |
| |
| |
| with gr.Column(): |
| citation_text = gr.TextArea(label="Citation", interactive=False) |
| reference_id = gr.Textbox(label="Reference ID", interactive=False) |
| label_id = gr.Textbox(label="Label ID", interactive=False) |
|
|
| with gr.Row(): |
| question_md = gr.Markdown(label="Question") |
|
|
| |
| difficulty_dropdown.change( |
| fn=lambda d: gr.Slider(minimum=0, maximum=len(datasets[d]) - 1, step=1, value=0), |
| inputs=[difficulty_dropdown], |
| outputs=[index_slider] |
| ) |
| |
| index_slider.change( |
| update_display, |
| inputs=[index_slider, difficulty_dropdown], |
| outputs=[question_md, citation_text, reference_id, label_id, index_slider] |
| ) |
| |
| prev_btn.click( |
| prev_item, |
| inputs=[difficulty_dropdown], |
| outputs=[question_md, citation_text, reference_id, label_id, index_slider] |
| ) |
| |
| next_btn.click( |
| next_item, |
| inputs=[difficulty_dropdown], |
| outputs=[question_md, citation_text, reference_id, label_id, index_slider] |
| ) |
|
|
| |
| demo.load( |
| update_display, |
| inputs=[index_slider, difficulty_dropdown], |
| outputs=[question_md, citation_text, reference_id, label_id, index_slider] |
| ) |
|
|
| |
| demo.launch() |
|
|