priyadip's picture
Add privacy notice to UI and README
fe7a57c
"""
GitHub Notebook Fixer β€” Hugging Face Space
Fixes .ipynb files so they render correctly on GitHub.
Common issues fixed:
1. Missing 'state' key in metadata.widgets
2. Oversized cell outputs (GitHub has a ~1MB render limit)
3. Invalid/missing notebook metadata (kernelspec, language_info)
4. Large base64-encoded images in outputs
5. Widget metadata without required 'state' field
"""
import gradio as gr
import json
import copy
import base64
import uuid
import tempfile
from pathlib import Path
# ---------------------------------------------------------------------------
# Analysis helpers
# ---------------------------------------------------------------------------
MAX_OUTPUT_SIZE = 500_000 # ~500 KB per cell output is risky for GitHub
MAX_TOTAL_SIZE = 10_000_000 # ~10 MB total notebook size warning
MAX_IMAGE_SIZE = 1_000_000 # ~1 MB per embedded image
GITHUB_ISSUES = {
"widget_state_missing": {
"severity": "critical",
"title": "Missing 'state' in metadata.widgets",
"desc": (
"GitHub requires a 'state' key inside metadata.widgets. "
"Without it the notebook preview shows 'Invalid Notebook'."
),
},
"widgets_empty_state": {
"severity": "info",
"title": "metadata.widgets exists but 'state' is empty",
"desc": "The widget state dict is present but empty β€” GitHub renders this fine.",
},
"no_kernelspec": {
"severity": "warning",
"title": "Missing kernelspec in metadata",
"desc": "GitHub may not detect the notebook language correctly.",
},
"no_language_info": {
"severity": "info",
"title": "Missing language_info in metadata",
"desc": "Minor β€” GitHub can usually infer the language from kernelspec.",
},
"oversized_output": {
"severity": "warning",
"title": "Cell output exceeds ~500 KB",
"desc": "Very large outputs can cause GitHub to skip rendering the notebook.",
},
"oversized_notebook": {
"severity": "critical",
"title": "Notebook exceeds ~10 MB",
"desc": "GitHub will refuse to render notebooks over ~10 MB.",
},
"large_embedded_image": {
"severity": "warning",
"title": "Large base64 image embedded in output",
"desc": "Images over ~1 MB bloat the notebook and slow GitHub rendering.",
},
"invalid_nbformat": {
"severity": "critical",
"title": "Missing or invalid nbformat version",
"desc": "GitHub needs nbformat >= 4 to render the notebook.",
},
}
def _sizeof(obj) -> int:
"""Rough byte size of a JSON-serializable object."""
return len(json.dumps(obj, ensure_ascii=False).encode())
def analyze_notebook(nb: dict) -> list[dict]:
"""Return a list of issue dicts found in the notebook."""
issues: list[dict] = []
# ---- nbformat version ----
nbf = nb.get("nbformat")
if nbf is None or (isinstance(nbf, int) and nbf < 4):
issues.append({**GITHUB_ISSUES["invalid_nbformat"], "location": "root"})
# ---- metadata.widgets ----
meta = nb.get("metadata", {})
widgets = meta.get("widgets")
if widgets is not None:
if "application/vnd.jupyter.widget-state+json" in widgets:
ws = widgets["application/vnd.jupyter.widget-state+json"]
if "state" not in ws:
issues.append({**GITHUB_ISSUES["widget_state_missing"], "location": "metadata.widgets"})
elif not ws["state"]:
issues.append({**GITHUB_ISSUES["widgets_empty_state"], "location": "metadata.widgets"})
else:
# widgets key exists but no standard widget-state key
for key, val in widgets.items():
if isinstance(val, dict) and "state" not in val:
issues.append({
**GITHUB_ISSUES["widget_state_missing"],
"location": f"metadata.widgets['{key}']",
})
# ---- kernelspec / language_info ----
if "kernelspec" not in meta:
issues.append({**GITHUB_ISSUES["no_kernelspec"], "location": "metadata"})
if "language_info" not in meta:
issues.append({**GITHUB_ISSUES["no_language_info"], "location": "metadata"})
# ---- per-cell checks ----
for idx, cell in enumerate(nb.get("cells", [])):
for out in cell.get("outputs", []):
out_size = _sizeof(out)
if out_size > MAX_OUTPUT_SIZE:
issues.append({
**GITHUB_ISSUES["oversized_output"],
"location": f"cells[{idx}]",
"detail": f"{out_size / 1_000_000:.2f} MB",
})
# check base64 images
data = out.get("data", {})
for mime, content in data.items():
if mime.startswith("image/") and isinstance(content, str):
try:
img_bytes = len(base64.b64decode(content, validate=False))
except Exception:
img_bytes = len(content)
if img_bytes > MAX_IMAGE_SIZE:
issues.append({
**GITHUB_ISSUES["large_embedded_image"],
"location": f"cells[{idx}] ({mime})",
"detail": f"{img_bytes / 1_000_000:.2f} MB",
})
# ---- total size ----
total = _sizeof(nb)
if total > MAX_TOTAL_SIZE:
issues.append({
**GITHUB_ISSUES["oversized_notebook"],
"location": "entire file",
"detail": f"{total / 1_000_000:.2f} MB",
})
return issues
def predict_github_render(issues: list[dict]) -> str:
"""Return a human-readable prediction."""
crits = [i for i in issues if i["severity"] == "critical"]
warns = [i for i in issues if i["severity"] == "warning"]
if crits:
return "❌ Will NOT render on GitHub"
if warns:
return "⚠️ Might render, but with issues"
return "βœ… Should render fine on GitHub"
# ---------------------------------------------------------------------------
# Fixer
# ---------------------------------------------------------------------------
def fix_notebook(nb: dict, strip_widgets: bool = False, strip_large_outputs: bool = True) -> dict:
"""Return a fixed copy of the notebook."""
nb = copy.deepcopy(nb)
# ---- Ensure nbformat ----
if nb.get("nbformat") is None or nb.get("nbformat") < 4:
nb["nbformat"] = 4
nb.setdefault("nbformat_minor", 5)
# ---- metadata ----
meta = nb.setdefault("metadata", {})
# Fix widgets
widgets = meta.get("widgets")
if widgets is not None:
if strip_widgets:
del meta["widgets"]
else:
# Add missing 'state' key to every widget-state entry
for key, val in list(widgets.items()):
if isinstance(val, dict) and "state" not in val:
val["state"] = {}
# Also handle the standard key specifically
if "application/vnd.jupyter.widget-state+json" in widgets:
ws = widgets["application/vnd.jupyter.widget-state+json"]
ws.setdefault("state", {})
# Ensure kernelspec
if "kernelspec" not in meta:
meta["kernelspec"] = {
"display_name": "Python 3",
"language": "python",
"name": "python3",
}
# Ensure language_info
if "language_info" not in meta:
meta["language_info"] = {
"name": "python",
"version": "3.10.0",
}
# ---- Per-cell fixes ----
for cell in nb.get("cells", []):
new_outputs = []
for out in cell.get("outputs", []):
out_size = _sizeof(out)
# Strip very large outputs if requested
if strip_large_outputs and out_size > MAX_OUTPUT_SIZE:
new_outputs.append({
"output_type": "stream",
"name": "stdout",
"text": ["[Output removed β€” too large for GitHub rendering]\n"],
})
continue
# Always strip oversized base64 images regardless of the checkbox
data = out.get("data", {})
for mime in list(data.keys()):
if mime.startswith("image/") and isinstance(data[mime], str):
try:
img_bytes = len(base64.b64decode(data[mime], validate=False))
except Exception:
img_bytes = len(data[mime])
if img_bytes > MAX_IMAGE_SIZE:
data[mime] = "" # clear the giant image
data.setdefault("text/plain", ["[Large image removed for GitHub compatibility]"])
new_outputs.append(out)
cell["outputs"] = new_outputs
# Ensure every cell has a valid nbformat 4.5+ id (8-char hex)
existing_id = cell.get("id", "")
if not existing_id or len(existing_id) < 8:
cell["id"] = uuid.uuid4().hex[:8]
return nb
# ---------------------------------------------------------------------------
# Report builder
# ---------------------------------------------------------------------------
def build_report(issues: list[dict], prediction: str) -> str:
lines = [f"## {prediction}\n"]
if not issues:
lines.append("No issues detected β€” this notebook looks good for GitHub!\n")
return "\n".join(lines)
severity_emoji = {"critical": "πŸ”΄", "warning": "🟑", "info": "πŸ”΅"}
lines.append(f"**Found {len(issues)} issue(s):**\n")
for i, issue in enumerate(issues, 1):
emoji = severity_emoji.get(issue["severity"], "βšͺ")
detail = f" β€” {issue.get('detail', '')}" if "detail" in issue else ""
lines.append(f"{i}. {emoji} **{issue['title']}**{detail}")
lines.append(f" *Location:* `{issue['location']}`")
lines.append(f" {issue['desc']}\n")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Gradio handler
# ---------------------------------------------------------------------------
def process_notebook(file, strip_widgets: bool, strip_large_outputs: bool):
"""Main handler: analyse β†’ predict β†’ fix β†’ return."""
if file is None:
return "Upload a `.ipynb` file first.", None
# Read the notebook
try:
with open(file.name, "r", encoding="utf-8") as f:
nb = json.load(f)
except json.JSONDecodeError:
return "❌ The uploaded file is not valid JSON. Are you sure it's a `.ipynb`?", None
except Exception as e:
return f"❌ Could not read file: {e}", None
# Analyse
issues = analyze_notebook(nb)
prediction = predict_github_render(issues)
report = build_report(issues, prediction)
# Fix
fixed_nb = fix_notebook(nb, strip_widgets=strip_widgets, strip_large_outputs=strip_large_outputs)
# Re-analyse fixed version
fixed_issues = analyze_notebook(fixed_nb)
fixed_prediction = predict_github_render(fixed_issues)
report += "\n---\n"
report += f"### After fix: {fixed_prediction}\n"
if fixed_issues:
remaining = [i for i in fixed_issues if i["severity"] in ("critical", "warning")]
if remaining:
report += f"⚠️ {len(remaining)} issue(s) remain (may need manual attention).\n"
else:
report += "Only informational notes remain β€” notebook should render on GitHub.\n"
else:
report += "All issues resolved! βœ…\n"
# Write fixed notebook to temp file
out_path = tempfile.NamedTemporaryFile(
suffix=".ipynb", delete=False, prefix="fixed_", dir=tempfile.gettempdir()
)
with open(out_path.name, "w", encoding="utf-8") as f:
json.dump(fixed_nb, f, ensure_ascii=False, indent=1)
return report, out_path.name
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
DESCRIPTION = """
# πŸ”§ GitHub Notebook Fixer
**Upload a `.ipynb` file** and this tool will:
1. **Predict** whether it will render on GitHub
2. **Diagnose** all issues (missing widget state, oversized outputs, bad metadata…)
3. **Fix** the problems and return a clean `.ipynb` you can push to GitHub
### Common issues fixed
- `metadata.widgets` missing the `state` key β†’ **"Invalid Notebook"** on GitHub
- Oversized cell outputs (>500 KB) that block rendering
- Missing `kernelspec` / `language_info` metadata
- Giant base64-encoded images bloating the file
"""
with gr.Blocks(
title="GitHub Notebook Fixer",
theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"),
) as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload .ipynb file",
file_types=[".ipynb"],
type="filepath",
)
strip_widgets = gr.Checkbox(
label="Remove widget metadata entirely (instead of fixing it)",
value=False,
)
strip_large = gr.Checkbox(
label="Strip oversized outputs (>500 KB per cell)",
value=True,
)
btn = gr.Button("πŸ” Analyze & Fix", variant="primary", size="lg")
with gr.Column(scale=2):
report_output = gr.Markdown(label="Diagnosis Report")
file_output = gr.File(label="Download Fixed Notebook")
btn.click(
fn=process_notebook,
inputs=[file_input, strip_widgets, strip_large],
outputs=[report_output, file_output],
)
gr.Markdown(
'---\n*Built to solve the classic GitHub "Invalid Notebook" error. '
"Works for Colab, Jupyter, and any nbformat-4 notebook.*"
)
with gr.Accordion("πŸ”’ Privacy & Data β€” your files are never stored", open=False):
gr.Markdown("""
### How this tool works
1. You upload a `.ipynb` file β€” it is read **entirely in memory** on the server.
2. The tool analyses the JSON structure, fixes any issues, and writes a temporary output file.
3. You download the fixed file.
4. The temporary file is managed by the OS and is **automatically deleted** β€” it is never persisted beyond your session.
### What we do NOT do
- ❌ We do **not** save, log, or store your notebook file anywhere.
- ❌ We do **not** collect any personal information or usage metadata linked to your file.
- ❌ There is **no database** β€” nothing is written to permanent storage.
- ❌ Your notebook content is **never shared** with third parties.
### What happens to your data
Your file lives only in the server's RAM and a short-lived OS temp file for the duration of your request.
Once you close the session or the Space restarts, every trace of it is gone.
**You are the only person who ever sees your notebook.**
""")
if __name__ == "__main__":
demo.launch()