Spaces:
Running
Running
| import os | |
| import sys | |
| from pathlib import Path | |
| import importlib | |
| import json | |
| import base64 | |
| import re | |
| import pandas as pd | |
| import plotly.express as px | |
| import streamlit as st | |
| sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) | |
| # sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) | |
| from preprocessing.text_extractor import extract_text_from_file | |
| from preprocessing.clause_extraction import extract_clauses | |
| from embeddings.sbert_encoder import generate_embeddings | |
| from storage.faiss_index import create_faiss_index | |
| from analysis.similarity_search import get_similar | |
| import analysis.common_analyzer | |
| importlib.reload(analysis.common_analyzer) | |
| from analysis.common_analyzer import analyze_pair | |
| from analysis.nli_verifier import NLIVerifier | |
| from analysis.llama_legal_verifier import LlamaLegalVerifier | |
| from output.pdf_generator import generate_pdf_report | |
| from auth.user_store import authenticate_user, create_user | |
| APP_TITLE = "Legal Semantic Integrity" | |
| DEFAULT_MODEL_PATH = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| def init_state(): | |
| st.session_state.setdefault("is_authenticated", False) | |
| st.session_state.setdefault("username", "") | |
| st.session_state.setdefault("analysis_done", False) | |
| st.session_state.setdefault("results", []) | |
| st.session_state.setdefault("line_issues", []) | |
| st.session_state.setdefault("uploaded_name", "") | |
| st.session_state.setdefault("uploaded_ext", "") | |
| st.session_state.setdefault("uploaded_bytes", b"") | |
| def _extract_party_name(text: str, role: str) -> str: | |
| """ | |
| Try to extract a nearby party name for vendor/vendee from clause text. | |
| Falls back to role-present markers when exact name is not available. | |
| """ | |
| if not text: | |
| return "Not found" | |
| t = " ".join(str(text).split()) | |
| role_l = role.lower() | |
| # Pattern examples: | |
| # "Vendor Mr. Ravi Kumar", "Vendee: Sita Devi", "the vendor, John Doe" | |
| patterns = [ | |
| rf"\b{role_l}\b\s*[:,-]?\s*(?:mr\.?|mrs\.?|ms\.?)?\s*([A-Z][A-Za-z.\s]{{2,60}}?)(?=,|\.|;|\bson of\b|\bwife of\b|\bresiding\b|\baged\b|$)", | |
| rf"\bthe\s+{role_l}\b\s*[:,-]?\s*(?:is\s+)?(?:mr\.?|mrs\.?|ms\.?)?\s*([A-Z][A-Za-z.\s]{{2,60}}?)(?=,|\.|;|\bson of\b|\bwife of\b|\bresiding\b|\baged\b|$)", | |
| ] | |
| for pat in patterns: | |
| m = re.search(pat, t, flags=re.IGNORECASE) | |
| if m: | |
| name = " ".join(m.group(1).split()) | |
| # Filter generic captures like "hereinafter called" | |
| if name and not re.search( | |
| r"hereinafter|called|referred|party|agreement", name, re.IGNORECASE | |
| ): | |
| return name[:80] | |
| if re.search(rf"\b{role_l}\b", t, flags=re.IGNORECASE): | |
| return f"{role.title()} mentioned (name not parsed)" | |
| return "Not found" | |
| def _clean_candidate_name(name: str) -> str: | |
| name = re.sub(r"\s+", " ", str(name)).strip(" ,.;:-") | |
| if not name: | |
| return "" | |
| banned = r"hereinafter|called|referred|party|agreement|vendor|vendee|purchaser|buyer|seller" | |
| if re.search(banned, name, flags=re.IGNORECASE): | |
| return "" | |
| return name[:80] | |
| def _extract_document_parties(text_data): | |
| full_text = "\n".join(chunk.get("text", "") for chunk in (text_data or [])) | |
| compact = " ".join(full_text.split()) | |
| parties = {"Vendor": "Not found", "Vendee": "Not found"} | |
| # Common legal intro patterns: | |
| # "Mr. X ... hereinafter called the VENDOR" | |
| # "Y ... hereinafter called the VENDEE" | |
| role_patterns = { | |
| "Vendor": [ | |
| r"(Mr\.?|Mrs\.?|Ms\.?)?\s*([A-Z][A-Za-z.\s]{2,80}?)\s+(?:son of|wife of|daughter of|residing at|aged about|hereinafter)\b[^.]{0,120}\bvendor\b", | |
| r"\bvendor\b\s*[:,-]?\s*(?:is\s+)?(?:Mr\.?|Mrs\.?|Ms\.?)?\s*([A-Z][A-Za-z.\s]{2,80})(?=,|\.|;|\bson of\b|\bwife of\b|\bresiding\b|\baged\b|$)", | |
| ], | |
| "Vendee": [ | |
| r"(Mr\.?|Mrs\.?|Ms\.?)?\s*([A-Z][A-Za-z.\s]{2,80}?)\s+(?:son of|wife of|daughter of|residing at|aged about|hereinafter)\b[^.]{0,120}\bvendee\b", | |
| r"\bvendee\b\s*[:,-]?\s*(?:is\s+)?(?:Mr\.?|Mrs\.?|Ms\.?)?\s*([A-Z][A-Za-z.\s]{2,80})(?=,|\.|;|\bson of\b|\bwife of\b|\bresiding\b|\baged\b|$)", | |
| ], | |
| } | |
| for role, patterns in role_patterns.items(): | |
| for pat in patterns: | |
| m = re.search(pat, compact, flags=re.IGNORECASE) | |
| if not m: | |
| continue | |
| candidate = m.group(2) if (m.lastindex or 0) >= 2 else m.group(1) | |
| cleaned = _clean_candidate_name(candidate) | |
| if cleaned: | |
| parties[role] = cleaned | |
| break | |
| # Secondary fallback: explicit role in text without name | |
| if parties[role] == "Not found" and re.search( | |
| rf"\b{role.lower()}\b", compact, flags=re.IGNORECASE | |
| ): | |
| parties[role] = f"{role} mentioned (name not parsed)" | |
| return parties | |
| def _extract_parties(text1: str, text2: str, doc_parties=None): | |
| vendor = _extract_party_name(text1, "vendor") | |
| if vendor == "Not found": | |
| vendor = _extract_party_name(text2, "vendor") | |
| vendee = _extract_party_name(text1, "vendee") | |
| if vendee == "Not found": | |
| vendee = _extract_party_name(text2, "vendee") | |
| if doc_parties: | |
| if vendor in [ | |
| "Not found", | |
| "Vendor mentioned (name not parsed)", | |
| ] and doc_parties.get("Vendor"): | |
| vendor = doc_parties.get("Vendor") | |
| if vendee in [ | |
| "Not found", | |
| "Vendee mentioned (name not parsed)", | |
| ] and doc_parties.get("Vendee"): | |
| vendee = doc_parties.get("Vendee") | |
| return vendor, vendee | |
| def load_verifier(backend: str, llama_model_path: str): | |
| if backend == "llama": | |
| return LlamaLegalVerifier(model_path=llama_model_path) | |
| return NLIVerifier(model_name="cross-encoder/nli-distilroberta-base") | |
| def apply_theme(): | |
| st.markdown( | |
| """ | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap'); | |
| @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500&display=swap'); | |
| :root { | |
| --bg-soft: #f6fbff; | |
| --ink-900: #0b2f4a; | |
| --ink-700: #21506f; | |
| --accent-500: #0a84c6; | |
| --accent-700: #005b88; | |
| --mint-500: #2aa198; | |
| --warn-500: #c57b00; | |
| --danger-500: #c44736; | |
| --card-border: #dbeaf4; | |
| } | |
| html, body, [class*="css"] { | |
| font-family: 'Space Grotesk', sans-serif; | |
| } | |
| .stApp { | |
| background: | |
| radial-gradient(900px 420px at -15% -25%, #d7f0ff 0%, rgba(215,240,255,0) 62%), | |
| radial-gradient(900px 420px at 115% -20%, #fff2d8 0%, rgba(255,242,216,0) 62%), | |
| linear-gradient(180deg, #f8fcff 0%, #ffffff 55%); | |
| } | |
| .hero { | |
| border: 1px solid var(--card-border); | |
| background: linear-gradient(145deg, #f0f8ff 0%, #fffdf8 95%); | |
| border-radius: 18px; | |
| padding: 20px 22px; | |
| margin-bottom: 14px; | |
| box-shadow: 0 10px 24px rgba(9, 59, 102, 0.07); | |
| animation: fadeIn .45s ease-out; | |
| } | |
| .hero h2 { | |
| margin: 0; | |
| color: var(--ink-900); | |
| letter-spacing: .2px; | |
| font-weight: 700; | |
| } | |
| .hero p { | |
| margin: 8px 0 0 0; | |
| color: var(--ink-700); | |
| } | |
| .step { | |
| border-left: 4px solid var(--accent-500); | |
| background: #ffffff; | |
| border-radius: 8px; | |
| padding: 8px 12px; | |
| margin-bottom: 8px; | |
| font-weight: 500; | |
| color: #12344d; | |
| box-shadow: 0 6px 16px rgba(12, 53, 88, 0.05); | |
| } | |
| .mini-card { | |
| border: 1px solid var(--card-border); | |
| border-radius: 14px; | |
| background: #ffffff; | |
| padding: 14px 14px; | |
| margin-bottom: 10px; | |
| box-shadow: 0 6px 16px rgba(12, 53, 88, 0.04); | |
| animation: fadeIn .55s ease-out; | |
| } | |
| .mini-label { | |
| color: #43627c; | |
| font-size: 0.78rem; | |
| letter-spacing: .02em; | |
| text-transform: uppercase; | |
| margin-bottom: 6px; | |
| } | |
| .mini-value { | |
| color: #082d48; | |
| font-size: 1.45rem; | |
| font-weight: 700; | |
| line-height: 1.2; | |
| } | |
| .mono { | |
| font-family: 'IBM Plex Mono', monospace; | |
| } | |
| .tag { | |
| display: inline-block; | |
| border-radius: 999px; | |
| padding: 5px 10px; | |
| font-size: 0.75rem; | |
| font-weight: 600; | |
| margin-right: 6px; | |
| margin-top: 5px; | |
| border: 1px solid; | |
| } | |
| .tag-info { color: var(--accent-700); border-color: #b7def4; background: #ecf7ff; } | |
| .tag-ok { color: #186b64; border-color: #bceae5; background: #ecfffc; } | |
| .tag-warn { color: #8c5c00; border-color: #f2d9a4; background: #fff7e8; } | |
| .tag-risk { color: #9f3124; border-color: #efb5ad; background: #fff1ee; } | |
| [data-testid="stDataFrame"] div[role="table"] { | |
| border-radius: 12px; | |
| border: 1px solid #d6e8f4; | |
| overflow: hidden; | |
| } | |
| @keyframes fadeIn { | |
| from { opacity: 0; transform: translateY(8px); } | |
| to { opacity: 1; transform: translateY(0); } | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| def login_page(): | |
| col_intro, col_auth = st.columns([1.15, 1], gap="large") | |
| with col_intro: | |
| st.markdown( | |
| """ | |
| <div class="hero"> | |
| <h2>Legal Semantic Integrity Portal</h2> | |
| <p>Interactive contract diagnostics with line-level visibility and legal conflict tracing.</p> | |
| <div> | |
| <span class="tag tag-info">Step 1: Secure Login</span> | |
| <span class="tag tag-ok">Step 2: Upload & Analyze</span> | |
| <span class="tag tag-warn">Step 3: Error-Line Dashboard</span> | |
| </div> | |
| </div> | |
| <div class="mini-card"> | |
| <div class="mini-label">What You Get</div> | |
| <div class="mono">Duplicate clauses, legal contradictions, and exact page/line issue map.</div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| with col_auth: | |
| st.markdown( | |
| '<div class="step">Step 1 of 3: Login</div>', unsafe_allow_html=True | |
| ) | |
| tab_login, tab_signup = st.tabs(["Sign In", "Create Account"]) | |
| with tab_login: | |
| with st.form("login_form", clear_on_submit=False): | |
| username = st.text_input("Username") | |
| password = st.text_input("Password", type="password") | |
| submit = st.form_submit_button("Login") | |
| if submit: | |
| ok, message = authenticate_user(username, password) | |
| if ok: | |
| st.session_state.is_authenticated = True | |
| st.session_state.username = username.strip().lower() | |
| st.success(message) | |
| st.rerun() | |
| else: | |
| st.error(message) | |
| with tab_signup: | |
| with st.form("signup_form", clear_on_submit=True): | |
| new_username = st.text_input("New Username") | |
| new_password = st.text_input("New Password", type="password") | |
| confirm_password = st.text_input("Confirm Password", type="password") | |
| create_submit = st.form_submit_button("Create Account") | |
| if create_submit: | |
| if new_password != confirm_password: | |
| st.error("Passwords do not match.") | |
| else: | |
| ok, message = create_user(new_username, new_password) | |
| if ok: | |
| st.success(message) | |
| else: | |
| st.error(message) | |
| st.caption("Local accounts are saved in data/users.db") | |
| def run_analysis( | |
| uploaded_file, sensitivity: float, backend: str, llama_model_path: str | |
| ): | |
| file_ext = uploaded_file.name.split(".")[-1].lower() | |
| with st.spinner("Extracting text..."): | |
| text_data = extract_text_from_file(uploaded_file, file_ext) | |
| if not text_data: | |
| st.error("Could not extract text from this file.") | |
| return [], [] | |
| with st.spinner("Extracting clauses..."): | |
| clauses = extract_clauses(text_data) | |
| doc_parties = _extract_document_parties(text_data) | |
| if not clauses: | |
| st.warning("No valid clauses were detected.") | |
| return [], [] | |
| with st.spinner("Building semantic index..."): | |
| embeddings = generate_embeddings(clauses) | |
| index = create_faiss_index(embeddings) | |
| verifier = load_verifier(backend=backend, llama_model_path=llama_model_path) | |
| results = [] | |
| seen_pairs = set() | |
| progress = st.progress(0) | |
| total = len(embeddings) | |
| for i, emb in enumerate(embeddings): | |
| idxs, dists = get_similar(index, emb, k=5) | |
| for j, dist in zip(idxs, dists): | |
| if i >= j: | |
| continue | |
| if (i, j) in seen_pairs: | |
| continue | |
| seen_pairs.add((i, j)) | |
| similarity = 1 / (1 + dist) | |
| label, confidence, reason = analyze_pair( | |
| clauses[i]["text"], | |
| clauses[j]["text"], | |
| similarity, | |
| threshold=sensitivity, | |
| ) | |
| if not label: | |
| continue | |
| result = { | |
| "Label": label, | |
| "Confidence": float(confidence), | |
| "Reason": reason, | |
| "Clause 1": clauses[i]["text"], | |
| "Clause 2": clauses[j]["text"], | |
| "Page 1": clauses[i]["page"], | |
| "Line 1": clauses[i]["line"], | |
| "Page 2": clauses[j]["page"], | |
| "Line 2": clauses[j]["line"], | |
| "Location 1": f"Pg {clauses[i]['page']}, Ln {clauses[i]['line']}", | |
| "Location 2": f"Pg {clauses[j]['page']}, Ln {clauses[j]['line']}", | |
| } | |
| vendor_name, vendee_name = _extract_parties( | |
| result["Clause 1"], result["Clause 2"], doc_parties=doc_parties | |
| ) | |
| result["Vendor"] = vendor_name | |
| result["Vendee"] = vendee_name | |
| if backend == "llama": | |
| _, llm_conf, llm_label, llm_reason = verifier.predict( | |
| result["Clause 1"], result["Clause 2"] | |
| ) | |
| else: | |
| _, llm_conf, llm_label = verifier.predict( | |
| result["Clause 1"], result["Clause 2"] | |
| ) | |
| llm_reason = f"NLI label: {llm_label}" | |
| if llm_label == "Neutral": | |
| # Do not erase strong rule-based findings just because LLM is neutral. | |
| if result["Label"] in ["NUMERIC_INCONSISTENCY", "LEGAL_CONFLICT"]: | |
| result["Reason"] = f"{result['Reason']} | LLM neutral review" | |
| else: | |
| result["Label"] = "NO_CONFLICT" | |
| result["Reason"] = "LLM marked as neutral" | |
| elif llm_label == "Entailment": | |
| result["Label"] = "DUPLICATION" | |
| result["Reason"] = "LLM marked as entailment" | |
| elif llm_label == "Contradiction": | |
| if result["Label"] in ["CANDIDATE", "QUALIFICATION"]: | |
| result["Label"] = "LEGAL_CONFLICT" | |
| result["Reason"] = llm_reason | |
| result["Confidence"] = float(llm_conf) | |
| results.append(result) | |
| progress.progress((i + 1) / total) | |
| progress.empty() | |
| line_issues = [] | |
| for r in results: | |
| if r["Label"] == "NO_CONFLICT": | |
| continue | |
| line_issues.append( | |
| { | |
| "Issue Type": r["Label"], | |
| "Confidence": round(r["Confidence"], 4), | |
| "Page": r["Page 1"], | |
| "Line": r["Line 1"], | |
| "Snippet": r["Clause 1"][:160], | |
| "Reason": r["Reason"], | |
| "Vendor": r.get("Vendor", "Not found"), | |
| "Vendee": r.get("Vendee", "Not found"), | |
| } | |
| ) | |
| line_issues.append( | |
| { | |
| "Issue Type": r["Label"], | |
| "Confidence": round(r["Confidence"], 4), | |
| "Page": r["Page 2"], | |
| "Line": r["Line 2"], | |
| "Snippet": r["Clause 2"][:160], | |
| "Reason": r["Reason"], | |
| "Vendor": r.get("Vendor", "Not found"), | |
| "Vendee": r.get("Vendee", "Not found"), | |
| } | |
| ) | |
| line_issues.sort(key=lambda item: (item["Page"], item["Line"])) | |
| return results, line_issues | |
| def upload_page(): | |
| st.markdown( | |
| """ | |
| <div class="hero"> | |
| <h2>Upload And Scan</h2> | |
| <p>Drop your legal document, choose model/backend, and run full semantic integrity analysis.</p> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| st.markdown( | |
| '<div class="step">Step 2 of 3: Upload Document</div>', unsafe_allow_html=True | |
| ) | |
| with st.sidebar: | |
| st.header("Scan Settings") | |
| scan_mode = st.radio( | |
| "Select scan mode", | |
| ( | |
| "Standard Scan (Recommended)", | |
| "Deep Search (Fuzzy)", | |
| "Strict (Duplicates Only)", | |
| ), | |
| index=0, | |
| ) | |
| if "Standard" in scan_mode: | |
| sensitivity = 0.60 | |
| elif "Deep" in scan_mode: | |
| sensitivity = 0.50 | |
| else: | |
| sensitivity = 0.85 | |
| # Locked configuration requested by user: | |
| # always use local fine-tuned Llama verifier and hide controls. | |
| model_backend = "llama" | |
| llama_model_path = DEFAULT_MODEL_PATH | |
| st.caption("Verifier backend: llama (fixed)") | |
| st.caption("Local model: merged_tinyllama_instruction (fixed)") | |
| st.markdown( | |
| f""" | |
| <div class="mini-card"> | |
| <div class="mini-label">Active Mode</div> | |
| <div class="mini-value">{scan_mode.split("(")[0].strip()}</div> | |
| <div class="mono">Sensitivity: {sensitivity} | Backend: {model_backend}</div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| col_left, col_right = st.columns([1.35, 1], gap="large") | |
| with col_left: | |
| uploaded_file = st.file_uploader( | |
| "Upload a legal document", | |
| type=["pdf", "docx", "txt"], | |
| help="Supported files: PDF, DOCX, TXT", | |
| ) | |
| with col_right: | |
| st.markdown( | |
| """ | |
| <div class="mini-card"> | |
| <div class="mini-label">Supported Inputs</div> | |
| <div class="mono">PDF / DOCX / TXT</div> | |
| </div> | |
| <div class="mini-card"> | |
| <div class="mini-label">Output</div> | |
| <div class="mono">Pair Findings + Error-Line Dashboard + PDF/JSON Export</div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| if uploaded_file is None: | |
| st.info("Upload a file to continue.") | |
| return | |
| st.session_state.uploaded_name = uploaded_file.name | |
| st.session_state.uploaded_ext = uploaded_file.name.split(".")[-1].lower() | |
| st.session_state.uploaded_bytes = uploaded_file.getvalue() | |
| st.success(f"File ready: {uploaded_file.name}") | |
| if st.button("Run Full Analysis", type="primary"): | |
| try: | |
| results, line_issues = run_analysis( | |
| uploaded_file=uploaded_file, | |
| sensitivity=sensitivity, | |
| backend=model_backend, | |
| llama_model_path=llama_model_path, | |
| ) | |
| st.session_state.results = results | |
| st.session_state.line_issues = line_issues | |
| st.session_state.analysis_done = True | |
| st.rerun() | |
| except Exception as exc: | |
| st.error(f"Analysis failed: {exc}") | |
| def dashboard_page(): | |
| st.markdown( | |
| """ | |
| <div class="hero"> | |
| <h2>Interactive Findings Dashboard</h2> | |
| <p>Trace conflicts by issue type, confidence, and exact line location.</p> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| st.markdown( | |
| '<div class="step">Step 3 of 3: Dashboard</div>', unsafe_allow_html=True | |
| ) | |
| results = st.session_state.results | |
| line_issues = st.session_state.line_issues | |
| if not results: | |
| st.warning("No results found.") | |
| return | |
| df = pd.DataFrame(results) | |
| df["Confidence"] = df["Confidence"].astype(float) | |
| issues_df = df[~df["Label"].isin(["NO_CONFLICT"])].copy() | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.markdown( | |
| f""" | |
| <div class="mini-card"> | |
| <div class="mini-label">User</div> | |
| <div class="mini-value">{st.session_state.username or "N/A"}</div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| with col2: | |
| st.markdown( | |
| f""" | |
| <div class="mini-card"> | |
| <div class="mini-label">Pairs Reviewed</div> | |
| <div class="mini-value">{len(df)}</div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| with col3: | |
| st.markdown( | |
| f""" | |
| <div class="mini-card"> | |
| <div class="mini-label">Detected Issues</div> | |
| <div class="mini-value">{len(issues_df)}</div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| with col4: | |
| max_conf = float(df["Confidence"].max()) if not df.empty else 0.0 | |
| st.markdown( | |
| f""" | |
| <div class="mini-card"> | |
| <div class="mini-label">Max Confidence</div> | |
| <div class="mini-value">{max_conf:.2f}</div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| st.subheader("Issue Analytics Dashboard") | |
| if line_issues: | |
| line_df = pd.DataFrame(line_issues).copy() | |
| line_df["Page"] = line_df["Page"].astype(int) | |
| line_df["Line"] = line_df["Line"].astype(int) | |
| line_df["Confidence"] = line_df["Confidence"].astype(float) | |
| filter_col1, filter_col2, filter_col3 = st.columns([1.2, 1, 1], gap="large") | |
| with filter_col1: | |
| issue_types = sorted(line_df["Issue Type"].dropna().unique().tolist()) | |
| issue_sel = st.multiselect("Issue Types", issue_types, default=issue_types) | |
| with filter_col2: | |
| conf_min = st.slider("Min Confidence (analytics)", 0.0, 1.0, 0.0, 0.01) | |
| page_min, page_max = int(line_df["Page"].min()), int(line_df["Page"].max()) | |
| if page_min == page_max: | |
| st.caption(f"Single issue page: {page_min}") | |
| page_sel = (page_min, page_max) | |
| else: | |
| page_sel = st.slider( | |
| "Page Range (analytics)", page_min, page_max, (page_min, page_max) | |
| ) | |
| with filter_col3: | |
| vendors = ["All"] + sorted( | |
| line_df["Vendor"].dropna().astype(str).unique().tolist() | |
| ) | |
| vendees = ["All"] + sorted( | |
| line_df["Vendee"].dropna().astype(str).unique().tolist() | |
| ) | |
| vendor_sel = st.selectbox("Vendor", vendors, index=0) | |
| vendee_sel = st.selectbox("Vendee", vendees, index=0) | |
| filtered = line_df.copy() | |
| if issue_sel: | |
| filtered = filtered[filtered["Issue Type"].isin(issue_sel)] | |
| filtered = filtered[filtered["Confidence"] >= conf_min] | |
| filtered = filtered[ | |
| (filtered["Page"] >= page_sel[0]) & (filtered["Page"] <= page_sel[1]) | |
| ] | |
| if vendor_sel != "All": | |
| filtered = filtered[filtered["Vendor"] == vendor_sel] | |
| if vendee_sel != "All": | |
| filtered = filtered[filtered["Vendee"] == vendee_sel] | |
| total_issues = len(filtered) | |
| conflict_rate = (len(issues_df) / len(df) * 100.0) if len(df) else 0.0 | |
| top_issue = ( | |
| filtered["Issue Type"].mode().iloc[0] if not filtered.empty else "N/A" | |
| ) | |
| highest_risk_page = ( | |
| int(filtered.groupby("Page")["Confidence"].mean().idxmax()) | |
| if not filtered.empty | |
| else "N/A" | |
| ) | |
| k1, k2, k3, k4 = st.columns(4) | |
| k1.metric("Filtered Issues", total_issues) | |
| k2.metric("Conflict Rate", f"{conflict_rate:.1f}%") | |
| k3.metric("Top Issue Type", top_issue) | |
| k4.metric("Highest Risk Page", highest_risk_page) | |
| if filtered.empty: | |
| st.warning("No analytics data for current filter.") | |
| else: | |
| pie_df = filtered["Issue Type"].value_counts().reset_index() | |
| pie_df.columns = ["Issue Type", "Count"] | |
| pie_fig = px.pie( | |
| pie_df, | |
| names="Issue Type", | |
| values="Count", | |
| title="Issue Type Split", | |
| hole=0.35, | |
| ) | |
| pie_fig.update_layout(margin=dict(l=10, r=10, t=50, b=10)) | |
| st.plotly_chart(pie_fig, use_container_width=True) | |
| top_lines = filtered.sort_values(by=["Confidence"], ascending=False).head( | |
| 10 | |
| ) | |
| st.markdown("**Top 10 High-Risk Lines**") | |
| st.dataframe( | |
| top_lines[ | |
| [ | |
| "Issue Type", | |
| "Confidence", | |
| "Page", | |
| "Line", | |
| "Vendor", | |
| "Vendee", | |
| "Snippet", | |
| "Reason", | |
| ] | |
| ], | |
| use_container_width=True, | |
| ) | |
| else: | |
| st.info("No issue analytics data available.") | |
| tab_findings, tab_line_map, tab_export = st.tabs( | |
| ["Findings Table", "Error Line Map", "Export"] | |
| ) | |
| with tab_findings: | |
| st.subheader("Detected Issues") | |
| left, right = st.columns([1, 1.1]) | |
| with left: | |
| display_mode = st.radio( | |
| "Display mode", | |
| ["Issues Only", "All Analyzed Pairs"], | |
| horizontal=True, | |
| ) | |
| with right: | |
| conf_threshold = st.slider("Minimum confidence", 0.0, 1.0, 0.0, 0.01) | |
| display_df = issues_df if display_mode == "Issues Only" else df | |
| display_df = display_df[display_df["Confidence"] >= conf_threshold] | |
| if display_mode == "Issues Only" and display_df.empty: | |
| st.warning("No issues match this filter.") | |
| st.info("Try lower confidence or switch to 'All Analyzed Pairs'.") | |
| elif display_df.empty: | |
| st.info("No analyzed pairs match this filter.") | |
| else: | |
| display_df = display_df.copy().reset_index(drop=True) | |
| display_df.insert(0, "S.No", range(1, len(display_df) + 1)) | |
| cols = [ | |
| "S.No", | |
| "Label", | |
| "Confidence", | |
| "Reason", | |
| "Location 1", | |
| "Location 2", | |
| "Clause 1", | |
| "Clause 2", | |
| ] | |
| st.dataframe(display_df[cols], use_container_width=True) | |
| with tab_line_map: | |
| st.subheader("Error Line Dashboard") | |
| if line_issues: | |
| line_df = pd.DataFrame(line_issues) | |
| labels = sorted(line_df["Issue Type"].dropna().unique().tolist()) | |
| selected = st.multiselect("Filter issue types", labels, default=labels) | |
| page_min = int(line_df["Page"].min()) if not line_df.empty else 1 | |
| page_max = int(line_df["Page"].max()) if not line_df.empty else 1 | |
| if page_min == page_max: | |
| st.caption(f"Only one page with issues: Page {page_min}") | |
| page_range = (page_min, page_max) | |
| else: | |
| page_range = st.slider( | |
| "Page range", page_min, page_max, (page_min, page_max) | |
| ) | |
| if selected: | |
| line_df = line_df[line_df["Issue Type"].isin(selected)] | |
| line_df = line_df[ | |
| (line_df["Page"] >= page_range[0]) & (line_df["Page"] <= page_range[1]) | |
| ] | |
| st.dataframe(line_df, use_container_width=True) | |
| st.markdown("**Issue Occurrence By Line With Parties**") | |
| by_line = line_df.copy() | |
| by_line = by_line.sort_values( | |
| by=["Page", "Line", "Confidence"], ascending=[True, True, False] | |
| ) | |
| st.dataframe( | |
| by_line[ | |
| [ | |
| "Issue Type", | |
| "Page", | |
| "Line", | |
| "Vendor", | |
| "Vendee", | |
| "Confidence", | |
| "Reason", | |
| ] | |
| ], | |
| use_container_width=True, | |
| ) | |
| st.subheader("Jump To Error Line") | |
| if not line_df.empty: | |
| line_df = line_df.reset_index(drop=True) | |
| line_df.insert(0, "Item", range(1, len(line_df) + 1)) | |
| line_df["Jump"] = line_df.apply( | |
| lambda r: ( | |
| f"#{r['Item']} | Pg {int(r['Page'])}, Ln {int(r['Line'])} | {r['Issue Type']}" | |
| ), | |
| axis=1, | |
| ) | |
| selected_jump = st.selectbox( | |
| "Select issue line", line_df["Jump"].tolist() | |
| ) | |
| chosen = line_df[line_df["Jump"] == selected_jump].iloc[0] | |
| c1, c2 = st.columns([1.1, 1], gap="large") | |
| with c1: | |
| st.markdown( | |
| f""" | |
| <div class="mini-card"> | |
| <div class="mini-label">Selected Line</div> | |
| <div class="mini-value">Pg {int(chosen["Page"])} · Ln {int(chosen["Line"])}</div> | |
| <div class="mono">{chosen["Issue Type"]} | Confidence: {float(chosen["Confidence"]):.2f}</div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| st.caption("Snippet") | |
| st.code(str(chosen["Snippet"]), language="text") | |
| st.caption("Reason") | |
| st.write(str(chosen["Reason"])) | |
| with c2: | |
| is_pdf = st.session_state.uploaded_ext == "pdf" | |
| if is_pdf and st.session_state.uploaded_bytes: | |
| st.caption("PDF Preview (jumped to selected page)") | |
| page_number = int(chosen["Page"]) | |
| pdf_b64 = base64.b64encode( | |
| st.session_state.uploaded_bytes | |
| ).decode("utf-8") | |
| pdf_html = f""" | |
| <iframe | |
| src="data:application/pdf;base64,{pdf_b64}#page={page_number}&zoom=110" | |
| width="100%" | |
| height="520" | |
| style="border:1px solid #d6e8f4; border-radius: 10px;" | |
| ></iframe> | |
| """ | |
| st.markdown(pdf_html, unsafe_allow_html=True) | |
| else: | |
| st.info( | |
| "Inline PDF preview is available for PDF uploads. Current file is not PDF." | |
| ) | |
| else: | |
| st.info("No line-level issues to display.") | |
| with tab_export: | |
| st.subheader("Download Reports") | |
| json_payload = json.dumps(results, indent=2) | |
| st.download_button( | |
| label="Download JSON Report", | |
| data=json_payload, | |
| file_name="semantic_integrity_report.json", | |
| mime="application/json", | |
| ) | |
| pdf_bytes = generate_pdf_report( | |
| [r for r in results if r["Label"] != "NO_CONFLICT"] | |
| ) | |
| st.download_button( | |
| label="Download PDF Report", | |
| data=pdf_bytes, | |
| file_name="semantic_integrity_report.pdf", | |
| mime="application/pdf", | |
| ) | |
| if st.button("Analyze Another Document"): | |
| st.session_state.analysis_done = False | |
| st.session_state.results = [] | |
| st.session_state.line_issues = [] | |
| st.rerun() | |
| def main(): | |
| st.set_page_config(page_title=APP_TITLE, layout="wide") | |
| apply_theme() | |
| init_state() | |
| top_col1, top_col2 = st.columns([5, 1]) | |
| with top_col1: | |
| st.title(APP_TITLE) | |
| with top_col2: | |
| if st.session_state.is_authenticated and st.button("Logout"): | |
| st.session_state.is_authenticated = False | |
| st.session_state.username = "" | |
| st.session_state.analysis_done = False | |
| st.session_state.results = [] | |
| st.session_state.line_issues = [] | |
| st.rerun() | |
| if not st.session_state.is_authenticated: | |
| login_page() | |
| return | |
| if not st.session_state.analysis_done: | |
| upload_page() | |
| else: | |
| dashboard_page() | |
| if __name__ == "__main__": | |
| main() | |