""" Strict Domain Analyzer for Legal Documents. Implements specific checks for: - Entity Roles (Vendor vs Vendee) - Domain Categories (Financial, Possession, Ownership, etc.) - Timeline Logic (Agreement vs Registration) - Numeric Consistency within context """ import re # ========================= # 1. STRICT CLASSIFICATION # ========================= def is_legal_boilerplate(text): """Detects standard legal headers, footers, and witness blocks.""" t = text.lower() patterns = [ "in witness whereof", "signed and delivered", "witnesses:", "schedule", "jurisdiction", "arbitration", "notice", "all that piece and parcel", "north by", "south by" ] # If it's very short (< 5 words) and contains a keyword words = t.split() if len(words) < 5 and any(p in t for p in patterns): return True # If it's just a signature block if "signed by" in t or "witness" in t: return True return False def get_clause_domain(text): """ Classify clause into strict legal domains. Returns: 'FINANCIAL', 'POSSESSION', 'OWNERSHIP', 'ENCUMBRANCE', 'ADMINISTRATIVE', 'RECITAL', 'DEFINITION', 'OPERATIVE' or 'GENERAL' """ t = text.lower() # 1. RECITAL (Background) if t.startswith("whereas") or "and whereas" in t: return "RECITAL" # 2. DEFINITION if "shall mean" in t or "expression vendor" in t or "expression vendee" in t: return "DEFINITION" # 3. FINANCIAL (Money, Consideration) if any(w in t for w in ["rs.", "rupees", "paid", "consideration", "sum of", "amount", "price", "cheque", "bank"]): return "FINANCIAL" # 4. POSSESSION (Handover, Vacant) if any(w in t for w in ["possession", "handed over", "delivered", "vacant"]): return "POSSESSION" # 5. OWNERSHIP / TITLE if any(w in t for w in ["owner", "title", "interest", "rights", "absolute", "fee simple"]): return "OWNERSHIP" # 6. ENCUMBRANCE (Loans, Mortgages) if any(w in t for w in ["encumbrance", "mortgage", "loan", "charge", "lien", "litigation"]): return "ENCUMBRANCE" # 7. ADMINISTRATIVE (Boilerplate) if any(w in t for w in ["witness", "signed", "schedule", "jurisdiction", "arbitration", "notice"]): return "ADMINISTRATIVE" # 8. OPERATIVE (Action) if t.startswith("that") or "hereby" in t or "now this deed" in t: return "OPERATIVE" return "GENERAL" def get_entities(text): """ Strictly detect if clause belongs to a specific entity. """ t = text.lower() entities = set() if "vendor" in t: entities.add("Vendor") if "vendee" in t: entities.add("Vendee") return entities # ========================= # 2. EXTRACTION HELPERS # ========================= def extract_numbers(text): """Extract numeric values for comparison.""" # Matches Rs. 100, 1,00,000, 500 sq ft (just the numbers) return [int(n.replace(",", "")) for n in re.findall(r'\b\d{1,3}(?:,\d{3})*\b', text)] def has_negation(text): neg_words = ["not", "never", "no", "cannot", "must not", "shall not"] return any(w in text.lower() for w in neg_words) def has_exception_language(text): """Detects legal exception/qualification identifiers.""" qualifiers = [ "subject to", "notwithstanding", "except as provided", "unless otherwise", "provided however", "without prejudice" ] return any(q in text.lower() for q in qualifiers) def is_definition(text): """Strictly checks if a clause is a definition.""" t = text.lower() if "shall mean" in t or "means" in t or "defined as" in t: return True return False def is_party_intro(text): """Detects if a clause is just listing a party description.""" t = text.lower() # Strong Indicators: Address patterns, Relations, IDs # Regex for "Door No", "D.No", "residing at" address_pattern = r"(door\s*no|d\.no|residing\s*at|post\s*,\s*village)" # Regex for relations: "son of", "wife of", "daughter of", "w/o", "s/o", or just "son", "wife" in context relation_pattern = r"\b(son|wife|daughter|husband|father|mother|s/o|w/o|d/o)\b" # Regex for IDs: "aadhaar", "pan no", "id card" id_pattern = r"(aadhaar|pan\s*no|id\s*card|mobile\s*no)" # Check for presence of these patterns has_address = re.search(address_pattern, t) has_relation = re.search(relation_pattern, t) has_id = re.search(id_pattern, t) # If it has at least 2 strong components (e.g. Relation + ID, or Address + Relation), it's a bio score = 0 if has_address: score += 1 if has_relation: score += 1 if has_id: score += 1 return score >= 2 # ========================= # 3. CORE LOGIC GATES # ========================= def analyze_pair(text1, text2, similarity, threshold=0.75): """ Strict Analyzer returning (Label, Score, Reason). Args: threshold: Minimum similarity score to consider as CANDIDATE (default 0.75) """ # Force Reload Trigger # --- GATE 0: BOILERPLATE CHECK --- if is_legal_boilerplate(text1) or is_legal_boilerplate(text2): return None, 0.0, "Boilerplate (Skipped)" # --- GATE 1: DOMAIN MISMATCH --- d1 = get_clause_domain(text1) d2 = get_clause_domain(text2) # If domains are totally different, SKIP. # Exception: OPERATIVE and GENERAL might overlap, but strictly FINANCIAL vs POSSESSION should skip. if d1 != "GENERAL" and d2 != "GENERAL" and d1 != d2: # RELAXATION: Only bypass if similarity is VERY high (suggesting misclassification). # Otherwise, DO NOT compare apples (Financial) to oranges (Possession), # even in Deep Search mode. if similarity < 0.85: return None, 0.0, "Domain Mismatch" # --- HARDENED CHECK: GENERAL vs SPECIFIC --- # Common source of noise: "Any other details" matching "The price is Rs 100" # Block GENERAL vs Specific unless similarity is high if (d1 == "GENERAL" and d2 != "GENERAL") or (d2 == "GENERAL" and d1 != "GENERAL"): if similarity < 0.80: return None, 0.0, "General vs Specific Domain (Skipped)" # --- SPECIFIC FILTER: MONEY vs TIMELINE --- # Prevents "Price is X" vs "Payment due on Date Y" (confusing numbers/dates) # Check if one clause is purely FINANCIAL and other is purely TIMELINE/DATE based is_financial = d1 == "FINANCIAL" or d2 == "FINANCIAL" has_date = re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text1) or \ re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text2) if is_financial and has_date: # If one talks about Price/Amount and other has a Date, # unless they are explicitly about "Payment Schedule", they are likely different. if "schedule" not in text1.lower() and "schedule" not in text2.lower(): if similarity < 0.85: return None, 0.0, "Financial vs Timeline Mismatch" # --- SPECIFIC FILTER: ELIGIBILITY vs ASSISTANCE --- # Prevents "Eligibility criteria" vs "Assistance details" (Common in schemes) # Check for keywords like "eligible", "qualify" vs "grant", "support", "help" t1_lower, t2_lower = text1.lower(), text2.lower() is_eligibility = any(w in t1_lower for w in ["eligible", "qualify", "criteria", "requirement"]) or \ any(w in t2_lower for w in ["eligible", "qualify", "criteria", "requirement"]) is_assistance = any(w in t1_lower for w in ["provide", "grant", "subsidy", "support", "assistance"]) or \ any(w in t2_lower for w in ["provide", "grant", "subsidy", "support", "assistance"]) if is_eligibility and is_assistance: # Unless precise overlap, these are distinct sections if similarity < 0.85: return None, 0.0, "Eligibility vs Assistance Mismatch" # --- GATE 1.5: PARTY DESCRIPTION CHECK --- # If both clauses are just descriptions of people (addresses, relations), skip. if is_party_intro(text1) and is_party_intro(text2): return None, 0.0, "Party Description (Skipped)" # --- GATE 2: ENTITY MISMATCH --- e1 = get_entities(text1) e2 = get_entities(text2) # If one is Vendor ONLY and other is Vendee ONLY -> SKIP if e1 and e2 and e1 != e2 and not (e1 & e2): # RELAXATION: Only bypass if similarity is VERY high. if similarity < 0.85: return None, 0.0, "Entity Role Mismatch" # --- GATE 2.5: DEFINITION GUARD --- # Don't compare definitions with operative clauses generally if is_definition(text1) or is_definition(text2): # Only compare if both are definitions (conflicting definitions) if not (is_definition(text1) and is_definition(text2)): return None, 0.0, "Definition vs Operative" # --- GATE 3: POSSESSION TIMELINE --- # "Possession at agreement" vs "Possession at registration" is NOT a contradiction. if d1 == "POSSESSION" and d2 == "POSSESSION": keywords_a = ["agreement", "earnest"] keywords_b = ["registration", "sale deed", "final"] has_a = any(k in text1.lower() for k in keywords_a) has_b = any(k in text2.lower() for k in keywords_b) # If one talks about start and other about end, it's a sequence. if (has_a and any(k in text2.lower() for k in keywords_b)) or \ (has_b and any(k in text1.lower() for k in keywords_a)): return None, 0.0, "Possession Timeline Sequence" # --- GATE 4: NUMERIC REASONING --- # Only compare numbers if context allows nums1 = extract_numbers(text1) nums2 = extract_numbers(text2) if nums1 and nums2 and nums1 != nums2: # MAGNITUDE CHECK: If numbers differ by > 100x, likely different units (e.g. Price vs Area) # e.g. 5,50,000 vs 1.25 -> Ratio is huge. max1, max2 = max(nums1), max(nums2) if max1 > 0 and max2 > 0: ratio = max1 / max2 if max1 > max2 else max2 / max1 if ratio > 100: return None, 0.0, "Numeric Magnitude Mismatch (Likely Unit Diff)" # Check if they are in the same domain (likely valid comparison) if d1 == d2 and d1 != "GENERAL": return "NUMERIC_INCONSISTENCY", 0.9, f"Mismatch in {d1} values" # If General, be careful. # But if similarity is VERY high, it might be a contradiction. if similarity > 0.9: return "NUMERIC_INCONSISTENCY", 0.85, "Numeric Mismatch in similar context" # --- GATE 4.5: EXCEPTION/HIERARCHY CHECK --- # If high similarity but one has exception language # We use a slightly lower threshold for exception detection to be safe exception_threshold = max(0.65, threshold - 0.05) if similarity > exception_threshold: has_ex1 = has_exception_language(text1) has_ex2 = has_exception_language(text2) if (has_ex1 and not has_ex2) or (has_ex2 and not has_ex1): return "QUALIFICATION", similarity, "Legal Exception/Qualification detected (Not a Conflict)" # --- GATE 5: LOGICAL NEGATION --- if (has_negation(text1) and not has_negation(text2)) or \ (has_negation(text2) and not has_negation(text1)): # Only flag if high similarity implies they are talking about the same thing # Negation check requires fairly high confidence they are related if similarity > 0.85: return "LEGAL_CONFLICT", 0.8, "Logical Negation detected" # --- FINAL GATE: CANDIDATE FOR NLI --- # If we are here, we passed the blocks. # If similarity is high, let NLI decide. if similarity > threshold: return "CANDIDATE", similarity, "High Similarity - Pending NLI" return None, 0.0, "Low Similarity"