| |
| """ |
| PubGuard gate for pipeline integration. |
| |
| Reads extracted PDF text from stdin or a file, screens it, and: |
| - Prints verdict JSON to STDERR (for debugging) |
| - Prints PASS/FAIL to STDERR |
| - Exits 0 (pass) or 1 (fail) |
| |
| Usage: |
| echo "$PDF_TEXT" | python3 pub_check/scripts/pubguard_gate.py |
| |
| Environment variables: |
| PUBGUARD_MODELS_DIR β Override models directory |
| PUBGUARD_STRICT β Set to "0" to warn instead of gate (exit 0 always) |
| """ |
|
|
| import json |
| import sys |
| import os |
| import logging |
|
|
| logging.basicConfig( |
| level=logging.WARNING, |
| format="%(asctime)s | %(levelname)s | %(message)s", |
| datefmt="%H:%M:%S", |
| ) |
|
|
| from pubguard import PubGuard, PubGuardConfig |
|
|
|
|
| def main(): |
| if len(sys.argv) > 1 and sys.argv[1] != "-": |
| with open(sys.argv[1], errors="replace") as f: |
| text = f.read() |
| else: |
| text = sys.stdin.read() |
|
|
| if not text.strip(): |
| print("PUBGUARD: Empty input", file=sys.stderr) |
| sys.exit(1) |
|
|
| config = PubGuardConfig() |
| strict = os.environ.get("PUBGUARD_STRICT", "1") != "0" |
|
|
| guard = PubGuard(config=config) |
| guard.initialize() |
| verdict = guard.screen(text) |
|
|
| print(json.dumps(verdict), file=sys.stderr) |
|
|
| if verdict["pass"]: |
| print("PUBGUARD: PASS", file=sys.stderr) |
| sys.exit(0) |
| else: |
| reasons = [] |
| if verdict["doc_type"]["label"] != "scientific_paper": |
| reasons.append(f"doc_type={verdict['doc_type']['label']}") |
| if verdict["ai_generated"]["label"] == "ai_generated": |
| reasons.append(f"ai_generated (score={verdict['ai_generated']['score']:.2f})") |
| if verdict["toxicity"]["label"] == "toxic": |
| reasons.append(f"toxic (score={verdict['toxicity']['score']:.2f})") |
|
|
| print(f"PUBGUARD: FAIL β {', '.join(reasons)}", file=sys.stderr) |
|
|
| if strict: |
| sys.exit(1) |
| else: |
| print("PUBGUARD: Running in non-strict mode, continuing...", file=sys.stderr) |
| sys.exit(0) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|