Spaces:

SemiAutomat1c
/

philverify-api

Running

Ryan Christian D. Deniega Claude Sonnet 4.6 commited on about 5 hours ago

Commit

affe2db

1 Parent(s): 8af997f

Add LDA topic inference — show detected topic label + defining words in UI

LDAFeatureClassifier gains get_topic_info() which reuses the already-trained
_lda and _count_vec to infer the dominant topic for any new text. Returns
a human-assigned label (Health & Conspiracy, Breaking News, etc.), the top 6
words that define that topic, and the probability confidence.

_run_comparison now returns (comparison_entries, LDATopicResult) tuple.
VerificationResponse gains lda_topic field. The LDA column in the comparison
panel expands to show the topic label in cyan, top-word chips, and match %.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (4) hide show

api/schemas.py +12 -0
frontend/src/pages/VerifyPage.jsx +26 -0
ml/lda_analysis.py +29 -0
scoring/engine.py +10 -5

api/schemas.py CHANGED Viewed

@@ -103,6 +103,14 @@ class ClassifierComparisonEntry(BaseModel):
     top_features: list[str] = []        # up to 3 top features / lda_topic_N label
 # ── Main Response ─────────────────────────────────────────────────────────────
 class VerificationResponse(BaseModel):
@@ -124,6 +132,10 @@ class VerificationResponse(BaseModel):
         default_factory=list,
         description="Per-classifier results from all classical ML models (BoW, TF-IDF, NB, LDA)",
     )
 # ── History / Trends ──────────────────────────────────────────────────────────

     top_features: list[str] = []        # up to 3 top features / lda_topic_N label
+# ── LDA Topic Result ──────────────────────────────────────────────────────────
+class LDATopicResult(BaseModel):
+    label: str                                          # Human-assigned topic name
+    top_words: list[str]                                # Top 6 words defining this topic
+    confidence: float = Field(..., ge=0.0, le=100.0)   # Dominant topic probability (%)
 # ── Main Response ─────────────────────────────────────────────────────────────
 class VerificationResponse(BaseModel):
         default_factory=list,
         description="Per-classifier results from all classical ML models (BoW, TF-IDF, NB, LDA)",
     )
+    lda_topic: Optional[LDATopicResult] = Field(
+        None,
+        description="Dominant LDA topic inferred for this text",
+    )
 # ── History / Trends ──────────────────────────────────────────────────────────

frontend/src/pages/VerifyPage.jsx CHANGED Viewed

@@ -927,6 +927,32 @@ export default function VerifyPage() {
                                                             clf.verdict === 'Likely Fake' ? '#f87171' : 'var(--accent-gold)',
                                             }} />
                                         </div>
                                         {clf.top_features?.length > 0 && (
                                             <div style={{ display: 'flex', flexWrap: 'wrap', gap: 3 }}>
                                                 {clf.top_features.map((f, i) => (

                                                             clf.verdict === 'Likely Fake' ? '#f87171' : 'var(--accent-gold)',
                                             }} />
                                         </div>
+                                        {clf.name === 'LDA' && result.lda_topic && (
+                                            <div style={{ marginBottom: 8 }}>
+                                                <div style={{
+                                                    fontSize: '0.72rem', fontWeight: 600,
+                                                    color: 'var(--accent-cyan)', marginBottom: 4,
+                                                }}>
+                                                    {result.lda_topic.label}
+                                                </div>
+                                                <div style={{ display: 'flex', flexWrap: 'wrap', gap: 3, marginBottom: 4 }}>
+                                                    {result.lda_topic.top_words.map((w, i) => (
+                                                        <span key={i} style={{
+                                                            fontSize: '0.6rem', padding: '1px 5px', borderRadius: 2,
+                                                            background: 'rgba(6,182,212,0.08)',
+                                                            color: 'var(--accent-cyan)',
+                                                            border: '1px solid rgba(6,182,212,0.2)',
+                                                            fontFamily: 'var(--font-mono)',
+                                                        }}>
+                                                            {w}
+                                                        </span>
+                                                    ))}
+                                                </div>
+                                                <div style={{ fontSize: '0.62rem', color: 'var(--text-muted)' }}>
+                                                    {result.lda_topic.confidence.toFixed(1)}% topic match
+                                                </div>
+                                            </div>
+                                        )}
                                         {clf.top_features?.length > 0 && (
                                             <div style={{ display: 'flex', flexWrap: 'wrap', gap: 3 }}>
                                                 {clf.top_features.map((f, i) => (

ml/lda_analysis.py CHANGED Viewed

@@ -30,6 +30,16 @@ logger = logging.getLogger(__name__)
 _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
 # ── Standalone topic analysis ──────────────────────────────────────────────────
@@ -166,6 +176,25 @@ class LDAFeatureClassifier:
             triggered_features=triggered[:5],
         )
 # ── Direct run ─────────────────────────────────────────────────────────────────

 _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
+# Human-readable labels for each LDA topic (1-indexed).
+# Assigned by inspecting run_topic_analysis() output on the 100-sample PH dataset.
+TOPIC_LABELS: dict[int, str] = {
+    1: "Health & Conspiracy",
+    2: "Breaking News",
+    3: "Crime & Law",
+    4: "Politics & Government",
+    5: "Filipino Current Events",
+}
 # ── Standalone topic analysis ──────────────────────────────────────────────────
             triggered_features=triggered[:5],
         )
+    def get_topic_info(self, text: str) -> dict:
+        """
+        Infer the dominant LDA topic for a new text.
+        Returns label (human-assigned), top 6 defining words, and confidence
+        (the probability mass on the dominant topic, 0–100%).
+        """
+        processed = self._preprocess(text)
+        X_counts = self._count_vec.transform([processed])
+        X_lda = self._lda.transform(X_counts)          # (1, n_topics)
+        topic_idx = int(X_lda[0].argmax())
+        confidence = round(float(X_lda[0][topic_idx]) * 100, 1)
+        vocab = self._count_vec.get_feature_names_out()
+        topic_vec = self._lda.components_[topic_idx]
+        top_words = [vocab[i] for i in topic_vec.argsort()[-6:][::-1]]
+        label = TOPIC_LABELS.get(topic_idx + 1, f"Topic {topic_idx + 1}")
+        return {"label": label, "top_words": top_words, "confidence": confidence}
 # ── Direct run ─────────────────────────────────────────────────────────────────

scoring/engine.py CHANGED Viewed

@@ -14,7 +14,7 @@ from config import get_settings
 from api.schemas import (
     VerificationResponse, Verdict, Language, DomainTier,
     Layer1Result, Layer2Result, EntitiesResult, EvidenceSource, Stance,
-    ClassifierComparisonEntry,
 )
 logger = logging.getLogger(__name__)
@@ -36,8 +36,8 @@ def _get_nlp(key: str, factory):
 # Runs all four classical ML classifiers on every request for the demo panel.
 # Each classifier trains once on first call and is cached via _get_nlp().
-async def _run_comparison(text: str) -> list[ClassifierComparisonEntry]:
-    """Run BoW, TF-IDF, Naive Bayes, and LDA classifiers and return comparison entries."""
     _COMPARISON_CLASSIFIERS = [
         ("BoW",         "cmp_bow",   lambda: __import__("ml.bow_classifier", fromlist=["BoWClassifier"]).BoWClassifier()),
         ("TF-IDF",      "cmp_tfidf", lambda: __import__("ml.tfidf_classifier", fromlist=["TFIDFClassifier"]).TFIDFClassifier()),
@@ -47,6 +47,7 @@ async def _run_comparison(text: str) -> list[ClassifierComparisonEntry]:
     def _predict_all():
         results = []
         for name, key, factory in _COMPARISON_CLASSIFIERS:
             try:
                 clf = _get_nlp(key, factory)
@@ -57,9 +58,12 @@ async def _run_comparison(text: str) -> list[ClassifierComparisonEntry]:
                     confidence=r.confidence,
                     top_features=r.triggered_features[:3],
                 ))
             except Exception as exc:
                 logger.warning("Comparison classifier %s failed: %s", name, exc)
-        return results
     loop = asyncio.get_event_loop()
     return await loop.run_in_executor(None, _predict_all)
@@ -315,7 +319,7 @@ async def run_verification(
     verdict = _map_verdict(final_score)
     # ── Step 10: Assemble response ────────────────────────────────────────────
-    comparison = await comparison_task
     result = VerificationResponse(
         verdict=verdict,
@@ -335,6 +339,7 @@ async def run_verification(
         domain_credibility=get_domain_tier(source_domain) if source_domain else None,
         input_type=input_type,
         classifier_comparison=comparison,
     )
     # ── Record to Firestore (falls back to in-memory if Firebase not configured) ─

 from api.schemas import (
     VerificationResponse, Verdict, Language, DomainTier,
     Layer1Result, Layer2Result, EntitiesResult, EvidenceSource, Stance,
+    ClassifierComparisonEntry, LDATopicResult,
 )
 logger = logging.getLogger(__name__)
 # Runs all four classical ML classifiers on every request for the demo panel.
 # Each classifier trains once on first call and is cached via _get_nlp().
+async def _run_comparison(text: str) -> tuple[list[ClassifierComparisonEntry], LDATopicResult | None]:
+    """Run BoW, TF-IDF, Naive Bayes, and LDA classifiers. Also infer LDA topic."""
     _COMPARISON_CLASSIFIERS = [
         ("BoW",         "cmp_bow",   lambda: __import__("ml.bow_classifier", fromlist=["BoWClassifier"]).BoWClassifier()),
         ("TF-IDF",      "cmp_tfidf", lambda: __import__("ml.tfidf_classifier", fromlist=["TFIDFClassifier"]).TFIDFClassifier()),
     def _predict_all():
         results = []
+        lda_topic_result = None
         for name, key, factory in _COMPARISON_CLASSIFIERS:
             try:
                 clf = _get_nlp(key, factory)
                     confidence=r.confidence,
                     top_features=r.triggered_features[:3],
                 ))
+                if name == "LDA" and hasattr(clf, "get_topic_info"):
+                    info = clf.get_topic_info(text)
+                    lda_topic_result = LDATopicResult(**info)
             except Exception as exc:
                 logger.warning("Comparison classifier %s failed: %s", name, exc)
+        return results, lda_topic_result
     loop = asyncio.get_event_loop()
     return await loop.run_in_executor(None, _predict_all)
     verdict = _map_verdict(final_score)
     # ── Step 10: Assemble response ────────────────────────────────────────────
+    comparison, lda_topic = await comparison_task
     result = VerificationResponse(
         verdict=verdict,
         domain_credibility=get_domain_tier(source_domain) if source_domain else None,
         input_type=input_type,
         classifier_comparison=comparison,
+        lda_topic=lda_topic,
     )
     # ── Record to Firestore (falls back to in-memory if Firebase not configured) ─