Ryan Christian D. Deniega Claude Sonnet 4.6 commited on
Commit
affe2db
·
1 Parent(s): 8af997f

Add LDA topic inference — show detected topic label + defining words in UI

Browse files

LDAFeatureClassifier gains get_topic_info() which reuses the already-trained
_lda and _count_vec to infer the dominant topic for any new text. Returns
a human-assigned label (Health & Conspiracy, Breaking News, etc.), the top 6
words that define that topic, and the probability confidence.

_run_comparison now returns (comparison_entries, LDATopicResult) tuple.
VerificationResponse gains lda_topic field. The LDA column in the comparison
panel expands to show the topic label in cyan, top-word chips, and match %.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

api/schemas.py CHANGED
@@ -103,6 +103,14 @@ class ClassifierComparisonEntry(BaseModel):
103
  top_features: list[str] = [] # up to 3 top features / lda_topic_N label
104
 
105
 
 
 
 
 
 
 
 
 
106
  # ── Main Response ─────────────────────────────────────────────────────────────
107
 
108
  class VerificationResponse(BaseModel):
@@ -124,6 +132,10 @@ class VerificationResponse(BaseModel):
124
  default_factory=list,
125
  description="Per-classifier results from all classical ML models (BoW, TF-IDF, NB, LDA)",
126
  )
 
 
 
 
127
 
128
 
129
  # ── History / Trends ──────────────────────────────────────────────────────────
 
103
  top_features: list[str] = [] # up to 3 top features / lda_topic_N label
104
 
105
 
106
+ # ── LDA Topic Result ──────────────────────────────────────────────────────────
107
+
108
+ class LDATopicResult(BaseModel):
109
+ label: str # Human-assigned topic name
110
+ top_words: list[str] # Top 6 words defining this topic
111
+ confidence: float = Field(..., ge=0.0, le=100.0) # Dominant topic probability (%)
112
+
113
+
114
  # ── Main Response ─────────────────────────────────────────────────────────────
115
 
116
  class VerificationResponse(BaseModel):
 
132
  default_factory=list,
133
  description="Per-classifier results from all classical ML models (BoW, TF-IDF, NB, LDA)",
134
  )
135
+ lda_topic: Optional[LDATopicResult] = Field(
136
+ None,
137
+ description="Dominant LDA topic inferred for this text",
138
+ )
139
 
140
 
141
  # ── History / Trends ──────────────────────────────────────────────────────────
frontend/src/pages/VerifyPage.jsx CHANGED
@@ -927,6 +927,32 @@ export default function VerifyPage() {
927
  clf.verdict === 'Likely Fake' ? '#f87171' : 'var(--accent-gold)',
928
  }} />
929
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
930
  {clf.top_features?.length > 0 && (
931
  <div style={{ display: 'flex', flexWrap: 'wrap', gap: 3 }}>
932
  {clf.top_features.map((f, i) => (
 
927
  clf.verdict === 'Likely Fake' ? '#f87171' : 'var(--accent-gold)',
928
  }} />
929
  </div>
930
+ {clf.name === 'LDA' && result.lda_topic && (
931
+ <div style={{ marginBottom: 8 }}>
932
+ <div style={{
933
+ fontSize: '0.72rem', fontWeight: 600,
934
+ color: 'var(--accent-cyan)', marginBottom: 4,
935
+ }}>
936
+ {result.lda_topic.label}
937
+ </div>
938
+ <div style={{ display: 'flex', flexWrap: 'wrap', gap: 3, marginBottom: 4 }}>
939
+ {result.lda_topic.top_words.map((w, i) => (
940
+ <span key={i} style={{
941
+ fontSize: '0.6rem', padding: '1px 5px', borderRadius: 2,
942
+ background: 'rgba(6,182,212,0.08)',
943
+ color: 'var(--accent-cyan)',
944
+ border: '1px solid rgba(6,182,212,0.2)',
945
+ fontFamily: 'var(--font-mono)',
946
+ }}>
947
+ {w}
948
+ </span>
949
+ ))}
950
+ </div>
951
+ <div style={{ fontSize: '0.62rem', color: 'var(--text-muted)' }}>
952
+ {result.lda_topic.confidence.toFixed(1)}% topic match
953
+ </div>
954
+ </div>
955
+ )}
956
  {clf.top_features?.length > 0 && (
957
  <div style={{ display: 'flex', flexWrap: 'wrap', gap: 3 }}>
958
  {clf.top_features.map((f, i) => (
ml/lda_analysis.py CHANGED
@@ -30,6 +30,16 @@ logger = logging.getLogger(__name__)
30
 
31
  _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
32
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  # ── Standalone topic analysis ──────────────────────────────────────────────────
35
 
@@ -166,6 +176,25 @@ class LDAFeatureClassifier:
166
  triggered_features=triggered[:5],
167
  )
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  # ── Direct run ─────────────────────────────────────────────────────────────────
171
 
 
30
 
31
  _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
32
 
33
+ # Human-readable labels for each LDA topic (1-indexed).
34
+ # Assigned by inspecting run_topic_analysis() output on the 100-sample PH dataset.
35
+ TOPIC_LABELS: dict[int, str] = {
36
+ 1: "Health & Conspiracy",
37
+ 2: "Breaking News",
38
+ 3: "Crime & Law",
39
+ 4: "Politics & Government",
40
+ 5: "Filipino Current Events",
41
+ }
42
+
43
 
44
  # ── Standalone topic analysis ──────────────────────────────────────────────────
45
 
 
176
  triggered_features=triggered[:5],
177
  )
178
 
179
+ def get_topic_info(self, text: str) -> dict:
180
+ """
181
+ Infer the dominant LDA topic for a new text.
182
+ Returns label (human-assigned), top 6 defining words, and confidence
183
+ (the probability mass on the dominant topic, 0–100%).
184
+ """
185
+ processed = self._preprocess(text)
186
+ X_counts = self._count_vec.transform([processed])
187
+ X_lda = self._lda.transform(X_counts) # (1, n_topics)
188
+ topic_idx = int(X_lda[0].argmax())
189
+ confidence = round(float(X_lda[0][topic_idx]) * 100, 1)
190
+
191
+ vocab = self._count_vec.get_feature_names_out()
192
+ topic_vec = self._lda.components_[topic_idx]
193
+ top_words = [vocab[i] for i in topic_vec.argsort()[-6:][::-1]]
194
+
195
+ label = TOPIC_LABELS.get(topic_idx + 1, f"Topic {topic_idx + 1}")
196
+ return {"label": label, "top_words": top_words, "confidence": confidence}
197
+
198
 
199
  # ── Direct run ─────────────────────────────────────────────────────────────────
200
 
scoring/engine.py CHANGED
@@ -14,7 +14,7 @@ from config import get_settings
14
  from api.schemas import (
15
  VerificationResponse, Verdict, Language, DomainTier,
16
  Layer1Result, Layer2Result, EntitiesResult, EvidenceSource, Stance,
17
- ClassifierComparisonEntry,
18
  )
19
 
20
  logger = logging.getLogger(__name__)
@@ -36,8 +36,8 @@ def _get_nlp(key: str, factory):
36
  # Runs all four classical ML classifiers on every request for the demo panel.
37
  # Each classifier trains once on first call and is cached via _get_nlp().
38
 
39
- async def _run_comparison(text: str) -> list[ClassifierComparisonEntry]:
40
- """Run BoW, TF-IDF, Naive Bayes, and LDA classifiers and return comparison entries."""
41
  _COMPARISON_CLASSIFIERS = [
42
  ("BoW", "cmp_bow", lambda: __import__("ml.bow_classifier", fromlist=["BoWClassifier"]).BoWClassifier()),
43
  ("TF-IDF", "cmp_tfidf", lambda: __import__("ml.tfidf_classifier", fromlist=["TFIDFClassifier"]).TFIDFClassifier()),
@@ -47,6 +47,7 @@ async def _run_comparison(text: str) -> list[ClassifierComparisonEntry]:
47
 
48
  def _predict_all():
49
  results = []
 
50
  for name, key, factory in _COMPARISON_CLASSIFIERS:
51
  try:
52
  clf = _get_nlp(key, factory)
@@ -57,9 +58,12 @@ async def _run_comparison(text: str) -> list[ClassifierComparisonEntry]:
57
  confidence=r.confidence,
58
  top_features=r.triggered_features[:3],
59
  ))
 
 
 
60
  except Exception as exc:
61
  logger.warning("Comparison classifier %s failed: %s", name, exc)
62
- return results
63
 
64
  loop = asyncio.get_event_loop()
65
  return await loop.run_in_executor(None, _predict_all)
@@ -315,7 +319,7 @@ async def run_verification(
315
  verdict = _map_verdict(final_score)
316
 
317
  # ── Step 10: Assemble response ────────────────────────────────────────────
318
- comparison = await comparison_task
319
 
320
  result = VerificationResponse(
321
  verdict=verdict,
@@ -335,6 +339,7 @@ async def run_verification(
335
  domain_credibility=get_domain_tier(source_domain) if source_domain else None,
336
  input_type=input_type,
337
  classifier_comparison=comparison,
 
338
  )
339
 
340
  # ── Record to Firestore (falls back to in-memory if Firebase not configured) ─
 
14
  from api.schemas import (
15
  VerificationResponse, Verdict, Language, DomainTier,
16
  Layer1Result, Layer2Result, EntitiesResult, EvidenceSource, Stance,
17
+ ClassifierComparisonEntry, LDATopicResult,
18
  )
19
 
20
  logger = logging.getLogger(__name__)
 
36
  # Runs all four classical ML classifiers on every request for the demo panel.
37
  # Each classifier trains once on first call and is cached via _get_nlp().
38
 
39
+ async def _run_comparison(text: str) -> tuple[list[ClassifierComparisonEntry], LDATopicResult | None]:
40
+ """Run BoW, TF-IDF, Naive Bayes, and LDA classifiers. Also infer LDA topic."""
41
  _COMPARISON_CLASSIFIERS = [
42
  ("BoW", "cmp_bow", lambda: __import__("ml.bow_classifier", fromlist=["BoWClassifier"]).BoWClassifier()),
43
  ("TF-IDF", "cmp_tfidf", lambda: __import__("ml.tfidf_classifier", fromlist=["TFIDFClassifier"]).TFIDFClassifier()),
 
47
 
48
  def _predict_all():
49
  results = []
50
+ lda_topic_result = None
51
  for name, key, factory in _COMPARISON_CLASSIFIERS:
52
  try:
53
  clf = _get_nlp(key, factory)
 
58
  confidence=r.confidence,
59
  top_features=r.triggered_features[:3],
60
  ))
61
+ if name == "LDA" and hasattr(clf, "get_topic_info"):
62
+ info = clf.get_topic_info(text)
63
+ lda_topic_result = LDATopicResult(**info)
64
  except Exception as exc:
65
  logger.warning("Comparison classifier %s failed: %s", name, exc)
66
+ return results, lda_topic_result
67
 
68
  loop = asyncio.get_event_loop()
69
  return await loop.run_in_executor(None, _predict_all)
 
319
  verdict = _map_verdict(final_score)
320
 
321
  # ── Step 10: Assemble response ────────────────────────────────────────────
322
+ comparison, lda_topic = await comparison_task
323
 
324
  result = VerificationResponse(
325
  verdict=verdict,
 
339
  domain_credibility=get_domain_tier(source_domain) if source_domain else None,
340
  input_type=input_type,
341
  classifier_comparison=comparison,
342
+ lda_topic=lda_topic,
343
  )
344
 
345
  # ── Record to Firestore (falls back to in-memory if Firebase not configured) ─