Spaces:
Running
Running
Ryan Christian D. Deniega Claude Sonnet 4.6 commited on
Commit ·
affe2db
1
Parent(s): 8af997f
Add LDA topic inference — show detected topic label + defining words in UI
Browse filesLDAFeatureClassifier gains get_topic_info() which reuses the already-trained
_lda and _count_vec to infer the dominant topic for any new text. Returns
a human-assigned label (Health & Conspiracy, Breaking News, etc.), the top 6
words that define that topic, and the probability confidence.
_run_comparison now returns (comparison_entries, LDATopicResult) tuple.
VerificationResponse gains lda_topic field. The LDA column in the comparison
panel expands to show the topic label in cyan, top-word chips, and match %.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- api/schemas.py +12 -0
- frontend/src/pages/VerifyPage.jsx +26 -0
- ml/lda_analysis.py +29 -0
- scoring/engine.py +10 -5
api/schemas.py
CHANGED
|
@@ -103,6 +103,14 @@ class ClassifierComparisonEntry(BaseModel):
|
|
| 103 |
top_features: list[str] = [] # up to 3 top features / lda_topic_N label
|
| 104 |
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# ── Main Response ─────────────────────────────────────────────────────────────
|
| 107 |
|
| 108 |
class VerificationResponse(BaseModel):
|
|
@@ -124,6 +132,10 @@ class VerificationResponse(BaseModel):
|
|
| 124 |
default_factory=list,
|
| 125 |
description="Per-classifier results from all classical ML models (BoW, TF-IDF, NB, LDA)",
|
| 126 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
# ── History / Trends ──────────────────────────────────────────────────────────
|
|
|
|
| 103 |
top_features: list[str] = [] # up to 3 top features / lda_topic_N label
|
| 104 |
|
| 105 |
|
| 106 |
+
# ── LDA Topic Result ──────────────────────────────────────────────────────────
|
| 107 |
+
|
| 108 |
+
class LDATopicResult(BaseModel):
|
| 109 |
+
label: str # Human-assigned topic name
|
| 110 |
+
top_words: list[str] # Top 6 words defining this topic
|
| 111 |
+
confidence: float = Field(..., ge=0.0, le=100.0) # Dominant topic probability (%)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
# ── Main Response ─────────────────────────────────────────────────────────────
|
| 115 |
|
| 116 |
class VerificationResponse(BaseModel):
|
|
|
|
| 132 |
default_factory=list,
|
| 133 |
description="Per-classifier results from all classical ML models (BoW, TF-IDF, NB, LDA)",
|
| 134 |
)
|
| 135 |
+
lda_topic: Optional[LDATopicResult] = Field(
|
| 136 |
+
None,
|
| 137 |
+
description="Dominant LDA topic inferred for this text",
|
| 138 |
+
)
|
| 139 |
|
| 140 |
|
| 141 |
# ── History / Trends ──────────────────────────────────────────────────────────
|
frontend/src/pages/VerifyPage.jsx
CHANGED
|
@@ -927,6 +927,32 @@ export default function VerifyPage() {
|
|
| 927 |
clf.verdict === 'Likely Fake' ? '#f87171' : 'var(--accent-gold)',
|
| 928 |
}} />
|
| 929 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 930 |
{clf.top_features?.length > 0 && (
|
| 931 |
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 3 }}>
|
| 932 |
{clf.top_features.map((f, i) => (
|
|
|
|
| 927 |
clf.verdict === 'Likely Fake' ? '#f87171' : 'var(--accent-gold)',
|
| 928 |
}} />
|
| 929 |
</div>
|
| 930 |
+
{clf.name === 'LDA' && result.lda_topic && (
|
| 931 |
+
<div style={{ marginBottom: 8 }}>
|
| 932 |
+
<div style={{
|
| 933 |
+
fontSize: '0.72rem', fontWeight: 600,
|
| 934 |
+
color: 'var(--accent-cyan)', marginBottom: 4,
|
| 935 |
+
}}>
|
| 936 |
+
{result.lda_topic.label}
|
| 937 |
+
</div>
|
| 938 |
+
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 3, marginBottom: 4 }}>
|
| 939 |
+
{result.lda_topic.top_words.map((w, i) => (
|
| 940 |
+
<span key={i} style={{
|
| 941 |
+
fontSize: '0.6rem', padding: '1px 5px', borderRadius: 2,
|
| 942 |
+
background: 'rgba(6,182,212,0.08)',
|
| 943 |
+
color: 'var(--accent-cyan)',
|
| 944 |
+
border: '1px solid rgba(6,182,212,0.2)',
|
| 945 |
+
fontFamily: 'var(--font-mono)',
|
| 946 |
+
}}>
|
| 947 |
+
{w}
|
| 948 |
+
</span>
|
| 949 |
+
))}
|
| 950 |
+
</div>
|
| 951 |
+
<div style={{ fontSize: '0.62rem', color: 'var(--text-muted)' }}>
|
| 952 |
+
{result.lda_topic.confidence.toFixed(1)}% topic match
|
| 953 |
+
</div>
|
| 954 |
+
</div>
|
| 955 |
+
)}
|
| 956 |
{clf.top_features?.length > 0 && (
|
| 957 |
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 3 }}>
|
| 958 |
{clf.top_features.map((f, i) => (
|
ml/lda_analysis.py
CHANGED
|
@@ -30,6 +30,16 @@ logger = logging.getLogger(__name__)
|
|
| 30 |
|
| 31 |
_LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# ── Standalone topic analysis ──────────────────────────────────────────────────
|
| 35 |
|
|
@@ -166,6 +176,25 @@ class LDAFeatureClassifier:
|
|
| 166 |
triggered_features=triggered[:5],
|
| 167 |
)
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
# ── Direct run ─────────────────────────────────────────────────────────────────
|
| 171 |
|
|
|
|
| 30 |
|
| 31 |
_LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
|
| 32 |
|
| 33 |
+
# Human-readable labels for each LDA topic (1-indexed).
|
| 34 |
+
# Assigned by inspecting run_topic_analysis() output on the 100-sample PH dataset.
|
| 35 |
+
TOPIC_LABELS: dict[int, str] = {
|
| 36 |
+
1: "Health & Conspiracy",
|
| 37 |
+
2: "Breaking News",
|
| 38 |
+
3: "Crime & Law",
|
| 39 |
+
4: "Politics & Government",
|
| 40 |
+
5: "Filipino Current Events",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
|
| 44 |
# ── Standalone topic analysis ──────────────────────────────────────────────────
|
| 45 |
|
|
|
|
| 176 |
triggered_features=triggered[:5],
|
| 177 |
)
|
| 178 |
|
| 179 |
+
def get_topic_info(self, text: str) -> dict:
|
| 180 |
+
"""
|
| 181 |
+
Infer the dominant LDA topic for a new text.
|
| 182 |
+
Returns label (human-assigned), top 6 defining words, and confidence
|
| 183 |
+
(the probability mass on the dominant topic, 0–100%).
|
| 184 |
+
"""
|
| 185 |
+
processed = self._preprocess(text)
|
| 186 |
+
X_counts = self._count_vec.transform([processed])
|
| 187 |
+
X_lda = self._lda.transform(X_counts) # (1, n_topics)
|
| 188 |
+
topic_idx = int(X_lda[0].argmax())
|
| 189 |
+
confidence = round(float(X_lda[0][topic_idx]) * 100, 1)
|
| 190 |
+
|
| 191 |
+
vocab = self._count_vec.get_feature_names_out()
|
| 192 |
+
topic_vec = self._lda.components_[topic_idx]
|
| 193 |
+
top_words = [vocab[i] for i in topic_vec.argsort()[-6:][::-1]]
|
| 194 |
+
|
| 195 |
+
label = TOPIC_LABELS.get(topic_idx + 1, f"Topic {topic_idx + 1}")
|
| 196 |
+
return {"label": label, "top_words": top_words, "confidence": confidence}
|
| 197 |
+
|
| 198 |
|
| 199 |
# ── Direct run ─────────────────────────────────────────────────────────────────
|
| 200 |
|
scoring/engine.py
CHANGED
|
@@ -14,7 +14,7 @@ from config import get_settings
|
|
| 14 |
from api.schemas import (
|
| 15 |
VerificationResponse, Verdict, Language, DomainTier,
|
| 16 |
Layer1Result, Layer2Result, EntitiesResult, EvidenceSource, Stance,
|
| 17 |
-
ClassifierComparisonEntry,
|
| 18 |
)
|
| 19 |
|
| 20 |
logger = logging.getLogger(__name__)
|
|
@@ -36,8 +36,8 @@ def _get_nlp(key: str, factory):
|
|
| 36 |
# Runs all four classical ML classifiers on every request for the demo panel.
|
| 37 |
# Each classifier trains once on first call and is cached via _get_nlp().
|
| 38 |
|
| 39 |
-
async def _run_comparison(text: str) -> list[ClassifierComparisonEntry]:
|
| 40 |
-
"""Run BoW, TF-IDF, Naive Bayes, and LDA classifiers
|
| 41 |
_COMPARISON_CLASSIFIERS = [
|
| 42 |
("BoW", "cmp_bow", lambda: __import__("ml.bow_classifier", fromlist=["BoWClassifier"]).BoWClassifier()),
|
| 43 |
("TF-IDF", "cmp_tfidf", lambda: __import__("ml.tfidf_classifier", fromlist=["TFIDFClassifier"]).TFIDFClassifier()),
|
|
@@ -47,6 +47,7 @@ async def _run_comparison(text: str) -> list[ClassifierComparisonEntry]:
|
|
| 47 |
|
| 48 |
def _predict_all():
|
| 49 |
results = []
|
|
|
|
| 50 |
for name, key, factory in _COMPARISON_CLASSIFIERS:
|
| 51 |
try:
|
| 52 |
clf = _get_nlp(key, factory)
|
|
@@ -57,9 +58,12 @@ async def _run_comparison(text: str) -> list[ClassifierComparisonEntry]:
|
|
| 57 |
confidence=r.confidence,
|
| 58 |
top_features=r.triggered_features[:3],
|
| 59 |
))
|
|
|
|
|
|
|
|
|
|
| 60 |
except Exception as exc:
|
| 61 |
logger.warning("Comparison classifier %s failed: %s", name, exc)
|
| 62 |
-
return results
|
| 63 |
|
| 64 |
loop = asyncio.get_event_loop()
|
| 65 |
return await loop.run_in_executor(None, _predict_all)
|
|
@@ -315,7 +319,7 @@ async def run_verification(
|
|
| 315 |
verdict = _map_verdict(final_score)
|
| 316 |
|
| 317 |
# ── Step 10: Assemble response ────────────────────────────────────────────
|
| 318 |
-
comparison = await comparison_task
|
| 319 |
|
| 320 |
result = VerificationResponse(
|
| 321 |
verdict=verdict,
|
|
@@ -335,6 +339,7 @@ async def run_verification(
|
|
| 335 |
domain_credibility=get_domain_tier(source_domain) if source_domain else None,
|
| 336 |
input_type=input_type,
|
| 337 |
classifier_comparison=comparison,
|
|
|
|
| 338 |
)
|
| 339 |
|
| 340 |
# ── Record to Firestore (falls back to in-memory if Firebase not configured) ─
|
|
|
|
| 14 |
from api.schemas import (
|
| 15 |
VerificationResponse, Verdict, Language, DomainTier,
|
| 16 |
Layer1Result, Layer2Result, EntitiesResult, EvidenceSource, Stance,
|
| 17 |
+
ClassifierComparisonEntry, LDATopicResult,
|
| 18 |
)
|
| 19 |
|
| 20 |
logger = logging.getLogger(__name__)
|
|
|
|
| 36 |
# Runs all four classical ML classifiers on every request for the demo panel.
|
| 37 |
# Each classifier trains once on first call and is cached via _get_nlp().
|
| 38 |
|
| 39 |
+
async def _run_comparison(text: str) -> tuple[list[ClassifierComparisonEntry], LDATopicResult | None]:
|
| 40 |
+
"""Run BoW, TF-IDF, Naive Bayes, and LDA classifiers. Also infer LDA topic."""
|
| 41 |
_COMPARISON_CLASSIFIERS = [
|
| 42 |
("BoW", "cmp_bow", lambda: __import__("ml.bow_classifier", fromlist=["BoWClassifier"]).BoWClassifier()),
|
| 43 |
("TF-IDF", "cmp_tfidf", lambda: __import__("ml.tfidf_classifier", fromlist=["TFIDFClassifier"]).TFIDFClassifier()),
|
|
|
|
| 47 |
|
| 48 |
def _predict_all():
|
| 49 |
results = []
|
| 50 |
+
lda_topic_result = None
|
| 51 |
for name, key, factory in _COMPARISON_CLASSIFIERS:
|
| 52 |
try:
|
| 53 |
clf = _get_nlp(key, factory)
|
|
|
|
| 58 |
confidence=r.confidence,
|
| 59 |
top_features=r.triggered_features[:3],
|
| 60 |
))
|
| 61 |
+
if name == "LDA" and hasattr(clf, "get_topic_info"):
|
| 62 |
+
info = clf.get_topic_info(text)
|
| 63 |
+
lda_topic_result = LDATopicResult(**info)
|
| 64 |
except Exception as exc:
|
| 65 |
logger.warning("Comparison classifier %s failed: %s", name, exc)
|
| 66 |
+
return results, lda_topic_result
|
| 67 |
|
| 68 |
loop = asyncio.get_event_loop()
|
| 69 |
return await loop.run_in_executor(None, _predict_all)
|
|
|
|
| 319 |
verdict = _map_verdict(final_score)
|
| 320 |
|
| 321 |
# ── Step 10: Assemble response ────────────────────────────────────────────
|
| 322 |
+
comparison, lda_topic = await comparison_task
|
| 323 |
|
| 324 |
result = VerificationResponse(
|
| 325 |
verdict=verdict,
|
|
|
|
| 339 |
domain_credibility=get_domain_tier(source_domain) if source_domain else None,
|
| 340 |
input_type=input_type,
|
| 341 |
classifier_comparison=comparison,
|
| 342 |
+
lda_topic=lda_topic,
|
| 343 |
)
|
| 344 |
|
| 345 |
# ── Record to Firestore (falls back to in-memory if Firebase not configured) ─
|