Spaces:

sungho7373
/

FakeAD-Detector

Running

App Files Files Community

sungho7373 commited on Feb 26

Commit

1ebb586

0 Parent(s):

Initial commit: clean code without secrets

Browse files

Files changed (12) hide show

.DS_Store +0 -0
.gitignore +4 -0
README.md +3 -0
myapp/Procfile +0 -0
myapp/app.py +353 -0
myapp/main_pipeline.py +88 -0
myapp/requirements.txt +13 -0
myapp/step0_ingestion.py +178 -0
myapp/step1_lexical.py +109 -0
myapp/step2_semantic.py +116 -0
myapp/step3_rag.py +111 -0
myapp/step4_xai.py +60 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.env
+__pycache__/
+.venv/.env
+.venv/

README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+# FakeAD-Detector
+# FakeAD-Detector
+# FakeAD-Detector

myapp/Procfile ADDED Viewed

File without changes

myapp/app.py ADDED Viewed

	@@ -0,0 +1,353 @@

+from fastapi import FastAPI
+from fastapi.responses import HTMLResponse
+from fastapi.middleware.cors import CORSMiddleware # 🌟 CORS 추가
+from pydantic import BaseModel
+import uvicorn
+import math
+# (기존 AI 모듈 import 부분 동일)
+app = FastAPI()
+# 🌟 외부(Wix)에서 API를 호출할 수 있도록 CORS 허용 설정
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # 실제 서비스 시에는 ["https://내wix주소.com"] 으로 변경 권장
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# (이하 기존 코드 동일...)
+from flask import Flask
+app = Flask(__name__)
+@app.route("/")
+def home():
+    return "Hello from Render!"
+if __name__ == "__main__":
+    app.run()
+from fastapi import FastAPI
+from fastapi.responses import HTMLResponse
+from pydantic import BaseModel
+import uvicorn
+import math
+# ==========================================
+# 🌟 AI 레고 블록 불러오기
+# ==========================================
+from step0_ingestion import DataIngestionPipeline
+from step1_lexical import LexicalAnalyzer
+from step2_semantic import SemanticAnalyzer
+from step3_rag import FactCheckerRAG
+from step4_xai import XAIScorer
+app = FastAPI()
+print("==================================================")
+print(" ⏳ AI 엔진 및 딥러닝 모델들을 메모리에 올리는 중입니다...")
+print("==================================================")
+ingestion = DataIngestionPipeline()
+lexical = LexicalAnalyzer()
+semantic = SemanticAnalyzer()
+rag_checker = FactCheckerRAG()
+xai_scorer = XAIScorer()
+print("\n✅ [서버 준비 완료] http://127.0.0.1:8000 에 접속하세요!\n")
+class AdRequest(BaseModel):
+    video_url: str
+    product_url: str
+# 1. 🌟 다크 모드 대시보드 웹 프론트엔드 (HTML/CSS/JS)
+@app.get("/", response_class=HTMLResponse)
+async def serve_frontend():
+    html_content = """
+    <!DOCTYPE html>
+    <html lang="ko">
+    <head>
+        <meta charset="UTF-8">
+        <title>AI 과대광고 탐지 대시보드</title>
+        <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+        <style>
+            body { font-family: 'Pretendard', sans-serif; background-color: #121212; color: #ffffff; padding: 20px; margin: 0; }
+            .container { max-width: 1200px; margin: auto; }
+            h2 { color: #ffffff; text-align: left; border-bottom: 2px solid #333; padding-bottom: 10px; }
+            /* 입력 폼 섹션 */
+            .input-section { background: #1e1e1e; padding: 20px; border-radius: 12px; margin-bottom: 20px; box-shadow: 0 4px 6px rgba(0,0,0,0.3); }
+            input { width: 100%; padding: 12px; margin: 8px 0; background: #2c2c2e; border: 1px solid #444; border-radius: 8px; color: #fff; box-sizing: border-box; }
+            button { width: 100%; padding: 15px; background: #4caf50; color: white; border: none; border-radius: 8px; font-size: 16px; font-weight: bold; cursor: pointer; transition: 0.3s; }
+            button:hover { background: #45a049; }
+            /* 대시보드 그리드 레이아웃 */
+            .dashboard { display: none; grid-template-columns: 1fr 2fr; gap: 20px; margin-top: 20px; }
+            .card { background: #1e1e1e; padding: 20px; border-radius: 12px; box-shadow: 0 4px 6px rgba(0,0,0,0.3); }
+            .card h3 { margin-top: 0; color: #a0a0a0; font-size: 16px; border-bottom: 1px solid #333; padding-bottom: 10px; }
+            /* 도넛 차트 카드 */
+            .score-card { text-align: center; }
+            .score-card canvas { max-height: 250px; margin: auto; }
+            /* 사고회로 카드 */
+            .xai-card { font-size: 15px; line-height: 1.6; color: #e0e0e0; }
+            .highlight { color: #4caf50; font-weight: bold; }
+            .danger { color: #ff5252; font-weight: bold; }
+            /* 세부 엔진 결과 카드 */
+            .details-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-top: 20px; }
+            .detail-item { background: #2c2c2e; padding: 15px; border-radius: 8px; }
+            .detail-item h4 { margin: 0 0 10px 0; color: #4caf50; }
+            /* 로딩 텍스트 */
+            #loading { display: none; text-align: center; color: #4caf50; font-size: 18px; margin-top: 20px; font-weight: bold; }
+        </style>
+    </head>
+    <body>
+        <div class="container">
+            <h2>🚨 AI 과대광고 탐지 대시보드 (Overview)</h2>
+            <div class="input-section">
+                <input type="text" id="video_url" placeholder="유튜브 영상 링크 (선택사항)">
+                <input type="text" id="product_url" placeholder="상품 상세페이지 링크 (필수)" value="https://brand.naver.com/pacsafe/products/9365045491">
+                <button onclick="analyzeAd()">분석 시작 (데이터 크롤링 및 AI 분석)</button>
+            </div>
+            <div id="loading">데이터를 수집하고 AI 모델이 분석 중입니다. 잠시만 기다려주세요 ⏳...</div>
+            <div class="dashboard" id="dashboard">
+                <div class="card score-card">
+                    <h3>통합 위험도 점수 (Final Score)</h3>
+                    <canvas id="scoreChart"></canvas>
+                    <h1 id="scoreText" style="margin-top: 15px;">0.00점</h1>
+                    <p id="statusText" style="color: #a0a0a0;"></p>
+                </div>
+                <div class="card xai-card">
+                    <h3>🤖 AI 최종 판정 사고회로 (XAI Reasoning)</h3>
+                    <div id="xaiReasoning" style="margin-bottom: 20px;"></div>
+                    <h3>🌐 RAG 팩트체크 2D 벡터 공간 비교</h3>
+                    <canvas id="vectorChart" style="max-height: 200px;"></canvas>
+                </div>
+            </div>
+            <div class="dashboard" id="detailsDashboard" style="grid-template-columns: 1fr; margin-top: 0;">
+                <div class="card">
+                    <h3>⚙️ 세부 엔진 분석 결과 (Detailed Engine Results)</h3>
+                    <div class="details-grid">
+                        <div class="detail-item">
+                            <h4>[X1] 형태소 및 단어 탐지기</h4>
+                            <p id="x1Details"></p>
+                        </div>
+                        <div class="detail-item">
+                            <h4>[X2] 의미론적 문맥 유사도 (KoELECTRA)</h4>
+                            <p id="x2Details"></p>
+                        </div>
+                        <div class="detail-item" style="grid-column: span 2;">
+                            <h4>[X3] RAG 기반 팩트체크 교차 검증</h4>
+                            <p id="x3Details"></p>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <script>
+            let scoreChartInstance = null;
+            let vectorChartInstance = null;
+            async function analyzeAd() {
+                const videoUrl = document.getElementById('video_url').value;
+                const productUrl = document.getElementById('product_url').value;
+                if (!productUrl) return alert("상품 링크는 필수입니다!");
+                document.getElementById('loading').style.display = 'block';
+                document.getElementById('dashboard').style.display = 'none';
+                document.getElementById('detailsDashboard').style.display = 'none';
+                try {
+                    const response = await fetch('/api/analyze', {
+                        method: 'POST',
+                        headers: { 'Content-Type': 'application/json' },
+                        body: JSON.stringify({ video_url: videoUrl, product_url: productUrl })
+                    });
+                    const data = await response.json();
+                    document.getElementById('loading').style.display = 'none';
+                    if (data.status === "success") {
+                        document.getElementById('dashboard').style.display = 'grid';
+                        document.getElementById('detailsDashboard').style.display = 'grid';
+                        // 데이터 바인딩
+                        document.getElementById('scoreText').innerText = data.final_score.toFixed(2) + "점";
+                        document.getElementById('statusText').innerText = data.message;
+                        document.getElementById('statusText').style.color = data.final_score > 70 ? "#ff5252" : (data.final_score > 40 ? "#ffeb3b" : "#4caf50");
+                        document.getElementById('xaiReasoning').innerHTML = data.xai_reasoning;
+                        document.getElementById('x1Details').innerHTML = data.x1_details;
+                        document.getElementById('x2Details').innerHTML = data.x2_details;
+                        document.getElementById('x3Details').innerHTML = data.x3_details;
+                        // 📊 도넛 차트 렌더링
+                        renderScoreChart(data.final_score);
+                        // 🌐 2D 벡터 산점도 렌더링
+                        renderVectorChart(data.vector_data);
+                    } else {
+                        alert("분석 실패: " + data.error);
+                    }
+                } catch (err) {
+                    document.getElementById('loading').style.display = 'none';
+                    alert("서버 통신 에러가 발생했습니다.");
+                }
+            }
+            function renderScoreChart(score) {
+                const ctx = document.getElementById('scoreChart').getContext('2d');
+                if(scoreChartInstance) scoreChartInstance.destroy();
+                const color = score > 70 ? '#ff5252' : (score > 40 ? '#ffeb3b' : '#4caf50');
+                scoreChartInstance = new Chart(ctx, {
+                    type: 'doughnut',
+                    data: {
+                        labels: ['위험도', '안전'],
+                        datasets: [{
+                            data: [score, 100 - score],
+                            backgroundColor: [color, '#2c2c2e'],
+                            borderWidth: 0
+                        }]
+                    },
+                    options: {
+                        cutout: '75%',
+                        plugins: { legend: { display: false } }
+                    }
+                });
+            }
+            function renderVectorChart(vectorData) {
+                const ctx = document.getElementById('vectorChart').getContext('2d');
+                if(vectorChartInstance) vectorChartInstance.destroy();
+                scoreChartInstance = new Chart(ctx, {
+                    type: 'scatter',
+                    data: {
+                        datasets: [
+                            {
+                                label: '식약처 규정 (Fact)',
+                                data: [{ x: 0, y: 0 }],
+                                backgroundColor: '#4caf50',
+                                pointRadius: 8
+                            },
+                            {
+                                label: '광고 문구 (Claim)',
+                                data: [{ x: vectorData.x, y: vectorData.y }],
+                                backgroundColor: '#ff5252',
+                                pointRadius: 8
+                            }
+                        ]
+                    },
+                    options: {
+                        responsive: true,
+                        scales: {
+                            x: { grid: { color: '#333' }, min: -10, max: 100, title: {display: true, text: '의미론적 거리 (X)', color: '#888'} },
+                            y: { grid: { color: '#333' }, min: -10, max: 100, title: {display: true, text: '의미론적 거리 (Y)', color: '#888'} }
+                        },
+                        plugins: {
+                            legend: { labels: { color: '#fff' } }
+                        }
+                    }
+                });
+            }
+        </script>
+    </body>
+    </html>
+    """
+    return HTMLResponse(content=html_content)
+# 2. 분석 API
+@app.post("/api/analyze")
+def api_analyze(req: AdRequest):
+    try:
+        # Step 0: 데이터 수집
+        stt_text = ingestion.run_stt(ingestion.extract_audio_from_video(req.video_url)) if req.video_url.strip() else ""
+        ocr_text = ingestion.run_ocr_from_web(req.product_url) if req.product_url.strip() else ""
+        combined_text = f"{stt_text}\n{ocr_text}".strip()
+        if len(combined_text) < 5:
+            return {"status": "error", "error": "텍스트를 찾지 못했습니다."}
+        # Step 1, 2, 3: 점수 도출
+        x1_score = lexical.calculate_x1_score(combined_text)
+        x2_score = semantic.calculate_x2_score(combined_text)
+        x3_score, matched_fact = rag_checker.calculate_x3_score(combined_text)
+        # Step 4: 머신러닝 스코어링
+        final_score, shap_vals, _ = xai_scorer.calculate_final_score_and_explain(x1_score, x2_score, x3_score)
+        # =====================================================================
+        # 🌟 UI에 뿌려줄 상세 설명(Detail Text) 생성 로직
+        # =====================================================================
+        # [X1 세부결과] 발견된 금칙어 추출
+        detected_words = [word for word in lexical.lexicon.keys() if word in combined_text]
+        if detected_words:
+            x1_details = f"<span class='danger'>적발된 단어: {', '.join(detected_words)}</span><br>이 단어들은 식약처 가이드라인에 의해 사용이 강하게 규제되는 표현입니다."
+        else:
+            x1_details = "<span class='highlight'>발견된 금칙어 없음.</span><br>명시적인 허위 과장 단어는 사용되지 않아 텍스트 표면적으로는 안전합니다."
+        # [X2 세부결과] 문맥 설명
+        if x2_score > 60:
+            x2_details = f"<span class='danger'>문맥적 위험도 {x2_score:.1f}점</span><br>단순 단어를 넘어, 문장의 전반적인 뉘앙스가 과거 적발된 허위광고 데이터베이스의 과장 패턴(단정적 표현, 효능 맹신 등)과 <b>매우 유사하게 감지</b>되었습니다."
+        else:
+            x2_details = f"<span class='highlight'>문맥적 위험도 {x2_score:.1f}점</span><br>과거 적발된 허위광고 특유의 자극적이거나 기만적인 문맥 패턴이 크게 발견되지 않았습니다."
+        # [X3 세부결과] RAG 설명
+        x3_details = f"<b>[관련 식약처 규정 매칭]</b><br>{matched_fact}<br><br>"
+        if x3_score > 50: # GPT가 높은 위반 점수�� 준 경우
+            x3_details += f"<b>[LLM 추론 결과: <span class='danger'>모순 발견</span>]</b><br>광고 문구가 위 식약처 규정을 명백히 위반하고 있는 것으로 추론되었습니다."
+        else:
+            x3_details += f"<b>[LLM 추론 결과: <span class='highlight'>규정 준수</span>]</b><br>광고 문구와 위 식약처 규정 간에 심각한 논리적 모순이나 위반 사항이 발견되지 않았습니다."
+        # [XAI 사고회로] SHAP 기반 서술
+        xai_reasoning = f"AI는 이 광고 텍스트를 분석할 때 다음과 같은 사고 과정을 거쳤습니다.<br><ul>"
+        if shap_vals[0] > 0: xai_reasoning += f"<li>표면적인 단어(X1)에 과장된 표현이 섞여 있어 위험도를 높였습니다. <span class='danger'>(+{shap_vals[0]:.2f})</span></li>"
+        else: xai_reasoning += f"<li>금칙어가 검출되지 않아 기본적으로 안전한 글로 인식했습니다. <span class='highlight'>({shap_vals[0]:.2f})</span></li>"
+        if shap_vals[1] > 0: xai_reasoning += f"<li>하지만 문맥의 뉘앙스(X2)가 과거 허위광고와 너무 비슷해 AI가 강한 의심을 품었습니다. <span class='danger'>(+{shap_vals[1]:.2f})</span></li>"
+        else: xai_reasoning += f"<li>전체적인 문맥과 뉘앙스(X2) 역시 정상적인 제품 설명의 형태를 띠고 있습니다. <span class='highlight'>({shap_vals[1]:.2f})</span></li>"
+        if shap_vals[2] > 0: xai_reasoning += f"<li><b>결정적으로 식약처 팩트체크(X3)에서 명백한 규정 위반이 확인</b>되어, 최종 위험 판정을 내렸습니다. <span class='danger'>(+{shap_vals[2]:.2f})</span></li>"
+        else: xai_reasoning += f"<li><b>식약처 팩트체크(X3) 결과 규정 위반 소지가 발견되지 않아</b> 최종적으로 안전하다고 판결했습니다. <span class='highlight'>({shap_vals[2]:.2f})</span></li>"
+        xai_reasoning += "</ul>"
+        # [벡터 공간 시각화 데이터] 유사도가 낮을수록 두 점의 거리가 멀어짐
+        # 코사인 유사도 점수를 기반으로 임의의 x, y 좌표 산출
+        distance = max(10, 100 - x2_score) # 0에 가까울수록 위험(가까움)
+        vector_data = {"x": distance * 0.8, "y": distance * 0.9}
+        message = "🚨 악의적인 허위/과대광고로 의심됩니다." if final_score > 70 else ("⚠️ 일부 과장된 내용이 포함되어 있습니다." if final_score > 40 else "✅ 과대광고 소지가 적은 안전한 콘텐츠입니다.")
+        return {
+            "status": "success",
+            "final_score": float(round(final_score, 2)),
+            "message": message,
+            "x1_details": x1_details,
+            "x2_details": x2_details,
+            "x3_details": x3_details,
+            "xai_reasoning": xai_reasoning,
+            "vector_data": vector_data
+        }
+    except Exception as e:
+        return {"status": "error", "error": str(e)}
+if __name__ == "__main__":
+    uvicorn.run("app:app", host="127.0.0.1", port=8000, reload=True)

myapp/main_pipeline.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from step0_ingestion import DataIngestionPipeline
+from step1_lexical import LexicalAnalyzer
+from step2_semantic import SemanticAnalyzer
+from step3_rag import FactCheckerRAG
+from step4_xai import XAIScorer # 🌟 Step 4 추가!
+def run_full_pipeline():
+    print("==================================================")
+    print(" 🚀 [AtoZ 파이프라인] 과대광고 탐지 AI 시스템 가동")
+    print("==================================================\n")
+    # 1. 모델 및 엔진 초기화
+    ingestion = DataIngestionPipeline()
+    lexical = LexicalAnalyzer()
+    semantic = SemanticAnalyzer()
+    rag_checker = FactCheckerRAG()
+    xai_scorer = XAIScorer() # 🌟 XAI 엔진 가동!
+    target_video_url = "https://youtu.be/SJxSDRxd8Dc?si=t8dnIQciFulUlbVW"
+    target_product_url = "https://brand.naver.com/pacsafe/products/9365045491"
+    # [Step 0] 데이터 수집
+    print("\n▶️ [Step 0] 멀티모달 데이터 수집 중...")
+    stt_text = ""
+    try:
+        audio_file = ingestion.extract_audio_from_video(target_video_url)
+        if audio_file:
+            stt_text = ingestion.run_stt(audio_file)
+    except Exception as e:
+        print(f"⚠️ 유튜브 추출 실패 (무시하고 진행): {e}")
+    ocr_text = ""
+    try:
+        ocr_text = ingestion.run_ocr_from_web(target_product_url)
+    except Exception as e:
+        print(f"⚠️ OCR 추출 실패: {e}")
+    combined_text = f"{stt_text}\n{ocr_text}"
+    if len(combined_text) == 0:
+        print("❌ 분석할 텍스트가 없어 종료합니다.")
+        return
+    # [Step 1, 2, 3] 텍스트 심층 분석
+    print("\n▶️ [Step 1, 2, 3] 텍스트 심층 분석 가동...")
+    x1_score = lexical.calculate_x1_score(combined_text)
+    x2_score = semantic.calculate_x2_score(combined_text)
+    x3_score, matched_fact = rag_checker.calculate_x3_score(combined_text)
+    # 🌟 [Step 4] XGBoost 스코어링 및 SHAP 분석
+    print("\n▶️ [Step 4] XGBoost 기반 최종 스코어링 및 SHAP 설명 생성...")
+    final_score, shap_vals, base_value = xai_scorer.calculate_final_score_and_explain(x1_score, x2_score, x3_score)
+    # =====================================================================
+    # 🌟 최종 분석 리포트 (XAI 설명 포함)
+    # =====================================================================
+    print("\n==================================================")
+    print(" 📜 [최종 과대광고 탐지 리포트]")
+    print("==================================================")
+    print(f" 🔹 [X1] 형태소/단어 위반 (Lexical) : {x1_score:5.1f} 점")
+    print(f" 🔹 [X2] 문맥적 유사도 (Semantic)   : {x2_score:5.1f} 점")
+    print(f" 🔹 [X3] 식약처 팩트체크 (RAG)      : {x3_score:5.1f} 점")
+    print("--------------------------------------------------")
+    print(f" 🎯 [S] 머신러닝 최종 위험도 점수   : {final_score:5.1f} / 100.0 점")
+    print("==================================================")
+    if final_score > 70:
+        print(" 🚨 [판정] 매우 위험! 악의적인 허위/과대광고로 의심됩니다.")
+    elif final_score > 40:
+        print(" ⚠️ [판정] 주의! 일부 과장된 표현이나 사실과 다른 내용이 포함되어 있습니다.")
+    else:
+        print(" ✅ [판정] 안전! 과대광고 소지가 적은 정상적인 콘텐츠입니다.")
+    print("\n==================================================")
+    print(" 🤖 [XAI] 인공지능의 판정 사유 (SHAP Values)")
+    print("==================================================")
+    print(" (점수가 양수(+)면 위험도를 높였고, 음수(-)면 안전하다고 판단한 근거입니다.)\n")
+    features = ["X1 (단어 위반)", "X2 (문맥 유사도)", "X3 (팩트체크 모순)"]
+    for i, feature_name in enumerate(features):
+        impact = shap_vals[i]
+        direction = "🔴 위험도 증가" if impact > 0 else "🟢 위험도 감소"
+        # 직관성을 위해 SHAP Log-odds 값을 점수 스케일처럼 비례해서 보여줍니다
+        print(f"  {direction} 기여: {feature_name:<18} -> 기여도: {impact:+.2f}")
+    print("==================================================")
+if __name__ == "__main__":
+    run_full_pipeline()

myapp/requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi
+uvicorn
+pydantic
+requests
+playwright
+paddleocr
+paddlepaddle
+torch
+transformers
+sentence-transformers
+xgboost
+shap
+openai

myapp/step0_ingestion.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os
+import gc
+import requests
+import torch
+import whisper
+from pytubefix import YouTube
+from paddleocr import PaddleOCR
+from playwright.sync_api import sync_playwright
+import logging
+class DataIngestionPipeline:
+    def __init__(self):
+        self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+        print(f"✅ 사용 중인 디바이스: {self.device}")
+    def clear_memory(self):
+        gc.collect()
+        if torch.backends.mps.is_available():
+            torch.mps.empty_cache()
+        print("🧹 메모리 정리 완료")
+    def extract_audio_from_video(self, video_url, output_filename="temp_audio"):
+        print(f"\n🎥 [1] 유튜브 영상 다운로드 시작: {video_url}")
+        try:
+            yt = YouTube(video_url, 'WEB')
+            audio_stream = yt.streams.get_audio_only()
+            file_path = audio_stream.download(filename=f"{output_filename}.mp4")
+            print(f"✅ 오디오 추출 완료: {file_path}")
+            return file_path
+        except Exception as e:
+            print(f"❌ 다운로드 실패: {e}")
+            return None
+    def run_stt(self, audio_path):
+        print(f"\n🗣️ [2] STT(음성->텍스트) 변환 시작: {audio_path}")
+        model = whisper.load_model("small", device="cpu")
+        result = model.transcribe(audio_path, language="ko", fp16=False)
+        if result is None:
+            raise ValueError("Whisper가 텍스트를 반환하지 못했습니다.")
+        text_result = result.get("text", "")
+        del model
+        self.clear_memory()
+        print("✅ STT 변환 완료")
+        return text_result.strip()
+    def run_ocr_from_web(self, product_url):
+        print(f"\n🖼️ [3] 웹페이지 접속 및 이미지 OCR 시작: {product_url}")
+        raw_image_urls = []
+        with sync_playwright() as p:
+            print("   -> 브라우저 창을 띄우고 페이지 로딩을 기다립니다...")
+            browser = p.chromium.launch(headless=False)
+            page = browser.new_page()
+            page.goto(product_url, wait_until="domcontentloaded", timeout=60000)
+            page.wait_for_timeout(3000)
+            print("   -> 🎯 숨겨진 상세페이지를 열기 위해 '더보기' 버튼을 찾습니다...")
+            try:
+                more_btn = page.locator('button:has-text("상세정보 펼쳐보기"), button:has-text("상세설명 더보기"), a:has-text("더보기")').first
+                if more_btn.is_visible(timeout=3000):
+                    more_btn.click()
+                    print("      => 쾅! '더보기' 버튼을 성공적으로 클릭했습니다!")
+                    page.wait_for_timeout(2000)
+            except Exception:
+                print("      => '더보기' 버튼이 없거나 이미 펼쳐져 있습니다. 그대로 진행합니다.")
+            print("   -> 지연 로딩(Lazy-loading)된 이미지를 불러오기 위해 스크롤을 내립니다...")
+            for _ in range(10):
+                page.evaluate("window.scrollBy(0, 1500)")
+                page.wait_for_timeout(1000)
+            img_elements = page.query_selector_all('img')
+            for img in img_elements:
+                src = img.get_attribute('data-src') or img.get_attribute('src')
+                if src and ('http' in src or src.startswith('//')):
+                    if src.startswith('//'):
+                        src = 'https:' + src
+                    raw_image_urls.append(src)
+            browser.close()
+        valid_urls = []
+        for url in raw_image_urls:
+            url_lower = url.lower()
+            if not any(x in url_lower for x in ['.gif', 'icon', 'logo', 'blank', 'svg', 'thumb']):
+                valid_urls.append(url)
+        if not valid_urls:
+            print("❌ 유효한 상세 이미지를 찾을 수 없습니다.")
+            return ""
+        print(f"✅ 총 {len(valid_urls)}개의 이미지 발견! 진짜 상세 이미지를 탐색합니다...")
+        logging.getLogger('ppocr').setLevel(logging.ERROR)
+        # 🌟 해상도 한계치를 대폭 늘린 최신 세팅 적용
+        ocr = PaddleOCR(
+            lang='korean',
+            text_det_limit_side_len=2048,
+            text_det_limit_type='max'
+        )
+        all_extracted_text = []
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Referer': product_url
+        }
+        processed_count = 0
+        for i, img_url in enumerate(valid_urls[2:]):
+            if processed_count >= 3:
+                break
+            temp_img_path = f"temp_ocr_{i}.jpg"
+            try:
+                response = requests.get(img_url, headers=headers, timeout=10)
+                with open(temp_img_path, 'wb') as f:
+                    f.write(response.content)
+                if os.path.getsize(temp_img_path) < 30000:
+                    if os.path.exists(temp_img_path): os.remove(temp_img_path)
+                    continue
+                processed_count += 1
+                file_kb = os.path.getsize(temp_img_path) // 1024
+                print(f"   -> [진짜 텍스트 탐색 중...] 묵직한 상세 이미지 발견! ({file_kb}KB)")
+                result = ocr.ocr(temp_img_path)
+                # 🌟 방금 확인한 완벽한 데이터 추출 로직 적용!
+                if result and isinstance(result[0], dict) and 'rec_texts' in result[0]:
+                    texts = result[0]['rec_texts']
+                    print(f"      => ✨ 텍스트 {len(texts)}줄 추출 성공!")
+                    all_extracted_text.extend(texts)
+                    if os.path.exists(temp_img_path): os.remove(temp_img_path)
+                else:
+                    print(f"      => ⚠️ 글자가 없습니다! (이미지 확인: {temp_img_path})")
+            except Exception as e:
+                print(f"⚠️ 이미지 처리 중 오류 발생: {e}")
+                if os.path.exists(temp_img_path): os.remove(temp_img_path)
+        del ocr
+        self.clear_memory()
+        final_text = "\n".join(all_extracted_text) # 줄바꿈으로 깔끔하게 합치기
+        print("\n✅ 웹페이지 이미지 OCR 변환 완료!")
+        return final_text
+# ==========================================
+# 실제 실행 테스트 코드
+# ==========================================
+if __name__ == "__main__":
+    pipeline = DataIngestionPipeline()
+    # 1. 유튜브 STT 테스트
+    test_video_url = "https://youtu.be/SJxSDRxd8Dc?si=t8dnIQciFulUlbVW"
+    try:
+        audio_file = pipeline.extract_audio_from_video(test_video_url)
+        if audio_file:
+            stt_text = pipeline.run_stt(audio_file)
+            print(f"\n[STT 결과 (음성 -> 텍스트)]\n{stt_text[:500]}...\n")
+    except Exception as e:
+        print(f"유튜브/STT 처리 중 에러 발생: {e}")
+    # 2. 웹페이지 OCR 테스트
+    test_product_url = "https://brand.naver.com/pacsafe/products/9365045491"
+    try:
+        ocr_text = pipeline.run_ocr_from_web(test_product_url)
+        print(f"\n[웹페이지 OCR 결과 (상세 이미지 -> 텍스트)]\n===========================\n{ocr_text}\n===========================")
+    except Exception as e:
+        print(f"OCR 처리 중 에러 발생: {e}")

myapp/step1_lexical.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import re
+from mecab import MeCab
+class LexicalAnalyzer:
+    def __init__(self):
+        print("✅ Mecab 형태소 분석기를 로드합니다...")
+        self.mecab = MeCab()
+        # 🚨 가중치(TF-IDF의 IDF 개념 차용)가 부여된 과대광고 사전
+        # 점수가 높을수록 한 번만 등장해도 치명적인 단어입니다.
+        self.lexicon = {
+            "치료": 2.0, "예방": 2.0, "완치": 2.0, "항암": 2.0, "특효": 2.0,
+            "100%": 1.5, "만병통치": 2.0, "기적": 1.5, "단숨에": 1.5,
+            "주문쇄도": 1.0, "단체추천": 1.0, "특수제법": 1.0, # 식약처 기만광고 적발 키워드
+            "최고": 1.0, "가장 좋은": 1.0, "독소": 1.0, "부작용": 1.5,
+            "체험기": 1.5, "체험사례": 1.5 # 식약처가 금지하는 후기 마케팅 키워드
+        }
+        # 🛡️ 부정어 사전 (이 단어들이 주변에 있으면 무죄 판결)
+        self.negation_words = {"없", "않", "아니", "무", "안", "못"}
+    def split_into_sentences(self, text):
+        """텍스트를 문장 단위로 분리하여 길이 편향(Length Bias)을 방지합니다."""
+        # 마침표, 느낌표, 물음표 또는 줄바꿈을 기준으로 분리
+        sentences = re.split(r'[.!?\n]+', text)
+        return [s.strip() for s in sentences if len(s.strip()) > 2]
+    def check_negation_context(self, tokens, target_index, window_size=3):
+        """
+        [핵심 로직] 타겟 단어 뒤에 부정어가 오는지 문맥을 검사합니다.
+        예: '부작용'(명사) + '이'(조사) + '없'(형용사) + '습니다'(어미)
+        """
+        # 타겟 단어 뒤의 window_size 만큼의 형태소를 살펴봅니다.
+        end_index = min(target_index + window_size + 1, len(tokens))
+        context_tokens = tokens[target_index + 1 : end_index]
+        for word, pos in context_tokens:
+            # VA(형용사-없다), VX(보조용언-않다), MAG(부사-안,못) 등을 체크
+            if word in self.negation_words or pos in ['VA', 'VX', 'MAG']:
+                return True # 부정어가 존재함!
+        return False
+    def calculate_x1_score(self, text):
+        if not text:
+            return 0.0
+        print("\n🔍 [Step 1] 텍스트 과장도(Lexical Score) 분석 시작...")
+        sentences = self.split_into_sentences(text)
+        total_sentences = len(sentences)
+        if total_sentences == 0:
+            return 0.0
+        total_penalty = 0.0
+        detected_issues = []
+        # 문장 단위로 검사하여 텍스트가 길어져도 점수가 희석되지 않게 방어
+        for sentence in sentences:
+            tokens = self.mecab.pos(sentence)
+            sentence_flagged = False
+            for i, (word, pos) in enumerate(tokens):
+                # 사전에 있는 금칙어인지 확인
+                if word in self.lexicon:
+                    # 부정어 문맥 체크 ("부작용이 없습니다" 필터링)
+                    is_negated = self.check_negation_context(tokens, i)
+                    if is_negated:
+                        detected_issues.append(f"🛡️ 무죄(부정어 동반): '{word}' (문장: {sentence})")
+                    else:
+                        weight = self.lexicon[word]
+                        total_penalty += weight
+                        sentence_flagged = True
+                        detected_issues.append(f"🚨 적발: '{word}' (가중치: +{weight})")
+            # 한 문장에 금칙어가 여러 번 나와도 1차원적으로 폭발하지 않도록 패널티 상한선 부여
+            if sentence_flagged:
+                total_penalty += 0.5 # 문장 자체의 불량도 추가 점수
+        # 🧮 X1 스코어 계산 (0 ~ 100점 스케일링)
+        # 공식: (총 패널티 / 전체 문장 수)를 기준으로 점수화하되, 로그 스케일 등을 써서 100점 상한선 적용
+        raw_score = (total_penalty / total_sentences) * 50
+        x1_score = min(raw_score, 100.0)
+        # 결과 리포트 출력
+        print(f"\n[분석 리포트]")
+        print(f" - 전체 문장 수: {total_sentences}문장")
+        for issue in detected_issues:
+            print(f" {issue}")
+        print(f"📈 최종 X1 점수: {x1_score:.2f} / 100.0 점")
+        return x1_score
+# ==========================================
+# 실제 실행 테스트 코드
+# ==========================================
+if __name__ == "__main__":
+    analyzer = LexicalAnalyzer()
+    print("==================================================")
+    print("테스트 1: 부정어가 포함된 정상적인 광고 (오탐 방지 테스트)")
+    test_text_1 = "이 제품은 식약처 인증을 받았습니다. 피부 트러블이나 부작용이 전혀 없습니다. 안심하고 사용하세요."
+    analyzer.calculate_x1_score(test_text_1)
+    print("\n==================================================")
+    print("테스트 2: 극단적인 과대광고 (허위 탐지 테스트)")
+    test_text_2 = "단 일주일 만에 지방이 100% 분해되는 기적을 경험하세요! 이것은 암도 완치하는 만병통치 약입니다. 무조건 구매하세요. 최고의 선택입니다."
+    analyzer.calculate_x1_score(test_text_2)

myapp/step2_semantic.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import gc
+class SemanticAnalyzer:
+    def __init__(self):
+        print("🧠 [Step 2] 의미론적 딥러닝 분석기(KoELECTRA)를 로드합니다...")
+        # M1 Max GPU(MPS) 세팅
+        self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+        # 가볍고 성능이 뛰어난 KoELECTRA 모델 로드
+        self.model_name = "monologg/koelectra-base-v3-discriminator"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        # 🌟 핵심: output_hidden_states=True 옵션을 켜야
+        # 분류 Logit과 코사인 유사도용 Vector를 한 번의 연산으로 둘 다 뽑아낼 수 있습니다!
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            self.model_name,
+            num_labels=2, # 0: 정상, 1: 과대광고
+            output_hidden_states=True
+        ).to(self.device)
+        self.model.eval() # 추론 모드 전환
+        # [벡터 A 구축] 기존에 적발된 허위광고 레퍼런스 문장들 (예시)
+        self.reference_bad_texts = [
+            "단 일주일 만에 지방이 100% 분해되는 기적의 크림",
+            "식약처에서 인증한 만병통치약, 암세포 완벽 제거",
+            "이것만 먹으면 독소가 배출되고 세포가 즉각 재생됩니다",
+            "의사들이 무조건 추천하는 부작용 없는 완치제"
+        ]
+        print("   -> 레퍼런스 허위광고 문장들을 벡터(Vector) 공간에 배치 중...")
+        self.reference_embeddings = self._get_embeddings(self.reference_bad_texts)
+    def clear_memory(self):
+        """메모리 누수 방지용 가비지 컬렉션"""
+        gc.collect()
+        if torch.backends.mps.is_available():
+            torch.mps.empty_cache()
+    def _get_embeddings(self, texts):
+        """텍스트 리스트를 입력받아 문장 임베딩 벡터를 반환합니다."""
+        inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            # 마지막 레이어의 hidden states에서 [CLS] 토큰(인덱스 0)의 벡터를 문장 대표 벡터로 사용
+            sentence_embeddings = outputs.hidden_states[-1][:, 0, :]
+        return sentence_embeddings
+    def calculate_x2_score(self, text):
+        if not text or len(text.strip()) < 5:
+            return 0.0
+        print("\n🧠 [Step 2] 딥러닝 문맥 및 유사도(Semantic) 분석 시작...")
+        # 너무 긴 텍스트는 PLM 한계(512토큰)에 걸리므로 잘라서 입력
+        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
+        with torch.no_grad(): # 역전파(학습) 연산을 꺼서 M1 메모리 절약
+            outputs = self.model(**inputs)
+            # =========================================================
+            # [수식 1] 분류 확률 (Softmax) : X2
+            # =========================================================
+            logits = outputs.logits # 모델의 날것 출력값 (z)
+            probs = F.softmax(logits, dim=-1) # Softmax 적용: e^z / sum(e^z)
+            # 라벨 1(과대광고)에 해당하는 확률값 (0.0 ~ 1.0)
+            prob_fake = probs[0][1].item()
+            # =========================================================
+            # [수식 2] 의미론적 유사도 (Cosine Similarity)
+            # =========================================================
+            # 현재 검사 중인 텍스트의 벡터 B 추출
+            current_embedding = outputs.hidden_states[-1][:, 0, :]
+            # 레퍼런스 벡터 A들과 벡터 B의 유사도 계산: (A * B) / (|A| * |B|)
+            similarities = F.cosine_similarity(current_embedding, self.reference_embeddings)
+            # 가장 문맥이 비슷하다고 판정된 레퍼런스와의 최고 유사도 점수
+            max_sim = torch.max(similarities).item()
+        # 분석이 끝나면 즉시 VRAM 반환
+        del inputs, outputs, logits, probs, current_embedding, similarities
+        self.clear_memory()
+        # =========================================================
+        # 최종 X2 스코어 산출 (분류 확률 + 유사도 앙상블)
+        # =========================================================
+        # 분류기 확률을 100점 만점으로 변환
+        classification_score = prob_fake * 100
+        # 코사인 유사도를 100점 만점으로 변환 (유사도가 0 이하면 0점 처리)
+        similarity_score = max(max_sim, 0) * 100
+        # 💡 [실무 팁] 혼합 가중치 적용
+        # 현재 KoELECTRA 모델은 '과대광고 전용'���로 파인튜닝 되지 않은 쌩얼 상태입니다.
+        # 따라서 Softmax 분류 확률은 랜덤에 가깝고, 코사인 유사도가 훨씬 정확합니다.
+        # 유사도 점수에 80%, 분류 점수에 20% 가중치를 주어 최종 점수를 만듭니다.
+        x2_score = (similarity_score * 0.8) + (classification_score * 0.2)
+        print(f"   -> 📊 모델 분류 확률 (Softmax): {prob_fake*100:.1f}%")
+        print(f"   -> 🔗 최대 문맥 유사도 (Cosine Sim): {max_sim*100:.1f}%")
+        print(f"📈 최종 X2 점수 (0~100): {x2_score:.2f}점")
+        return x2_score
+# ==========================================
+# 단독 테스트 코드
+# ==========================================
+if __name__ == "__main__":
+    analyzer = SemanticAnalyzer()
+    # 레퍼런스("단 일주일 만에...")와 의미가 유사한 문장 테스트
+    test_text = "단 7일만 투자하세요! 지방이 완전히 파괴되는 놀라운 마법을 겪게 됩니다."
+    analyzer.calculate_x2_score(test_text)

myapp/step3_rag.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+import re
+import torch
+import numpy as np
+from openai import OpenAI
+from sentence_transformers import SentenceTransformer, util
+from dotenv import load_dotenv
+# .env 파일에 저장된 OPENAI_API_KEY를 환경 변수로 로드합니다.
+load_dotenv()
+class FactCheckerRAG:
+    def __init__(self):
+        print("📚 [Step 3] RAG + LLM 팩트체커를 로드합니다...")
+        # 기기 설정 (Mac의 경우 mps, 아니면 cpu)
+        self.device = "mps" if torch.backends.mps.is_available() else "cpu"
+        # 한국어 문장 임베딩 모델 로드
+        self.retriever = SentenceTransformer('jhgan/ko-sroberta-multitask', device=self.device)
+        # OpenAI 클라이언트 초기화 (API 키는 환경변수에서 안전하게 가져옴)
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            print("⚠️ 경고: OPENAI_API_KEY가 설정되지 않았습니다. .env 파일을 확인하세요.")
+        self.client = OpenAI(api_key=api_key)
+        # 팩트 데이터베이스 (식약처 규정 및 판례 기반)
+        self.fact_db = [
+            # 1. 절대 금지 조항
+            "질병의 예방 및 치료에 효능·효과가 있거나 의약품 또는 건강기능식품으로 오인·혼동할 우려가 있는 표시·광고는 금지됩니다.",
+            "체험기 등을 이용하거나 '주문쇄도', '단체추천' 등 소비자를 기만하는 광고는 처벌 대상입니다.",
+            "식품에 각종 상장, 인증, 보증을 받았다는 내용을 사용하는 것은 허위·과대광고에 해당할 수 있습니다.",
+            # 2. 허용되는 표현
+            "인체의 건전한 성장 및 발달과 건강 유지에 도움을 준다는 표현은 특정 질병을 언급하지 않는 한 허용됩니다.",
+            "건강증진, 체질개선, 식이요법, 영양보급 등에 도움을 준다는 표현은 과대광고가 아닙니다.",
+            "해당 제품이 유아식, 환자식 등 특수용도식품이라는 표현은 허용됩니다.",
+            # 3. 실제 적발 사례
+            "일반 식품에 당뇨, 고혈압, 항암 등 특정 질병 치료 효과가 있다고 기재하는 것은 명백한 불법입니다.",
+            "블로그나 쇼핑몰에 질병 치료 전후 비교 사진이나 개인적인 체험기를 올리는 행위는 불법 과대광고입니다."
+        ]
+        # 데이터베이스 임베딩 미리 계산
+        self.db_embeddings = self.retriever.encode(self.fact_db, convert_to_tensor=True)
+    def calculate_x3_score(self, text):
+        """RAG 기반으로 광고 텍스트의 위반 점수를 계산합니다."""
+        if not text or len(text.strip()) < 5:
+            return 0.0, "검사할 텍스트가 부족합니다."
+        try:
+            # 1. 관련 규정 검색 (Retrieval)
+            query_embedding = self.retriever.encode(text, convert_to_tensor=True)
+            cosine_scores = util.cos_sim(query_embedding, self.db_embeddings)[0]
+            best_idx = torch.argmax(cosine_scores).item()
+            retrieved_fact = self.fact_db[best_idx]
+            # 2. LLM 심사 (Generation)
+            prompt = f"""
+            당신은 대한민국 식약처 및 공정위의 과대광고 심사관입니다.
+            아래 [관련 규정]을 바탕으로 [광고 텍스트]의 위반 여부를 판단하세요.
+            [관련 규정]: {retrieved_fact}
+            [광고 텍스트]: {text}
+            반드시 아래 형식으로만 응답하세요:
+            점수: [0~100 사이 숫자]
+            사유: [위반인 경우 구체적 근거, 아니면 허용 근거를 1~2줄로 설명]
+            """
+            print("   -> 🤖 GPT 심사관이 분석 중...")
+            response = self.client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.0
+            )
+            result_text = response.choices[0].message.content
+            print(f"   [결과] {result_text}")
+            # 점수 파싱
+            score_match = re.search(r"점수:\s*(\d+)", result_text)
+            x3_score = float(score_match.group(1)) if score_match else 0.0
+            return x3_score, retrieved_fact
+        except Exception as e:
+            print(f"⚠️ 에러 발생: {e}")
+            return 0.0, "분석 중 오류가 발생했습니다."
+# 앙상블 점수 계산기
+def calculate_final_score(x1, x2, x3):
+    """
+    x1: 키워드 매칭 점수
+    x2: 딥러닝 문맥 점수
+    x3: RAG 팩트체크 점수
+    """
+    w1, w2, w3 = 0.2, 0.4, 0.4
+    return (w1 * x1) + (w2 * x2) + (w3 * x3)
+if __name__ == "__main__":
+    checker = FactCheckerRAG()
+    test_ad = "이 차를 마시면 암 ��방은 물론 당뇨 수치가 즉각 떨어집니다!"
+    score, fact = checker.calculate_x3_score(test_ad)
+    print("-" * 30)
+    print(f"최종 위반 점수: {score}")
+    print(f"참조 규정: {fact}")

myapp/step4_xai.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import numpy as np
+import xgboost as xgb
+import shap
+class XAIScorer:
+    def __init__(self):
+        print("📊 [Step 4] XGBoost 앙상블 모델 및 SHAP 설명기(XAI) 로드 중...")
+        # 1. PoC용 가상 데이터 생성 (실무에서는 실제 라벨링된 DB를 불러옵니다)
+        # X1(단어), X2(문맥), X3(팩트체크) 점수를 랜덤 생성
+        np.random.seed(42)
+        X_train = np.random.rand(1000, 3) * 100
+        # 가상의 정답(Label) 생성 로직: X2와 X3가 높을수록 과대광고(1)일 확률이 높음
+        # y = 1 (과대광고), y = 0 (정상)
+        y_train = ((X_train[:, 0]*0.2 + X_train[:, 1]*0.4 + X_train[:, 2]*0.4) > 50).astype(int)
+        # 2. XGBoost 모델 정의 및 학습 (Logistic 변환 내장)
+        self.model = xgb.XGBClassifier(
+            n_estimators=50,
+            max_depth=3,
+            learning_rate=0.1,
+            eval_metric='logloss',
+            random_state=42
+        )
+        self.model.fit(X_train, y_train)
+        # 3. SHAP TreeExplainer 초기화 (XAI)
+        self.explainer = shap.TreeExplainer(self.model)
+        print("   -> 🧠 머신러닝 스코어링 엔진 세팅 완료!")
+    def calculate_final_score_and_explain(self, x1, x2, x3):
+        # 입력값을 numpy 배열로 변환
+        X_input = np.array([[x1, x2, x3]])
+        # =========================================================
+        # 1. 최종 스코어링 (Logistic/Sigmoid 변환)
+        # XGBoost의 predict_proba는 내부적으로 Z값에 Sigmoid를 씌워 0~1 확률을 반환합니다.
+        # =========================================================
+        probabilities = self.model.predict_proba(X_input)
+        final_score = probabilities[0][1] * 100  # 클래스 1(과대광고)일 확률을 100점 만점으로 변환
+        # =========================================================
+        # 2. XAI (SHAP Value 계산)
+        # =========================================================
+        shap_values = self.explainer.shap_values(X_input)
+        # 이진 분류의 경우 버전/세팅에 따라 리스트로 나올 수 있으므로 안전하게 추출
+        if isinstance(shap_values, list):
+            shap_vals = shap_values[1][0]
+        else:
+            shap_vals = shap_values[0]
+        # Base Value (모델의 평균 예측값/절편)
+        if isinstance(self.explainer.expected_value, (list, np.ndarray)):
+            base_value = self.explainer.expected_value[1]
+        else:
+            base_value = self.explainer.expected_value
+        return final_score, shap_vals, base_value