Spaces:
Running
Running
Commit ยท
1ebb586
0
Parent(s):
Initial commit: clean code without secrets
Browse files- .DS_Store +0 -0
- .gitignore +4 -0
- README.md +3 -0
- myapp/Procfile +0 -0
- myapp/app.py +353 -0
- myapp/main_pipeline.py +88 -0
- myapp/requirements.txt +13 -0
- myapp/step0_ingestion.py +178 -0
- myapp/step1_lexical.py +109 -0
- myapp/step2_semantic.py +116 -0
- myapp/step3_rag.py +111 -0
- myapp/step4_xai.py +60 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
__pycache__/
|
| 3 |
+
.venv/.env
|
| 4 |
+
.venv/
|
README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FakeAD-Detector
|
| 2 |
+
# FakeAD-Detector
|
| 3 |
+
# FakeAD-Detector
|
myapp/Procfile
ADDED
|
File without changes
|
myapp/app.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from fastapi.responses import HTMLResponse
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware # ๐ CORS ์ถ๊ฐ
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
import uvicorn
|
| 6 |
+
import math
|
| 7 |
+
|
| 8 |
+
# (๊ธฐ์กด AI ๋ชจ๋ import ๋ถ๋ถ ๋์ผ)
|
| 9 |
+
|
| 10 |
+
app = FastAPI()
|
| 11 |
+
|
| 12 |
+
# ๐ ์ธ๋ถ(Wix)์์ API๋ฅผ ํธ์ถํ ์ ์๋๋ก CORS ํ์ฉ ์ค์
|
| 13 |
+
app.add_middleware(
|
| 14 |
+
CORSMiddleware,
|
| 15 |
+
allow_origins=["*"], # ์ค์ ์๋น์ค ์์๋ ["https://๋ดwix์ฃผ์.com"] ์ผ๋ก ๋ณ๊ฒฝ ๊ถ์ฅ
|
| 16 |
+
allow_credentials=True,
|
| 17 |
+
allow_methods=["*"],
|
| 18 |
+
allow_headers=["*"],
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# (์ดํ ๊ธฐ์กด ์ฝ๋ ๋์ผ...)
|
| 22 |
+
|
| 23 |
+
from flask import Flask
|
| 24 |
+
|
| 25 |
+
app = Flask(__name__)
|
| 26 |
+
|
| 27 |
+
@app.route("/")
|
| 28 |
+
def home():
|
| 29 |
+
return "Hello from Render!"
|
| 30 |
+
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
app.run()
|
| 33 |
+
|
| 34 |
+
from fastapi import FastAPI
|
| 35 |
+
from fastapi.responses import HTMLResponse
|
| 36 |
+
from pydantic import BaseModel
|
| 37 |
+
import uvicorn
|
| 38 |
+
import math
|
| 39 |
+
|
| 40 |
+
# ==========================================
|
| 41 |
+
# ๐ AI ๋ ๊ณ ๋ธ๋ก ๋ถ๋ฌ์ค๊ธฐ
|
| 42 |
+
# ==========================================
|
| 43 |
+
from step0_ingestion import DataIngestionPipeline
|
| 44 |
+
from step1_lexical import LexicalAnalyzer
|
| 45 |
+
from step2_semantic import SemanticAnalyzer
|
| 46 |
+
from step3_rag import FactCheckerRAG
|
| 47 |
+
from step4_xai import XAIScorer
|
| 48 |
+
|
| 49 |
+
app = FastAPI()
|
| 50 |
+
|
| 51 |
+
print("==================================================")
|
| 52 |
+
print(" โณ AI ์์ง ๋ฐ ๋ฅ๋ฌ๋ ๋ชจ๋ธ๋ค์ ๋ฉ๋ชจ๋ฆฌ์ ์ฌ๋ฆฌ๋ ์ค์
๋๋ค...")
|
| 53 |
+
print("==================================================")
|
| 54 |
+
ingestion = DataIngestionPipeline()
|
| 55 |
+
lexical = LexicalAnalyzer()
|
| 56 |
+
semantic = SemanticAnalyzer()
|
| 57 |
+
rag_checker = FactCheckerRAG()
|
| 58 |
+
xai_scorer = XAIScorer()
|
| 59 |
+
print("\nโ
[์๋ฒ ์ค๋น ์๋ฃ] http://127.0.0.1:8000 ์ ์ ์ํ์ธ์!\n")
|
| 60 |
+
|
| 61 |
+
class AdRequest(BaseModel):
|
| 62 |
+
video_url: str
|
| 63 |
+
product_url: str
|
| 64 |
+
|
| 65 |
+
# 1. ๐ ๋คํฌ ๋ชจ๋ ๋์๋ณด๋ ์น ํ๋ก ํธ์๋ (HTML/CSS/JS)
|
| 66 |
+
@app.get("/", response_class=HTMLResponse)
|
| 67 |
+
async def serve_frontend():
|
| 68 |
+
html_content = """
|
| 69 |
+
<!DOCTYPE html>
|
| 70 |
+
<html lang="ko">
|
| 71 |
+
<head>
|
| 72 |
+
<meta charset="UTF-8">
|
| 73 |
+
<title>AI ๊ณผ๋๊ด๊ณ ํ์ง ๋์๋ณด๋</title>
|
| 74 |
+
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
| 75 |
+
<style>
|
| 76 |
+
body { font-family: 'Pretendard', sans-serif; background-color: #121212; color: #ffffff; padding: 20px; margin: 0; }
|
| 77 |
+
.container { max-width: 1200px; margin: auto; }
|
| 78 |
+
h2 { color: #ffffff; text-align: left; border-bottom: 2px solid #333; padding-bottom: 10px; }
|
| 79 |
+
|
| 80 |
+
/* ์
๋ ฅ ํผ ์น์
*/
|
| 81 |
+
.input-section { background: #1e1e1e; padding: 20px; border-radius: 12px; margin-bottom: 20px; box-shadow: 0 4px 6px rgba(0,0,0,0.3); }
|
| 82 |
+
input { width: 100%; padding: 12px; margin: 8px 0; background: #2c2c2e; border: 1px solid #444; border-radius: 8px; color: #fff; box-sizing: border-box; }
|
| 83 |
+
button { width: 100%; padding: 15px; background: #4caf50; color: white; border: none; border-radius: 8px; font-size: 16px; font-weight: bold; cursor: pointer; transition: 0.3s; }
|
| 84 |
+
button:hover { background: #45a049; }
|
| 85 |
+
|
| 86 |
+
/* ๋์๋ณด๋ ๊ทธ๋ฆฌ๋ ๋ ์ด์์ */
|
| 87 |
+
.dashboard { display: none; grid-template-columns: 1fr 2fr; gap: 20px; margin-top: 20px; }
|
| 88 |
+
.card { background: #1e1e1e; padding: 20px; border-radius: 12px; box-shadow: 0 4px 6px rgba(0,0,0,0.3); }
|
| 89 |
+
.card h3 { margin-top: 0; color: #a0a0a0; font-size: 16px; border-bottom: 1px solid #333; padding-bottom: 10px; }
|
| 90 |
+
|
| 91 |
+
/* ๋๋ ์ฐจํธ ์นด๋ */
|
| 92 |
+
.score-card { text-align: center; }
|
| 93 |
+
.score-card canvas { max-height: 250px; margin: auto; }
|
| 94 |
+
|
| 95 |
+
/* ์ฌ๊ณ ํ๋ก ์นด๋ */
|
| 96 |
+
.xai-card { font-size: 15px; line-height: 1.6; color: #e0e0e0; }
|
| 97 |
+
.highlight { color: #4caf50; font-weight: bold; }
|
| 98 |
+
.danger { color: #ff5252; font-weight: bold; }
|
| 99 |
+
|
| 100 |
+
/* ์ธ๋ถ ์์ง ๊ฒฐ๊ณผ ์นด๋ */
|
| 101 |
+
.details-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-top: 20px; }
|
| 102 |
+
.detail-item { background: #2c2c2e; padding: 15px; border-radius: 8px; }
|
| 103 |
+
.detail-item h4 { margin: 0 0 10px 0; color: #4caf50; }
|
| 104 |
+
|
| 105 |
+
/* ๋ก๋ฉ ํ
์คํธ */
|
| 106 |
+
#loading { display: none; text-align: center; color: #4caf50; font-size: 18px; margin-top: 20px; font-weight: bold; }
|
| 107 |
+
</style>
|
| 108 |
+
</head>
|
| 109 |
+
<body>
|
| 110 |
+
<div class="container">
|
| 111 |
+
<h2>๐จ AI ๊ณผ๋๊ด๊ณ ํ์ง ๋์๋ณด๋ (Overview)</h2>
|
| 112 |
+
|
| 113 |
+
<div class="input-section">
|
| 114 |
+
<input type="text" id="video_url" placeholder="์ ํ๋ธ ์์ ๋งํฌ (์ ํ์ฌํญ)">
|
| 115 |
+
<input type="text" id="product_url" placeholder="์ํ ์์ธํ์ด์ง ๋งํฌ (ํ์)" value="https://brand.naver.com/pacsafe/products/9365045491">
|
| 116 |
+
<button onclick="analyzeAd()">๋ถ์ ์์ (๋ฐ์ดํฐ ํฌ๋กค๋ง ๋ฐ AI ๋ถ์)</button>
|
| 117 |
+
</div>
|
| 118 |
+
|
| 119 |
+
<div id="loading">๋ฐ์ดํฐ๋ฅผ ์์งํ๊ณ AI ๋ชจ๋ธ์ด ๋ถ์ ์ค์
๋๋ค. ์ ์๋ง ๊ธฐ๋ค๋ ค์ฃผ์ธ์ โณ...</div>
|
| 120 |
+
|
| 121 |
+
<div class="dashboard" id="dashboard">
|
| 122 |
+
<div class="card score-card">
|
| 123 |
+
<h3>ํตํฉ ์ํ๋ ์ ์ (Final Score)</h3>
|
| 124 |
+
<canvas id="scoreChart"></canvas>
|
| 125 |
+
<h1 id="scoreText" style="margin-top: 15px;">0.00์ </h1>
|
| 126 |
+
<p id="statusText" style="color: #a0a0a0;"></p>
|
| 127 |
+
</div>
|
| 128 |
+
|
| 129 |
+
<div class="card xai-card">
|
| 130 |
+
<h3>๐ค AI ์ต์ข
ํ์ ์ฌ๊ณ ํ๋ก (XAI Reasoning)</h3>
|
| 131 |
+
<div id="xaiReasoning" style="margin-bottom: 20px;"></div>
|
| 132 |
+
|
| 133 |
+
<h3>๐ RAG ํฉํธ์ฒดํฌ 2D ๋ฒกํฐ ๊ณต๊ฐ ๋น๊ต</h3>
|
| 134 |
+
<canvas id="vectorChart" style="max-height: 200px;"></canvas>
|
| 135 |
+
</div>
|
| 136 |
+
</div>
|
| 137 |
+
|
| 138 |
+
<div class="dashboard" id="detailsDashboard" style="grid-template-columns: 1fr; margin-top: 0;">
|
| 139 |
+
<div class="card">
|
| 140 |
+
<h3>โ๏ธ ์ธ๋ถ ์์ง ๋ถ์ ๊ฒฐ๊ณผ (Detailed Engine Results)</h3>
|
| 141 |
+
<div class="details-grid">
|
| 142 |
+
<div class="detail-item">
|
| 143 |
+
<h4>[X1] ํํ์ ๋ฐ ๋จ์ด ํ์ง๊ธฐ</h4>
|
| 144 |
+
<p id="x1Details"></p>
|
| 145 |
+
</div>
|
| 146 |
+
<div class="detail-item">
|
| 147 |
+
<h4>[X2] ์๋ฏธ๋ก ์ ๋ฌธ๋งฅ ์ ์ฌ๋ (KoELECTRA)</h4>
|
| 148 |
+
<p id="x2Details"></p>
|
| 149 |
+
</div>
|
| 150 |
+
<div class="detail-item" style="grid-column: span 2;">
|
| 151 |
+
<h4>[X3] RAG ๊ธฐ๋ฐ ํฉํธ์ฒดํฌ ๊ต์ฐจ ๊ฒ์ฆ</h4>
|
| 152 |
+
<p id="x3Details"></p>
|
| 153 |
+
</div>
|
| 154 |
+
</div>
|
| 155 |
+
</div>
|
| 156 |
+
</div>
|
| 157 |
+
</div>
|
| 158 |
+
|
| 159 |
+
<script>
|
| 160 |
+
let scoreChartInstance = null;
|
| 161 |
+
let vectorChartInstance = null;
|
| 162 |
+
|
| 163 |
+
async function analyzeAd() {
|
| 164 |
+
const videoUrl = document.getElementById('video_url').value;
|
| 165 |
+
const productUrl = document.getElementById('product_url').value;
|
| 166 |
+
|
| 167 |
+
if (!productUrl) return alert("์ํ ๋งํฌ๋ ํ์์
๋๋ค!");
|
| 168 |
+
|
| 169 |
+
document.getElementById('loading').style.display = 'block';
|
| 170 |
+
document.getElementById('dashboard').style.display = 'none';
|
| 171 |
+
document.getElementById('detailsDashboard').style.display = 'none';
|
| 172 |
+
|
| 173 |
+
try {
|
| 174 |
+
const response = await fetch('/api/analyze', {
|
| 175 |
+
method: 'POST',
|
| 176 |
+
headers: { 'Content-Type': 'application/json' },
|
| 177 |
+
body: JSON.stringify({ video_url: videoUrl, product_url: productUrl })
|
| 178 |
+
});
|
| 179 |
+
|
| 180 |
+
const data = await response.json();
|
| 181 |
+
document.getElementById('loading').style.display = 'none';
|
| 182 |
+
|
| 183 |
+
if (data.status === "success") {
|
| 184 |
+
document.getElementById('dashboard').style.display = 'grid';
|
| 185 |
+
document.getElementById('detailsDashboard').style.display = 'grid';
|
| 186 |
+
|
| 187 |
+
// ๋ฐ์ดํฐ ๋ฐ์ธ๋ฉ
|
| 188 |
+
document.getElementById('scoreText').innerText = data.final_score.toFixed(2) + "์ ";
|
| 189 |
+
document.getElementById('statusText').innerText = data.message;
|
| 190 |
+
document.getElementById('statusText').style.color = data.final_score > 70 ? "#ff5252" : (data.final_score > 40 ? "#ffeb3b" : "#4caf50");
|
| 191 |
+
|
| 192 |
+
document.getElementById('xaiReasoning').innerHTML = data.xai_reasoning;
|
| 193 |
+
document.getElementById('x1Details').innerHTML = data.x1_details;
|
| 194 |
+
document.getElementById('x2Details').innerHTML = data.x2_details;
|
| 195 |
+
document.getElementById('x3Details').innerHTML = data.x3_details;
|
| 196 |
+
|
| 197 |
+
// ๐ ๋๋ ์ฐจํธ ๋ ๋๋ง
|
| 198 |
+
renderScoreChart(data.final_score);
|
| 199 |
+
|
| 200 |
+
// ๐ 2D ๋ฒกํฐ ์ฐ์ ๋ ๋ ๋๋ง
|
| 201 |
+
renderVectorChart(data.vector_data);
|
| 202 |
+
|
| 203 |
+
} else {
|
| 204 |
+
alert("๋ถ์ ์คํจ: " + data.error);
|
| 205 |
+
}
|
| 206 |
+
} catch (err) {
|
| 207 |
+
document.getElementById('loading').style.display = 'none';
|
| 208 |
+
alert("์๋ฒ ํต์ ์๋ฌ๊ฐ ๋ฐ์ํ์ต๋๋ค.");
|
| 209 |
+
}
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
function renderScoreChart(score) {
|
| 213 |
+
const ctx = document.getElementById('scoreChart').getContext('2d');
|
| 214 |
+
if(scoreChartInstance) scoreChartInstance.destroy();
|
| 215 |
+
|
| 216 |
+
const color = score > 70 ? '#ff5252' : (score > 40 ? '#ffeb3b' : '#4caf50');
|
| 217 |
+
|
| 218 |
+
scoreChartInstance = new Chart(ctx, {
|
| 219 |
+
type: 'doughnut',
|
| 220 |
+
data: {
|
| 221 |
+
labels: ['์ํ๋', '์์ '],
|
| 222 |
+
datasets: [{
|
| 223 |
+
data: [score, 100 - score],
|
| 224 |
+
backgroundColor: [color, '#2c2c2e'],
|
| 225 |
+
borderWidth: 0
|
| 226 |
+
}]
|
| 227 |
+
},
|
| 228 |
+
options: {
|
| 229 |
+
cutout: '75%',
|
| 230 |
+
plugins: { legend: { display: false } }
|
| 231 |
+
}
|
| 232 |
+
});
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
function renderVectorChart(vectorData) {
|
| 236 |
+
const ctx = document.getElementById('vectorChart').getContext('2d');
|
| 237 |
+
if(vectorChartInstance) vectorChartInstance.destroy();
|
| 238 |
+
|
| 239 |
+
scoreChartInstance = new Chart(ctx, {
|
| 240 |
+
type: 'scatter',
|
| 241 |
+
data: {
|
| 242 |
+
datasets: [
|
| 243 |
+
{
|
| 244 |
+
label: '์์ฝ์ฒ ๊ท์ (Fact)',
|
| 245 |
+
data: [{ x: 0, y: 0 }],
|
| 246 |
+
backgroundColor: '#4caf50',
|
| 247 |
+
pointRadius: 8
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
label: '๊ด๊ณ ๋ฌธ๊ตฌ (Claim)',
|
| 251 |
+
data: [{ x: vectorData.x, y: vectorData.y }],
|
| 252 |
+
backgroundColor: '#ff5252',
|
| 253 |
+
pointRadius: 8
|
| 254 |
+
}
|
| 255 |
+
]
|
| 256 |
+
},
|
| 257 |
+
options: {
|
| 258 |
+
responsive: true,
|
| 259 |
+
scales: {
|
| 260 |
+
x: { grid: { color: '#333' }, min: -10, max: 100, title: {display: true, text: '์๋ฏธ๋ก ์ ๊ฑฐ๋ฆฌ (X)', color: '#888'} },
|
| 261 |
+
y: { grid: { color: '#333' }, min: -10, max: 100, title: {display: true, text: '์๋ฏธ๋ก ์ ๊ฑฐ๋ฆฌ (Y)', color: '#888'} }
|
| 262 |
+
},
|
| 263 |
+
plugins: {
|
| 264 |
+
legend: { labels: { color: '#fff' } }
|
| 265 |
+
}
|
| 266 |
+
}
|
| 267 |
+
});
|
| 268 |
+
}
|
| 269 |
+
</script>
|
| 270 |
+
</body>
|
| 271 |
+
</html>
|
| 272 |
+
"""
|
| 273 |
+
return HTMLResponse(content=html_content)
|
| 274 |
+
|
| 275 |
+
# 2. ๋ถ์ API
|
| 276 |
+
@app.post("/api/analyze")
|
| 277 |
+
def api_analyze(req: AdRequest):
|
| 278 |
+
try:
|
| 279 |
+
# Step 0: ๋ฐ์ดํฐ ์์ง
|
| 280 |
+
stt_text = ingestion.run_stt(ingestion.extract_audio_from_video(req.video_url)) if req.video_url.strip() else ""
|
| 281 |
+
ocr_text = ingestion.run_ocr_from_web(req.product_url) if req.product_url.strip() else ""
|
| 282 |
+
combined_text = f"{stt_text}\n{ocr_text}".strip()
|
| 283 |
+
|
| 284 |
+
if len(combined_text) < 5:
|
| 285 |
+
return {"status": "error", "error": "ํ
์คํธ๋ฅผ ์ฐพ์ง ๋ชปํ์ต๋๋ค."}
|
| 286 |
+
|
| 287 |
+
# Step 1, 2, 3: ์ ์ ๋์ถ
|
| 288 |
+
x1_score = lexical.calculate_x1_score(combined_text)
|
| 289 |
+
x2_score = semantic.calculate_x2_score(combined_text)
|
| 290 |
+
x3_score, matched_fact = rag_checker.calculate_x3_score(combined_text)
|
| 291 |
+
|
| 292 |
+
# Step 4: ๋จธ์ ๋ฌ๋ ์ค์ฝ์ด๋ง
|
| 293 |
+
final_score, shap_vals, _ = xai_scorer.calculate_final_score_and_explain(x1_score, x2_score, x3_score)
|
| 294 |
+
|
| 295 |
+
# =====================================================================
|
| 296 |
+
# ๐ UI์ ๋ฟ๋ ค์ค ์์ธ ์ค๋ช
(Detail Text) ์์ฑ ๋ก์ง
|
| 297 |
+
# =====================================================================
|
| 298 |
+
|
| 299 |
+
# [X1 ์ธ๋ถ๊ฒฐ๊ณผ] ๋ฐ๊ฒฌ๋ ๊ธ์น์ด ์ถ์ถ
|
| 300 |
+
detected_words = [word for word in lexical.lexicon.keys() if word in combined_text]
|
| 301 |
+
if detected_words:
|
| 302 |
+
x1_details = f"<span class='danger'>์ ๋ฐ๋ ๋จ์ด: {', '.join(detected_words)}</span><br>์ด ๋จ์ด๋ค์ ์์ฝ์ฒ ๊ฐ์ด๋๋ผ์ธ์ ์ํด ์ฌ์ฉ์ด ๊ฐํ๊ฒ ๊ท์ ๋๋ ํํ์
๋๋ค."
|
| 303 |
+
else:
|
| 304 |
+
x1_details = "<span class='highlight'>๋ฐ๊ฒฌ๋ ๊ธ์น์ด ์์.</span><br>๋ช
์์ ์ธ ํ์ ๊ณผ์ฅ ๋จ์ด๋ ์ฌ์ฉ๋์ง ์์ ํ
์คํธ ํ๋ฉด์ ์ผ๋ก๋ ์์ ํฉ๋๋ค."
|
| 305 |
+
|
| 306 |
+
# [X2 ์ธ๋ถ๊ฒฐ๊ณผ] ๋ฌธ๋งฅ ์ค๋ช
|
| 307 |
+
if x2_score > 60:
|
| 308 |
+
x2_details = f"<span class='danger'>๋ฌธ๋งฅ์ ์ํ๋ {x2_score:.1f}์ </span><br>๋จ์ ๋จ์ด๋ฅผ ๋์ด, ๋ฌธ์ฅ์ ์ ๋ฐ์ ์ธ ๋์์ค๊ฐ ๊ณผ๊ฑฐ ์ ๋ฐ๋ ํ์๊ด๊ณ ๋ฐ์ดํฐ๋ฒ ์ด์ค์ ๊ณผ์ฅ ํจํด(๋จ์ ์ ํํ, ํจ๋ฅ ๋งน์ ๋ฑ)๊ณผ <b>๋งค์ฐ ์ ์ฌํ๊ฒ ๊ฐ์ง</b>๋์์ต๋๋ค."
|
| 309 |
+
else:
|
| 310 |
+
x2_details = f"<span class='highlight'>๋ฌธ๋งฅ์ ์ํ๋ {x2_score:.1f}์ </span><br>๊ณผ๊ฑฐ ์ ๋ฐ๋ ํ์๊ด๊ณ ํน์ ์ ์๊ทน์ ์ด๊ฑฐ๋ ๊ธฐ๋ง์ ์ธ ๋ฌธ๋งฅ ํจํด์ด ํฌ๊ฒ ๋ฐ๊ฒฌ๋์ง ์์์ต๋๋ค."
|
| 311 |
+
|
| 312 |
+
# [X3 ์ธ๋ถ๊ฒฐ๊ณผ] RAG ์ค๋ช
|
| 313 |
+
x3_details = f"<b>[๊ด๋ จ ์์ฝ์ฒ ๊ท์ ๋งค์นญ]</b><br>{matched_fact}<br><br>"
|
| 314 |
+
if x3_score > 50: # GPT๊ฐ ๋์ ์๋ฐ ์ ์๏ฟฝ๏ฟฝ ์ค ๊ฒฝ์ฐ
|
| 315 |
+
x3_details += f"<b>[LLM ์ถ๋ก ๊ฒฐ๊ณผ: <span class='danger'>๋ชจ์ ๋ฐ๊ฒฌ</span>]</b><br>๊ด๊ณ ๋ฌธ๊ตฌ๊ฐ ์ ์์ฝ์ฒ ๊ท์ ์ ๋ช
๋ฐฑํ ์๋ฐํ๊ณ ์๋ ๊ฒ์ผ๋ก ์ถ๋ก ๋์์ต๋๋ค."
|
| 316 |
+
else:
|
| 317 |
+
x3_details += f"<b>[LLM ์ถ๋ก ๊ฒฐ๊ณผ: <span class='highlight'>๊ท์ ์ค์</span>]</b><br>๊ด๊ณ ๋ฌธ๊ตฌ์ ์ ์์ฝ์ฒ ๊ท์ ๊ฐ์ ์ฌ๊ฐํ ๋
ผ๋ฆฌ์ ๋ชจ์์ด๋ ์๋ฐ ์ฌํญ์ด ๋ฐ๊ฒฌ๋์ง ์์์ต๋๋ค."
|
| 318 |
+
|
| 319 |
+
# [XAI ์ฌ๊ณ ํ๋ก] SHAP ๊ธฐ๋ฐ ์์
|
| 320 |
+
xai_reasoning = f"AI๋ ์ด ๊ด๊ณ ํ
์คํธ๋ฅผ ๋ถ์ํ ๋ ๋ค์๊ณผ ๊ฐ์ ์ฌ๊ณ ๊ณผ์ ์ ๊ฑฐ์ณค์ต๋๋ค.<br><ul>"
|
| 321 |
+
if shap_vals[0] > 0: xai_reasoning += f"<li>ํ๋ฉด์ ์ธ ๋จ์ด(X1)์ ๊ณผ์ฅ๋ ํํ์ด ์์ฌ ์์ด ์ํ๋๋ฅผ ๋์์ต๋๋ค. <span class='danger'>(+{shap_vals[0]:.2f})</span></li>"
|
| 322 |
+
else: xai_reasoning += f"<li>๊ธ์น์ด๊ฐ ๊ฒ์ถ๋์ง ์์ ๊ธฐ๋ณธ์ ์ผ๋ก ์์ ํ ๊ธ๋ก ์ธ์ํ์ต๋๋ค. <span class='highlight'>({shap_vals[0]:.2f})</span></li>"
|
| 323 |
+
|
| 324 |
+
if shap_vals[1] > 0: xai_reasoning += f"<li>ํ์ง๋ง ๋ฌธ๋งฅ์ ๋์์ค(X2)๊ฐ ๊ณผ๊ฑฐ ํ์๊ด๊ณ ์ ๋๋ฌด ๋น์ทํด AI๊ฐ ๊ฐํ ์์ฌ์ ํ์์ต๋๋ค. <span class='danger'>(+{shap_vals[1]:.2f})</span></li>"
|
| 325 |
+
else: xai_reasoning += f"<li>์ ์ฒด์ ์ธ ๋ฌธ๋งฅ๊ณผ ๋์์ค(X2) ์ญ์ ์ ์์ ์ธ ์ ํ ์ค๋ช
์ ํํ๋ฅผ ๋ ๊ณ ์์ต๋๋ค. <span class='highlight'>({shap_vals[1]:.2f})</span></li>"
|
| 326 |
+
|
| 327 |
+
if shap_vals[2] > 0: xai_reasoning += f"<li><b>๊ฒฐ์ ์ ์ผ๋ก ์์ฝ์ฒ ํฉํธ์ฒดํฌ(X3)์์ ๋ช
๋ฐฑํ ๊ท์ ์๋ฐ์ด ํ์ธ</b>๋์ด, ์ต์ข
์ํ ํ์ ์ ๋ด๋ ธ์ต๋๋ค. <span class='danger'>(+{shap_vals[2]:.2f})</span></li>"
|
| 328 |
+
else: xai_reasoning += f"<li><b>์์ฝ์ฒ ํฉํธ์ฒดํฌ(X3) ๊ฒฐ๊ณผ ๊ท์ ์๋ฐ ์์ง๊ฐ ๋ฐ๊ฒฌ๋์ง ์์</b> ์ต์ข
์ ์ผ๋ก ์์ ํ๋ค๊ณ ํ๊ฒฐํ์ต๋๋ค. <span class='highlight'>({shap_vals[2]:.2f})</span></li>"
|
| 329 |
+
xai_reasoning += "</ul>"
|
| 330 |
+
|
| 331 |
+
# [๋ฒกํฐ ๊ณต๊ฐ ์๊ฐํ ๋ฐ์ดํฐ] ์ ์ฌ๋๊ฐ ๋ฎ์์๋ก ๋ ์ ์ ๊ฑฐ๋ฆฌ๊ฐ ๋ฉ์ด์ง
|
| 332 |
+
# ์ฝ์ฌ์ธ ์ ์ฌ๋ ์ ์๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์์์ x, y ์ขํ ์ฐ์ถ
|
| 333 |
+
distance = max(10, 100 - x2_score) # 0์ ๊ฐ๊น์ธ์๋ก ์ํ(๊ฐ๊น์)
|
| 334 |
+
vector_data = {"x": distance * 0.8, "y": distance * 0.9}
|
| 335 |
+
|
| 336 |
+
message = "๐จ ์
์์ ์ธ ํ์/๊ณผ๋๊ด๊ณ ๋ก ์์ฌ๋ฉ๋๋ค." if final_score > 70 else ("โ ๏ธ ์ผ๋ถ ๊ณผ์ฅ๋ ๋ด์ฉ์ด ํฌํจ๋์ด ์์ต๋๋ค." if final_score > 40 else "โ
๊ณผ๋๊ด๊ณ ์์ง๊ฐ ์ ์ ์์ ํ ์ฝํ
์ธ ์
๋๋ค.")
|
| 337 |
+
|
| 338 |
+
return {
|
| 339 |
+
"status": "success",
|
| 340 |
+
"final_score": float(round(final_score, 2)),
|
| 341 |
+
"message": message,
|
| 342 |
+
"x1_details": x1_details,
|
| 343 |
+
"x2_details": x2_details,
|
| 344 |
+
"x3_details": x3_details,
|
| 345 |
+
"xai_reasoning": xai_reasoning,
|
| 346 |
+
"vector_data": vector_data
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
except Exception as e:
|
| 350 |
+
return {"status": "error", "error": str(e)}
|
| 351 |
+
|
| 352 |
+
if __name__ == "__main__":
|
| 353 |
+
uvicorn.run("app:app", host="127.0.0.1", port=8000, reload=True)
|
myapp/main_pipeline.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from step0_ingestion import DataIngestionPipeline
|
| 2 |
+
from step1_lexical import LexicalAnalyzer
|
| 3 |
+
from step2_semantic import SemanticAnalyzer
|
| 4 |
+
from step3_rag import FactCheckerRAG
|
| 5 |
+
from step4_xai import XAIScorer # ๐ Step 4 ์ถ๊ฐ!
|
| 6 |
+
|
| 7 |
+
def run_full_pipeline():
|
| 8 |
+
print("==================================================")
|
| 9 |
+
print(" ๐ [AtoZ ํ์ดํ๋ผ์ธ] ๊ณผ๋๊ด๊ณ ํ์ง AI ์์คํ
๊ฐ๋")
|
| 10 |
+
print("==================================================\n")
|
| 11 |
+
|
| 12 |
+
# 1. ๋ชจ๋ธ ๋ฐ ์์ง ์ด๊ธฐํ
|
| 13 |
+
ingestion = DataIngestionPipeline()
|
| 14 |
+
lexical = LexicalAnalyzer()
|
| 15 |
+
semantic = SemanticAnalyzer()
|
| 16 |
+
rag_checker = FactCheckerRAG()
|
| 17 |
+
xai_scorer = XAIScorer() # ๐ XAI ์์ง ๊ฐ๋!
|
| 18 |
+
|
| 19 |
+
target_video_url = "https://youtu.be/SJxSDRxd8Dc?si=t8dnIQciFulUlbVW"
|
| 20 |
+
target_product_url = "https://brand.naver.com/pacsafe/products/9365045491"
|
| 21 |
+
|
| 22 |
+
# [Step 0] ๋ฐ์ดํฐ ์์ง
|
| 23 |
+
print("\nโถ๏ธ [Step 0] ๋ฉํฐ๋ชจ๋ฌ ๋ฐ์ดํฐ ์์ง ์ค...")
|
| 24 |
+
stt_text = ""
|
| 25 |
+
try:
|
| 26 |
+
audio_file = ingestion.extract_audio_from_video(target_video_url)
|
| 27 |
+
if audio_file:
|
| 28 |
+
stt_text = ingestion.run_stt(audio_file)
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"โ ๏ธ ์ ํ๋ธ ์ถ์ถ ์คํจ (๋ฌด์ํ๊ณ ์งํ): {e}")
|
| 31 |
+
|
| 32 |
+
ocr_text = ""
|
| 33 |
+
try:
|
| 34 |
+
ocr_text = ingestion.run_ocr_from_web(target_product_url)
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"โ ๏ธ OCR ์ถ์ถ ์คํจ: {e}")
|
| 37 |
+
|
| 38 |
+
combined_text = f"{stt_text}\n{ocr_text}"
|
| 39 |
+
if len(combined_text) == 0:
|
| 40 |
+
print("โ ๋ถ์ํ ํ
์คํธ๊ฐ ์์ด ์ข
๋ฃํฉ๋๋ค.")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
# [Step 1, 2, 3] ํ
์คํธ ์ฌ์ธต ๋ถ์
|
| 44 |
+
print("\nโถ๏ธ [Step 1, 2, 3] ํ
์คํธ ์ฌ์ธต ๋ถ์ ๊ฐ๋...")
|
| 45 |
+
|
| 46 |
+
x1_score = lexical.calculate_x1_score(combined_text)
|
| 47 |
+
x2_score = semantic.calculate_x2_score(combined_text)
|
| 48 |
+
x3_score, matched_fact = rag_checker.calculate_x3_score(combined_text)
|
| 49 |
+
|
| 50 |
+
# ๐ [Step 4] XGBoost ์ค์ฝ์ด๋ง ๋ฐ SHAP ๋ถ์
|
| 51 |
+
print("\nโถ๏ธ [Step 4] XGBoost ๊ธฐ๋ฐ ์ต์ข
์ค์ฝ์ด๋ง ๋ฐ SHAP ์ค๋ช
์์ฑ...")
|
| 52 |
+
final_score, shap_vals, base_value = xai_scorer.calculate_final_score_and_explain(x1_score, x2_score, x3_score)
|
| 53 |
+
|
| 54 |
+
# =====================================================================
|
| 55 |
+
# ๐ ์ต์ข
๋ถ์ ๋ฆฌํฌํธ (XAI ์ค๋ช
ํฌํจ)
|
| 56 |
+
# =====================================================================
|
| 57 |
+
print("\n==================================================")
|
| 58 |
+
print(" ๐ [์ต์ข
๊ณผ๋๊ด๊ณ ํ์ง ๋ฆฌํฌํธ]")
|
| 59 |
+
print("==================================================")
|
| 60 |
+
print(f" ๐น [X1] ํํ์/๋จ์ด ์๋ฐ (Lexical) : {x1_score:5.1f} ์ ")
|
| 61 |
+
print(f" ๐น [X2] ๋ฌธ๋งฅ์ ์ ์ฌ๋ (Semantic) : {x2_score:5.1f} ์ ")
|
| 62 |
+
print(f" ๐น [X3] ์์ฝ์ฒ ํฉํธ์ฒดํฌ (RAG) : {x3_score:5.1f} ์ ")
|
| 63 |
+
print("--------------------------------------------------")
|
| 64 |
+
print(f" ๐ฏ [S] ๋จธ์ ๋ฌ๋ ์ต์ข
์ํ๋ ์ ์ : {final_score:5.1f} / 100.0 ์ ")
|
| 65 |
+
print("==================================================")
|
| 66 |
+
|
| 67 |
+
if final_score > 70:
|
| 68 |
+
print(" ๐จ [ํ์ ] ๋งค์ฐ ์ํ! ์
์์ ์ธ ํ์/๊ณผ๋๊ด๊ณ ๋ก ์์ฌ๋ฉ๋๋ค.")
|
| 69 |
+
elif final_score > 40:
|
| 70 |
+
print(" โ ๏ธ [ํ์ ] ์ฃผ์! ์ผ๋ถ ๊ณผ์ฅ๋ ํํ์ด๋ ์ฌ์ค๊ณผ ๋ค๋ฅธ ๋ด์ฉ์ด ํฌํจ๋์ด ์์ต๋๋ค.")
|
| 71 |
+
else:
|
| 72 |
+
print(" โ
[ํ์ ] ์์ ! ๊ณผ๋๊ด๊ณ ์์ง๊ฐ ์ ์ ์ ์์ ์ธ ์ฝํ
์ธ ์
๋๋ค.")
|
| 73 |
+
|
| 74 |
+
print("\n==================================================")
|
| 75 |
+
print(" ๐ค [XAI] ์ธ๊ณต์ง๋ฅ์ ํ์ ์ฌ์ (SHAP Values)")
|
| 76 |
+
print("==================================================")
|
| 77 |
+
print(" (์ ์๊ฐ ์์(+)๋ฉด ์ํ๋๋ฅผ ๋์๊ณ , ์์(-)๋ฉด ์์ ํ๋ค๊ณ ํ๋จํ ๊ทผ๊ฑฐ์
๋๋ค.)\n")
|
| 78 |
+
|
| 79 |
+
features = ["X1 (๋จ์ด ์๋ฐ)", "X2 (๋ฌธ๋งฅ ์ ์ฌ๋)", "X3 (ํฉํธ์ฒดํฌ ๋ชจ์)"]
|
| 80 |
+
for i, feature_name in enumerate(features):
|
| 81 |
+
impact = shap_vals[i]
|
| 82 |
+
direction = "๐ด ์ํ๋ ์ฆ๊ฐ" if impact > 0 else "๐ข ์ํ๋ ๊ฐ์"
|
| 83 |
+
# ์ง๊ด์ฑ์ ์ํด SHAP Log-odds ๊ฐ์ ์ ์ ์ค์ผ์ผ์ฒ๋ผ ๋น๋กํด์ ๋ณด์ฌ์ค๋๋ค
|
| 84 |
+
print(f" {direction} ๊ธฐ์ฌ: {feature_name:<18} -> ๊ธฐ์ฌ๋: {impact:+.2f}")
|
| 85 |
+
print("==================================================")
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
run_full_pipeline()
|
myapp/requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
pydantic
|
| 4 |
+
requests
|
| 5 |
+
playwright
|
| 6 |
+
paddleocr
|
| 7 |
+
paddlepaddle
|
| 8 |
+
torch
|
| 9 |
+
transformers
|
| 10 |
+
sentence-transformers
|
| 11 |
+
xgboost
|
| 12 |
+
shap
|
| 13 |
+
openai
|
myapp/step0_ingestion.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import gc
|
| 3 |
+
import requests
|
| 4 |
+
import torch
|
| 5 |
+
import whisper
|
| 6 |
+
from pytubefix import YouTube
|
| 7 |
+
from paddleocr import PaddleOCR
|
| 8 |
+
from playwright.sync_api import sync_playwright
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
class DataIngestionPipeline:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
| 14 |
+
print(f"โ
์ฌ์ฉ ์ค์ธ ๋๋ฐ์ด์ค: {self.device}")
|
| 15 |
+
|
| 16 |
+
def clear_memory(self):
|
| 17 |
+
gc.collect()
|
| 18 |
+
if torch.backends.mps.is_available():
|
| 19 |
+
torch.mps.empty_cache()
|
| 20 |
+
print("๐งน ๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ ์๋ฃ")
|
| 21 |
+
|
| 22 |
+
def extract_audio_from_video(self, video_url, output_filename="temp_audio"):
|
| 23 |
+
print(f"\n๐ฅ [1] ์ ํ๋ธ ์์ ๋ค์ด๋ก๋ ์์: {video_url}")
|
| 24 |
+
try:
|
| 25 |
+
yt = YouTube(video_url, 'WEB')
|
| 26 |
+
audio_stream = yt.streams.get_audio_only()
|
| 27 |
+
file_path = audio_stream.download(filename=f"{output_filename}.mp4")
|
| 28 |
+
print(f"โ
์ค๋์ค ์ถ์ถ ์๋ฃ: {file_path}")
|
| 29 |
+
return file_path
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"โ ๋ค์ด๋ก๋ ์คํจ: {e}")
|
| 32 |
+
return None
|
| 33 |
+
|
| 34 |
+
def run_stt(self, audio_path):
|
| 35 |
+
print(f"\n๐ฃ๏ธ [2] STT(์์ฑ->ํ
์คํธ) ๋ณํ ์์: {audio_path}")
|
| 36 |
+
model = whisper.load_model("small", device="cpu")
|
| 37 |
+
result = model.transcribe(audio_path, language="ko", fp16=False)
|
| 38 |
+
|
| 39 |
+
if result is None:
|
| 40 |
+
raise ValueError("Whisper๊ฐ ํ
์คํธ๋ฅผ ๋ฐํํ์ง ๋ชปํ์ต๋๋ค.")
|
| 41 |
+
|
| 42 |
+
text_result = result.get("text", "")
|
| 43 |
+
|
| 44 |
+
del model
|
| 45 |
+
self.clear_memory()
|
| 46 |
+
|
| 47 |
+
print("โ
STT ๋ณํ ์๋ฃ")
|
| 48 |
+
return text_result.strip()
|
| 49 |
+
|
| 50 |
+
def run_ocr_from_web(self, product_url):
|
| 51 |
+
print(f"\n๐ผ๏ธ [3] ์นํ์ด์ง ์ ์ ๋ฐ ์ด๋ฏธ์ง OCR ์์: {product_url}")
|
| 52 |
+
|
| 53 |
+
raw_image_urls = []
|
| 54 |
+
with sync_playwright() as p:
|
| 55 |
+
print(" -> ๋ธ๋ผ์ฐ์ ์ฐฝ์ ๋์ฐ๊ณ ํ์ด์ง ๋ก๋ฉ์ ๊ธฐ๋ค๋ฆฝ๋๋ค...")
|
| 56 |
+
browser = p.chromium.launch(headless=False)
|
| 57 |
+
page = browser.new_page()
|
| 58 |
+
page.goto(product_url, wait_until="domcontentloaded", timeout=60000)
|
| 59 |
+
page.wait_for_timeout(3000)
|
| 60 |
+
|
| 61 |
+
print(" -> ๐ฏ ์จ๊ฒจ์ง ์์ธํ์ด์ง๋ฅผ ์ด๊ธฐ ์ํด '๋๋ณด๊ธฐ' ๋ฒํผ์ ์ฐพ์ต๋๋ค...")
|
| 62 |
+
try:
|
| 63 |
+
more_btn = page.locator('button:has-text("์์ธ์ ๋ณด ํผ์ณ๋ณด๊ธฐ"), button:has-text("์์ธ์ค๋ช
๋๋ณด๊ธฐ"), a:has-text("๋๋ณด๊ธฐ")').first
|
| 64 |
+
if more_btn.is_visible(timeout=3000):
|
| 65 |
+
more_btn.click()
|
| 66 |
+
print(" => ์พ
! '๋๋ณด๊ธฐ' ๋ฒํผ์ ์ฑ๊ณต์ ์ผ๋ก ํด๋ฆญํ์ต๋๋ค!")
|
| 67 |
+
page.wait_for_timeout(2000)
|
| 68 |
+
except Exception:
|
| 69 |
+
print(" => '๋๋ณด๊ธฐ' ๋ฒํผ์ด ์๊ฑฐ๋ ์ด๋ฏธ ํผ์ณ์ ธ ์์ต๋๋ค. ๊ทธ๋๋ก ์งํํฉ๋๋ค.")
|
| 70 |
+
|
| 71 |
+
print(" -> ์ง์ฐ ๋ก๋ฉ(Lazy-loading)๋ ์ด๋ฏธ์ง๋ฅผ ๋ถ๋ฌ์ค๊ธฐ ์ํด ์คํฌ๋กค์ ๋ด๋ฆฝ๋๋ค...")
|
| 72 |
+
for _ in range(10):
|
| 73 |
+
page.evaluate("window.scrollBy(0, 1500)")
|
| 74 |
+
page.wait_for_timeout(1000)
|
| 75 |
+
|
| 76 |
+
img_elements = page.query_selector_all('img')
|
| 77 |
+
for img in img_elements:
|
| 78 |
+
src = img.get_attribute('data-src') or img.get_attribute('src')
|
| 79 |
+
if src and ('http' in src or src.startswith('//')):
|
| 80 |
+
if src.startswith('//'):
|
| 81 |
+
src = 'https:' + src
|
| 82 |
+
raw_image_urls.append(src)
|
| 83 |
+
browser.close()
|
| 84 |
+
|
| 85 |
+
valid_urls = []
|
| 86 |
+
for url in raw_image_urls:
|
| 87 |
+
url_lower = url.lower()
|
| 88 |
+
if not any(x in url_lower for x in ['.gif', 'icon', 'logo', 'blank', 'svg', 'thumb']):
|
| 89 |
+
valid_urls.append(url)
|
| 90 |
+
|
| 91 |
+
if not valid_urls:
|
| 92 |
+
print("โ ์ ํจํ ์์ธ ์ด๋ฏธ์ง๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
| 93 |
+
return ""
|
| 94 |
+
|
| 95 |
+
print(f"โ
์ด {len(valid_urls)}๊ฐ์ ์ด๋ฏธ์ง ๋ฐ๊ฒฌ! ์ง์ง ์์ธ ์ด๋ฏธ์ง๋ฅผ ํ์ํฉ๋๋ค...")
|
| 96 |
+
|
| 97 |
+
logging.getLogger('ppocr').setLevel(logging.ERROR)
|
| 98 |
+
|
| 99 |
+
# ๐ ํด์๋ ํ๊ณ์น๋ฅผ ๋ํญ ๋๋ฆฐ ์ต์ ์ธํ
์ ์ฉ
|
| 100 |
+
ocr = PaddleOCR(
|
| 101 |
+
lang='korean',
|
| 102 |
+
text_det_limit_side_len=2048,
|
| 103 |
+
text_det_limit_type='max'
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
all_extracted_text = []
|
| 107 |
+
|
| 108 |
+
headers = {
|
| 109 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 110 |
+
'Referer': product_url
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
processed_count = 0
|
| 114 |
+
|
| 115 |
+
for i, img_url in enumerate(valid_urls[2:]):
|
| 116 |
+
if processed_count >= 3:
|
| 117 |
+
break
|
| 118 |
+
|
| 119 |
+
temp_img_path = f"temp_ocr_{i}.jpg"
|
| 120 |
+
try:
|
| 121 |
+
response = requests.get(img_url, headers=headers, timeout=10)
|
| 122 |
+
with open(temp_img_path, 'wb') as f:
|
| 123 |
+
f.write(response.content)
|
| 124 |
+
|
| 125 |
+
if os.path.getsize(temp_img_path) < 30000:
|
| 126 |
+
if os.path.exists(temp_img_path): os.remove(temp_img_path)
|
| 127 |
+
continue
|
| 128 |
+
|
| 129 |
+
processed_count += 1
|
| 130 |
+
file_kb = os.path.getsize(temp_img_path) // 1024
|
| 131 |
+
print(f" -> [์ง์ง ํ
์คํธ ํ์ ์ค...] ๋ฌต์งํ ์์ธ ์ด๋ฏธ์ง ๋ฐ๊ฒฌ! ({file_kb}KB)")
|
| 132 |
+
|
| 133 |
+
result = ocr.ocr(temp_img_path)
|
| 134 |
+
|
| 135 |
+
# ๐ ๋ฐฉ๊ธ ํ์ธํ ์๋ฒฝํ ๋ฐ์ดํฐ ์ถ์ถ ๋ก์ง ์ ์ฉ!
|
| 136 |
+
if result and isinstance(result[0], dict) and 'rec_texts' in result[0]:
|
| 137 |
+
texts = result[0]['rec_texts']
|
| 138 |
+
print(f" => โจ ํ
์คํธ {len(texts)}์ค ์ถ์ถ ์ฑ๊ณต!")
|
| 139 |
+
all_extracted_text.extend(texts)
|
| 140 |
+
if os.path.exists(temp_img_path): os.remove(temp_img_path)
|
| 141 |
+
else:
|
| 142 |
+
print(f" => โ ๏ธ ๊ธ์๊ฐ ์์ต๋๋ค! (์ด๋ฏธ์ง ํ์ธ: {temp_img_path})")
|
| 143 |
+
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"โ ๏ธ ์ด๋ฏธ์ง ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
| 146 |
+
if os.path.exists(temp_img_path): os.remove(temp_img_path)
|
| 147 |
+
|
| 148 |
+
del ocr
|
| 149 |
+
self.clear_memory()
|
| 150 |
+
|
| 151 |
+
final_text = "\n".join(all_extracted_text) # ์ค๋ฐ๊ฟ์ผ๋ก ๊น๋ํ๊ฒ ํฉ์น๊ธฐ
|
| 152 |
+
print("\nโ
์นํ์ด์ง ์ด๋ฏธ์ง OCR ๋ณํ ์๋ฃ!")
|
| 153 |
+
return final_text
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# ==========================================
|
| 157 |
+
# ์ค์ ์คํ ํ
์คํธ ์ฝ๋
|
| 158 |
+
# ==========================================
|
| 159 |
+
if __name__ == "__main__":
|
| 160 |
+
pipeline = DataIngestionPipeline()
|
| 161 |
+
|
| 162 |
+
# 1. ์ ํ๋ธ STT ํ
์คํธ
|
| 163 |
+
test_video_url = "https://youtu.be/SJxSDRxd8Dc?si=t8dnIQciFulUlbVW"
|
| 164 |
+
try:
|
| 165 |
+
audio_file = pipeline.extract_audio_from_video(test_video_url)
|
| 166 |
+
if audio_file:
|
| 167 |
+
stt_text = pipeline.run_stt(audio_file)
|
| 168 |
+
print(f"\n[STT ๊ฒฐ๊ณผ (์์ฑ -> ํ
์คํธ)]\n{stt_text[:500]}...\n")
|
| 169 |
+
except Exception as e:
|
| 170 |
+
print(f"์ ํ๋ธ/STT ์ฒ๋ฆฌ ์ค ์๋ฌ ๋ฐ์: {e}")
|
| 171 |
+
|
| 172 |
+
# 2. ์นํ์ด์ง OCR ํ
์คํธ
|
| 173 |
+
test_product_url = "https://brand.naver.com/pacsafe/products/9365045491"
|
| 174 |
+
try:
|
| 175 |
+
ocr_text = pipeline.run_ocr_from_web(test_product_url)
|
| 176 |
+
print(f"\n[์นํ์ด์ง OCR ๊ฒฐ๊ณผ (์์ธ ์ด๋ฏธ์ง -> ํ
์คํธ)]\n===========================\n{ocr_text}\n===========================")
|
| 177 |
+
except Exception as e:
|
| 178 |
+
print(f"OCR ์ฒ๋ฆฌ ์ค ์๋ฌ ๋ฐ์: {e}")
|
myapp/step1_lexical.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from mecab import MeCab
|
| 3 |
+
|
| 4 |
+
class LexicalAnalyzer:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
print("โ
Mecab ํํ์ ๋ถ์๊ธฐ๋ฅผ ๋ก๋ํฉ๋๋ค...")
|
| 7 |
+
self.mecab = MeCab()
|
| 8 |
+
|
| 9 |
+
# ๐จ ๊ฐ์ค์น(TF-IDF์ IDF ๊ฐ๋
์ฐจ์ฉ)๊ฐ ๋ถ์ฌ๋ ๊ณผ๋๊ด๊ณ ์ฌ์
|
| 10 |
+
# ์ ์๊ฐ ๋์์๋ก ํ ๋ฒ๋ง ๋ฑ์ฅํด๋ ์น๋ช
์ ์ธ ๋จ์ด์
๋๋ค.
|
| 11 |
+
self.lexicon = {
|
| 12 |
+
"์น๋ฃ": 2.0, "์๋ฐฉ": 2.0, "์์น": 2.0, "ํญ์": 2.0, "ํนํจ": 2.0,
|
| 13 |
+
"100%": 1.5, "๋ง๋ณํต์น": 2.0, "๊ธฐ์ ": 1.5, "๋จ์จ์": 1.5,
|
| 14 |
+
"์ฃผ๋ฌธ์๋": 1.0, "๋จ์ฒด์ถ์ฒ": 1.0, "ํน์์ ๋ฒ": 1.0, # ์์ฝ์ฒ ๊ธฐ๋ง๊ด๊ณ ์ ๋ฐ ํค์๋
|
| 15 |
+
"์ต๊ณ ": 1.0, "๊ฐ์ฅ ์ข์": 1.0, "๋
์": 1.0, "๋ถ์์ฉ": 1.5,
|
| 16 |
+
"์ฒดํ๊ธฐ": 1.5, "์ฒดํ์ฌ๋ก": 1.5 # ์์ฝ์ฒ๊ฐ ๊ธ์งํ๋ ํ๊ธฐ ๋ง์ผํ
ํค์๋
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
# ๐ก๏ธ ๋ถ์ ์ด ์ฌ์ (์ด ๋จ์ด๋ค์ด ์ฃผ๋ณ์ ์์ผ๋ฉด ๋ฌด์ฃ ํ๊ฒฐ)
|
| 20 |
+
self.negation_words = {"์", "์", "์๋", "๋ฌด", "์", "๋ชป"}
|
| 21 |
+
|
| 22 |
+
def split_into_sentences(self, text):
|
| 23 |
+
"""ํ
์คํธ๋ฅผ ๋ฌธ์ฅ ๋จ์๋ก ๋ถ๋ฆฌํ์ฌ ๊ธธ์ด ํธํฅ(Length Bias)์ ๋ฐฉ์งํฉ๋๋ค."""
|
| 24 |
+
# ๋ง์นจํ, ๋๋ํ, ๋ฌผ์ํ ๋๋ ์ค๋ฐ๊ฟ์ ๊ธฐ์ค์ผ๋ก ๋ถ๋ฆฌ
|
| 25 |
+
sentences = re.split(r'[.!?\n]+', text)
|
| 26 |
+
return [s.strip() for s in sentences if len(s.strip()) > 2]
|
| 27 |
+
|
| 28 |
+
def check_negation_context(self, tokens, target_index, window_size=3):
|
| 29 |
+
"""
|
| 30 |
+
[ํต์ฌ ๋ก์ง] ํ๊ฒ ๋จ์ด ๋ค์ ๋ถ์ ์ด๊ฐ ์ค๋์ง ๋ฌธ๋งฅ์ ๊ฒ์ฌํฉ๋๋ค.
|
| 31 |
+
์: '๋ถ์์ฉ'(๋ช
์ฌ) + '์ด'(์กฐ์ฌ) + '์'(ํ์ฉ์ฌ) + '์ต๋๋ค'(์ด๋ฏธ)
|
| 32 |
+
"""
|
| 33 |
+
# ํ๊ฒ ๋จ์ด ๋ค์ window_size ๋งํผ์ ํํ์๋ฅผ ์ดํด๋ด
๋๋ค.
|
| 34 |
+
end_index = min(target_index + window_size + 1, len(tokens))
|
| 35 |
+
context_tokens = tokens[target_index + 1 : end_index]
|
| 36 |
+
|
| 37 |
+
for word, pos in context_tokens:
|
| 38 |
+
# VA(ํ์ฉ์ฌ-์๋ค), VX(๋ณด์กฐ์ฉ์ธ-์๋ค), MAG(๋ถ์ฌ-์,๋ชป) ๋ฑ์ ์ฒดํฌ
|
| 39 |
+
if word in self.negation_words or pos in ['VA', 'VX', 'MAG']:
|
| 40 |
+
return True # ๋ถ์ ์ด๊ฐ ์กด์ฌํจ!
|
| 41 |
+
return False
|
| 42 |
+
|
| 43 |
+
def calculate_x1_score(self, text):
|
| 44 |
+
if not text:
|
| 45 |
+
return 0.0
|
| 46 |
+
|
| 47 |
+
print("\n๐ [Step 1] ํ
์คํธ ๊ณผ์ฅ๋(Lexical Score) ๋ถ์ ์์...")
|
| 48 |
+
|
| 49 |
+
sentences = self.split_into_sentences(text)
|
| 50 |
+
total_sentences = len(sentences)
|
| 51 |
+
|
| 52 |
+
if total_sentences == 0:
|
| 53 |
+
return 0.0
|
| 54 |
+
|
| 55 |
+
total_penalty = 0.0
|
| 56 |
+
detected_issues = []
|
| 57 |
+
|
| 58 |
+
# ๋ฌธ์ฅ ๋จ์๋ก ๊ฒ์ฌํ์ฌ ํ
์คํธ๊ฐ ๊ธธ์ด์ ธ๋ ์ ์๊ฐ ํฌ์๋์ง ์๊ฒ ๋ฐฉ์ด
|
| 59 |
+
for sentence in sentences:
|
| 60 |
+
tokens = self.mecab.pos(sentence)
|
| 61 |
+
sentence_flagged = False
|
| 62 |
+
|
| 63 |
+
for i, (word, pos) in enumerate(tokens):
|
| 64 |
+
# ์ฌ์ ์ ์๋ ๊ธ์น์ด์ธ์ง ํ์ธ
|
| 65 |
+
if word in self.lexicon:
|
| 66 |
+
# ๋ถ์ ์ด ๋ฌธ๋งฅ ์ฒดํฌ ("๋ถ์์ฉ์ด ์์ต๋๋ค" ํํฐ๋ง)
|
| 67 |
+
is_negated = self.check_negation_context(tokens, i)
|
| 68 |
+
|
| 69 |
+
if is_negated:
|
| 70 |
+
detected_issues.append(f"๐ก๏ธ ๋ฌด์ฃ(๋ถ์ ์ด ๋๋ฐ): '{word}' (๋ฌธ์ฅ: {sentence})")
|
| 71 |
+
else:
|
| 72 |
+
weight = self.lexicon[word]
|
| 73 |
+
total_penalty += weight
|
| 74 |
+
sentence_flagged = True
|
| 75 |
+
detected_issues.append(f"๐จ ์ ๋ฐ: '{word}' (๊ฐ์ค์น: +{weight})")
|
| 76 |
+
|
| 77 |
+
# ํ ๋ฌธ์ฅ์ ๊ธ์น์ด๊ฐ ์ฌ๋ฌ ๋ฒ ๋์๋ 1์ฐจ์์ ์ผ๋ก ํญ๋ฐํ์ง ์๋๋ก ํจ๋ํฐ ์ํ์ ๋ถ์ฌ
|
| 78 |
+
if sentence_flagged:
|
| 79 |
+
total_penalty += 0.5 # ๋ฌธ์ฅ ์์ฒด์ ๋ถ๋๋ ์ถ๊ฐ ์ ์
|
| 80 |
+
|
| 81 |
+
# ๐งฎ X1 ์ค์ฝ์ด ๊ณ์ฐ (0 ~ 100์ ์ค์ผ์ผ๋ง)
|
| 82 |
+
# ๊ณต์: (์ด ํจ๋ํฐ / ์ ์ฒด ๋ฌธ์ฅ ์)๋ฅผ ๊ธฐ์ค์ผ๋ก ์ ์ํํ๋, ๋ก๊ทธ ์ค์ผ์ผ ๋ฑ์ ์จ์ 100์ ์ํ์ ์ ์ฉ
|
| 83 |
+
raw_score = (total_penalty / total_sentences) * 50
|
| 84 |
+
x1_score = min(raw_score, 100.0)
|
| 85 |
+
|
| 86 |
+
# ๊ฒฐ๊ณผ ๋ฆฌํฌํธ ์ถ๋ ฅ
|
| 87 |
+
print(f"\n[๋ถ์ ๋ฆฌํฌํธ]")
|
| 88 |
+
print(f" - ์ ์ฒด ๋ฌธ์ฅ ์: {total_sentences}๋ฌธ์ฅ")
|
| 89 |
+
for issue in detected_issues:
|
| 90 |
+
print(f" {issue}")
|
| 91 |
+
|
| 92 |
+
print(f"๐ ์ต์ข
X1 ์ ์: {x1_score:.2f} / 100.0 ์ ")
|
| 93 |
+
return x1_score
|
| 94 |
+
|
| 95 |
+
# ==========================================
|
| 96 |
+
# ์ค์ ์คํ ํ
์คํธ ์ฝ๋
|
| 97 |
+
# ==========================================
|
| 98 |
+
if __name__ == "__main__":
|
| 99 |
+
analyzer = LexicalAnalyzer()
|
| 100 |
+
|
| 101 |
+
print("==================================================")
|
| 102 |
+
print("ํ
์คํธ 1: ๋ถ์ ์ด๊ฐ ํฌํจ๋ ์ ์์ ์ธ ๊ด๊ณ (์คํ ๋ฐฉ์ง ํ
์คํธ)")
|
| 103 |
+
test_text_1 = "์ด ์ ํ์ ์์ฝ์ฒ ์ธ์ฆ์ ๋ฐ์์ต๋๋ค. ํผ๋ถ ํธ๋ฌ๋ธ์ด๋ ๋ถ์์ฉ์ด ์ ํ ์์ต๋๋ค. ์์ฌํ๊ณ ์ฌ์ฉํ์ธ์."
|
| 104 |
+
analyzer.calculate_x1_score(test_text_1)
|
| 105 |
+
|
| 106 |
+
print("\n==================================================")
|
| 107 |
+
print("ํ
์คํธ 2: ๊ทน๋จ์ ์ธ ๊ณผ๋๊ด๊ณ (ํ์ ํ์ง ํ
์คํธ)")
|
| 108 |
+
test_text_2 = "๋จ ์ผ์ฃผ์ผ ๋ง์ ์ง๋ฐฉ์ด 100% ๋ถํด๋๋ ๊ธฐ์ ์ ๊ฒฝํํ์ธ์! ์ด๊ฒ์ ์๋ ์์นํ๋ ๋ง๋ณํต์น ์ฝ์
๋๋ค. ๋ฌด์กฐ๊ฑด ๊ตฌ๋งคํ์ธ์. ์ต๊ณ ์ ์ ํ์
๋๋ค."
|
| 109 |
+
analyzer.calculate_x1_score(test_text_2)
|
myapp/step2_semantic.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn.functional as F
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 4 |
+
import gc
|
| 5 |
+
|
| 6 |
+
class SemanticAnalyzer:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
print("๐ง [Step 2] ์๋ฏธ๋ก ์ ๋ฅ๋ฌ๋ ๋ถ์๊ธฐ(KoELECTRA)๋ฅผ ๋ก๋ํฉ๋๋ค...")
|
| 9 |
+
|
| 10 |
+
# M1 Max GPU(MPS) ์ธํ
|
| 11 |
+
self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
| 12 |
+
|
| 13 |
+
# ๊ฐ๋ณ๊ณ ์ฑ๋ฅ์ด ๋ฐ์ด๋ KoELECTRA ๋ชจ๋ธ ๋ก๋
|
| 14 |
+
self.model_name = "monologg/koelectra-base-v3-discriminator"
|
| 15 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 16 |
+
|
| 17 |
+
# ๐ ํต์ฌ: output_hidden_states=True ์ต์
์ ์ผ์ผ
|
| 18 |
+
# ๋ถ๋ฅ Logit๊ณผ ์ฝ์ฌ์ธ ์ ์ฌ๋์ฉ Vector๋ฅผ ํ ๋ฒ์ ์ฐ์ฐ์ผ๋ก ๋ ๋ค ๋ฝ์๋ผ ์ ์์ต๋๋ค!
|
| 19 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(
|
| 20 |
+
self.model_name,
|
| 21 |
+
num_labels=2, # 0: ์ ์, 1: ๊ณผ๋๊ด๊ณ
|
| 22 |
+
output_hidden_states=True
|
| 23 |
+
).to(self.device)
|
| 24 |
+
self.model.eval() # ์ถ๋ก ๋ชจ๋ ์ ํ
|
| 25 |
+
|
| 26 |
+
# [๋ฒกํฐ A ๊ตฌ์ถ] ๊ธฐ์กด์ ์ ๋ฐ๋ ํ์๊ด๊ณ ๋ ํผ๋ฐ์ค ๋ฌธ์ฅ๋ค (์์)
|
| 27 |
+
self.reference_bad_texts = [
|
| 28 |
+
"๋จ ์ผ์ฃผ์ผ ๋ง์ ์ง๋ฐฉ์ด 100% ๋ถํด๋๋ ๊ธฐ์ ์ ํฌ๋ฆผ",
|
| 29 |
+
"์์ฝ์ฒ์์ ์ธ์ฆํ ๋ง๋ณํต์น์ฝ, ์์ธํฌ ์๋ฒฝ ์ ๊ฑฐ",
|
| 30 |
+
"์ด๊ฒ๋ง ๋จน์ผ๋ฉด ๋
์๊ฐ ๋ฐฐ์ถ๋๊ณ ์ธํฌ๊ฐ ์ฆ๊ฐ ์ฌ์๋ฉ๋๋ค",
|
| 31 |
+
"์์ฌ๋ค์ด ๋ฌด์กฐ๊ฑด ์ถ์ฒํ๋ ๋ถ์์ฉ ์๋ ์์น์ "
|
| 32 |
+
]
|
| 33 |
+
print(" -> ๋ ํผ๋ฐ์ค ํ์๊ด๊ณ ๋ฌธ์ฅ๋ค์ ๋ฒกํฐ(Vector) ๊ณต๊ฐ์ ๋ฐฐ์น ์ค...")
|
| 34 |
+
self.reference_embeddings = self._get_embeddings(self.reference_bad_texts)
|
| 35 |
+
|
| 36 |
+
def clear_memory(self):
|
| 37 |
+
"""๋ฉ๋ชจ๋ฆฌ ๋์ ๋ฐฉ์ง์ฉ ๊ฐ๋น์ง ์ปฌ๋ ์
"""
|
| 38 |
+
gc.collect()
|
| 39 |
+
if torch.backends.mps.is_available():
|
| 40 |
+
torch.mps.empty_cache()
|
| 41 |
+
|
| 42 |
+
def _get_embeddings(self, texts):
|
| 43 |
+
"""ํ
์คํธ ๋ฆฌ์คํธ๋ฅผ ์
๋ ฅ๋ฐ์ ๋ฌธ์ฅ ์๋ฒ ๋ฉ ๋ฒกํฐ๋ฅผ ๋ฐํํฉ๋๋ค."""
|
| 44 |
+
inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(self.device)
|
| 45 |
+
with torch.no_grad():
|
| 46 |
+
outputs = self.model(**inputs)
|
| 47 |
+
# ๋ง์ง๋ง ๋ ์ด์ด์ hidden states์์ [CLS] ํ ํฐ(์ธ๋ฑ์ค 0)์ ๋ฒกํฐ๋ฅผ ๋ฌธ์ฅ ๋ํ ๋ฒกํฐ๋ก ์ฌ์ฉ
|
| 48 |
+
sentence_embeddings = outputs.hidden_states[-1][:, 0, :]
|
| 49 |
+
return sentence_embeddings
|
| 50 |
+
|
| 51 |
+
def calculate_x2_score(self, text):
|
| 52 |
+
if not text or len(text.strip()) < 5:
|
| 53 |
+
return 0.0
|
| 54 |
+
|
| 55 |
+
print("\n๐ง [Step 2] ๋ฅ๋ฌ๋ ๋ฌธ๋งฅ ๋ฐ ์ ์ฌ๋(Semantic) ๋ถ์ ์์...")
|
| 56 |
+
|
| 57 |
+
# ๋๋ฌด ๊ธด ํ
์คํธ๋ PLM ํ๊ณ(512ํ ํฐ)์ ๊ฑธ๋ฆฌ๋ฏ๋ก ์๋ผ์ ์
๋ ฅ
|
| 58 |
+
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
|
| 59 |
+
|
| 60 |
+
with torch.no_grad(): # ์ญ์ ํ(ํ์ต) ์ฐ์ฐ์ ๊บผ์ M1 ๋ฉ๋ชจ๋ฆฌ ์ ์ฝ
|
| 61 |
+
outputs = self.model(**inputs)
|
| 62 |
+
|
| 63 |
+
# =========================================================
|
| 64 |
+
# [์์ 1] ๋ถ๋ฅ ํ๋ฅ (Softmax) : X2
|
| 65 |
+
# =========================================================
|
| 66 |
+
logits = outputs.logits # ๋ชจ๋ธ์ ๋ ๊ฒ ์ถ๋ ฅ๊ฐ (z)
|
| 67 |
+
probs = F.softmax(logits, dim=-1) # Softmax ์ ์ฉ: e^z / sum(e^z)
|
| 68 |
+
|
| 69 |
+
# ๋ผ๋ฒจ 1(๊ณผ๋๊ด๊ณ )์ ํด๋นํ๋ ํ๋ฅ ๊ฐ (0.0 ~ 1.0)
|
| 70 |
+
prob_fake = probs[0][1].item()
|
| 71 |
+
|
| 72 |
+
# =========================================================
|
| 73 |
+
# [์์ 2] ์๋ฏธ๋ก ์ ์ ์ฌ๋ (Cosine Similarity)
|
| 74 |
+
# =========================================================
|
| 75 |
+
# ํ์ฌ ๊ฒ์ฌ ์ค์ธ ํ
์คํธ์ ๋ฒกํฐ B ์ถ์ถ
|
| 76 |
+
current_embedding = outputs.hidden_states[-1][:, 0, :]
|
| 77 |
+
|
| 78 |
+
# ๋ ํผ๋ฐ์ค ๋ฒกํฐ A๋ค๊ณผ ๋ฒกํฐ B์ ์ ์ฌ๋ ๊ณ์ฐ: (A * B) / (|A| * |B|)
|
| 79 |
+
similarities = F.cosine_similarity(current_embedding, self.reference_embeddings)
|
| 80 |
+
|
| 81 |
+
# ๊ฐ์ฅ ๋ฌธ๋งฅ์ด ๋น์ทํ๋ค๊ณ ํ์ ๋ ๋ ํผ๋ฐ์ค์์ ์ต๊ณ ์ ์ฌ๋ ์ ์
|
| 82 |
+
max_sim = torch.max(similarities).item()
|
| 83 |
+
|
| 84 |
+
# ๋ถ์์ด ๋๋๋ฉด ์ฆ์ VRAM ๋ฐํ
|
| 85 |
+
del inputs, outputs, logits, probs, current_embedding, similarities
|
| 86 |
+
self.clear_memory()
|
| 87 |
+
|
| 88 |
+
# =========================================================
|
| 89 |
+
# ์ต์ข
X2 ์ค์ฝ์ด ์ฐ์ถ (๋ถ๋ฅ ํ๋ฅ + ์ ์ฌ๋ ์์๋ธ)
|
| 90 |
+
# =========================================================
|
| 91 |
+
# ๋ถ๋ฅ๊ธฐ ํ๋ฅ ์ 100์ ๋ง์ ์ผ๋ก ๋ณํ
|
| 92 |
+
classification_score = prob_fake * 100
|
| 93 |
+
# ์ฝ์ฌ์ธ ์ ์ฌ๋๋ฅผ 100์ ๋ง์ ์ผ๋ก ๋ณํ (์ ์ฌ๋๊ฐ 0 ์ดํ๋ฉด 0์ ์ฒ๋ฆฌ)
|
| 94 |
+
similarity_score = max(max_sim, 0) * 100
|
| 95 |
+
|
| 96 |
+
# ๐ก [์ค๋ฌด ํ] ํผํฉ ๊ฐ์ค์น ์ ์ฉ
|
| 97 |
+
# ํ์ฌ KoELECTRA ๋ชจ๋ธ์ '๊ณผ๋๊ด๊ณ ์ ์ฉ'๏ฟฝ๏ฟฝ๏ฟฝ๋ก ํ์ธํ๋ ๋์ง ์์ ์ฉ์ผ ์ํ์
๋๋ค.
|
| 98 |
+
# ๋ฐ๋ผ์ Softmax ๋ถ๋ฅ ํ๋ฅ ์ ๋๋ค์ ๊ฐ๊น๊ณ , ์ฝ์ฌ์ธ ์ ์ฌ๋๊ฐ ํจ์ฌ ์ ํํฉ๋๋ค.
|
| 99 |
+
# ์ ์ฌ๋ ์ ์์ 80%, ๋ถ๋ฅ ์ ์์ 20% ๊ฐ์ค์น๋ฅผ ์ฃผ์ด ์ต์ข
์ ์๋ฅผ ๋ง๋ญ๋๋ค.
|
| 100 |
+
x2_score = (similarity_score * 0.8) + (classification_score * 0.2)
|
| 101 |
+
|
| 102 |
+
print(f" -> ๐ ๋ชจ๋ธ ๋ถ๋ฅ ํ๋ฅ (Softmax): {prob_fake*100:.1f}%")
|
| 103 |
+
print(f" -> ๐ ์ต๋ ๋ฌธ๋งฅ ์ ์ฌ๋ (Cosine Sim): {max_sim*100:.1f}%")
|
| 104 |
+
print(f"๐ ์ต์ข
X2 ์ ์ (0~100): {x2_score:.2f}์ ")
|
| 105 |
+
|
| 106 |
+
return x2_score
|
| 107 |
+
|
| 108 |
+
# ==========================================
|
| 109 |
+
# ๋จ๋
ํ
์คํธ ์ฝ๋
|
| 110 |
+
# ==========================================
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
analyzer = SemanticAnalyzer()
|
| 113 |
+
|
| 114 |
+
# ๋ ํผ๋ฐ์ค("๋จ ์ผ์ฃผ์ผ ๋ง์...")์ ์๋ฏธ๊ฐ ์ ์ฌํ ๋ฌธ์ฅ ํ
์คํธ
|
| 115 |
+
test_text = "๋จ 7์ผ๋ง ํฌ์ํ์ธ์! ์ง๋ฐฉ์ด ์์ ํ ํ๊ดด๋๋ ๋๋ผ์ด ๋ง๋ฒ์ ๊ฒช๊ฒ ๋ฉ๋๋ค."
|
| 116 |
+
analyzer.calculate_x2_score(test_text)
|
myapp/step3_rag.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import torch
|
| 4 |
+
import numpy as np
|
| 5 |
+
from openai import OpenAI
|
| 6 |
+
from sentence_transformers import SentenceTransformer, util
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
# .env ํ์ผ์ ์ ์ฅ๋ OPENAI_API_KEY๋ฅผ ํ๊ฒฝ ๋ณ์๋ก ๋ก๋ํฉ๋๋ค.
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
class FactCheckerRAG:
|
| 13 |
+
def __init__(self):
|
| 14 |
+
print("๐ [Step 3] RAG + LLM ํฉํธ์ฒด์ปค๋ฅผ ๋ก๋ํฉ๋๋ค...")
|
| 15 |
+
|
| 16 |
+
# ๊ธฐ๊ธฐ ์ค์ (Mac์ ๊ฒฝ์ฐ mps, ์๋๋ฉด cpu)
|
| 17 |
+
self.device = "mps" if torch.backends.mps.is_available() else "cpu"
|
| 18 |
+
|
| 19 |
+
# ํ๊ตญ์ด ๋ฌธ์ฅ ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋
|
| 20 |
+
self.retriever = SentenceTransformer('jhgan/ko-sroberta-multitask', device=self.device)
|
| 21 |
+
|
| 22 |
+
# OpenAI ํด๋ผ์ด์ธํธ ์ด๊ธฐํ (API ํค๋ ํ๊ฒฝ๋ณ์์์ ์์ ํ๊ฒ ๊ฐ์ ธ์ด)
|
| 23 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 24 |
+
if not api_key:
|
| 25 |
+
print("โ ๏ธ ๊ฒฝ๊ณ : OPENAI_API_KEY๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. .env ํ์ผ์ ํ์ธํ์ธ์.")
|
| 26 |
+
|
| 27 |
+
self.client = OpenAI(api_key=api_key)
|
| 28 |
+
|
| 29 |
+
# ํฉํธ ๋ฐ์ดํฐ๋ฒ ์ด์ค (์์ฝ์ฒ ๊ท์ ๋ฐ ํ๋ก ๊ธฐ๋ฐ)
|
| 30 |
+
self.fact_db = [
|
| 31 |
+
# 1. ์ ๋ ๊ธ์ง ์กฐํญ
|
| 32 |
+
"์ง๋ณ์ ์๋ฐฉ ๋ฐ ์น๋ฃ์ ํจ๋ฅยทํจ๊ณผ๊ฐ ์๊ฑฐ๋ ์์ฝํ ๋๋ ๊ฑด๊ฐ๊ธฐ๋ฅ์ํ์ผ๋ก ์ค์ธยทํผ๋ํ ์ฐ๋ ค๊ฐ ์๋ ํ์ยท๊ด๊ณ ๋ ๊ธ์ง๋ฉ๋๋ค.",
|
| 33 |
+
"์ฒดํ๊ธฐ ๋ฑ์ ์ด์ฉํ๊ฑฐ๋ '์ฃผ๋ฌธ์๋', '๋จ์ฒด์ถ์ฒ' ๋ฑ ์๋น์๋ฅผ ๊ธฐ๋งํ๋ ๊ด๊ณ ๋ ์ฒ๋ฒ ๋์์
๋๋ค.",
|
| 34 |
+
"์ํ์ ๊ฐ์ข
์์ฅ, ์ธ์ฆ, ๋ณด์ฆ์ ๋ฐ์๋ค๋ ๋ด์ฉ์ ์ฌ์ฉํ๋ ๊ฒ์ ํ์ยท๊ณผ๋๊ด๊ณ ์ ํด๋นํ ์ ์์ต๋๋ค.",
|
| 35 |
+
|
| 36 |
+
# 2. ํ์ฉ๋๋ ํํ
|
| 37 |
+
"์ธ์ฒด์ ๊ฑด์ ํ ์ฑ์ฅ ๋ฐ ๋ฐ๋ฌ๊ณผ ๊ฑด๊ฐ ์ ์ง์ ๋์์ ์ค๋ค๋ ํํ์ ํน์ ์ง๋ณ์ ์ธ๊ธํ์ง ์๋ ํ ํ์ฉ๋ฉ๋๋ค.",
|
| 38 |
+
"๊ฑด๊ฐ์ฆ์ง, ์ฒด์ง๊ฐ์ , ์์ด์๋ฒ, ์์๋ณด๊ธ ๋ฑ์ ๋์์ ์ค๋ค๋ ํํ์ ๊ณผ๋๊ด๊ณ ๊ฐ ์๋๋๋ค.",
|
| 39 |
+
"ํด๋น ์ ํ์ด ์ ์์, ํ์์ ๋ฑ ํน์์ฉ๋์ํ์ด๋ผ๋ ํํ์ ํ์ฉ๋ฉ๋๋ค.",
|
| 40 |
+
|
| 41 |
+
# 3. ์ค์ ์ ๋ฐ ์ฌ๋ก
|
| 42 |
+
"์ผ๋ฐ ์ํ์ ๋น๋จ, ๊ณ ํ์, ํญ์ ๋ฑ ํน์ ์ง๋ณ ์น๋ฃ ํจ๊ณผ๊ฐ ์๋ค๊ณ ๊ธฐ์ฌํ๋ ๊ฒ์ ๋ช
๋ฐฑํ ๋ถ๋ฒ์
๋๋ค.",
|
| 43 |
+
"๋ธ๋ก๊ทธ๋ ์ผํ๋ชฐ์ ์ง๋ณ ์น๋ฃ ์ ํ ๋น๊ต ์ฌ์ง์ด๋ ๊ฐ์ธ์ ์ธ ์ฒดํ๊ธฐ๋ฅผ ์ฌ๋ฆฌ๋ ํ์๋ ๋ถ๋ฒ ๊ณผ๋๊ด๊ณ ์
๋๋ค."
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
# ๋ฐ์ดํฐ๋ฒ ์ด์ค ์๋ฒ ๋ฉ ๋ฏธ๋ฆฌ ๊ณ์ฐ
|
| 47 |
+
self.db_embeddings = self.retriever.encode(self.fact_db, convert_to_tensor=True)
|
| 48 |
+
|
| 49 |
+
def calculate_x3_score(self, text):
|
| 50 |
+
"""RAG ๊ธฐ๋ฐ์ผ๋ก ๊ด๊ณ ํ
์คํธ์ ์๋ฐ ์ ์๋ฅผ ๊ณ์ฐํฉ๋๋ค."""
|
| 51 |
+
if not text or len(text.strip()) < 5:
|
| 52 |
+
return 0.0, "๊ฒ์ฌํ ํ
์คํธ๊ฐ ๋ถ์กฑํฉ๋๋ค."
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
# 1. ๊ด๋ จ ๊ท์ ๊ฒ์ (Retrieval)
|
| 56 |
+
query_embedding = self.retriever.encode(text, convert_to_tensor=True)
|
| 57 |
+
cosine_scores = util.cos_sim(query_embedding, self.db_embeddings)[0]
|
| 58 |
+
best_idx = torch.argmax(cosine_scores).item()
|
| 59 |
+
retrieved_fact = self.fact_db[best_idx]
|
| 60 |
+
|
| 61 |
+
# 2. LLM ์ฌ์ฌ (Generation)
|
| 62 |
+
prompt = f"""
|
| 63 |
+
๋น์ ์ ๋ํ๋ฏผ๊ตญ ์์ฝ์ฒ ๋ฐ ๊ณต์ ์์ ๊ณผ๋๊ด๊ณ ์ฌ์ฌ๊ด์
๋๋ค.
|
| 64 |
+
์๋ [๊ด๋ จ ๊ท์ ]์ ๋ฐํ์ผ๋ก [๊ด๊ณ ํ
์คํธ]์ ์๋ฐ ์ฌ๋ถ๋ฅผ ํ๋จํ์ธ์.
|
| 65 |
+
|
| 66 |
+
[๊ด๋ จ ๊ท์ ]: {retrieved_fact}
|
| 67 |
+
[๊ด๊ณ ํ
์คํธ]: {text}
|
| 68 |
+
|
| 69 |
+
๋ฐ๋์ ์๋ ํ์์ผ๋ก๋ง ์๋ตํ์ธ์:
|
| 70 |
+
์ ์: [0~100 ์ฌ์ด ์ซ์]
|
| 71 |
+
์ฌ์ : [์๋ฐ์ธ ๊ฒฝ์ฐ ๊ตฌ์ฒด์ ๊ทผ๊ฑฐ, ์๋๋ฉด ํ์ฉ ๊ทผ๊ฑฐ๋ฅผ 1~2์ค๋ก ์ค๋ช
]
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
print(" -> ๐ค GPT ์ฌ์ฌ๊ด์ด ๋ถ์ ์ค...")
|
| 75 |
+
response = self.client.chat.completions.create(
|
| 76 |
+
model="gpt-3.5-turbo",
|
| 77 |
+
messages=[{"role": "user", "content": prompt}],
|
| 78 |
+
temperature=0.0
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
result_text = response.choices[0].message.content
|
| 82 |
+
print(f" [๊ฒฐ๊ณผ] {result_text}")
|
| 83 |
+
|
| 84 |
+
# ์ ์ ํ์ฑ
|
| 85 |
+
score_match = re.search(r"์ ์:\s*(\d+)", result_text)
|
| 86 |
+
x3_score = float(score_match.group(1)) if score_match else 0.0
|
| 87 |
+
|
| 88 |
+
return x3_score, retrieved_fact
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
print(f"โ ๏ธ ์๋ฌ ๋ฐ์: {e}")
|
| 92 |
+
return 0.0, "๋ถ์ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค."
|
| 93 |
+
|
| 94 |
+
# ์์๋ธ ์ ์ ๊ณ์ฐ๊ธฐ
|
| 95 |
+
def calculate_final_score(x1, x2, x3):
|
| 96 |
+
"""
|
| 97 |
+
x1: ํค์๋ ๋งค์นญ ์ ์
|
| 98 |
+
x2: ๋ฅ๋ฌ๋ ๋ฌธ๋งฅ ์ ์
|
| 99 |
+
x3: RAG ํฉํธ์ฒดํฌ ์ ์
|
| 100 |
+
"""
|
| 101 |
+
w1, w2, w3 = 0.2, 0.4, 0.4
|
| 102 |
+
return (w1 * x1) + (w2 * x2) + (w3 * x3)
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
checker = FactCheckerRAG()
|
| 106 |
+
test_ad = "์ด ์ฐจ๋ฅผ ๋ง์๋ฉด ์ ๏ฟฝ๏ฟฝ๋ฐฉ์ ๋ฌผ๋ก ๋น๋จ ์์น๊ฐ ์ฆ๊ฐ ๋จ์ด์ง๋๋ค!"
|
| 107 |
+
|
| 108 |
+
score, fact = checker.calculate_x3_score(test_ad)
|
| 109 |
+
print("-" * 30)
|
| 110 |
+
print(f"์ต์ข
์๋ฐ ์ ์: {score}")
|
| 111 |
+
print(f"์ฐธ์กฐ ๊ท์ : {fact}")
|
myapp/step4_xai.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import xgboost as xgb
|
| 3 |
+
import shap
|
| 4 |
+
|
| 5 |
+
class XAIScorer:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
print("๐ [Step 4] XGBoost ์์๋ธ ๋ชจ๋ธ ๋ฐ SHAP ์ค๋ช
๊ธฐ(XAI) ๋ก๋ ์ค...")
|
| 8 |
+
|
| 9 |
+
# 1. PoC์ฉ ๊ฐ์ ๋ฐ์ดํฐ ์์ฑ (์ค๋ฌด์์๋ ์ค์ ๋ผ๋ฒจ๋ง๋ DB๋ฅผ ๋ถ๋ฌ์ต๋๋ค)
|
| 10 |
+
# X1(๋จ์ด), X2(๋ฌธ๋งฅ), X3(ํฉํธ์ฒดํฌ) ์ ์๋ฅผ ๋๋ค ์์ฑ
|
| 11 |
+
np.random.seed(42)
|
| 12 |
+
X_train = np.random.rand(1000, 3) * 100
|
| 13 |
+
|
| 14 |
+
# ๊ฐ์์ ์ ๋ต(Label) ์์ฑ ๋ก์ง: X2์ X3๊ฐ ๋์์๋ก ๊ณผ๋๊ด๊ณ (1)์ผ ํ๋ฅ ์ด ๋์
|
| 15 |
+
# y = 1 (๊ณผ๋๊ด๊ณ ), y = 0 (์ ์)
|
| 16 |
+
y_train = ((X_train[:, 0]*0.2 + X_train[:, 1]*0.4 + X_train[:, 2]*0.4) > 50).astype(int)
|
| 17 |
+
|
| 18 |
+
# 2. XGBoost ๋ชจ๋ธ ์ ์ ๋ฐ ํ์ต (Logistic ๋ณํ ๋ด์ฅ)
|
| 19 |
+
self.model = xgb.XGBClassifier(
|
| 20 |
+
n_estimators=50,
|
| 21 |
+
max_depth=3,
|
| 22 |
+
learning_rate=0.1,
|
| 23 |
+
eval_metric='logloss',
|
| 24 |
+
random_state=42
|
| 25 |
+
)
|
| 26 |
+
self.model.fit(X_train, y_train)
|
| 27 |
+
|
| 28 |
+
# 3. SHAP TreeExplainer ์ด๊ธฐํ (XAI)
|
| 29 |
+
self.explainer = shap.TreeExplainer(self.model)
|
| 30 |
+
print(" -> ๐ง ๋จธ์ ๋ฌ๋ ์ค์ฝ์ด๋ง ์์ง ์ธํ
์๋ฃ!")
|
| 31 |
+
|
| 32 |
+
def calculate_final_score_and_explain(self, x1, x2, x3):
|
| 33 |
+
# ์
๋ ฅ๊ฐ์ numpy ๋ฐฐ์ด๋ก ๋ณํ
|
| 34 |
+
X_input = np.array([[x1, x2, x3]])
|
| 35 |
+
|
| 36 |
+
# =========================================================
|
| 37 |
+
# 1. ์ต์ข
์ค์ฝ์ด๋ง (Logistic/Sigmoid ๋ณํ)
|
| 38 |
+
# XGBoost์ predict_proba๋ ๋ด๋ถ์ ์ผ๋ก Z๊ฐ์ Sigmoid๋ฅผ ์์ 0~1 ํ๋ฅ ์ ๋ฐํํฉ๋๋ค.
|
| 39 |
+
# =========================================================
|
| 40 |
+
probabilities = self.model.predict_proba(X_input)
|
| 41 |
+
final_score = probabilities[0][1] * 100 # ํด๋์ค 1(๊ณผ๋๊ด๊ณ )์ผ ํ๋ฅ ์ 100์ ๋ง์ ์ผ๋ก ๋ณํ
|
| 42 |
+
|
| 43 |
+
# =========================================================
|
| 44 |
+
# 2. XAI (SHAP Value ๊ณ์ฐ)
|
| 45 |
+
# =========================================================
|
| 46 |
+
shap_values = self.explainer.shap_values(X_input)
|
| 47 |
+
|
| 48 |
+
# ์ด์ง ๋ถ๋ฅ์ ๊ฒฝ์ฐ ๋ฒ์ /์ธํ
์ ๋ฐ๋ผ ๋ฆฌ์คํธ๋ก ๋์ฌ ์ ์์ผ๋ฏ๋ก ์์ ํ๊ฒ ์ถ์ถ
|
| 49 |
+
if isinstance(shap_values, list):
|
| 50 |
+
shap_vals = shap_values[1][0]
|
| 51 |
+
else:
|
| 52 |
+
shap_vals = shap_values[0]
|
| 53 |
+
|
| 54 |
+
# Base Value (๋ชจ๋ธ์ ํ๊ท ์์ธก๊ฐ/์ ํธ)
|
| 55 |
+
if isinstance(self.explainer.expected_value, (list, np.ndarray)):
|
| 56 |
+
base_value = self.explainer.expected_value[1]
|
| 57 |
+
else:
|
| 58 |
+
base_value = self.explainer.expected_value
|
| 59 |
+
|
| 60 |
+
return final_score, shap_vals, base_value
|