sungho7373 commited on
Commit
1ebb586
ยท
0 Parent(s):

Initial commit: clean code without secrets

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ .venv/.env
4
+ .venv/
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # FakeAD-Detector
2
+ # FakeAD-Detector
3
+ # FakeAD-Detector
myapp/Procfile ADDED
File without changes
myapp/app.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.responses import HTMLResponse
3
+ from fastapi.middleware.cors import CORSMiddleware # ๐ŸŒŸ CORS ์ถ”๊ฐ€
4
+ from pydantic import BaseModel
5
+ import uvicorn
6
+ import math
7
+
8
+ # (๊ธฐ์กด AI ๋ชจ๋“ˆ import ๋ถ€๋ถ„ ๋™์ผ)
9
+
10
+ app = FastAPI()
11
+
12
+ # ๐ŸŒŸ ์™ธ๋ถ€(Wix)์—์„œ API๋ฅผ ํ˜ธ์ถœํ•  ์ˆ˜ ์žˆ๋„๋ก CORS ํ—ˆ์šฉ ์„ค์ •
13
+ app.add_middleware(
14
+ CORSMiddleware,
15
+ allow_origins=["*"], # ์‹ค์ œ ์„œ๋น„์Šค ์‹œ์—๋Š” ["https://๋‚ดwix์ฃผ์†Œ.com"] ์œผ๋กœ ๋ณ€๊ฒฝ ๊ถŒ์žฅ
16
+ allow_credentials=True,
17
+ allow_methods=["*"],
18
+ allow_headers=["*"],
19
+ )
20
+
21
+ # (์ดํ•˜ ๊ธฐ์กด ์ฝ”๋“œ ๋™์ผ...)
22
+
23
+ from flask import Flask
24
+
25
+ app = Flask(__name__)
26
+
27
+ @app.route("/")
28
+ def home():
29
+ return "Hello from Render!"
30
+
31
+ if __name__ == "__main__":
32
+ app.run()
33
+
34
+ from fastapi import FastAPI
35
+ from fastapi.responses import HTMLResponse
36
+ from pydantic import BaseModel
37
+ import uvicorn
38
+ import math
39
+
40
+ # ==========================================
41
+ # ๐ŸŒŸ AI ๋ ˆ๊ณ  ๋ธ”๋ก ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
42
+ # ==========================================
43
+ from step0_ingestion import DataIngestionPipeline
44
+ from step1_lexical import LexicalAnalyzer
45
+ from step2_semantic import SemanticAnalyzer
46
+ from step3_rag import FactCheckerRAG
47
+ from step4_xai import XAIScorer
48
+
49
+ app = FastAPI()
50
+
51
+ print("==================================================")
52
+ print(" โณ AI ์—”์ง„ ๋ฐ ๋”ฅ๋Ÿฌ๋‹ ๋ชจ๋ธ๋“ค์„ ๋ฉ”๋ชจ๋ฆฌ์— ์˜ฌ๋ฆฌ๋Š” ์ค‘์ž…๋‹ˆ๋‹ค...")
53
+ print("==================================================")
54
+ ingestion = DataIngestionPipeline()
55
+ lexical = LexicalAnalyzer()
56
+ semantic = SemanticAnalyzer()
57
+ rag_checker = FactCheckerRAG()
58
+ xai_scorer = XAIScorer()
59
+ print("\nโœ… [์„œ๋ฒ„ ์ค€๋น„ ์™„๋ฃŒ] http://127.0.0.1:8000 ์— ์ ‘์†ํ•˜์„ธ์š”!\n")
60
+
61
+ class AdRequest(BaseModel):
62
+ video_url: str
63
+ product_url: str
64
+
65
+ # 1. ๐ŸŒŸ ๋‹คํฌ ๋ชจ๋“œ ๋Œ€์‹œ๋ณด๋“œ ์›น ํ”„๋ก ํŠธ์—”๋“œ (HTML/CSS/JS)
66
+ @app.get("/", response_class=HTMLResponse)
67
+ async def serve_frontend():
68
+ html_content = """
69
+ <!DOCTYPE html>
70
+ <html lang="ko">
71
+ <head>
72
+ <meta charset="UTF-8">
73
+ <title>AI ๊ณผ๋Œ€๊ด‘๊ณ  ํƒ์ง€ ๋Œ€์‹œ๋ณด๋“œ</title>
74
+ <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
75
+ <style>
76
+ body { font-family: 'Pretendard', sans-serif; background-color: #121212; color: #ffffff; padding: 20px; margin: 0; }
77
+ .container { max-width: 1200px; margin: auto; }
78
+ h2 { color: #ffffff; text-align: left; border-bottom: 2px solid #333; padding-bottom: 10px; }
79
+
80
+ /* ์ž…๋ ฅ ํผ ์„น์…˜ */
81
+ .input-section { background: #1e1e1e; padding: 20px; border-radius: 12px; margin-bottom: 20px; box-shadow: 0 4px 6px rgba(0,0,0,0.3); }
82
+ input { width: 100%; padding: 12px; margin: 8px 0; background: #2c2c2e; border: 1px solid #444; border-radius: 8px; color: #fff; box-sizing: border-box; }
83
+ button { width: 100%; padding: 15px; background: #4caf50; color: white; border: none; border-radius: 8px; font-size: 16px; font-weight: bold; cursor: pointer; transition: 0.3s; }
84
+ button:hover { background: #45a049; }
85
+
86
+ /* ๋Œ€์‹œ๋ณด๋“œ ๊ทธ๋ฆฌ๋“œ ๋ ˆ์ด์•„์›ƒ */
87
+ .dashboard { display: none; grid-template-columns: 1fr 2fr; gap: 20px; margin-top: 20px; }
88
+ .card { background: #1e1e1e; padding: 20px; border-radius: 12px; box-shadow: 0 4px 6px rgba(0,0,0,0.3); }
89
+ .card h3 { margin-top: 0; color: #a0a0a0; font-size: 16px; border-bottom: 1px solid #333; padding-bottom: 10px; }
90
+
91
+ /* ๋„๋„› ์ฐจํŠธ ์นด๋“œ */
92
+ .score-card { text-align: center; }
93
+ .score-card canvas { max-height: 250px; margin: auto; }
94
+
95
+ /* ์‚ฌ๊ณ ํšŒ๋กœ ์นด๋“œ */
96
+ .xai-card { font-size: 15px; line-height: 1.6; color: #e0e0e0; }
97
+ .highlight { color: #4caf50; font-weight: bold; }
98
+ .danger { color: #ff5252; font-weight: bold; }
99
+
100
+ /* ์„ธ๋ถ€ ์—”์ง„ ๊ฒฐ๊ณผ ์นด๋“œ */
101
+ .details-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-top: 20px; }
102
+ .detail-item { background: #2c2c2e; padding: 15px; border-radius: 8px; }
103
+ .detail-item h4 { margin: 0 0 10px 0; color: #4caf50; }
104
+
105
+ /* ๋กœ๋”ฉ ํ…์ŠคํŠธ */
106
+ #loading { display: none; text-align: center; color: #4caf50; font-size: 18px; margin-top: 20px; font-weight: bold; }
107
+ </style>
108
+ </head>
109
+ <body>
110
+ <div class="container">
111
+ <h2>๐Ÿšจ AI ๊ณผ๋Œ€๊ด‘๊ณ  ํƒ์ง€ ๋Œ€์‹œ๋ณด๋“œ (Overview)</h2>
112
+
113
+ <div class="input-section">
114
+ <input type="text" id="video_url" placeholder="์œ ํŠœ๋ธŒ ์˜์ƒ ๋งํฌ (์„ ํƒ์‚ฌํ•ญ)">
115
+ <input type="text" id="product_url" placeholder="์ƒํ’ˆ ์ƒ์„ธํŽ˜์ด์ง€ ๋งํฌ (ํ•„์ˆ˜)" value="https://brand.naver.com/pacsafe/products/9365045491">
116
+ <button onclick="analyzeAd()">๋ถ„์„ ์‹œ์ž‘ (๋ฐ์ดํ„ฐ ํฌ๋กค๋ง ๋ฐ AI ๋ถ„์„)</button>
117
+ </div>
118
+
119
+ <div id="loading">๋ฐ์ดํ„ฐ๋ฅผ ์ˆ˜์ง‘ํ•˜๊ณ  AI ๋ชจ๋ธ์ด ๋ถ„์„ ์ค‘์ž…๋‹ˆ๋‹ค. ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š” โณ...</div>
120
+
121
+ <div class="dashboard" id="dashboard">
122
+ <div class="card score-card">
123
+ <h3>ํ†ตํ•ฉ ์œ„ํ—˜๋„ ์ ์ˆ˜ (Final Score)</h3>
124
+ <canvas id="scoreChart"></canvas>
125
+ <h1 id="scoreText" style="margin-top: 15px;">0.00์ </h1>
126
+ <p id="statusText" style="color: #a0a0a0;"></p>
127
+ </div>
128
+
129
+ <div class="card xai-card">
130
+ <h3>๐Ÿค– AI ์ตœ์ข… ํŒ์ • ์‚ฌ๊ณ ํšŒ๋กœ (XAI Reasoning)</h3>
131
+ <div id="xaiReasoning" style="margin-bottom: 20px;"></div>
132
+
133
+ <h3>๐ŸŒ RAG ํŒฉํŠธ์ฒดํฌ 2D ๋ฒกํ„ฐ ๊ณต๊ฐ„ ๋น„๊ต</h3>
134
+ <canvas id="vectorChart" style="max-height: 200px;"></canvas>
135
+ </div>
136
+ </div>
137
+
138
+ <div class="dashboard" id="detailsDashboard" style="grid-template-columns: 1fr; margin-top: 0;">
139
+ <div class="card">
140
+ <h3>โš™๏ธ ์„ธ๋ถ€ ์—”์ง„ ๋ถ„์„ ๊ฒฐ๊ณผ (Detailed Engine Results)</h3>
141
+ <div class="details-grid">
142
+ <div class="detail-item">
143
+ <h4>[X1] ํ˜•ํƒœ์†Œ ๋ฐ ๋‹จ์–ด ํƒ์ง€๊ธฐ</h4>
144
+ <p id="x1Details"></p>
145
+ </div>
146
+ <div class="detail-item">
147
+ <h4>[X2] ์˜๋ฏธ๋ก ์  ๋ฌธ๋งฅ ์œ ์‚ฌ๋„ (KoELECTRA)</h4>
148
+ <p id="x2Details"></p>
149
+ </div>
150
+ <div class="detail-item" style="grid-column: span 2;">
151
+ <h4>[X3] RAG ๊ธฐ๋ฐ˜ ํŒฉํŠธ์ฒดํฌ ๊ต์ฐจ ๊ฒ€์ฆ</h4>
152
+ <p id="x3Details"></p>
153
+ </div>
154
+ </div>
155
+ </div>
156
+ </div>
157
+ </div>
158
+
159
+ <script>
160
+ let scoreChartInstance = null;
161
+ let vectorChartInstance = null;
162
+
163
+ async function analyzeAd() {
164
+ const videoUrl = document.getElementById('video_url').value;
165
+ const productUrl = document.getElementById('product_url').value;
166
+
167
+ if (!productUrl) return alert("์ƒํ’ˆ ๋งํฌ๋Š” ํ•„์ˆ˜์ž…๋‹ˆ๋‹ค!");
168
+
169
+ document.getElementById('loading').style.display = 'block';
170
+ document.getElementById('dashboard').style.display = 'none';
171
+ document.getElementById('detailsDashboard').style.display = 'none';
172
+
173
+ try {
174
+ const response = await fetch('/api/analyze', {
175
+ method: 'POST',
176
+ headers: { 'Content-Type': 'application/json' },
177
+ body: JSON.stringify({ video_url: videoUrl, product_url: productUrl })
178
+ });
179
+
180
+ const data = await response.json();
181
+ document.getElementById('loading').style.display = 'none';
182
+
183
+ if (data.status === "success") {
184
+ document.getElementById('dashboard').style.display = 'grid';
185
+ document.getElementById('detailsDashboard').style.display = 'grid';
186
+
187
+ // ๋ฐ์ดํ„ฐ ๋ฐ”์ธ๋”ฉ
188
+ document.getElementById('scoreText').innerText = data.final_score.toFixed(2) + "์ ";
189
+ document.getElementById('statusText').innerText = data.message;
190
+ document.getElementById('statusText').style.color = data.final_score > 70 ? "#ff5252" : (data.final_score > 40 ? "#ffeb3b" : "#4caf50");
191
+
192
+ document.getElementById('xaiReasoning').innerHTML = data.xai_reasoning;
193
+ document.getElementById('x1Details').innerHTML = data.x1_details;
194
+ document.getElementById('x2Details').innerHTML = data.x2_details;
195
+ document.getElementById('x3Details').innerHTML = data.x3_details;
196
+
197
+ // ๐Ÿ“Š ๋„๋„› ์ฐจํŠธ ๋ Œ๋”๋ง
198
+ renderScoreChart(data.final_score);
199
+
200
+ // ๐ŸŒ 2D ๋ฒกํ„ฐ ์‚ฐ์ ๋„ ๋ Œ๋”๋ง
201
+ renderVectorChart(data.vector_data);
202
+
203
+ } else {
204
+ alert("๋ถ„์„ ์‹คํŒจ: " + data.error);
205
+ }
206
+ } catch (err) {
207
+ document.getElementById('loading').style.display = 'none';
208
+ alert("์„œ๋ฒ„ ํ†ต์‹  ์—๋Ÿฌ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค.");
209
+ }
210
+ }
211
+
212
+ function renderScoreChart(score) {
213
+ const ctx = document.getElementById('scoreChart').getContext('2d');
214
+ if(scoreChartInstance) scoreChartInstance.destroy();
215
+
216
+ const color = score > 70 ? '#ff5252' : (score > 40 ? '#ffeb3b' : '#4caf50');
217
+
218
+ scoreChartInstance = new Chart(ctx, {
219
+ type: 'doughnut',
220
+ data: {
221
+ labels: ['์œ„ํ—˜๋„', '์•ˆ์ „'],
222
+ datasets: [{
223
+ data: [score, 100 - score],
224
+ backgroundColor: [color, '#2c2c2e'],
225
+ borderWidth: 0
226
+ }]
227
+ },
228
+ options: {
229
+ cutout: '75%',
230
+ plugins: { legend: { display: false } }
231
+ }
232
+ });
233
+ }
234
+
235
+ function renderVectorChart(vectorData) {
236
+ const ctx = document.getElementById('vectorChart').getContext('2d');
237
+ if(vectorChartInstance) vectorChartInstance.destroy();
238
+
239
+ scoreChartInstance = new Chart(ctx, {
240
+ type: 'scatter',
241
+ data: {
242
+ datasets: [
243
+ {
244
+ label: '์‹์•ฝ์ฒ˜ ๊ทœ์ • (Fact)',
245
+ data: [{ x: 0, y: 0 }],
246
+ backgroundColor: '#4caf50',
247
+ pointRadius: 8
248
+ },
249
+ {
250
+ label: '๊ด‘๊ณ  ๋ฌธ๊ตฌ (Claim)',
251
+ data: [{ x: vectorData.x, y: vectorData.y }],
252
+ backgroundColor: '#ff5252',
253
+ pointRadius: 8
254
+ }
255
+ ]
256
+ },
257
+ options: {
258
+ responsive: true,
259
+ scales: {
260
+ x: { grid: { color: '#333' }, min: -10, max: 100, title: {display: true, text: '์˜๋ฏธ๋ก ์  ๊ฑฐ๋ฆฌ (X)', color: '#888'} },
261
+ y: { grid: { color: '#333' }, min: -10, max: 100, title: {display: true, text: '์˜๋ฏธ๋ก ์  ๊ฑฐ๋ฆฌ (Y)', color: '#888'} }
262
+ },
263
+ plugins: {
264
+ legend: { labels: { color: '#fff' } }
265
+ }
266
+ }
267
+ });
268
+ }
269
+ </script>
270
+ </body>
271
+ </html>
272
+ """
273
+ return HTMLResponse(content=html_content)
274
+
275
+ # 2. ๋ถ„์„ API
276
+ @app.post("/api/analyze")
277
+ def api_analyze(req: AdRequest):
278
+ try:
279
+ # Step 0: ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘
280
+ stt_text = ingestion.run_stt(ingestion.extract_audio_from_video(req.video_url)) if req.video_url.strip() else ""
281
+ ocr_text = ingestion.run_ocr_from_web(req.product_url) if req.product_url.strip() else ""
282
+ combined_text = f"{stt_text}\n{ocr_text}".strip()
283
+
284
+ if len(combined_text) < 5:
285
+ return {"status": "error", "error": "ํ…์ŠคํŠธ๋ฅผ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค."}
286
+
287
+ # Step 1, 2, 3: ์ ์ˆ˜ ๋„์ถœ
288
+ x1_score = lexical.calculate_x1_score(combined_text)
289
+ x2_score = semantic.calculate_x2_score(combined_text)
290
+ x3_score, matched_fact = rag_checker.calculate_x3_score(combined_text)
291
+
292
+ # Step 4: ๋จธ์‹ ๋Ÿฌ๋‹ ์Šค์ฝ”์–ด๋ง
293
+ final_score, shap_vals, _ = xai_scorer.calculate_final_score_and_explain(x1_score, x2_score, x3_score)
294
+
295
+ # =====================================================================
296
+ # ๐ŸŒŸ UI์— ๋ฟŒ๋ ค์ค„ ์ƒ์„ธ ์„ค๋ช…(Detail Text) ์ƒ์„ฑ ๋กœ์ง
297
+ # =====================================================================
298
+
299
+ # [X1 ์„ธ๋ถ€๊ฒฐ๊ณผ] ๋ฐœ๊ฒฌ๋œ ๊ธˆ์น™์–ด ์ถ”์ถœ
300
+ detected_words = [word for word in lexical.lexicon.keys() if word in combined_text]
301
+ if detected_words:
302
+ x1_details = f"<span class='danger'>์ ๋ฐœ๋œ ๋‹จ์–ด: {', '.join(detected_words)}</span><br>์ด ๋‹จ์–ด๋“ค์€ ์‹์•ฝ์ฒ˜ ๊ฐ€์ด๋“œ๋ผ์ธ์— ์˜ํ•ด ์‚ฌ์šฉ์ด ๊ฐ•ํ•˜๊ฒŒ ๊ทœ์ œ๋˜๋Š” ํ‘œํ˜„์ž…๋‹ˆ๋‹ค."
303
+ else:
304
+ x1_details = "<span class='highlight'>๋ฐœ๊ฒฌ๋œ ๊ธˆ์น™์–ด ์—†์Œ.</span><br>๋ช…์‹œ์ ์ธ ํ—ˆ์œ„ ๊ณผ์žฅ ๋‹จ์–ด๋Š” ์‚ฌ์šฉ๋˜์ง€ ์•Š์•„ ํ…์ŠคํŠธ ํ‘œ๋ฉด์ ์œผ๋กœ๋Š” ์•ˆ์ „ํ•ฉ๋‹ˆ๋‹ค."
305
+
306
+ # [X2 ์„ธ๋ถ€๊ฒฐ๊ณผ] ๋ฌธ๋งฅ ์„ค๋ช…
307
+ if x2_score > 60:
308
+ x2_details = f"<span class='danger'>๋ฌธ๋งฅ์  ์œ„ํ—˜๋„ {x2_score:.1f}์ </span><br>๋‹จ์ˆœ ๋‹จ์–ด๋ฅผ ๋„˜์–ด, ๋ฌธ์žฅ์˜ ์ „๋ฐ˜์ ์ธ ๋‰˜์•™์Šค๊ฐ€ ๊ณผ๊ฑฐ ์ ๋ฐœ๋œ ํ—ˆ์œ„๊ด‘๊ณ  ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค์˜ ๊ณผ์žฅ ํŒจํ„ด(๋‹จ์ •์  ํ‘œํ˜„, ํšจ๋Šฅ ๋งน์‹  ๋“ฑ)๊ณผ <b>๋งค์šฐ ์œ ์‚ฌํ•˜๊ฒŒ ๊ฐ์ง€</b>๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
309
+ else:
310
+ x2_details = f"<span class='highlight'>๋ฌธ๋งฅ์  ์œ„ํ—˜๋„ {x2_score:.1f}์ </span><br>๊ณผ๊ฑฐ ์ ๋ฐœ๋œ ํ—ˆ์œ„๊ด‘๊ณ  ํŠน์œ ์˜ ์ž๊ทน์ ์ด๊ฑฐ๋‚˜ ๊ธฐ๋งŒ์ ์ธ ๋ฌธ๋งฅ ํŒจํ„ด์ด ํฌ๊ฒŒ ๋ฐœ๊ฒฌ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค."
311
+
312
+ # [X3 ์„ธ๋ถ€๊ฒฐ๊ณผ] RAG ์„ค๋ช…
313
+ x3_details = f"<b>[๊ด€๋ จ ์‹์•ฝ์ฒ˜ ๊ทœ์ • ๋งค์นญ]</b><br>{matched_fact}<br><br>"
314
+ if x3_score > 50: # GPT๊ฐ€ ๋†’์€ ์œ„๋ฐ˜ ์ ์ˆ˜๏ฟฝ๏ฟฝ ์ค€ ๊ฒฝ์šฐ
315
+ x3_details += f"<b>[LLM ์ถ”๋ก  ๊ฒฐ๊ณผ: <span class='danger'>๋ชจ์ˆœ ๋ฐœ๊ฒฌ</span>]</b><br>๊ด‘๊ณ  ๋ฌธ๊ตฌ๊ฐ€ ์œ„ ์‹์•ฝ์ฒ˜ ๊ทœ์ •์„ ๋ช…๋ฐฑํžˆ ์œ„๋ฐ˜ํ•˜๊ณ  ์žˆ๋Š” ๊ฒƒ์œผ๋กœ ์ถ”๋ก ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
316
+ else:
317
+ x3_details += f"<b>[LLM ์ถ”๋ก  ๊ฒฐ๊ณผ: <span class='highlight'>๊ทœ์ • ์ค€์ˆ˜</span>]</b><br>๊ด‘๊ณ  ๋ฌธ๊ตฌ์™€ ์œ„ ์‹์•ฝ์ฒ˜ ๊ทœ์ • ๊ฐ„์— ์‹ฌ๊ฐํ•œ ๋…ผ๋ฆฌ์  ๋ชจ์ˆœ์ด๋‚˜ ์œ„๋ฐ˜ ์‚ฌํ•ญ์ด ๋ฐœ๊ฒฌ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค."
318
+
319
+ # [XAI ์‚ฌ๊ณ ํšŒ๋กœ] SHAP ๊ธฐ๋ฐ˜ ์„œ์ˆ 
320
+ xai_reasoning = f"AI๋Š” ์ด ๊ด‘๊ณ  ํ…์ŠคํŠธ๋ฅผ ๋ถ„์„ํ•  ๋•Œ ๋‹ค์Œ๊ณผ ๊ฐ™์€ ์‚ฌ๊ณ  ๊ณผ์ •์„ ๊ฑฐ์ณค์Šต๋‹ˆ๋‹ค.<br><ul>"
321
+ if shap_vals[0] > 0: xai_reasoning += f"<li>ํ‘œ๋ฉด์ ์ธ ๋‹จ์–ด(X1)์— ๊ณผ์žฅ๋œ ํ‘œํ˜„์ด ์„ž์—ฌ ์žˆ์–ด ์œ„ํ—˜๋„๋ฅผ ๋†’์˜€์Šต๋‹ˆ๋‹ค. <span class='danger'>(+{shap_vals[0]:.2f})</span></li>"
322
+ else: xai_reasoning += f"<li>๊ธˆ์น™์–ด๊ฐ€ ๊ฒ€์ถœ๋˜์ง€ ์•Š์•„ ๊ธฐ๋ณธ์ ์œผ๋กœ ์•ˆ์ „ํ•œ ๊ธ€๋กœ ์ธ์‹ํ–ˆ์Šต๋‹ˆ๋‹ค. <span class='highlight'>({shap_vals[0]:.2f})</span></li>"
323
+
324
+ if shap_vals[1] > 0: xai_reasoning += f"<li>ํ•˜์ง€๋งŒ ๋ฌธ๋งฅ์˜ ๋‰˜์•™์Šค(X2)๊ฐ€ ๊ณผ๊ฑฐ ํ—ˆ์œ„๊ด‘๊ณ ์™€ ๋„ˆ๋ฌด ๋น„์Šทํ•ด AI๊ฐ€ ๊ฐ•ํ•œ ์˜์‹ฌ์„ ํ’ˆ์—ˆ์Šต๋‹ˆ๋‹ค. <span class='danger'>(+{shap_vals[1]:.2f})</span></li>"
325
+ else: xai_reasoning += f"<li>์ „์ฒด์ ์ธ ๋ฌธ๋งฅ๊ณผ ๋‰˜์•™์Šค(X2) ์—ญ์‹œ ์ •์ƒ์ ์ธ ์ œํ’ˆ ์„ค๋ช…์˜ ํ˜•ํƒœ๋ฅผ ๋ ๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. <span class='highlight'>({shap_vals[1]:.2f})</span></li>"
326
+
327
+ if shap_vals[2] > 0: xai_reasoning += f"<li><b>๊ฒฐ์ •์ ์œผ๋กœ ์‹์•ฝ์ฒ˜ ํŒฉํŠธ์ฒดํฌ(X3)์—์„œ ๋ช…๋ฐฑํ•œ ๊ทœ์ • ์œ„๋ฐ˜์ด ํ™•์ธ</b>๋˜์–ด, ์ตœ์ข… ์œ„ํ—˜ ํŒ์ •์„ ๋‚ด๋ ธ์Šต๋‹ˆ๋‹ค. <span class='danger'>(+{shap_vals[2]:.2f})</span></li>"
328
+ else: xai_reasoning += f"<li><b>์‹์•ฝ์ฒ˜ ํŒฉํŠธ์ฒดํฌ(X3) ๊ฒฐ๊ณผ ๊ทœ์ • ์œ„๋ฐ˜ ์†Œ์ง€๊ฐ€ ๋ฐœ๊ฒฌ๋˜์ง€ ์•Š์•„</b> ์ตœ์ข…์ ์œผ๋กœ ์•ˆ์ „ํ•˜๋‹ค๊ณ  ํŒ๊ฒฐํ–ˆ์Šต๋‹ˆ๋‹ค. <span class='highlight'>({shap_vals[2]:.2f})</span></li>"
329
+ xai_reasoning += "</ul>"
330
+
331
+ # [๋ฒกํ„ฐ ๊ณต๊ฐ„ ์‹œ๊ฐํ™” ๋ฐ์ดํ„ฐ] ์œ ์‚ฌ๋„๊ฐ€ ๋‚ฎ์„์ˆ˜๋ก ๋‘ ์ ์˜ ๊ฑฐ๋ฆฌ๊ฐ€ ๋ฉ€์–ด์ง
332
+ # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ์ ์ˆ˜๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์ž„์˜์˜ x, y ์ขŒํ‘œ ์‚ฐ์ถœ
333
+ distance = max(10, 100 - x2_score) # 0์— ๊ฐ€๊นŒ์šธ์ˆ˜๋ก ์œ„ํ—˜(๊ฐ€๊นŒ์›€)
334
+ vector_data = {"x": distance * 0.8, "y": distance * 0.9}
335
+
336
+ message = "๐Ÿšจ ์•…์˜์ ์ธ ํ—ˆ์œ„/๊ณผ๋Œ€๊ด‘๊ณ ๋กœ ์˜์‹ฌ๋ฉ๋‹ˆ๋‹ค." if final_score > 70 else ("โš ๏ธ ์ผ๋ถ€ ๊ณผ์žฅ๋œ ๋‚ด์šฉ์ด ํฌํ•จ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค." if final_score > 40 else "โœ… ๊ณผ๋Œ€๊ด‘๊ณ  ์†Œ์ง€๊ฐ€ ์ ์€ ์•ˆ์ „ํ•œ ์ฝ˜ํ…์ธ ์ž…๋‹ˆ๋‹ค.")
337
+
338
+ return {
339
+ "status": "success",
340
+ "final_score": float(round(final_score, 2)),
341
+ "message": message,
342
+ "x1_details": x1_details,
343
+ "x2_details": x2_details,
344
+ "x3_details": x3_details,
345
+ "xai_reasoning": xai_reasoning,
346
+ "vector_data": vector_data
347
+ }
348
+
349
+ except Exception as e:
350
+ return {"status": "error", "error": str(e)}
351
+
352
+ if __name__ == "__main__":
353
+ uvicorn.run("app:app", host="127.0.0.1", port=8000, reload=True)
myapp/main_pipeline.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from step0_ingestion import DataIngestionPipeline
2
+ from step1_lexical import LexicalAnalyzer
3
+ from step2_semantic import SemanticAnalyzer
4
+ from step3_rag import FactCheckerRAG
5
+ from step4_xai import XAIScorer # ๐ŸŒŸ Step 4 ์ถ”๊ฐ€!
6
+
7
+ def run_full_pipeline():
8
+ print("==================================================")
9
+ print(" ๐Ÿš€ [AtoZ ํŒŒ์ดํ”„๋ผ์ธ] ๊ณผ๋Œ€๊ด‘๊ณ  ํƒ์ง€ AI ์‹œ์Šคํ…œ ๊ฐ€๋™")
10
+ print("==================================================\n")
11
+
12
+ # 1. ๋ชจ๋ธ ๋ฐ ์—”์ง„ ์ดˆ๊ธฐํ™”
13
+ ingestion = DataIngestionPipeline()
14
+ lexical = LexicalAnalyzer()
15
+ semantic = SemanticAnalyzer()
16
+ rag_checker = FactCheckerRAG()
17
+ xai_scorer = XAIScorer() # ๐ŸŒŸ XAI ์—”์ง„ ๊ฐ€๋™!
18
+
19
+ target_video_url = "https://youtu.be/SJxSDRxd8Dc?si=t8dnIQciFulUlbVW"
20
+ target_product_url = "https://brand.naver.com/pacsafe/products/9365045491"
21
+
22
+ # [Step 0] ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘
23
+ print("\nโ–ถ๏ธ [Step 0] ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ์ค‘...")
24
+ stt_text = ""
25
+ try:
26
+ audio_file = ingestion.extract_audio_from_video(target_video_url)
27
+ if audio_file:
28
+ stt_text = ingestion.run_stt(audio_file)
29
+ except Exception as e:
30
+ print(f"โš ๏ธ ์œ ํŠœ๋ธŒ ์ถ”์ถœ ์‹คํŒจ (๋ฌด์‹œํ•˜๊ณ  ์ง„ํ–‰): {e}")
31
+
32
+ ocr_text = ""
33
+ try:
34
+ ocr_text = ingestion.run_ocr_from_web(target_product_url)
35
+ except Exception as e:
36
+ print(f"โš ๏ธ OCR ์ถ”์ถœ ์‹คํŒจ: {e}")
37
+
38
+ combined_text = f"{stt_text}\n{ocr_text}"
39
+ if len(combined_text) == 0:
40
+ print("โŒ ๋ถ„์„ํ•  ํ…์ŠคํŠธ๊ฐ€ ์—†์–ด ์ข…๋ฃŒํ•ฉ๋‹ˆ๋‹ค.")
41
+ return
42
+
43
+ # [Step 1, 2, 3] ํ…์ŠคํŠธ ์‹ฌ์ธต ๋ถ„์„
44
+ print("\nโ–ถ๏ธ [Step 1, 2, 3] ํ…์ŠคํŠธ ์‹ฌ์ธต ๋ถ„์„ ๊ฐ€๋™...")
45
+
46
+ x1_score = lexical.calculate_x1_score(combined_text)
47
+ x2_score = semantic.calculate_x2_score(combined_text)
48
+ x3_score, matched_fact = rag_checker.calculate_x3_score(combined_text)
49
+
50
+ # ๐ŸŒŸ [Step 4] XGBoost ์Šค์ฝ”์–ด๋ง ๋ฐ SHAP ๋ถ„์„
51
+ print("\nโ–ถ๏ธ [Step 4] XGBoost ๊ธฐ๋ฐ˜ ์ตœ์ข… ์Šค์ฝ”์–ด๋ง ๋ฐ SHAP ์„ค๋ช… ์ƒ์„ฑ...")
52
+ final_score, shap_vals, base_value = xai_scorer.calculate_final_score_and_explain(x1_score, x2_score, x3_score)
53
+
54
+ # =====================================================================
55
+ # ๐ŸŒŸ ์ตœ์ข… ๋ถ„์„ ๋ฆฌํฌํŠธ (XAI ์„ค๋ช… ํฌํ•จ)
56
+ # =====================================================================
57
+ print("\n==================================================")
58
+ print(" ๐Ÿ“œ [์ตœ์ข… ๊ณผ๋Œ€๊ด‘๊ณ  ํƒ์ง€ ๋ฆฌํฌํŠธ]")
59
+ print("==================================================")
60
+ print(f" ๐Ÿ”น [X1] ํ˜•ํƒœ์†Œ/๋‹จ์–ด ์œ„๋ฐ˜ (Lexical) : {x1_score:5.1f} ์ ")
61
+ print(f" ๐Ÿ”น [X2] ๋ฌธ๋งฅ์  ์œ ์‚ฌ๋„ (Semantic) : {x2_score:5.1f} ์ ")
62
+ print(f" ๐Ÿ”น [X3] ์‹์•ฝ์ฒ˜ ํŒฉํŠธ์ฒดํฌ (RAG) : {x3_score:5.1f} ์ ")
63
+ print("--------------------------------------------------")
64
+ print(f" ๐ŸŽฏ [S] ๋จธ์‹ ๋Ÿฌ๋‹ ์ตœ์ข… ์œ„ํ—˜๋„ ์ ์ˆ˜ : {final_score:5.1f} / 100.0 ์ ")
65
+ print("==================================================")
66
+
67
+ if final_score > 70:
68
+ print(" ๐Ÿšจ [ํŒ์ •] ๋งค์šฐ ์œ„ํ—˜! ์•…์˜์ ์ธ ํ—ˆ์œ„/๊ณผ๋Œ€๊ด‘๊ณ ๋กœ ์˜์‹ฌ๋ฉ๋‹ˆ๋‹ค.")
69
+ elif final_score > 40:
70
+ print(" โš ๏ธ [ํŒ์ •] ์ฃผ์˜! ์ผ๋ถ€ ๊ณผ์žฅ๋œ ํ‘œํ˜„์ด๋‚˜ ์‚ฌ์‹ค๊ณผ ๋‹ค๋ฅธ ๋‚ด์šฉ์ด ํฌํ•จ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.")
71
+ else:
72
+ print(" โœ… [ํŒ์ •] ์•ˆ์ „! ๊ณผ๋Œ€๊ด‘๊ณ  ์†Œ์ง€๊ฐ€ ์ ์€ ์ •์ƒ์ ์ธ ์ฝ˜ํ…์ธ ์ž…๋‹ˆ๋‹ค.")
73
+
74
+ print("\n==================================================")
75
+ print(" ๐Ÿค– [XAI] ์ธ๊ณต์ง€๋Šฅ์˜ ํŒ์ • ์‚ฌ์œ  (SHAP Values)")
76
+ print("==================================================")
77
+ print(" (์ ์ˆ˜๊ฐ€ ์–‘์ˆ˜(+)๋ฉด ์œ„ํ—˜๋„๋ฅผ ๋†’์˜€๊ณ , ์Œ์ˆ˜(-)๋ฉด ์•ˆ์ „ํ•˜๋‹ค๊ณ  ํŒ๋‹จํ•œ ๊ทผ๊ฑฐ์ž…๋‹ˆ๋‹ค.)\n")
78
+
79
+ features = ["X1 (๋‹จ์–ด ์œ„๋ฐ˜)", "X2 (๋ฌธ๋งฅ ์œ ์‚ฌ๋„)", "X3 (ํŒฉํŠธ์ฒดํฌ ๋ชจ์ˆœ)"]
80
+ for i, feature_name in enumerate(features):
81
+ impact = shap_vals[i]
82
+ direction = "๐Ÿ”ด ์œ„ํ—˜๋„ ์ฆ๊ฐ€" if impact > 0 else "๐ŸŸข ์œ„ํ—˜๋„ ๊ฐ์†Œ"
83
+ # ์ง๊ด€์„ฑ์„ ์œ„ํ•ด SHAP Log-odds ๊ฐ’์„ ์ ์ˆ˜ ์Šค์ผ€์ผ์ฒ˜๋Ÿผ ๋น„๋ก€ํ•ด์„œ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค
84
+ print(f" {direction} ๊ธฐ์—ฌ: {feature_name:<18} -> ๊ธฐ์—ฌ๋„: {impact:+.2f}")
85
+ print("==================================================")
86
+
87
+ if __name__ == "__main__":
88
+ run_full_pipeline()
myapp/requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pydantic
4
+ requests
5
+ playwright
6
+ paddleocr
7
+ paddlepaddle
8
+ torch
9
+ transformers
10
+ sentence-transformers
11
+ xgboost
12
+ shap
13
+ openai
myapp/step0_ingestion.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import requests
4
+ import torch
5
+ import whisper
6
+ from pytubefix import YouTube
7
+ from paddleocr import PaddleOCR
8
+ from playwright.sync_api import sync_playwright
9
+ import logging
10
+
11
+ class DataIngestionPipeline:
12
+ def __init__(self):
13
+ self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
14
+ print(f"โœ… ์‚ฌ์šฉ ์ค‘์ธ ๋””๋ฐ”์ด์Šค: {self.device}")
15
+
16
+ def clear_memory(self):
17
+ gc.collect()
18
+ if torch.backends.mps.is_available():
19
+ torch.mps.empty_cache()
20
+ print("๐Ÿงน ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ ์™„๋ฃŒ")
21
+
22
+ def extract_audio_from_video(self, video_url, output_filename="temp_audio"):
23
+ print(f"\n๐ŸŽฅ [1] ์œ ํŠœ๋ธŒ ์˜์ƒ ๋‹ค์šด๋กœ๋“œ ์‹œ์ž‘: {video_url}")
24
+ try:
25
+ yt = YouTube(video_url, 'WEB')
26
+ audio_stream = yt.streams.get_audio_only()
27
+ file_path = audio_stream.download(filename=f"{output_filename}.mp4")
28
+ print(f"โœ… ์˜ค๋””์˜ค ์ถ”์ถœ ์™„๋ฃŒ: {file_path}")
29
+ return file_path
30
+ except Exception as e:
31
+ print(f"โŒ ๋‹ค์šด๋กœ๋“œ ์‹คํŒจ: {e}")
32
+ return None
33
+
34
+ def run_stt(self, audio_path):
35
+ print(f"\n๐Ÿ—ฃ๏ธ [2] STT(์Œ์„ฑ->ํ…์ŠคํŠธ) ๋ณ€ํ™˜ ์‹œ์ž‘: {audio_path}")
36
+ model = whisper.load_model("small", device="cpu")
37
+ result = model.transcribe(audio_path, language="ko", fp16=False)
38
+
39
+ if result is None:
40
+ raise ValueError("Whisper๊ฐ€ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.")
41
+
42
+ text_result = result.get("text", "")
43
+
44
+ del model
45
+ self.clear_memory()
46
+
47
+ print("โœ… STT ๋ณ€ํ™˜ ์™„๋ฃŒ")
48
+ return text_result.strip()
49
+
50
+ def run_ocr_from_web(self, product_url):
51
+ print(f"\n๐Ÿ–ผ๏ธ [3] ์›นํŽ˜์ด์ง€ ์ ‘์† ๋ฐ ์ด๋ฏธ์ง€ OCR ์‹œ์ž‘: {product_url}")
52
+
53
+ raw_image_urls = []
54
+ with sync_playwright() as p:
55
+ print(" -> ๋ธŒ๋ผ์šฐ์ € ์ฐฝ์„ ๋„์šฐ๊ณ  ํŽ˜์ด์ง€ ๋กœ๋”ฉ์„ ๊ธฐ๋‹ค๋ฆฝ๋‹ˆ๋‹ค...")
56
+ browser = p.chromium.launch(headless=False)
57
+ page = browser.new_page()
58
+ page.goto(product_url, wait_until="domcontentloaded", timeout=60000)
59
+ page.wait_for_timeout(3000)
60
+
61
+ print(" -> ๐ŸŽฏ ์ˆจ๊ฒจ์ง„ ์ƒ์„ธํŽ˜์ด์ง€๋ฅผ ์—ด๊ธฐ ์œ„ํ•ด '๋”๋ณด๊ธฐ' ๋ฒ„ํŠผ์„ ์ฐพ์Šต๋‹ˆ๋‹ค...")
62
+ try:
63
+ more_btn = page.locator('button:has-text("์ƒ์„ธ์ •๋ณด ํŽผ์ณ๋ณด๊ธฐ"), button:has-text("์ƒ์„ธ์„ค๋ช… ๋”๋ณด๊ธฐ"), a:has-text("๋”๋ณด๊ธฐ")').first
64
+ if more_btn.is_visible(timeout=3000):
65
+ more_btn.click()
66
+ print(" => ์พ…! '๋”๋ณด๊ธฐ' ๋ฒ„ํŠผ์„ ์„ฑ๊ณต์ ์œผ๋กœ ํด๋ฆญํ–ˆ์Šต๋‹ˆ๋‹ค!")
67
+ page.wait_for_timeout(2000)
68
+ except Exception:
69
+ print(" => '๋”๋ณด๊ธฐ' ๋ฒ„ํŠผ์ด ์—†๊ฑฐ๋‚˜ ์ด๋ฏธ ํŽผ์ณ์ ธ ์žˆ์Šต๋‹ˆ๋‹ค. ๊ทธ๋Œ€๋กœ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค.")
70
+
71
+ print(" -> ์ง€์—ฐ ๋กœ๋”ฉ(Lazy-loading)๋œ ์ด๋ฏธ์ง€๋ฅผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ ์œ„ํ•ด ์Šคํฌ๋กค์„ ๋‚ด๋ฆฝ๋‹ˆ๋‹ค...")
72
+ for _ in range(10):
73
+ page.evaluate("window.scrollBy(0, 1500)")
74
+ page.wait_for_timeout(1000)
75
+
76
+ img_elements = page.query_selector_all('img')
77
+ for img in img_elements:
78
+ src = img.get_attribute('data-src') or img.get_attribute('src')
79
+ if src and ('http' in src or src.startswith('//')):
80
+ if src.startswith('//'):
81
+ src = 'https:' + src
82
+ raw_image_urls.append(src)
83
+ browser.close()
84
+
85
+ valid_urls = []
86
+ for url in raw_image_urls:
87
+ url_lower = url.lower()
88
+ if not any(x in url_lower for x in ['.gif', 'icon', 'logo', 'blank', 'svg', 'thumb']):
89
+ valid_urls.append(url)
90
+
91
+ if not valid_urls:
92
+ print("โŒ ์œ ํšจํ•œ ์ƒ์„ธ ์ด๋ฏธ์ง€๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
93
+ return ""
94
+
95
+ print(f"โœ… ์ด {len(valid_urls)}๊ฐœ์˜ ์ด๋ฏธ์ง€ ๋ฐœ๊ฒฌ! ์ง„์งœ ์ƒ์„ธ ์ด๋ฏธ์ง€๋ฅผ ํƒ์ƒ‰ํ•ฉ๋‹ˆ๋‹ค...")
96
+
97
+ logging.getLogger('ppocr').setLevel(logging.ERROR)
98
+
99
+ # ๐ŸŒŸ ํ•ด์ƒ๋„ ํ•œ๊ณ„์น˜๋ฅผ ๋Œ€ํญ ๋Š˜๋ฆฐ ์ตœ์‹  ์„ธํŒ… ์ ์šฉ
100
+ ocr = PaddleOCR(
101
+ lang='korean',
102
+ text_det_limit_side_len=2048,
103
+ text_det_limit_type='max'
104
+ )
105
+
106
+ all_extracted_text = []
107
+
108
+ headers = {
109
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
110
+ 'Referer': product_url
111
+ }
112
+
113
+ processed_count = 0
114
+
115
+ for i, img_url in enumerate(valid_urls[2:]):
116
+ if processed_count >= 3:
117
+ break
118
+
119
+ temp_img_path = f"temp_ocr_{i}.jpg"
120
+ try:
121
+ response = requests.get(img_url, headers=headers, timeout=10)
122
+ with open(temp_img_path, 'wb') as f:
123
+ f.write(response.content)
124
+
125
+ if os.path.getsize(temp_img_path) < 30000:
126
+ if os.path.exists(temp_img_path): os.remove(temp_img_path)
127
+ continue
128
+
129
+ processed_count += 1
130
+ file_kb = os.path.getsize(temp_img_path) // 1024
131
+ print(f" -> [์ง„์งœ ํ…์ŠคํŠธ ํƒ์ƒ‰ ์ค‘...] ๋ฌต์งํ•œ ์ƒ์„ธ ์ด๋ฏธ์ง€ ๋ฐœ๊ฒฌ! ({file_kb}KB)")
132
+
133
+ result = ocr.ocr(temp_img_path)
134
+
135
+ # ๐ŸŒŸ ๋ฐฉ๊ธˆ ํ™•์ธํ•œ ์™„๋ฒฝํ•œ ๋ฐ์ดํ„ฐ ์ถ”์ถœ ๋กœ์ง ์ ์šฉ!
136
+ if result and isinstance(result[0], dict) and 'rec_texts' in result[0]:
137
+ texts = result[0]['rec_texts']
138
+ print(f" => โœจ ํ…์ŠคํŠธ {len(texts)}์ค„ ์ถ”์ถœ ์„ฑ๊ณต!")
139
+ all_extracted_text.extend(texts)
140
+ if os.path.exists(temp_img_path): os.remove(temp_img_path)
141
+ else:
142
+ print(f" => โš ๏ธ ๊ธ€์ž๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค! (์ด๋ฏธ์ง€ ํ™•์ธ: {temp_img_path})")
143
+
144
+ except Exception as e:
145
+ print(f"โš ๏ธ ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
146
+ if os.path.exists(temp_img_path): os.remove(temp_img_path)
147
+
148
+ del ocr
149
+ self.clear_memory()
150
+
151
+ final_text = "\n".join(all_extracted_text) # ์ค„๋ฐ”๊ฟˆ์œผ๋กœ ๊น”๋”ํ•˜๊ฒŒ ํ•ฉ์น˜๊ธฐ
152
+ print("\nโœ… ์›นํŽ˜์ด์ง€ ์ด๋ฏธ์ง€ OCR ๋ณ€ํ™˜ ์™„๋ฃŒ!")
153
+ return final_text
154
+
155
+
156
+ # ==========================================
157
+ # ์‹ค์ œ ์‹คํ–‰ ํ…Œ์ŠคํŠธ ์ฝ”๋“œ
158
+ # ==========================================
159
+ if __name__ == "__main__":
160
+ pipeline = DataIngestionPipeline()
161
+
162
+ # 1. ์œ ํŠœ๋ธŒ STT ํ…Œ์ŠคํŠธ
163
+ test_video_url = "https://youtu.be/SJxSDRxd8Dc?si=t8dnIQciFulUlbVW"
164
+ try:
165
+ audio_file = pipeline.extract_audio_from_video(test_video_url)
166
+ if audio_file:
167
+ stt_text = pipeline.run_stt(audio_file)
168
+ print(f"\n[STT ๊ฒฐ๊ณผ (์Œ์„ฑ -> ํ…์ŠคํŠธ)]\n{stt_text[:500]}...\n")
169
+ except Exception as e:
170
+ print(f"์œ ํŠœ๋ธŒ/STT ์ฒ˜๋ฆฌ ์ค‘ ์—๋Ÿฌ ๋ฐœ์ƒ: {e}")
171
+
172
+ # 2. ์›นํŽ˜์ด์ง€ OCR ํ…Œ์ŠคํŠธ
173
+ test_product_url = "https://brand.naver.com/pacsafe/products/9365045491"
174
+ try:
175
+ ocr_text = pipeline.run_ocr_from_web(test_product_url)
176
+ print(f"\n[์›นํŽ˜์ด์ง€ OCR ๊ฒฐ๊ณผ (์ƒ์„ธ ์ด๋ฏธ์ง€ -> ํ…์ŠคํŠธ)]\n===========================\n{ocr_text}\n===========================")
177
+ except Exception as e:
178
+ print(f"OCR ์ฒ˜๋ฆฌ ์ค‘ ์—๋Ÿฌ ๋ฐœ์ƒ: {e}")
myapp/step1_lexical.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from mecab import MeCab
3
+
4
+ class LexicalAnalyzer:
5
+ def __init__(self):
6
+ print("โœ… Mecab ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค...")
7
+ self.mecab = MeCab()
8
+
9
+ # ๐Ÿšจ ๊ฐ€์ค‘์น˜(TF-IDF์˜ IDF ๊ฐœ๋… ์ฐจ์šฉ)๊ฐ€ ๋ถ€์—ฌ๋œ ๊ณผ๋Œ€๊ด‘๊ณ  ์‚ฌ์ „
10
+ # ์ ์ˆ˜๊ฐ€ ๋†’์„์ˆ˜๋ก ํ•œ ๋ฒˆ๋งŒ ๋“ฑ์žฅํ•ด๋„ ์น˜๋ช…์ ์ธ ๋‹จ์–ด์ž…๋‹ˆ๋‹ค.
11
+ self.lexicon = {
12
+ "์น˜๋ฃŒ": 2.0, "์˜ˆ๋ฐฉ": 2.0, "์™„์น˜": 2.0, "ํ•ญ์•”": 2.0, "ํŠนํšจ": 2.0,
13
+ "100%": 1.5, "๋งŒ๋ณ‘ํ†ต์น˜": 2.0, "๊ธฐ์ ": 1.5, "๋‹จ์ˆจ์—": 1.5,
14
+ "์ฃผ๋ฌธ์‡„๋„": 1.0, "๋‹จ์ฒด์ถ”์ฒœ": 1.0, "ํŠน์ˆ˜์ œ๋ฒ•": 1.0, # ์‹์•ฝ์ฒ˜ ๊ธฐ๋งŒ๊ด‘๊ณ  ์ ๋ฐœ ํ‚ค์›Œ๋“œ
15
+ "์ตœ๊ณ ": 1.0, "๊ฐ€์žฅ ์ข‹์€": 1.0, "๋…์†Œ": 1.0, "๋ถ€์ž‘์šฉ": 1.5,
16
+ "์ฒดํ—˜๊ธฐ": 1.5, "์ฒดํ—˜์‚ฌ๋ก€": 1.5 # ์‹์•ฝ์ฒ˜๊ฐ€ ๊ธˆ์ง€ํ•˜๋Š” ํ›„๊ธฐ ๋งˆ์ผ€ํŒ… ํ‚ค์›Œ๋“œ
17
+ }
18
+
19
+ # ๐Ÿ›ก๏ธ ๋ถ€์ •์–ด ์‚ฌ์ „ (์ด ๋‹จ์–ด๋“ค์ด ์ฃผ๋ณ€์— ์žˆ์œผ๋ฉด ๋ฌด์ฃ„ ํŒ๊ฒฐ)
20
+ self.negation_words = {"์—†", "์•Š", "์•„๋‹ˆ", "๋ฌด", "์•ˆ", "๋ชป"}
21
+
22
+ def split_into_sentences(self, text):
23
+ """ํ…์ŠคํŠธ๋ฅผ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌํ•˜์—ฌ ๊ธธ์ด ํŽธํ–ฅ(Length Bias)์„ ๋ฐฉ์ง€ํ•ฉ๋‹ˆ๋‹ค."""
24
+ # ๋งˆ์นจํ‘œ, ๋А๋‚Œํ‘œ, ๋ฌผ์Œํ‘œ ๋˜๋Š” ์ค„๋ฐ”๊ฟˆ์„ ๊ธฐ์ค€์œผ๋กœ ๋ถ„๋ฆฌ
25
+ sentences = re.split(r'[.!?\n]+', text)
26
+ return [s.strip() for s in sentences if len(s.strip()) > 2]
27
+
28
+ def check_negation_context(self, tokens, target_index, window_size=3):
29
+ """
30
+ [ํ•ต์‹ฌ ๋กœ์ง] ํƒ€๊ฒŸ ๋‹จ์–ด ๋’ค์— ๋ถ€์ •์–ด๊ฐ€ ์˜ค๋Š”์ง€ ๋ฌธ๋งฅ์„ ๊ฒ€์‚ฌํ•ฉ๋‹ˆ๋‹ค.
31
+ ์˜ˆ: '๋ถ€์ž‘์šฉ'(๋ช…์‚ฌ) + '์ด'(์กฐ์‚ฌ) + '์—†'(ํ˜•์šฉ์‚ฌ) + '์Šต๋‹ˆ๋‹ค'(์–ด๋ฏธ)
32
+ """
33
+ # ํƒ€๊ฒŸ ๋‹จ์–ด ๋’ค์˜ window_size ๋งŒํผ์˜ ํ˜•ํƒœ์†Œ๋ฅผ ์‚ดํŽด๋ด…๋‹ˆ๋‹ค.
34
+ end_index = min(target_index + window_size + 1, len(tokens))
35
+ context_tokens = tokens[target_index + 1 : end_index]
36
+
37
+ for word, pos in context_tokens:
38
+ # VA(ํ˜•์šฉ์‚ฌ-์—†๋‹ค), VX(๋ณด์กฐ์šฉ์–ธ-์•Š๋‹ค), MAG(๋ถ€์‚ฌ-์•ˆ,๋ชป) ๋“ฑ์„ ์ฒดํฌ
39
+ if word in self.negation_words or pos in ['VA', 'VX', 'MAG']:
40
+ return True # ๋ถ€์ •์–ด๊ฐ€ ์กด์žฌํ•จ!
41
+ return False
42
+
43
+ def calculate_x1_score(self, text):
44
+ if not text:
45
+ return 0.0
46
+
47
+ print("\n๐Ÿ” [Step 1] ํ…์ŠคํŠธ ๊ณผ์žฅ๋„(Lexical Score) ๋ถ„์„ ์‹œ์ž‘...")
48
+
49
+ sentences = self.split_into_sentences(text)
50
+ total_sentences = len(sentences)
51
+
52
+ if total_sentences == 0:
53
+ return 0.0
54
+
55
+ total_penalty = 0.0
56
+ detected_issues = []
57
+
58
+ # ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๊ฒ€์‚ฌํ•˜์—ฌ ํ…์ŠคํŠธ๊ฐ€ ๊ธธ์–ด์ ธ๋„ ์ ์ˆ˜๊ฐ€ ํฌ์„๋˜์ง€ ์•Š๊ฒŒ ๋ฐฉ์–ด
59
+ for sentence in sentences:
60
+ tokens = self.mecab.pos(sentence)
61
+ sentence_flagged = False
62
+
63
+ for i, (word, pos) in enumerate(tokens):
64
+ # ์‚ฌ์ „์— ์žˆ๋Š” ๊ธˆ์น™์–ด์ธ์ง€ ํ™•์ธ
65
+ if word in self.lexicon:
66
+ # ๋ถ€์ •์–ด ๋ฌธ๋งฅ ์ฒดํฌ ("๋ถ€์ž‘์šฉ์ด ์—†์Šต๋‹ˆ๋‹ค" ํ•„ํ„ฐ๋ง)
67
+ is_negated = self.check_negation_context(tokens, i)
68
+
69
+ if is_negated:
70
+ detected_issues.append(f"๐Ÿ›ก๏ธ ๋ฌด์ฃ„(๋ถ€์ •์–ด ๋™๋ฐ˜): '{word}' (๋ฌธ์žฅ: {sentence})")
71
+ else:
72
+ weight = self.lexicon[word]
73
+ total_penalty += weight
74
+ sentence_flagged = True
75
+ detected_issues.append(f"๐Ÿšจ ์ ๋ฐœ: '{word}' (๊ฐ€์ค‘์น˜: +{weight})")
76
+
77
+ # ํ•œ ๋ฌธ์žฅ์— ๊ธˆ์น™์–ด๊ฐ€ ์—ฌ๋Ÿฌ ๋ฒˆ ๋‚˜์™€๋„ 1์ฐจ์›์ ์œผ๋กœ ํญ๋ฐœํ•˜์ง€ ์•Š๋„๋ก ํŒจ๋„ํ‹ฐ ์ƒํ•œ์„  ๋ถ€์—ฌ
78
+ if sentence_flagged:
79
+ total_penalty += 0.5 # ๋ฌธ์žฅ ์ž์ฒด์˜ ๋ถˆ๋Ÿ‰๋„ ์ถ”๊ฐ€ ์ ์ˆ˜
80
+
81
+ # ๐Ÿงฎ X1 ์Šค์ฝ”์–ด ๊ณ„์‚ฐ (0 ~ 100์  ์Šค์ผ€์ผ๋ง)
82
+ # ๊ณต์‹: (์ด ํŒจ๋„ํ‹ฐ / ์ „์ฒด ๋ฌธ์žฅ ์ˆ˜)๋ฅผ ๊ธฐ์ค€์œผ๋กœ ์ ์ˆ˜ํ™”ํ•˜๋˜, ๋กœ๊ทธ ์Šค์ผ€์ผ ๋“ฑ์„ ์จ์„œ 100์  ์ƒํ•œ์„  ์ ์šฉ
83
+ raw_score = (total_penalty / total_sentences) * 50
84
+ x1_score = min(raw_score, 100.0)
85
+
86
+ # ๊ฒฐ๊ณผ ๋ฆฌํฌํŠธ ์ถœ๋ ฅ
87
+ print(f"\n[๋ถ„์„ ๋ฆฌํฌํŠธ]")
88
+ print(f" - ์ „์ฒด ๋ฌธ์žฅ ์ˆ˜: {total_sentences}๋ฌธ์žฅ")
89
+ for issue in detected_issues:
90
+ print(f" {issue}")
91
+
92
+ print(f"๐Ÿ“ˆ ์ตœ์ข… X1 ์ ์ˆ˜: {x1_score:.2f} / 100.0 ์ ")
93
+ return x1_score
94
+
95
+ # ==========================================
96
+ # ์‹ค์ œ ์‹คํ–‰ ํ…Œ์ŠคํŠธ ์ฝ”๋“œ
97
+ # ==========================================
98
+ if __name__ == "__main__":
99
+ analyzer = LexicalAnalyzer()
100
+
101
+ print("==================================================")
102
+ print("ํ…Œ์ŠคํŠธ 1: ๋ถ€์ •์–ด๊ฐ€ ํฌํ•จ๋œ ์ •์ƒ์ ์ธ ๊ด‘๊ณ  (์˜คํƒ ๋ฐฉ์ง€ ํ…Œ์ŠคํŠธ)")
103
+ test_text_1 = "์ด ์ œํ’ˆ์€ ์‹์•ฝ์ฒ˜ ์ธ์ฆ์„ ๋ฐ›์•˜์Šต๋‹ˆ๋‹ค. ํ”ผ๋ถ€ ํŠธ๋Ÿฌ๋ธ”์ด๋‚˜ ๋ถ€์ž‘์šฉ์ด ์ „ํ˜€ ์—†์Šต๋‹ˆ๋‹ค. ์•ˆ์‹ฌํ•˜๊ณ  ์‚ฌ์šฉํ•˜์„ธ์š”."
104
+ analyzer.calculate_x1_score(test_text_1)
105
+
106
+ print("\n==================================================")
107
+ print("ํ…Œ์ŠคํŠธ 2: ๊ทน๋‹จ์ ์ธ ๊ณผ๋Œ€๊ด‘๊ณ  (ํ—ˆ์œ„ ํƒ์ง€ ํ…Œ์ŠคํŠธ)")
108
+ test_text_2 = "๋‹จ ์ผ์ฃผ์ผ ๋งŒ์— ์ง€๋ฐฉ์ด 100% ๋ถ„ํ•ด๋˜๋Š” ๊ธฐ์ ์„ ๊ฒฝํ—˜ํ•˜์„ธ์š”! ์ด๊ฒƒ์€ ์•”๋„ ์™„์น˜ํ•˜๋Š” ๋งŒ๋ณ‘ํ†ต์น˜ ์•ฝ์ž…๋‹ˆ๋‹ค. ๋ฌด์กฐ๊ฑด ๊ตฌ๋งคํ•˜์„ธ์š”. ์ตœ๊ณ ์˜ ์„ ํƒ์ž…๋‹ˆ๋‹ค."
109
+ analyzer.calculate_x1_score(test_text_2)
myapp/step2_semantic.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+ import gc
5
+
6
+ class SemanticAnalyzer:
7
+ def __init__(self):
8
+ print("๐Ÿง  [Step 2] ์˜๋ฏธ๋ก ์  ๋”ฅ๋Ÿฌ๋‹ ๋ถ„์„๊ธฐ(KoELECTRA)๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค...")
9
+
10
+ # M1 Max GPU(MPS) ์„ธํŒ…
11
+ self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
12
+
13
+ # ๊ฐ€๋ณ๊ณ  ์„ฑ๋Šฅ์ด ๋›ฐ์–ด๋‚œ KoELECTRA ๋ชจ๋ธ ๋กœ๋“œ
14
+ self.model_name = "monologg/koelectra-base-v3-discriminator"
15
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
16
+
17
+ # ๐ŸŒŸ ํ•ต์‹ฌ: output_hidden_states=True ์˜ต์…˜์„ ์ผœ์•ผ
18
+ # ๋ถ„๋ฅ˜ Logit๊ณผ ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„์šฉ Vector๋ฅผ ํ•œ ๋ฒˆ์˜ ์—ฐ์‚ฐ์œผ๋กœ ๋‘˜ ๋‹ค ๋ฝ‘์•„๋‚ผ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค!
19
+ self.model = AutoModelForSequenceClassification.from_pretrained(
20
+ self.model_name,
21
+ num_labels=2, # 0: ์ •์ƒ, 1: ๊ณผ๋Œ€๊ด‘๊ณ 
22
+ output_hidden_states=True
23
+ ).to(self.device)
24
+ self.model.eval() # ์ถ”๋ก  ๋ชจ๋“œ ์ „ํ™˜
25
+
26
+ # [๋ฒกํ„ฐ A ๊ตฌ์ถ•] ๊ธฐ์กด์— ์ ๋ฐœ๋œ ํ—ˆ์œ„๊ด‘๊ณ  ๋ ˆํผ๋Ÿฐ์Šค ๋ฌธ์žฅ๋“ค (์˜ˆ์‹œ)
27
+ self.reference_bad_texts = [
28
+ "๋‹จ ์ผ์ฃผ์ผ ๋งŒ์— ์ง€๋ฐฉ์ด 100% ๋ถ„ํ•ด๋˜๋Š” ๊ธฐ์ ์˜ ํฌ๋ฆผ",
29
+ "์‹์•ฝ์ฒ˜์—์„œ ์ธ์ฆํ•œ ๋งŒ๋ณ‘ํ†ต์น˜์•ฝ, ์•”์„ธํฌ ์™„๋ฒฝ ์ œ๊ฑฐ",
30
+ "์ด๊ฒƒ๋งŒ ๋จน์œผ๋ฉด ๋…์†Œ๊ฐ€ ๋ฐฐ์ถœ๋˜๊ณ  ์„ธํฌ๊ฐ€ ์ฆ‰๊ฐ ์žฌ์ƒ๋ฉ๋‹ˆ๋‹ค",
31
+ "์˜์‚ฌ๋“ค์ด ๋ฌด์กฐ๊ฑด ์ถ”์ฒœํ•˜๋Š” ๋ถ€์ž‘์šฉ ์—†๋Š” ์™„์น˜์ œ"
32
+ ]
33
+ print(" -> ๋ ˆํผ๋Ÿฐ์Šค ํ—ˆ์œ„๊ด‘๊ณ  ๋ฌธ์žฅ๋“ค์„ ๋ฒกํ„ฐ(Vector) ๊ณต๊ฐ„์— ๋ฐฐ์น˜ ์ค‘...")
34
+ self.reference_embeddings = self._get_embeddings(self.reference_bad_texts)
35
+
36
+ def clear_memory(self):
37
+ """๋ฉ”๋ชจ๋ฆฌ ๋ˆ„์ˆ˜ ๋ฐฉ์ง€์šฉ ๊ฐ€๋น„์ง€ ์ปฌ๋ ‰์…˜"""
38
+ gc.collect()
39
+ if torch.backends.mps.is_available():
40
+ torch.mps.empty_cache()
41
+
42
+ def _get_embeddings(self, texts):
43
+ """ํ…์ŠคํŠธ ๋ฆฌ์ŠคํŠธ๋ฅผ ์ž…๋ ฅ๋ฐ›์•„ ๋ฌธ์žฅ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
44
+ inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(self.device)
45
+ with torch.no_grad():
46
+ outputs = self.model(**inputs)
47
+ # ๋งˆ์ง€๋ง‰ ๋ ˆ์ด์–ด์˜ hidden states์—์„œ [CLS] ํ† ํฐ(์ธ๋ฑ์Šค 0)์˜ ๋ฒกํ„ฐ๋ฅผ ๋ฌธ์žฅ ๋Œ€ํ‘œ ๋ฒกํ„ฐ๋กœ ์‚ฌ์šฉ
48
+ sentence_embeddings = outputs.hidden_states[-1][:, 0, :]
49
+ return sentence_embeddings
50
+
51
+ def calculate_x2_score(self, text):
52
+ if not text or len(text.strip()) < 5:
53
+ return 0.0
54
+
55
+ print("\n๐Ÿง  [Step 2] ๋”ฅ๋Ÿฌ๋‹ ๋ฌธ๋งฅ ๋ฐ ์œ ์‚ฌ๋„(Semantic) ๋ถ„์„ ์‹œ์ž‘...")
56
+
57
+ # ๋„ˆ๋ฌด ๊ธด ํ…์ŠคํŠธ๋Š” PLM ํ•œ๊ณ„(512ํ† ํฐ)์— ๊ฑธ๋ฆฌ๋ฏ€๋กœ ์ž˜๋ผ์„œ ์ž…๋ ฅ
58
+ inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
59
+
60
+ with torch.no_grad(): # ์—ญ์ „ํŒŒ(ํ•™์Šต) ์—ฐ์‚ฐ์„ ๊บผ์„œ M1 ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ
61
+ outputs = self.model(**inputs)
62
+
63
+ # =========================================================
64
+ # [์ˆ˜์‹ 1] ๋ถ„๋ฅ˜ ํ™•๋ฅ  (Softmax) : X2
65
+ # =========================================================
66
+ logits = outputs.logits # ๋ชจ๋ธ์˜ ๋‚ ๊ฒƒ ์ถœ๋ ฅ๊ฐ’ (z)
67
+ probs = F.softmax(logits, dim=-1) # Softmax ์ ์šฉ: e^z / sum(e^z)
68
+
69
+ # ๋ผ๋ฒจ 1(๊ณผ๋Œ€๊ด‘๊ณ )์— ํ•ด๋‹นํ•˜๋Š” ํ™•๋ฅ ๊ฐ’ (0.0 ~ 1.0)
70
+ prob_fake = probs[0][1].item()
71
+
72
+ # =========================================================
73
+ # [์ˆ˜์‹ 2] ์˜๋ฏธ๋ก ์  ์œ ์‚ฌ๋„ (Cosine Similarity)
74
+ # =========================================================
75
+ # ํ˜„์žฌ ๊ฒ€์‚ฌ ์ค‘์ธ ํ…์ŠคํŠธ์˜ ๋ฒกํ„ฐ B ์ถ”์ถœ
76
+ current_embedding = outputs.hidden_states[-1][:, 0, :]
77
+
78
+ # ๋ ˆํผ๋Ÿฐ์Šค ๋ฒกํ„ฐ A๋“ค๊ณผ ๋ฒกํ„ฐ B์˜ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ: (A * B) / (|A| * |B|)
79
+ similarities = F.cosine_similarity(current_embedding, self.reference_embeddings)
80
+
81
+ # ๊ฐ€์žฅ ๋ฌธ๋งฅ์ด ๋น„์Šทํ•˜๋‹ค๊ณ  ํŒ์ •๋œ ๋ ˆํผ๋Ÿฐ์Šค์™€์˜ ์ตœ๊ณ  ์œ ์‚ฌ๋„ ์ ์ˆ˜
82
+ max_sim = torch.max(similarities).item()
83
+
84
+ # ๋ถ„์„์ด ๋๋‚˜๋ฉด ์ฆ‰์‹œ VRAM ๋ฐ˜ํ™˜
85
+ del inputs, outputs, logits, probs, current_embedding, similarities
86
+ self.clear_memory()
87
+
88
+ # =========================================================
89
+ # ์ตœ์ข… X2 ์Šค์ฝ”์–ด ์‚ฐ์ถœ (๋ถ„๋ฅ˜ ํ™•๋ฅ  + ์œ ์‚ฌ๋„ ์•™์ƒ๋ธ”)
90
+ # =========================================================
91
+ # ๋ถ„๋ฅ˜๊ธฐ ํ™•๋ฅ ์„ 100์  ๋งŒ์ ์œผ๋กœ ๋ณ€ํ™˜
92
+ classification_score = prob_fake * 100
93
+ # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„๋ฅผ 100์  ๋งŒ์ ์œผ๋กœ ๋ณ€ํ™˜ (์œ ์‚ฌ๋„๊ฐ€ 0 ์ดํ•˜๋ฉด 0์  ์ฒ˜๋ฆฌ)
94
+ similarity_score = max(max_sim, 0) * 100
95
+
96
+ # ๐Ÿ’ก [์‹ค๋ฌด ํŒ] ํ˜ผํ•ฉ ๊ฐ€์ค‘์น˜ ์ ์šฉ
97
+ # ํ˜„์žฌ KoELECTRA ๋ชจ๋ธ์€ '๊ณผ๋Œ€๊ด‘๊ณ  ์ „์šฉ'๏ฟฝ๏ฟฝ๏ฟฝ๋กœ ํŒŒ์ธํŠœ๋‹ ๋˜์ง€ ์•Š์€ ์Œฉ์–ผ ์ƒํƒœ์ž…๋‹ˆ๋‹ค.
98
+ # ๋”ฐ๋ผ์„œ Softmax ๋ถ„๋ฅ˜ ํ™•๋ฅ ์€ ๋žœ๋ค์— ๊ฐ€๊น๊ณ , ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„๊ฐ€ ํ›จ์”ฌ ์ •ํ™•ํ•ฉ๋‹ˆ๋‹ค.
99
+ # ์œ ์‚ฌ๋„ ์ ์ˆ˜์— 80%, ๋ถ„๋ฅ˜ ์ ์ˆ˜์— 20% ๊ฐ€์ค‘์น˜๋ฅผ ์ฃผ์–ด ์ตœ์ข… ์ ์ˆ˜๋ฅผ ๋งŒ๋“ญ๋‹ˆ๋‹ค.
100
+ x2_score = (similarity_score * 0.8) + (classification_score * 0.2)
101
+
102
+ print(f" -> ๐Ÿ“Š ๋ชจ๋ธ ๋ถ„๋ฅ˜ ํ™•๋ฅ  (Softmax): {prob_fake*100:.1f}%")
103
+ print(f" -> ๐Ÿ”— ์ตœ๋Œ€ ๋ฌธ๋งฅ ์œ ์‚ฌ๋„ (Cosine Sim): {max_sim*100:.1f}%")
104
+ print(f"๐Ÿ“ˆ ์ตœ์ข… X2 ์ ์ˆ˜ (0~100): {x2_score:.2f}์ ")
105
+
106
+ return x2_score
107
+
108
+ # ==========================================
109
+ # ๋‹จ๋… ํ…Œ์ŠคํŠธ ์ฝ”๋“œ
110
+ # ==========================================
111
+ if __name__ == "__main__":
112
+ analyzer = SemanticAnalyzer()
113
+
114
+ # ๋ ˆํผ๋Ÿฐ์Šค("๋‹จ ์ผ์ฃผ์ผ ๋งŒ์—...")์™€ ์˜๋ฏธ๊ฐ€ ์œ ์‚ฌํ•œ ๋ฌธ์žฅ ํ…Œ์ŠคํŠธ
115
+ test_text = "๋‹จ 7์ผ๋งŒ ํˆฌ์žํ•˜์„ธ์š”! ์ง€๋ฐฉ์ด ์™„์ „ํžˆ ํŒŒ๊ดด๋˜๋Š” ๋†€๋ผ์šด ๋งˆ๋ฒ•์„ ๊ฒช๊ฒŒ ๋ฉ๋‹ˆ๋‹ค."
116
+ analyzer.calculate_x2_score(test_text)
myapp/step3_rag.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import torch
4
+ import numpy as np
5
+ from openai import OpenAI
6
+ from sentence_transformers import SentenceTransformer, util
7
+ from dotenv import load_dotenv
8
+
9
+ # .env ํŒŒ์ผ์— ์ €์žฅ๋œ OPENAI_API_KEY๋ฅผ ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋กœ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
10
+ load_dotenv()
11
+
12
+ class FactCheckerRAG:
13
+ def __init__(self):
14
+ print("๐Ÿ“š [Step 3] RAG + LLM ํŒฉํŠธ์ฒด์ปค๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค...")
15
+
16
+ # ๊ธฐ๊ธฐ ์„ค์ • (Mac์˜ ๊ฒฝ์šฐ mps, ์•„๋‹ˆ๋ฉด cpu)
17
+ self.device = "mps" if torch.backends.mps.is_available() else "cpu"
18
+
19
+ # ํ•œ๊ตญ์–ด ๋ฌธ์žฅ ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ
20
+ self.retriever = SentenceTransformer('jhgan/ko-sroberta-multitask', device=self.device)
21
+
22
+ # OpenAI ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™” (API ํ‚ค๋Š” ํ™˜๊ฒฝ๋ณ€์ˆ˜์—์„œ ์•ˆ์ „ํ•˜๊ฒŒ ๊ฐ€์ ธ์˜ด)
23
+ api_key = os.getenv("OPENAI_API_KEY")
24
+ if not api_key:
25
+ print("โš ๏ธ ๊ฒฝ๊ณ : OPENAI_API_KEY๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. .env ํŒŒ์ผ์„ ํ™•์ธํ•˜์„ธ์š”.")
26
+
27
+ self.client = OpenAI(api_key=api_key)
28
+
29
+ # ํŒฉํŠธ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค (์‹์•ฝ์ฒ˜ ๊ทœ์ • ๋ฐ ํŒ๋ก€ ๊ธฐ๋ฐ˜)
30
+ self.fact_db = [
31
+ # 1. ์ ˆ๋Œ€ ๊ธˆ์ง€ ์กฐํ•ญ
32
+ "์งˆ๋ณ‘์˜ ์˜ˆ๋ฐฉ ๋ฐ ์น˜๋ฃŒ์— ํšจ๋Šฅยทํšจ๊ณผ๊ฐ€ ์žˆ๊ฑฐ๋‚˜ ์˜์•ฝํ’ˆ ๋˜๋Š” ๊ฑด๊ฐ•๊ธฐ๋Šฅ์‹ํ’ˆ์œผ๋กœ ์˜ค์ธยทํ˜ผ๋™ํ•  ์šฐ๋ ค๊ฐ€ ์žˆ๋Š” ํ‘œ์‹œยท๊ด‘๊ณ ๋Š” ๊ธˆ์ง€๋ฉ๋‹ˆ๋‹ค.",
33
+ "์ฒดํ—˜๊ธฐ ๋“ฑ์„ ์ด์šฉํ•˜๊ฑฐ๋‚˜ '์ฃผ๋ฌธ์‡„๋„', '๋‹จ์ฒด์ถ”์ฒœ' ๋“ฑ ์†Œ๋น„์ž๋ฅผ ๊ธฐ๋งŒํ•˜๋Š” ๊ด‘๊ณ ๋Š” ์ฒ˜๋ฒŒ ๋Œ€์ƒ์ž…๋‹ˆ๋‹ค.",
34
+ "์‹ํ’ˆ์— ๊ฐ์ข… ์ƒ์žฅ, ์ธ์ฆ, ๋ณด์ฆ์„ ๋ฐ›์•˜๋‹ค๋Š” ๋‚ด์šฉ์„ ์‚ฌ์šฉํ•˜๋Š” ๊ฒƒ์€ ํ—ˆ์œ„ยท๊ณผ๋Œ€๊ด‘๊ณ ์— ํ•ด๋‹นํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.",
35
+
36
+ # 2. ํ—ˆ์šฉ๋˜๋Š” ํ‘œํ˜„
37
+ "์ธ์ฒด์˜ ๊ฑด์ „ํ•œ ์„ฑ์žฅ ๋ฐ ๋ฐœ๋‹ฌ๊ณผ ๊ฑด๊ฐ• ์œ ์ง€์— ๋„์›€์„ ์ค€๋‹ค๋Š” ํ‘œํ˜„์€ ํŠน์ • ์งˆ๋ณ‘์„ ์–ธ๊ธ‰ํ•˜์ง€ ์•Š๋Š” ํ•œ ํ—ˆ์šฉ๋ฉ๋‹ˆ๋‹ค.",
38
+ "๊ฑด๊ฐ•์ฆ์ง„, ์ฒด์งˆ๊ฐœ์„ , ์‹์ด์š”๋ฒ•, ์˜์–‘๋ณด๊ธ‰ ๋“ฑ์— ๋„์›€์„ ์ค€๋‹ค๋Š” ํ‘œํ˜„์€ ๊ณผ๋Œ€๊ด‘๊ณ ๊ฐ€ ์•„๋‹™๋‹ˆ๋‹ค.",
39
+ "ํ•ด๋‹น ์ œํ’ˆ์ด ์œ ์•„์‹, ํ™˜์ž์‹ ๋“ฑ ํŠน์ˆ˜์šฉ๋„์‹ํ’ˆ์ด๋ผ๋Š” ํ‘œํ˜„์€ ํ—ˆ์šฉ๋ฉ๋‹ˆ๋‹ค.",
40
+
41
+ # 3. ์‹ค์ œ ์ ๋ฐœ ์‚ฌ๋ก€
42
+ "์ผ๋ฐ˜ ์‹ํ’ˆ์— ๋‹น๋‡จ, ๊ณ ํ˜ˆ์••, ํ•ญ์•” ๋“ฑ ํŠน์ • ์งˆ๋ณ‘ ์น˜๋ฃŒ ํšจ๊ณผ๊ฐ€ ์žˆ๋‹ค๊ณ  ๊ธฐ์žฌํ•˜๋Š” ๊ฒƒ์€ ๋ช…๋ฐฑํ•œ ๋ถˆ๋ฒ•์ž…๋‹ˆ๋‹ค.",
43
+ "๋ธ”๋กœ๊ทธ๋‚˜ ์‡ผํ•‘๋ชฐ์— ์งˆ๋ณ‘ ์น˜๋ฃŒ ์ „ํ›„ ๋น„๊ต ์‚ฌ์ง„์ด๋‚˜ ๊ฐœ์ธ์ ์ธ ์ฒดํ—˜๊ธฐ๋ฅผ ์˜ฌ๋ฆฌ๋Š” ํ–‰์œ„๋Š” ๋ถˆ๋ฒ• ๊ณผ๋Œ€๊ด‘๊ณ ์ž…๋‹ˆ๋‹ค."
44
+ ]
45
+
46
+ # ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์ž„๋ฒ ๋”ฉ ๋ฏธ๋ฆฌ ๊ณ„์‚ฐ
47
+ self.db_embeddings = self.retriever.encode(self.fact_db, convert_to_tensor=True)
48
+
49
+ def calculate_x3_score(self, text):
50
+ """RAG ๊ธฐ๋ฐ˜์œผ๋กœ ๊ด‘๊ณ  ํ…์ŠคํŠธ์˜ ์œ„๋ฐ˜ ์ ์ˆ˜๋ฅผ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค."""
51
+ if not text or len(text.strip()) < 5:
52
+ return 0.0, "๊ฒ€์‚ฌํ•  ํ…์ŠคํŠธ๊ฐ€ ๋ถ€์กฑํ•ฉ๋‹ˆ๋‹ค."
53
+
54
+ try:
55
+ # 1. ๊ด€๋ จ ๊ทœ์ • ๊ฒ€์ƒ‰ (Retrieval)
56
+ query_embedding = self.retriever.encode(text, convert_to_tensor=True)
57
+ cosine_scores = util.cos_sim(query_embedding, self.db_embeddings)[0]
58
+ best_idx = torch.argmax(cosine_scores).item()
59
+ retrieved_fact = self.fact_db[best_idx]
60
+
61
+ # 2. LLM ์‹ฌ์‚ฌ (Generation)
62
+ prompt = f"""
63
+ ๋‹น์‹ ์€ ๋Œ€ํ•œ๋ฏผ๊ตญ ์‹์•ฝ์ฒ˜ ๋ฐ ๊ณต์ •์œ„์˜ ๊ณผ๋Œ€๊ด‘๊ณ  ์‹ฌ์‚ฌ๊ด€์ž…๋‹ˆ๋‹ค.
64
+ ์•„๋ž˜ [๊ด€๋ จ ๊ทœ์ •]์„ ๋ฐ”ํƒ•์œผ๋กœ [๊ด‘๊ณ  ํ…์ŠคํŠธ]์˜ ์œ„๋ฐ˜ ์—ฌ๋ถ€๋ฅผ ํŒ๋‹จํ•˜์„ธ์š”.
65
+
66
+ [๊ด€๋ จ ๊ทœ์ •]: {retrieved_fact}
67
+ [๊ด‘๊ณ  ํ…์ŠคํŠธ]: {text}
68
+
69
+ ๋ฐ˜๋“œ์‹œ ์•„๋ž˜ ํ˜•์‹์œผ๋กœ๋งŒ ์‘๋‹ตํ•˜์„ธ์š”:
70
+ ์ ์ˆ˜: [0~100 ์‚ฌ์ด ์ˆซ์ž]
71
+ ์‚ฌ์œ : [์œ„๋ฐ˜์ธ ๊ฒฝ์šฐ ๊ตฌ์ฒด์  ๊ทผ๊ฑฐ, ์•„๋‹ˆ๋ฉด ํ—ˆ์šฉ ๊ทผ๊ฑฐ๋ฅผ 1~2์ค„๋กœ ์„ค๋ช…]
72
+ """
73
+
74
+ print(" -> ๐Ÿค– GPT ์‹ฌ์‚ฌ๊ด€์ด ๋ถ„์„ ์ค‘...")
75
+ response = self.client.chat.completions.create(
76
+ model="gpt-3.5-turbo",
77
+ messages=[{"role": "user", "content": prompt}],
78
+ temperature=0.0
79
+ )
80
+
81
+ result_text = response.choices[0].message.content
82
+ print(f" [๊ฒฐ๊ณผ] {result_text}")
83
+
84
+ # ์ ์ˆ˜ ํŒŒ์‹ฑ
85
+ score_match = re.search(r"์ ์ˆ˜:\s*(\d+)", result_text)
86
+ x3_score = float(score_match.group(1)) if score_match else 0.0
87
+
88
+ return x3_score, retrieved_fact
89
+
90
+ except Exception as e:
91
+ print(f"โš ๏ธ ์—๋Ÿฌ ๋ฐœ์ƒ: {e}")
92
+ return 0.0, "๋ถ„์„ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค."
93
+
94
+ # ์•™์ƒ๋ธ” ์ ์ˆ˜ ๊ณ„์‚ฐ๊ธฐ
95
+ def calculate_final_score(x1, x2, x3):
96
+ """
97
+ x1: ํ‚ค์›Œ๋“œ ๋งค์นญ ์ ์ˆ˜
98
+ x2: ๋”ฅ๋Ÿฌ๋‹ ๋ฌธ๋งฅ ์ ์ˆ˜
99
+ x3: RAG ํŒฉํŠธ์ฒดํฌ ์ ์ˆ˜
100
+ """
101
+ w1, w2, w3 = 0.2, 0.4, 0.4
102
+ return (w1 * x1) + (w2 * x2) + (w3 * x3)
103
+
104
+ if __name__ == "__main__":
105
+ checker = FactCheckerRAG()
106
+ test_ad = "์ด ์ฐจ๋ฅผ ๋งˆ์‹œ๋ฉด ์•” ๏ฟฝ๏ฟฝ๋ฐฉ์€ ๋ฌผ๋ก  ๋‹น๋‡จ ์ˆ˜์น˜๊ฐ€ ์ฆ‰๊ฐ ๋–จ์–ด์ง‘๋‹ˆ๋‹ค!"
107
+
108
+ score, fact = checker.calculate_x3_score(test_ad)
109
+ print("-" * 30)
110
+ print(f"์ตœ์ข… ์œ„๋ฐ˜ ์ ์ˆ˜: {score}")
111
+ print(f"์ฐธ์กฐ ๊ทœ์ •: {fact}")
myapp/step4_xai.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import xgboost as xgb
3
+ import shap
4
+
5
+ class XAIScorer:
6
+ def __init__(self):
7
+ print("๐Ÿ“Š [Step 4] XGBoost ์•™์ƒ๋ธ” ๋ชจ๋ธ ๋ฐ SHAP ์„ค๋ช…๊ธฐ(XAI) ๋กœ๋“œ ์ค‘...")
8
+
9
+ # 1. PoC์šฉ ๊ฐ€์ƒ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ (์‹ค๋ฌด์—์„œ๋Š” ์‹ค์ œ ๋ผ๋ฒจ๋ง๋œ DB๋ฅผ ๋ถˆ๋Ÿฌ์˜ต๋‹ˆ๋‹ค)
10
+ # X1(๋‹จ์–ด), X2(๋ฌธ๋งฅ), X3(ํŒฉํŠธ์ฒดํฌ) ์ ์ˆ˜๋ฅผ ๋žœ๋ค ์ƒ์„ฑ
11
+ np.random.seed(42)
12
+ X_train = np.random.rand(1000, 3) * 100
13
+
14
+ # ๊ฐ€์ƒ์˜ ์ •๋‹ต(Label) ์ƒ์„ฑ ๋กœ์ง: X2์™€ X3๊ฐ€ ๋†’์„์ˆ˜๋ก ๊ณผ๋Œ€๊ด‘๊ณ (1)์ผ ํ™•๋ฅ ์ด ๋†’์Œ
15
+ # y = 1 (๊ณผ๋Œ€๊ด‘๊ณ ), y = 0 (์ •์ƒ)
16
+ y_train = ((X_train[:, 0]*0.2 + X_train[:, 1]*0.4 + X_train[:, 2]*0.4) > 50).astype(int)
17
+
18
+ # 2. XGBoost ๋ชจ๋ธ ์ •์˜ ๋ฐ ํ•™์Šต (Logistic ๋ณ€ํ™˜ ๋‚ด์žฅ)
19
+ self.model = xgb.XGBClassifier(
20
+ n_estimators=50,
21
+ max_depth=3,
22
+ learning_rate=0.1,
23
+ eval_metric='logloss',
24
+ random_state=42
25
+ )
26
+ self.model.fit(X_train, y_train)
27
+
28
+ # 3. SHAP TreeExplainer ์ดˆ๊ธฐํ™” (XAI)
29
+ self.explainer = shap.TreeExplainer(self.model)
30
+ print(" -> ๐Ÿง  ๋จธ์‹ ๋Ÿฌ๋‹ ์Šค์ฝ”์–ด๋ง ์—”์ง„ ์„ธํŒ… ์™„๋ฃŒ!")
31
+
32
+ def calculate_final_score_and_explain(self, x1, x2, x3):
33
+ # ์ž…๋ ฅ๊ฐ’์„ numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜
34
+ X_input = np.array([[x1, x2, x3]])
35
+
36
+ # =========================================================
37
+ # 1. ์ตœ์ข… ์Šค์ฝ”์–ด๋ง (Logistic/Sigmoid ๋ณ€ํ™˜)
38
+ # XGBoost์˜ predict_proba๋Š” ๋‚ด๋ถ€์ ์œผ๋กœ Z๊ฐ’์— Sigmoid๋ฅผ ์”Œ์›Œ 0~1 ํ™•๋ฅ ์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
39
+ # =========================================================
40
+ probabilities = self.model.predict_proba(X_input)
41
+ final_score = probabilities[0][1] * 100 # ํด๋ž˜์Šค 1(๊ณผ๋Œ€๊ด‘๊ณ )์ผ ํ™•๋ฅ ์„ 100์  ๋งŒ์ ์œผ๋กœ ๋ณ€ํ™˜
42
+
43
+ # =========================================================
44
+ # 2. XAI (SHAP Value ๊ณ„์‚ฐ)
45
+ # =========================================================
46
+ shap_values = self.explainer.shap_values(X_input)
47
+
48
+ # ์ด์ง„ ๋ถ„๋ฅ˜์˜ ๊ฒฝ์šฐ ๋ฒ„์ „/์„ธํŒ…์— ๋”ฐ๋ผ ๋ฆฌ์ŠคํŠธ๋กœ ๋‚˜์˜ฌ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์•ˆ์ „ํ•˜๊ฒŒ ์ถ”์ถœ
49
+ if isinstance(shap_values, list):
50
+ shap_vals = shap_values[1][0]
51
+ else:
52
+ shap_vals = shap_values[0]
53
+
54
+ # Base Value (๋ชจ๋ธ์˜ ํ‰๊ท  ์˜ˆ์ธก๊ฐ’/์ ˆํŽธ)
55
+ if isinstance(self.explainer.expected_value, (list, np.ndarray)):
56
+ base_value = self.explainer.expected_value[1]
57
+ else:
58
+ base_value = self.explainer.expected_value
59
+
60
+ return final_score, shap_vals, base_value