KrishnaCosmic commited on
Commit
6337b71
·
1 Parent(s): 859e00d

checking changes

Browse files
services/sentiment_analysis_service.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sentiment Analysis Service for OpenTriage
3
+
4
+ Uses local Hugging Face DistilBERT model for fast, offline sentiment analysis
5
+ of PR comments. Detects sentiment scores and prominent language patterns.
6
+
7
+ Features:
8
+ - DistilBERT sentiment classification (local, no API calls)
9
+ - Keyword-based prominent language detection
10
+ - In-memory result caching (10-minute TTL)
11
+ - Stage 3 RAG prompt integration-ready
12
+ """
13
+
14
+ import logging
15
+ import time
16
+ from typing import Dict, Any, Optional, List, Tuple
17
+ from datetime import datetime, timezone
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Lazy-load transformers (only when needed)
22
+ _sentiment_pipeline = None
23
+ _cache = {} # {comment_id: {"sentiment": {...}, "timestamp": float}}
24
+ CACHE_TTL = 600 # 10 minutes
25
+
26
+ # Keyword patterns for prominent language detection
27
+ LANGUAGE_PATTERNS = {
28
+ "technical": ["bug", "error", "crash", "fix", "optimize", "refactor", "api", "database", "performance", "memory", "cpu"],
29
+ "positive": ["great", "excellent", "amazing", "love", "perfect", "awesome", "wonderful", "fantastic", "brilliant"],
30
+ "negative": ["bad", "horrible", "terrible", "hate", "useless", "broken", "awful", "pathetic", "worst"],
31
+ "urgent": ["critical", "urgent", "asap", "immediately", "emergency", "blocker", "must", "breaking"],
32
+ "discussion": ["thought", "idea", "suggestion", "question", "wondering", "propose", "consider", "discuss"],
33
+ "documentation": ["doc", "readme", "guide", "tutorial", "example", "comment", "explain"],
34
+ "testing": ["test", "coverage", "regression", "edge case", "unit test", "integration test", "quality"]
35
+ }
36
+
37
+
38
+ def _get_sentiment_pipeline():
39
+ """Lazy-load the sentiment analysis pipeline on first use."""
40
+ global _sentiment_pipeline
41
+
42
+ if _sentiment_pipeline is None:
43
+ try:
44
+ from transformers import pipeline
45
+ logger.info("[Sentiment] Loading DistilBERT sentiment-analysis model...")
46
+ _sentiment_pipeline = pipeline(
47
+ "sentiment-analysis",
48
+ model="distilbert-base-uncased-finetuned-sst-2-english",
49
+ device=-1 # CPU mode (set to 0 for GPU if available)
50
+ )
51
+ logger.info("[Sentiment] ✅ DistilBERT model loaded successfully")
52
+ except Exception as e:
53
+ logger.error(f"[Sentiment] Failed to load DistilBERT: {e}")
54
+ raise
55
+
56
+ return _sentiment_pipeline
57
+
58
+
59
+ def _detect_prominent_language(text: str) -> str:
60
+ """
61
+ Detect prominent language patterns from comment text.
62
+ Returns the most relevant category.
63
+ """
64
+ if not text:
65
+ return "neutral"
66
+
67
+ text_lower = text.lower()
68
+ pattern_scores = {}
69
+
70
+ for pattern, keywords in LANGUAGE_PATTERNS.items():
71
+ # Count keyword matches
72
+ matches = sum(1 for keyword in keywords if keyword in text_lower)
73
+ if matches > 0:
74
+ pattern_scores[pattern] = matches
75
+
76
+ # Return the category with most matches, or "neutral" if none found
77
+ if not pattern_scores:
78
+ return "neutral"
79
+
80
+ return max(pattern_scores.items(), key=lambda x: x[1])[0]
81
+
82
+
83
+ def _is_cache_valid(timestamp: float) -> bool:
84
+ """Check if cached entry is still valid (not expired)."""
85
+ return (time.time() - timestamp) < CACHE_TTL
86
+
87
+
88
+ def analyze_comment_sentiment(
89
+ comment_id: str,
90
+ comment_text: str,
91
+ author: str = "unknown",
92
+ force_recalc: bool = False
93
+ ) -> Dict[str, Any]:
94
+ """
95
+ Analyze the sentiment of a PR comment using DistilBERT.
96
+
97
+ Args:
98
+ comment_id: Unique comment identifier
99
+ comment_text: The comment body text
100
+ author: Comment author (for logging)
101
+ force_recalc: Force recalculation even if cached
102
+
103
+ Returns:
104
+ Dict with:
105
+ - sentiment_label: "POSITIVE" or "NEGATIVE"
106
+ - sentiment_score: Confidence score (0.0-1.0)
107
+ - prominent_language: Detected language category
108
+ - raw_scores: Full model output (all labels with scores)
109
+ - cached: Whether result came from cache
110
+ - analyzed_at: ISO timestamp
111
+ """
112
+ # Check cache first
113
+ if not force_recalc and comment_id in _cache:
114
+ cache_entry = _cache[comment_id]
115
+ if _is_cache_valid(cache_entry["timestamp"]):
116
+ logger.info(f"[Sentiment] Cache HIT for comment {comment_id} by {author}")
117
+ result = cache_entry["result"].copy()
118
+ result["cached"] = True
119
+ return result
120
+ else:
121
+ # Cache expired, remove it
122
+ del _cache[comment_id]
123
+ logger.info(f"[Sentiment] Cache expired for comment {comment_id}")
124
+
125
+ logger.info(f"[Sentiment] Analyzing comment {comment_id} by {author}")
126
+
127
+ try:
128
+ # Get sentiment pipeline
129
+ pipeline = _get_sentiment_pipeline()
130
+
131
+ # Truncate very long comments (keep first 512 tokens for DistilBERT)
132
+ truncated_text = comment_text[:512] if len(comment_text) > 512 else comment_text
133
+
134
+ # Run sentiment analysis
135
+ results = pipeline(truncated_text)
136
+
137
+ if not results:
138
+ logger.warning(f"[Sentiment] No results from model for comment {comment_id}")
139
+ return {
140
+ "sentiment_label": "NEUTRAL",
141
+ "sentiment_score": 0.5,
142
+ "prominent_language": "neutral",
143
+ "raw_scores": [],
144
+ "cached": False,
145
+ "analyzed_at": datetime.now(timezone.utc).isoformat(),
146
+ "error": "Model returned no results"
147
+ }
148
+
149
+ # Extract sentiment info
150
+ primary_result = results[0]
151
+ sentiment_label = primary_result["label"] # "POSITIVE" or "NEGATIVE"
152
+ sentiment_score = primary_result["score"] # Confidence (0.0-1.0)
153
+
154
+ # Detect prominent language patterns
155
+ prominent_language = _detect_prominent_language(comment_text)
156
+
157
+ # Build response
158
+ response = {
159
+ "sentiment_label": sentiment_label,
160
+ "sentiment_score": round(sentiment_score, 3),
161
+ "prominent_language": prominent_language,
162
+ "raw_scores": [
163
+ {
164
+ "label": r["label"],
165
+ "score": round(r["score"], 3)
166
+ } for r in results
167
+ ],
168
+ "cached": False,
169
+ "analyzed_at": datetime.now(timezone.utc).isoformat()
170
+ }
171
+
172
+ # Cache the result
173
+ _cache[comment_id] = {
174
+ "result": response.copy(),
175
+ "timestamp": time.time()
176
+ }
177
+
178
+ logger.info(
179
+ f"[Sentiment] ✅ Comment {comment_id}: {sentiment_label} "
180
+ f"(score: {sentiment_score:.3f}, language: {prominent_language})"
181
+ )
182
+
183
+ return response
184
+
185
+ except Exception as e:
186
+ logger.error(f"[Sentiment] Error analyzing comment {comment_id}: {e}")
187
+ return {
188
+ "sentiment_label": "NEUTRAL",
189
+ "sentiment_score": 0.5,
190
+ "prominent_language": "neutral",
191
+ "raw_scores": [],
192
+ "cached": False,
193
+ "analyzed_at": datetime.now(timezone.utc).isoformat(),
194
+ "error": str(e)
195
+ }
196
+
197
+
198
+ def analyze_batch_comments(comments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
199
+ """
200
+ Analyze sentiment for multiple comments at once.
201
+
202
+ Args:
203
+ comments: List of dicts with keys: id, body, author (optional)
204
+
205
+ Returns:
206
+ List of sentiment analysis results
207
+ """
208
+ results = []
209
+
210
+ for comment in comments:
211
+ comment_id = comment.get("id", f"comment_{len(results)}")
212
+ comment_text = comment.get("body", "")
213
+ author = comment.get("author", "unknown")
214
+
215
+ if not comment_text:
216
+ logger.warning(f"Skipping comment {comment_id} with empty body")
217
+ continue
218
+
219
+ result = analyze_comment_sentiment(
220
+ comment_id=comment_id,
221
+ comment_text=comment_text,
222
+ author=author
223
+ )
224
+
225
+ result["comment_id"] = comment_id
226
+ result["author"] = author
227
+ results.append(result)
228
+
229
+ return results
230
+
231
+
232
+ def get_sentiment_summary(comments: List[Dict[str, Any]]) -> Dict[str, Any]:
233
+ """
234
+ Get aggregate sentiment summary from multiple comments.
235
+
236
+ Useful for Stage 3 prompt: "What's the overall mood of reviewers?"
237
+
238
+ Args:
239
+ comments: List of sentiment analysis results
240
+
241
+ Returns:
242
+ Summary dict with:
243
+ - overall_sentiment: Dominant sentiment
244
+ - average_score: Mean sentiment score
245
+ - positive_count: Number of positive comments
246
+ - negative_count: Number of negative comments
247
+ - prominent_languages: Top language categories
248
+ - mood_description: Human-readable description
249
+ """
250
+ if not comments:
251
+ return {
252
+ "overall_sentiment": "NEUTRAL",
253
+ "average_score": 0.5,
254
+ "positive_count": 0,
255
+ "negative_count": 0,
256
+ "prominent_languages": [],
257
+ "mood_description": "No comments to analyze"
258
+ }
259
+
260
+ positive_count = sum(1 for c in comments if c.get("sentiment_label") == "POSITIVE")
261
+ negative_count = sum(1 for c in comments if c.get("sentiment_label") == "NEGATIVE")
262
+
263
+ # Calculate average sentiment score
264
+ scores = [c.get("sentiment_score", 0.5) for c in comments]
265
+ average_score = sum(scores) / len(scores) if scores else 0.5
266
+
267
+ # Count prominent languages
268
+ language_counts = {}
269
+ for comment in comments:
270
+ lang = comment.get("prominent_language", "neutral")
271
+ language_counts[lang] = language_counts.get(lang, 0) + 1
272
+
273
+ top_languages = sorted(language_counts.items(), key=lambda x: x[1], reverse=True)[:3]
274
+
275
+ # Determine overall sentiment
276
+ if positive_count > negative_count * 1.5:
277
+ overall = "POSITIVE"
278
+ mood = "Reviewers are enthusiastic and supportive"
279
+ elif negative_count > positive_count * 1.5:
280
+ overall = "NEGATIVE"
281
+ mood = "Reviewers have concerns or objections"
282
+ else:
283
+ overall = "MIXED"
284
+ mood = "Reviewers have mixed feedback with discussion"
285
+
286
+ return {
287
+ "overall_sentiment": overall,
288
+ "average_score": round(average_score, 3),
289
+ "positive_count": positive_count,
290
+ "negative_count": negative_count,
291
+ "neutral_count": len(comments) - positive_count - negative_count,
292
+ "prominent_languages": [lang for lang, _ in top_languages],
293
+ "mood_description": mood,
294
+ "total_comments": len(comments)
295
+ }
296
+
297
+
298
+ def clear_cache():
299
+ """Clear the sentiment analysis cache."""
300
+ global _cache
301
+ _cache.clear()
302
+ logger.info("[Sentiment] Cache cleared")
303
+
304
+
305
+ def get_cache_stats() -> Dict[str, Any]:
306
+ """Get cache statistics."""
307
+ valid_entries = sum(1 for e in _cache.values() if _is_cache_valid(e["timestamp"]))
308
+
309
+ return {
310
+ "total_entries": len(_cache),
311
+ "valid_entries": valid_entries,
312
+ "expired_entries": len(_cache) - valid_entries,
313
+ "cache_ttl_seconds": CACHE_TTL,
314
+ "model_loaded": _sentiment_pipeline is not None
315
+ }
316
+
317
+
318
+ # Service instance (singleton)
319
+ sentiment_analysis_service = type('SentimentAnalysisService', (), {
320
+ 'analyze_comment': analyze_comment_sentiment,
321
+ 'analyze_batch': analyze_batch_comments,
322
+ 'get_summary': get_sentiment_summary,
323
+ 'clear_cache': clear_cache,
324
+ 'get_cache_stats': get_cache_stats
325
+ })()