| """ |
| Multilingual Sentiment Analysis Tool |
| Supports Turkish, Persian, and English using lexicon-based and machine learning approaches |
| """ |
|
|
| import re |
| import json |
| import os |
| from typing import Dict, List, Tuple, Optional |
| from collections import Counter |
| import math |
|
|
|
|
| class SentimentLexicon: |
| """Base class for sentiment lexicons""" |
| |
| def __init__(self, language: str): |
| self.language = language |
| self.positive_words = set() |
| self.negative_words = set() |
| self.intensifiers = {} |
| self.negation_words = set() |
| self.diminishers = {} |
| self.contrast_words = set() |
| self.idioms_positive = [] |
| self.idioms_negative = [] |
| self._load_lexicon() |
| |
| def _load_lexicon(self): |
| """Load language-specific sentiment lexicon""" |
| lexicon_file = f"lexicons/{self.language}_lexicon.json" |
| if os.path.exists(lexicon_file): |
| with open(lexicon_file, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
| self.positive_words = set(data.get('positive', [])) |
| self.negative_words = set(data.get('negative', [])) |
| self.intensifiers = data.get('intensifiers', {}) |
| self.negation_words = set(data.get('negation', [])) |
| self.diminishers = data.get('diminishers', {}) |
| self.contrast_words = set(data.get('contrast_words', [])) |
| self.idioms_positive = data.get('idioms_positive', []) |
| self.idioms_negative = data.get('idioms_negative', []) |
| else: |
| |
| self._load_default_english() |
| |
| def _load_default_english(self): |
| """Load default English sentiment words""" |
| self.positive_words = { |
| 'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', |
| 'love', 'like', 'best', 'perfect', 'beautiful', 'nice', 'happy', |
| 'pleased', 'satisfied', 'awesome', 'brilliant', 'outstanding' |
| } |
| self.negative_words = { |
| 'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'dislike', |
| 'poor', 'disappointed', 'sad', 'angry', 'frustrated', 'annoying', |
| 'boring', 'ugly', 'awful', 'disgusting', 'pathetic' |
| } |
| self.intensifiers = { |
| 'very': 1.5, 'extremely': 2.0, 'really': 1.3, 'quite': 1.2, |
| 'too': 1.4, 'so': 1.3, 'absolutely': 1.8, 'completely': 1.5 |
| } |
| self.negation_words = { |
| 'not', 'no', 'never', 'none', 'nobody', 'nothing', 'nowhere', |
| 'neither', 'cannot', "can't", "won't", "don't", "doesn't" |
| } |
| self.diminishers = {} |
| self.contrast_words = set() |
| self.idioms_positive = [] |
| self.idioms_negative = [] |
|
|
|
|
| class TextPreprocessor: |
| """Text preprocessing for different languages""" |
| |
| def __init__(self, language: str): |
| self.language = language |
| |
| def preprocess(self, text: str) -> List[str]: |
| """Preprocess text and return tokens""" |
| |
| text = text.lower() |
| |
| |
| text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) |
| |
| |
| text = re.sub(r'\S+@\S+', '', text) |
| |
| |
| text = re.sub(r'[^\w\s\.,!?;:()\-\']', '', text) |
| |
| |
| tokens = re.findall(r'\b\w+\b|[.,!?;:()]', text) |
| |
| return tokens |
| |
| def normalize_turkish(self, text: str) -> str: |
| """Normalize Turkish text (handle special characters)""" |
| |
| replacements = { |
| 'ı': 'i', 'İ': 'I', |
| 'ğ': 'g', 'Ğ': 'G', |
| 'ü': 'u', 'Ü': 'U', |
| 'ş': 's', 'Ş': 'S', |
| 'ö': 'o', 'Ö': 'O', |
| 'ç': 'c', 'Ç': 'C' |
| } |
| for old, new in replacements.items(): |
| text = text.replace(old, new) |
| return text |
| |
| def normalize_persian(self, text: str) -> str: |
| """Normalize Persian text (handle different character forms)""" |
| |
| |
| return text |
|
|
|
|
| class LexiconBasedAnalyzer: |
| """Lexicon-based sentiment analysis with enhanced features""" |
| |
| def __init__(self, language: str): |
| self.language = language |
| self.lexicon = SentimentLexicon(language) |
| self.preprocessor = TextPreprocessor(language) |
| |
| def _check_idioms(self, text: str) -> Tuple[float, float]: |
| """Check for sentiment idioms in text""" |
| pos_score = 0.0 |
| neg_score = 0.0 |
| text_lower = text.lower() |
| |
| for idiom in self.lexicon.idioms_positive: |
| if idiom.lower() in text_lower: |
| pos_score += 2.0 |
| |
| for idiom in self.lexicon.idioms_negative: |
| if idiom.lower() in text_lower: |
| neg_score += 2.0 |
| |
| return pos_score, neg_score |
| |
| def analyze(self, text: str) -> Dict: |
| """Analyze sentiment using lexicon-based approach""" |
| tokens = self.preprocessor.preprocess(text) |
| text_lower = text.lower() |
| |
| positive_score = 0 |
| negative_score = 0 |
| sentiment_words = [] |
| negation_count = 0 |
| |
| |
| idiom_pos, idiom_neg = self._check_idioms(text) |
| positive_score += idiom_pos |
| negative_score += idiom_neg |
| |
| |
| window_size = 4 |
| i = 0 |
| while i < len(tokens): |
| token = tokens[i] |
| is_negated = False |
| intensifier_strength = 1.0 |
| diminisher_strength = 1.0 |
| |
| |
| for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)): |
| if tokens[j] in self.lexicon.negation_words: |
| |
| if j < i: |
| |
| has_punctuation = any( |
| tokens[k] in ['.', '!', '?', ';', ','] |
| for k in range(j + 1, i) |
| ) |
| if not has_punctuation: |
| is_negated = True |
| negation_count += 1 |
| break |
| |
| |
| for k in range(max(0, i-2), i): |
| if k >= 0 and tokens[k] in self.lexicon.intensifiers: |
| intensifier_strength = max(intensifier_strength, self.lexicon.intensifiers[tokens[k]]) |
| |
| |
| for k in range(max(0, i-2), i): |
| if k >= 0 and tokens[k] in self.lexicon.diminishers: |
| diminisher_strength = min(diminisher_strength, self.lexicon.diminishers[tokens[k]]) |
| |
| |
| if token in self.lexicon.positive_words: |
| score = 1.0 * intensifier_strength * diminisher_strength |
| if is_negated: |
| negative_score += score |
| sentiment_words.append(('negative', token, is_negated)) |
| else: |
| positive_score += score |
| sentiment_words.append(('positive', token, is_negated)) |
| elif token in self.lexicon.negative_words: |
| score = 1.0 * intensifier_strength * diminisher_strength |
| if is_negated: |
| positive_score += score |
| sentiment_words.append(('positive', token, is_negated)) |
| else: |
| negative_score += score |
| sentiment_words.append(('negative', token, is_negated)) |
| |
| i += 1 |
| |
| |
| |
| total_raw = positive_score + negative_score |
| if total_raw > 0: |
| |
| pos_normalized = positive_score / total_raw |
| neg_normalized = negative_score / total_raw |
| else: |
| pos_normalized = 0.0 |
| neg_normalized = 0.0 |
| |
| if total_raw == 0: |
| polarity = 'neutral' |
| confidence = 0.0 |
| elif positive_score > negative_score: |
| polarity = 'positive' |
| confidence = pos_normalized |
| else: |
| polarity = 'negative' |
| confidence = neg_normalized |
| |
| return { |
| 'polarity': polarity, |
| 'confidence': round(confidence, 3), |
| 'positive_score': round(positive_score, 3), |
| 'negative_score': round(negative_score, 3), |
| 'sentiment_words': sentiment_words[:10], |
| 'method': 'lexicon-based' |
| } |
|
|
|
|
| class RuleBasedAnalyzer: |
| """Rule-based sentiment analysis with advanced linguistic rules""" |
| |
| def __init__(self, language: str): |
| self.language = language |
| self.lexicon = SentimentLexicon(language) |
| self.preprocessor = TextPreprocessor(language) |
| |
| def _detect_emoticons(self, text: str) -> Tuple[float, float]: |
| """Detect and score emoticons and emojis""" |
| pos_score = 0.0 |
| neg_score = 0.0 |
| |
| |
| positive_emoticons = [ |
| ':)', ':-)', '=)', ';)', ';-)', '=D', ':D', ':-D', |
| '😊', '😀', '😁', '😂', '🤣', '😃', '😄', '😆', '😍', '🥰', |
| '😎', '🤗', '👍', '👏', '🎉', '❤️', '💕', '💖', '💗', '💓' |
| ] |
| |
| |
| negative_emoticons = [ |
| ':(', ':-(', '=(', ':/', ':-/', ':|', ':-|', '>:(', '>:(', |
| '😢', '😞', '😠', '😡', '😤', '😭', '😰', '😨', '😱', '😖', |
| '😣', '😫', '😩', '👎', '💔', '😒', '😔', '😕', '🙁' |
| ] |
| |
| for emoji in positive_emoticons: |
| count = text.count(emoji) |
| pos_score += count * 1.5 |
| |
| for emoji in negative_emoticons: |
| count = text.count(emoji) |
| neg_score += count * 1.5 |
| |
| return pos_score, neg_score |
| |
| def _handle_contrast_words(self, text: str, tokens: List[str], |
| pos_score: float, neg_score: float) -> Tuple[float, float]: |
| """Handle contrast words that may shift sentiment""" |
| |
| contrast_positions = [] |
| for i, token in enumerate(tokens): |
| if token.lower() in self.lexicon.contrast_words: |
| contrast_positions.append(i) |
| |
| |
| if contrast_positions: |
| |
| reduction_factor = 0.7 |
| return pos_score * reduction_factor, neg_score * reduction_factor |
| |
| return pos_score, neg_score |
| |
| def _detect_comparatives_superlatives(self, tokens: List[str]) -> float: |
| """Detect comparative and superlative forms that intensify sentiment""" |
| multiplier = 1.0 |
| |
| |
| superlative_indicators = ['most', 'best', 'worst', 'least', 'greatest'] |
| for token in tokens: |
| if token.lower() in superlative_indicators: |
| multiplier = max(multiplier, 1.4) |
| |
| |
| comparative_patterns = ['more', 'less', 'better', 'worse', 'greater', 'smaller'] |
| for token in tokens: |
| if token.lower() in comparative_patterns: |
| multiplier = max(multiplier, 1.2) |
| |
| return multiplier |
| |
| def _detect_repetition(self, text: str) -> float: |
| """Detect repeated characters/words that indicate emphasis""" |
| multiplier = 1.0 |
| |
| |
| repeated_chars = re.findall(r'(\w)\1{2,}', text.lower()) |
| if repeated_chars: |
| multiplier += len(repeated_chars) * 0.1 |
| |
| |
| words = text.lower().split() |
| if len(words) > 2: |
| for i in range(len(words) - 2): |
| if words[i] == words[i+1] == words[i+2]: |
| multiplier += 0.2 |
| break |
| |
| return min(multiplier, 1.5) |
| |
| def _detect_sentiment_shifters(self, text: str) -> float: |
| """Detect words that shift sentiment polarity""" |
| shifters = { |
| 'but': 0.6, 'however': 0.6, 'although': 0.7, 'though': 0.7, |
| 'yet': 0.6, 'still': 0.7, 'nevertheless': 0.6, 'nonetheless': 0.6 |
| } |
| |
| text_lower = text.lower() |
| for shifter, factor in shifters.items(): |
| if shifter in text_lower: |
| return factor |
| |
| return 1.0 |
| |
| def analyze(self, text: str) -> Dict: |
| """Analyze sentiment using rule-based approach with advanced rules""" |
| |
| base_analyzer = LexiconBasedAnalyzer(self.language) |
| result = base_analyzer.analyze(text) |
| |
| |
| tokens = self.preprocessor.preprocess(text) |
| text_lower = text.lower() |
| |
| |
| exclamation_count = text.count('!') |
| if exclamation_count > 0: |
| multiplier = 1 + min(exclamation_count * 0.15, 0.5) |
| result['positive_score'] *= multiplier |
| result['negative_score'] *= multiplier |
| |
| |
| question_count = text.count('?') |
| if question_count > 1: |
| uncertainty_factor = max(0.7, 1 - (question_count * 0.1)) |
| result['confidence'] *= uncertainty_factor |
| |
| |
| caps_words = [w for w in text.split() if w.isupper() and len(w) > 2] |
| if len(caps_words) > 0: |
| caps_multiplier = 1 + (len(caps_words) * 0.1) |
| result['positive_score'] *= caps_multiplier |
| result['negative_score'] *= caps_multiplier |
| |
| |
| emoji_pos, emoji_neg = self._detect_emoticons(text) |
| result['positive_score'] += emoji_pos |
| result['negative_score'] += emoji_neg |
| |
| |
| result['positive_score'], result['negative_score'] = self._handle_contrast_words( |
| text, tokens, result['positive_score'], result['negative_score'] |
| ) |
| |
| |
| comp_super_mult = self._detect_comparatives_superlatives(tokens) |
| result['positive_score'] *= comp_super_mult |
| result['negative_score'] *= comp_super_mult |
| |
| |
| rep_mult = self._detect_repetition(text) |
| result['positive_score'] *= rep_mult |
| result['negative_score'] *= rep_mult |
| |
| |
| shifter_factor = self._detect_sentiment_shifters(text) |
| if shifter_factor < 1.0: |
| |
| result['positive_score'] *= shifter_factor |
| result['negative_score'] *= shifter_factor |
| |
| |
| if '...' in text or '…' in text: |
| result['confidence'] *= 0.9 |
| |
| |
| multi_punct = re.findall(r'[!?]{2,}', text) |
| if multi_punct: |
| punct_mult = 1 + (len(multi_punct) * 0.1) |
| result['positive_score'] *= punct_mult |
| result['negative_score'] *= punct_mult |
| |
| |
| hashtags = re.findall(r'#\w+', text) |
| if hashtags: |
| |
| for tag in hashtags: |
| tag_lower = tag.lower() |
| if any(word in tag_lower for word in self.lexicon.positive_words): |
| result['positive_score'] += 0.5 |
| if any(word in tag_lower for word in self.lexicon.negative_words): |
| result['negative_score'] += 0.5 |
| |
| |
| if re.search(r'http[s]?://', text): |
| result['confidence'] *= 0.95 |
| |
| |
| word_count = len(text.split()) |
| if word_count < 3: |
| result['confidence'] *= 0.8 |
| elif word_count > 100: |
| result['confidence'] *= 0.95 |
| |
| |
| total = result['positive_score'] + result['negative_score'] |
| if total > 0: |
| if result['positive_score'] > result['negative_score']: |
| result['polarity'] = 'positive' |
| result['confidence'] = result['positive_score'] / total |
| else: |
| result['polarity'] = 'negative' |
| result['confidence'] = result['negative_score'] / total |
| else: |
| result['polarity'] = 'neutral' |
| result['confidence'] = 0.0 |
| |
| result['method'] = 'rule-based' |
| return result |
|
|
|
|
| class HybridAnalyzer: |
| """Hybrid approach combining lexicon, rules, and simple ML features""" |
| |
| def __init__(self, language: str): |
| self.language = language |
| self.lexicon_analyzer = LexiconBasedAnalyzer(language) |
| self.rule_analyzer = RuleBasedAnalyzer(language) |
| |
| def analyze(self, text: str) -> Dict: |
| """Analyze sentiment using hybrid approach""" |
| |
| lexicon_result = self.lexicon_analyzer.analyze(text) |
| rule_result = self.rule_analyzer.analyze(text) |
| |
| |
| lexicon_weight = 0.4 |
| rule_weight = 0.6 |
| |
| combined_positive = (lexicon_result['positive_score'] * lexicon_weight + |
| rule_result['positive_score'] * rule_weight) |
| combined_negative = (lexicon_result['negative_score'] * lexicon_weight + |
| rule_result['negative_score'] * rule_weight) |
| |
| total = combined_positive + combined_negative |
| if total == 0: |
| polarity = 'neutral' |
| confidence = 0.0 |
| elif combined_positive > combined_negative: |
| polarity = 'positive' |
| confidence = combined_positive / total |
| else: |
| polarity = 'negative' |
| confidence = combined_negative / total |
| |
| return { |
| 'polarity': polarity, |
| 'confidence': round(confidence, 3), |
| 'positive_score': round(combined_positive, 3), |
| 'negative_score': round(combined_negative, 3), |
| 'lexicon_result': lexicon_result, |
| 'rule_result': rule_result, |
| 'method': 'hybrid' |
| } |
|
|
|
|
| class MultilingualSentimentAnalyzer: |
| """Main sentiment analyzer supporting multiple languages and methods""" |
| |
| def __init__(self, language: str = 'english', method: str = 'hybrid'): |
| """ |
| Initialize sentiment analyzer |
| |
| Args: |
| language: 'english', 'turkish', or 'persian' |
| method: 'lexicon', 'rule', or 'hybrid' |
| """ |
| self.language = language.lower() |
| self.method = method.lower() |
| |
| if method == 'lexicon': |
| self.analyzer = LexiconBasedAnalyzer(self.language) |
| elif method == 'rule': |
| self.analyzer = RuleBasedAnalyzer(self.language) |
| else: |
| self.analyzer = HybridAnalyzer(self.language) |
| |
| def analyze(self, text: str) -> Dict: |
| """Analyze sentiment of input text""" |
| if not text or not text.strip(): |
| return { |
| 'polarity': 'neutral', |
| 'confidence': 0.0, |
| 'error': 'Empty text provided' |
| } |
| |
| try: |
| result = self.analyzer.analyze(text) |
| result['language'] = self.language |
| result['text_length'] = len(text) |
| result['word_count'] = len(text.split()) |
| return result |
| except Exception as e: |
| return { |
| 'polarity': 'neutral', |
| 'confidence': 0.0, |
| 'error': str(e) |
| } |
| |
| def analyze_batch(self, texts: List[str]) -> List[Dict]: |
| """Analyze multiple texts""" |
| return [self.analyze(text) for text in texts] |
| |
| def get_statistics(self, texts: List[str]) -> Dict: |
| """Get aggregate statistics for a batch of texts""" |
| results = self.analyze_batch(texts) |
| |
| polarity_counts = Counter([r['polarity'] for r in results]) |
| total = len(results) |
| |
| avg_confidence = sum([r.get('confidence', 0) for r in results]) / total if total > 0 else 0 |
| |
| return { |
| 'total_texts': total, |
| 'polarity_distribution': dict(polarity_counts), |
| 'polarity_percentages': { |
| k: round(v / total * 100, 2) |
| for k, v in polarity_counts.items() |
| }, |
| 'average_confidence': round(avg_confidence, 3) |
| } |
|
|
|
|