Spaces:

amirrezaa
/

Sentiment

Sleeping

App Files Files Community

Sentiment / sentiment_analyzer.py

amirrezaa

Upload sentiment_analyzer.py

2340214 verified 4 months ago

raw

history blame contribute delete

22.2 kB

	"""
	Multilingual Sentiment Analysis Tool
	Supports Turkish, Persian, and English using lexicon-based and machine learning approaches
	"""

	import re
	import json
	import os
	from typing import Dict, List, Tuple, Optional
	from collections import Counter
	import math


	class SentimentLexicon:
	"""Base class for sentiment lexicons"""

	def __init__(self, language: str):
	self.language = language
	self.positive_words = set()
	self.negative_words = set()
	self.intensifiers = {}
	self.negation_words = set()
	self.diminishers = {}
	self.contrast_words = set()
	self.idioms_positive = []
	self.idioms_negative = []
	self._load_lexicon()

	def _load_lexicon(self):
	"""Load language-specific sentiment lexicon"""
	lexicon_file = f"lexicons/{self.language}_lexicon.json"
	if os.path.exists(lexicon_file):
	with open(lexicon_file, 'r', encoding='utf-8') as f:
	data = json.load(f)
	self.positive_words = set(data.get('positive', []))
	self.negative_words = set(data.get('negative', []))
	self.intensifiers = data.get('intensifiers', {})
	self.negation_words = set(data.get('negation', []))
	self.diminishers = data.get('diminishers', {})
	self.contrast_words = set(data.get('contrast_words', []))
	self.idioms_positive = data.get('idioms_positive', [])
	self.idioms_negative = data.get('idioms_negative', [])
	else:
	# Default English lexicon
	self._load_default_english()

	def _load_default_english(self):
	"""Load default English sentiment words"""
	self.positive_words = {
	'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic',
	'love', 'like', 'best', 'perfect', 'beautiful', 'nice', 'happy',
	'pleased', 'satisfied', 'awesome', 'brilliant', 'outstanding'
	}
	self.negative_words = {
	'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'dislike',
	'poor', 'disappointed', 'sad', 'angry', 'frustrated', 'annoying',
	'boring', 'ugly', 'awful', 'disgusting', 'pathetic'
	}
	self.intensifiers = {
	'very': 1.5, 'extremely': 2.0, 'really': 1.3, 'quite': 1.2,
	'too': 1.4, 'so': 1.3, 'absolutely': 1.8, 'completely': 1.5
	}
	self.negation_words = {
	'not', 'no', 'never', 'none', 'nobody', 'nothing', 'nowhere',
	'neither', 'cannot', "can't", "won't", "don't", "doesn't"
	}
	self.diminishers = {}
	self.contrast_words = set()
	self.idioms_positive = []
	self.idioms_negative = []


	class TextPreprocessor:
	"""Text preprocessing for different languages"""

	def __init__(self, language: str):
	self.language = language

	def preprocess(self, text: str) -> List[str]:
	"""Preprocess text and return tokens"""
	# Convert to lowercase
	text = text.lower()

	# Remove URLs
	text = re.sub(r'http\S+\|www\S+\|https\S+', '', text, flags=re.MULTILINE)

	# Remove email addresses
	text = re.sub(r'\S+@\S+', '', text)

	# Remove special characters but keep punctuation for sentiment analysis
	text = re.sub(r'[^\w\s\.,!?;:()\-\']', '', text)

	# Tokenize
	tokens = re.findall(r'\b\w+\b\|[.,!?;:()]', text)

	return tokens

	def normalize_turkish(self, text: str) -> str:
	"""Normalize Turkish text (handle special characters)"""
	# Turkish character normalization
	replacements = {
	'ı': 'i', 'İ': 'I',
	'ğ': 'g', 'Ğ': 'G',
	'ü': 'u', 'Ü': 'U',
	'ş': 's', 'Ş': 'S',
	'ö': 'o', 'Ö': 'O',
	'ç': 'c', 'Ç': 'C'
	}
	for old, new in replacements.items():
	text = text.replace(old, new)
	return text

	def normalize_persian(self, text: str) -> str:
	"""Normalize Persian text (handle different character forms)"""
	# Persian/Arabic character normalization
	# This is a simplified version - real implementation would be more complex
	return text


	class LexiconBasedAnalyzer:
	"""Lexicon-based sentiment analysis with enhanced features"""

	def __init__(self, language: str):
	self.language = language
	self.lexicon = SentimentLexicon(language)
	self.preprocessor = TextPreprocessor(language)

	def _check_idioms(self, text: str) -> Tuple[float, float]:
	"""Check for sentiment idioms in text"""
	pos_score = 0.0
	neg_score = 0.0
	text_lower = text.lower()

	for idiom in self.lexicon.idioms_positive:
	if idiom.lower() in text_lower:
	pos_score += 2.0 # Idioms carry stronger sentiment

	for idiom in self.lexicon.idioms_negative:
	if idiom.lower() in text_lower:
	neg_score += 2.0

	return pos_score, neg_score

	def analyze(self, text: str) -> Dict:
	"""Analyze sentiment using lexicon-based approach"""
	tokens = self.preprocessor.preprocess(text)
	text_lower = text.lower()

	positive_score = 0
	negative_score = 0
	sentiment_words = []
	negation_count = 0

	# Check idioms first
	idiom_pos, idiom_neg = self._check_idioms(text)
	positive_score += idiom_pos
	negative_score += idiom_neg

	# Check for negation and intensifiers with improved scope detection
	window_size = 4 # Increased window for better context
	i = 0
	while i < len(tokens):
	token = tokens[i]
	is_negated = False
	intensifier_strength = 1.0
	diminisher_strength = 1.0

	# Check for negation in window (improved scope)
	for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
	if tokens[j] in self.lexicon.negation_words:
	# Check if negation is still in scope (not interrupted by punctuation)
	if j < i:
	# Check for punctuation between negation and token
	has_punctuation = any(
	tokens[k] in ['.', '!', '?', ';', ',']
	for k in range(j + 1, i)
	)
	if not has_punctuation:
	is_negated = True
	negation_count += 1
	break

	# Check for intensifiers (look back up to 2 tokens)
	for k in range(max(0, i-2), i):
	if k >= 0 and tokens[k] in self.lexicon.intensifiers:
	intensifier_strength = max(intensifier_strength, self.lexicon.intensifiers[tokens[k]])

	# Check for diminishers (look back up to 2 tokens)
	for k in range(max(0, i-2), i):
	if k >= 0 and tokens[k] in self.lexicon.diminishers:
	diminisher_strength = min(diminisher_strength, self.lexicon.diminishers[tokens[k]])

	# Check sentiment
	if token in self.lexicon.positive_words:
	score = 1.0 * intensifier_strength * diminisher_strength
	if is_negated:
	negative_score += score
	sentiment_words.append(('negative', token, is_negated))
	else:
	positive_score += score
	sentiment_words.append(('positive', token, is_negated))
	elif token in self.lexicon.negative_words:
	score = 1.0 * intensifier_strength * diminisher_strength
	if is_negated:
	positive_score += score
	sentiment_words.append(('positive', token, is_negated))
	else:
	negative_score += score
	sentiment_words.append(('negative', token, is_negated))

	i += 1

	# Calculate final sentiment with improved scoring
	# Normalize scores to prevent extreme values from dominating
	total_raw = positive_score + negative_score
	if total_raw > 0:
	# Use logarithmic scaling for better balance (but keep original for display)
	pos_normalized = positive_score / total_raw
	neg_normalized = negative_score / total_raw
	else:
	pos_normalized = 0.0
	neg_normalized = 0.0

	if total_raw == 0:
	polarity = 'neutral'
	confidence = 0.0
	elif positive_score > negative_score:
	polarity = 'positive'
	confidence = pos_normalized
	else:
	polarity = 'negative'
	confidence = neg_normalized

	return {
	'polarity': polarity,
	'confidence': round(confidence, 3),
	'positive_score': round(positive_score, 3),
	'negative_score': round(negative_score, 3),
	'sentiment_words': sentiment_words[:10], # Limit to first 10
	'method': 'lexicon-based'
	}


	class RuleBasedAnalyzer:
	"""Rule-based sentiment analysis with advanced linguistic rules"""

	def __init__(self, language: str):
	self.language = language
	self.lexicon = SentimentLexicon(language)
	self.preprocessor = TextPreprocessor(language)

	def _detect_emoticons(self, text: str) -> Tuple[float, float]:
	"""Detect and score emoticons and emojis"""
	pos_score = 0.0
	neg_score = 0.0

	# Positive emoticons
	positive_emoticons = [
	':)', ':-)', '=)', ';)', ';-)', '=D', ':D', ':-D',
	'😊', '😀', '😁', '😂', '🤣', '😃', '😄', '😆', '😍', '🥰',
	'😎', '🤗', '👍', '👏', '🎉', '❤️', '💕', '💖', '💗', '💓'
	]

	# Negative emoticons
	negative_emoticons = [
	':(', ':-(', '=(', ':/', ':-/', ':\|', ':-\|', '>:(', '>:(',
	'😢', '😞', '😠', '😡', '😤', '😭', '😰', '😨', '😱', '😖',
	'😣', '😫', '😩', '👎', '💔', '😒', '😔', '😕', '🙁'
	]

	for emoji in positive_emoticons:
	count = text.count(emoji)
	pos_score += count * 1.5

	for emoji in negative_emoticons:
	count = text.count(emoji)
	neg_score += count * 1.5

	return pos_score, neg_score

	def _handle_contrast_words(self, text: str, tokens: List[str],
	pos_score: float, neg_score: float) -> Tuple[float, float]:
	"""Handle contrast words that may shift sentiment"""
	# Find contrast words and adjust sentiment
	contrast_positions = []
	for i, token in enumerate(tokens):
	if token.lower() in self.lexicon.contrast_words:
	contrast_positions.append(i)

	# If contrast word found, reduce weight of sentiment before it
	if contrast_positions:
	# Simple heuristic: reduce earlier sentiment by 30%
	reduction_factor = 0.7
	return pos_score * reduction_factor, neg_score * reduction_factor

	return pos_score, neg_score

	def _detect_comparatives_superlatives(self, tokens: List[str]) -> float:
	"""Detect comparative and superlative forms that intensify sentiment"""
	multiplier = 1.0

	# Check for superlatives
	superlative_indicators = ['most', 'best', 'worst', 'least', 'greatest']
	for token in tokens:
	if token.lower() in superlative_indicators:
	multiplier = max(multiplier, 1.4)

	# Check for comparatives
	comparative_patterns = ['more', 'less', 'better', 'worse', 'greater', 'smaller']
	for token in tokens:
	if token.lower() in comparative_patterns:
	multiplier = max(multiplier, 1.2)

	return multiplier

	def _detect_repetition(self, text: str) -> float:
	"""Detect repeated characters/words that indicate emphasis"""
	multiplier = 1.0

	# Repeated characters (e.g., "soooo good")
	repeated_chars = re.findall(r'(\w)\1{2,}', text.lower())
	if repeated_chars:
	multiplier += len(repeated_chars) * 0.1

	# Repeated words (e.g., "good good good")
	words = text.lower().split()
	if len(words) > 2:
	for i in range(len(words) - 2):
	if words[i] == words[i+1] == words[i+2]:
	multiplier += 0.2
	break

	return min(multiplier, 1.5) # Cap at 1.5x

	def _detect_sentiment_shifters(self, text: str) -> float:
	"""Detect words that shift sentiment polarity"""
	shifters = {
	'but': 0.6, 'however': 0.6, 'although': 0.7, 'though': 0.7,
	'yet': 0.6, 'still': 0.7, 'nevertheless': 0.6, 'nonetheless': 0.6
	}

	text_lower = text.lower()
	for shifter, factor in shifters.items():
	if shifter in text_lower:
	return factor

	return 1.0

	def analyze(self, text: str) -> Dict:
	"""Analyze sentiment using rule-based approach with advanced rules"""
	# Use lexicon-based as base
	base_analyzer = LexiconBasedAnalyzer(self.language)
	result = base_analyzer.analyze(text)

	# Apply advanced rules
	tokens = self.preprocessor.preprocess(text)
	text_lower = text.lower()

	# Rule 1: Exclamation marks increase sentiment strength
	exclamation_count = text.count('!')
	if exclamation_count > 0:
	multiplier = 1 + min(exclamation_count * 0.15, 0.5) # Cap at 50% increase
	result['positive_score'] *= multiplier
	result['negative_score'] *= multiplier

	# Rule 2: Question marks may indicate uncertainty or sarcasm
	question_count = text.count('?')
	if question_count > 1:
	uncertainty_factor = max(0.7, 1 - (question_count * 0.1))
	result['confidence'] *= uncertainty_factor

	# Rule 3: All caps increase sentiment strength (but check length)
	caps_words = [w for w in text.split() if w.isupper() and len(w) > 2]
	if len(caps_words) > 0:
	caps_multiplier = 1 + (len(caps_words) * 0.1)
	result['positive_score'] *= caps_multiplier
	result['negative_score'] *= caps_multiplier

	# Rule 4: Enhanced emoticon detection
	emoji_pos, emoji_neg = self._detect_emoticons(text)
	result['positive_score'] += emoji_pos
	result['negative_score'] += emoji_neg

	# Rule 5: Contrast words handling
	result['positive_score'], result['negative_score'] = self._handle_contrast_words(
	text, tokens, result['positive_score'], result['negative_score']
	)

	# Rule 6: Comparatives and superlatives
	comp_super_mult = self._detect_comparatives_superlatives(tokens)
	result['positive_score'] *= comp_super_mult
	result['negative_score'] *= comp_super_mult

	# Rule 7: Repetition detection
	rep_mult = self._detect_repetition(text)
	result['positive_score'] *= rep_mult
	result['negative_score'] *= rep_mult

	# Rule 8: Sentiment shifters
	shifter_factor = self._detect_sentiment_shifters(text)
	if shifter_factor < 1.0:
	# Reduce earlier sentiment
	result['positive_score'] *= shifter_factor
	result['negative_score'] *= shifter_factor

	# Rule 9: Ellipsis may indicate uncertainty or trailing off
	if '...' in text or '…' in text:
	result['confidence'] *= 0.9

	# Rule 10: Multiple punctuation (e.g., "!!!") increases emphasis
	multi_punct = re.findall(r'[!?]{2,}', text)
	if multi_punct:
	punct_mult = 1 + (len(multi_punct) * 0.1)
	result['positive_score'] *= punct_mult
	result['negative_score'] *= punct_mult

	# Rule 11: Hashtags in social media context
	hashtags = re.findall(r'#\w+', text)
	if hashtags:
	# Check if hashtags contain sentiment words
	for tag in hashtags:
	tag_lower = tag.lower()
	if any(word in tag_lower for word in self.lexicon.positive_words):
	result['positive_score'] += 0.5
	if any(word in tag_lower for word in self.lexicon.negative_words):
	result['negative_score'] += 0.5

	# Rule 12: URL presence may indicate spam or less personal content
	if re.search(r'http[s]?://', text):
	result['confidence'] *= 0.95

	# Rule 13: Length-based confidence adjustment
	word_count = len(text.split())
	if word_count < 3:
	result['confidence'] *= 0.8 # Very short texts are less reliable
	elif word_count > 100:
	result['confidence'] *= 0.95 # Very long texts may have mixed sentiment

	# Recalculate polarity
	total = result['positive_score'] + result['negative_score']
	if total > 0:
	if result['positive_score'] > result['negative_score']:
	result['polarity'] = 'positive'
	result['confidence'] = result['positive_score'] / total
	else:
	result['polarity'] = 'negative'
	result['confidence'] = result['negative_score'] / total
	else:
	result['polarity'] = 'neutral'
	result['confidence'] = 0.0

	result['method'] = 'rule-based'
	return result


	class HybridAnalyzer:
	"""Hybrid approach combining lexicon, rules, and simple ML features"""

	def __init__(self, language: str):
	self.language = language
	self.lexicon_analyzer = LexiconBasedAnalyzer(language)
	self.rule_analyzer = RuleBasedAnalyzer(language)

	def analyze(self, text: str) -> Dict:
	"""Analyze sentiment using hybrid approach"""
	# Get results from both methods
	lexicon_result = self.lexicon_analyzer.analyze(text)
	rule_result = self.rule_analyzer.analyze(text)

	# Combine scores with weights
	lexicon_weight = 0.4
	rule_weight = 0.6

	combined_positive = (lexicon_result['positive_score'] * lexicon_weight +
	rule_result['positive_score'] * rule_weight)
	combined_negative = (lexicon_result['negative_score'] * lexicon_weight +
	rule_result['negative_score'] * rule_weight)

	total = combined_positive + combined_negative
	if total == 0:
	polarity = 'neutral'
	confidence = 0.0
	elif combined_positive > combined_negative:
	polarity = 'positive'
	confidence = combined_positive / total
	else:
	polarity = 'negative'
	confidence = combined_negative / total

	return {
	'polarity': polarity,
	'confidence': round(confidence, 3),
	'positive_score': round(combined_positive, 3),
	'negative_score': round(combined_negative, 3),
	'lexicon_result': lexicon_result,
	'rule_result': rule_result,
	'method': 'hybrid'
	}


	class MultilingualSentimentAnalyzer:
	"""Main sentiment analyzer supporting multiple languages and methods"""

	def __init__(self, language: str = 'english', method: str = 'hybrid'):
	"""
	Initialize sentiment analyzer

	Args:
	language: 'english', 'turkish', or 'persian'
	method: 'lexicon', 'rule', or 'hybrid'
	"""
	self.language = language.lower()
	self.method = method.lower()

	if method == 'lexicon':
	self.analyzer = LexiconBasedAnalyzer(self.language)
	elif method == 'rule':
	self.analyzer = RuleBasedAnalyzer(self.language)
	else: # hybrid
	self.analyzer = HybridAnalyzer(self.language)

	def analyze(self, text: str) -> Dict:
	"""Analyze sentiment of input text"""
	if not text or not text.strip():
	return {
	'polarity': 'neutral',
	'confidence': 0.0,
	'error': 'Empty text provided'
	}

	try:
	result = self.analyzer.analyze(text)
	result['language'] = self.language
	result['text_length'] = len(text)
	result['word_count'] = len(text.split())
	return result
	except Exception as e:
	return {
	'polarity': 'neutral',
	'confidence': 0.0,
	'error': str(e)
	}

	def analyze_batch(self, texts: List[str]) -> List[Dict]:
	"""Analyze multiple texts"""
	return [self.analyze(text) for text in texts]

	def get_statistics(self, texts: List[str]) -> Dict:
	"""Get aggregate statistics for a batch of texts"""
	results = self.analyze_batch(texts)

	polarity_counts = Counter([r['polarity'] for r in results])
	total = len(results)

	avg_confidence = sum([r.get('confidence', 0) for r in results]) / total if total > 0 else 0

	return {
	'total_texts': total,
	'polarity_distribution': dict(polarity_counts),
	'polarity_percentages': {
	k: round(v / total * 100, 2)
	for k, v in polarity_counts.items()
	},
	'average_confidence': round(avg_confidence, 3)
	}