| """ |
| Google Gemini Sign Language Classifier |
| |
| This module provides sign language classification using Google's Gemini AI API. |
| """ |
|
|
| import google.generativeai as genai |
| import os |
| from typing import List, Dict, Any, Optional |
| import json |
| import time |
| from dotenv import load_dotenv |
| from .fallback_classifier import FallbackSignLanguageClassifier |
|
|
| |
| load_dotenv() |
|
|
|
|
| class GeminiSignLanguageClassifier: |
| """ |
| Sign language classifier using Google Gemini AI. |
| """ |
| |
| def __init__(self, api_key: Optional[str] = None, model: str = "gemini-1.5-flash"): |
| """ |
| Initialize the Gemini classifier. |
| |
| Args: |
| api_key: Gemini API key (if None, will use environment variable) |
| model: Gemini model to use for classification |
| """ |
| self.api_key = api_key or os.getenv('GEMINI_API_KEY') |
| self.model_name = model |
| |
| if not self.api_key: |
| raise ValueError("Gemini API key not provided. Set GEMINI_API_KEY environment variable or pass api_key parameter.") |
| |
| |
| genai.configure(api_key=self.api_key) |
| self.model = genai.GenerativeModel(self.model_name) |
| |
| |
| self.last_request_time = 0 |
| self.min_request_interval = 5.0 |
| self.request_count = 0 |
| self.request_window_start = time.time() |
| self.max_requests_per_minute = 10 |
| |
| |
| self.fallback_classifier = FallbackSignLanguageClassifier() |
| |
| |
| self.debug = True |
| |
| print(f"Gemini classifier initialized with fallback support") |
| |
| def classify_gesture(self, gesture_description: str, |
| sign_language: str = "ASL", |
| context: Optional[str] = None) -> Dict[str, Any]: |
| """ |
| Classify a single gesture using Gemini AI. |
| |
| Args: |
| gesture_description: Description of the hand gesture |
| sign_language: Sign language type (default: ASL) |
| context: Additional context (optional) |
| |
| Returns: |
| Classification result dictionary |
| """ |
| self._rate_limit() |
| |
| |
| prompt = self._create_classification_prompt(gesture_description, sign_language, context) |
| |
| if self.debug: |
| print(f"\n=== Gemini Classification Debug ===") |
| print(f"Input gesture description: {gesture_description}") |
| print(f"Prompt sent to Gemini: {prompt[:200]}...") |
| |
| try: |
| response = self.model.generate_content(prompt) |
| response_content = response.text |
| |
| if self.debug: |
| print(f"Gemini response: {response_content}") |
| |
| result = self._parse_response(response_content) |
| result['raw_response'] = response_content |
| result['success'] = True |
| result['method'] = 'gemini_ai' |
| |
| if self.debug: |
| print(f"Parsed result: {result}") |
| print("=== End Gemini Debug ===\n") |
| |
| return result |
| |
| except Exception as e: |
| error_msg = str(e) |
| if self.debug: |
| print(f"Gemini API Error: {error_msg}") |
| print("Falling back to pattern-based classification...") |
| |
| |
| try: |
| fallback_result = self.fallback_classifier.classify_gesture( |
| gesture_description, sign_language, context |
| ) |
| fallback_result['fallback_used'] = True |
| fallback_result['gemini_error'] = error_msg |
| |
| if self.debug: |
| print(f"Fallback result: {fallback_result}") |
| print("=== End Gemini Debug ===\n") |
| |
| return fallback_result |
| |
| except Exception as fallback_error: |
| if self.debug: |
| print(f"Fallback also failed: {str(fallback_error)}") |
| print("=== End Gemini Debug ===\n") |
| |
| return { |
| 'success': False, |
| 'error': error_msg, |
| 'fallback_error': str(fallback_error), |
| 'letter': None, |
| 'word': None, |
| 'confidence': 0.0, |
| 'description': None, |
| 'method': 'gemini_ai' |
| } |
| |
| def classify_sequence(self, gesture_descriptions: List[str], |
| sign_language: str = "ASL") -> Dict[str, Any]: |
| """ |
| Classify a sequence of gestures using Gemini AI. |
| |
| Args: |
| gesture_descriptions: List of gesture descriptions |
| sign_language: Sign language type |
| |
| Returns: |
| Sequence classification result |
| """ |
| self._rate_limit() |
| |
| |
| prompt = self._create_sequence_prompt(gesture_descriptions, sign_language) |
| |
| try: |
| response = self.model.generate_content(prompt) |
| response_content = response.text |
| |
| result = self._parse_sequence_response(response_content) |
| result['raw_response'] = response_content |
| result['success'] = True |
| result['method'] = 'gemini_ai' |
| |
| return result |
| |
| except Exception as e: |
| |
| try: |
| fallback_result = self.fallback_classifier.classify_sequence( |
| gesture_descriptions, sign_language |
| ) |
| fallback_result['fallback_used'] = True |
| fallback_result['gemini_error'] = str(e) |
| return fallback_result |
| |
| except Exception as fallback_error: |
| return { |
| 'success': False, |
| 'error': str(e), |
| 'fallback_error': str(fallback_error), |
| 'word': None, |
| 'sentence': None, |
| 'confidence': 0.0, |
| 'method': 'gemini_ai' |
| } |
| |
| def _rate_limit(self): |
| """Enhanced rate limiting for Gemini free tier.""" |
| current_time = time.time() |
|
|
| |
| if current_time - self.request_window_start >= 60: |
| self.request_count = 0 |
| self.request_window_start = current_time |
|
|
| |
| if self.request_count >= self.max_requests_per_minute: |
| sleep_time = 60 - (current_time - self.request_window_start) + 1 |
| if self.debug: |
| print(f"⏳ Rate limit reached, sleeping for {sleep_time:.1f} seconds...") |
| time.sleep(sleep_time) |
| self.request_count = 0 |
| self.request_window_start = time.time() |
|
|
| |
| time_since_last_request = current_time - self.last_request_time |
| if time_since_last_request < self.min_request_interval: |
| sleep_time = self.min_request_interval - time_since_last_request |
| if self.debug: |
| print(f"⏳ Waiting {sleep_time:.1f} seconds between requests...") |
| time.sleep(sleep_time) |
|
|
| self.last_request_time = time.time() |
| self.request_count += 1 |
| |
| def _create_classification_prompt(self, gesture_description: str, |
| sign_language: str, context: Optional[str]) -> str: |
| """Create enhanced prompt for single gesture classification.""" |
| prompt = f"""You are an expert ASL (American Sign Language) interpreter. Analyze this hand gesture and provide ONE CLEAR PREDICTION. |
| |
| GESTURE DATA: |
| {gesture_description} |
| |
| COMMON ASL PATTERNS TO RECOGNIZE: |
| • Index finger pointing = Number "1" |
| • Pinky finger only = Pronoun "I" |
| • Thumb up = "GOOD" or "YES" |
| • All fingers extended = Number "5" or "HELLO" |
| • Closed fist = Letter "A" or "S" |
| • Index + middle = Number "2" |
| • Three fingers = Number "3" |
| • Four fingers = Number "4" |
| • Index + pinky = "I LOVE YOU" |
| • Thumb + index = Letter "L" |
| |
| TASK: Based on the finger positions described, identify what this gesture most likely represents: |
| - A single letter (A-Z) |
| - A single number (0-9) |
| - A complete word (HELLO, GOOD, I, YOU, LOVE, etc.) |
| |
| Even if not a perfect match, provide your best interpretation based on ASL knowledge. |
| |
| """ |
| |
| if context: |
| prompt += f"Context: {context}\n\n" |
| |
| prompt += """Respond in this EXACT JSON format (choose ONE prediction): |
| { |
| "letter": "1", |
| "word": null, |
| "confidence": 0.85, |
| "description": "Index finger pointing = Number 1" |
| } |
| |
| OR for a word: |
| { |
| "letter": null, |
| "word": "GOOD", |
| "confidence": 0.85, |
| "description": "Thumb up = GOOD" |
| } |
| |
| IMPORTANT: Always provide either a letter OR a word, never both null. Make your best guess based on ASL knowledge.""" |
| |
| return prompt |
| |
| def _create_sequence_prompt(self, gesture_descriptions: List[str], |
| sign_language: str) -> str: |
| """Create prompt for gesture sequence classification.""" |
| prompt = f"""Analyze this sequence of {sign_language} hand gestures: |
| |
| """ |
| |
| for i, description in enumerate(gesture_descriptions, 1): |
| prompt += f"Gesture {i}: {description}\n" |
| |
| prompt += f""" |
| What word or sentence do these {sign_language} gestures spell out when combined? |
| Consider the sequence and flow of the gestures. |
| |
| Respond in JSON format: |
| {{ |
| "word": "HELLO" or null, |
| "sentence": "HELLO WORLD" or null, |
| "confidence": 0.85, |
| "individual_letters": ["H", "E", "L", "L", "O"] |
| }}""" |
| |
| return prompt |
| |
| def _parse_response(self, response_text: str) -> Dict[str, Any]: |
| """Parse Gemini response for single gesture classification.""" |
| try: |
| |
| if '{' in response_text and '}' in response_text: |
| json_start = response_text.find('{') |
| json_end = response_text.rfind('}') + 1 |
| json_str = response_text[json_start:json_end] |
| result = json.loads(json_str) |
|
|
| |
| letter = result.get('letter') |
| word = result.get('word') |
| confidence = float(result.get('confidence', 0.0)) |
| description = result.get('description', '') |
|
|
| |
| if not letter and not word: |
| if self.debug: |
| print("⚠️ Gemini returned null values, trying to extract from description...") |
|
|
| |
| desc_lower = description.lower() |
|
|
| |
| for num in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']: |
| if f"number '{num}'" in desc_lower or f"number {num}" in desc_lower: |
| letter = num |
| break |
|
|
| |
| if not letter: |
| for char in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ': |
| if f"letter '{char.lower()}'" in desc_lower or f"letter {char.lower()}" in desc_lower: |
| letter = char |
| break |
|
|
| |
| if not letter and not word: |
| common_words = ['good', 'hello', 'i', 'you', 'love', 'yes', 'no', 'please', 'thank you'] |
| for w in common_words: |
| if w in desc_lower: |
| word = w.upper() |
| break |
|
|
| return { |
| 'letter': letter, |
| 'word': word, |
| 'confidence': confidence, |
| 'description': description |
| } |
| else: |
| |
| return self._parse_text_response(response_text) |
|
|
| except (json.JSONDecodeError, ValueError): |
| return self._parse_text_response(response_text) |
| |
| def _parse_sequence_response(self, response_text: str) -> Dict[str, Any]: |
| """Parse Gemini response for sequence classification.""" |
| try: |
| if '{' in response_text and '}' in response_text: |
| json_start = response_text.find('{') |
| json_end = response_text.rfind('}') + 1 |
| json_str = response_text[json_start:json_end] |
| result = json.loads(json_str) |
| |
| return { |
| 'word': result.get('word'), |
| 'sentence': result.get('sentence'), |
| 'confidence': float(result.get('confidence', 0.0)), |
| 'individual_letters': result.get('individual_letters', []) |
| } |
| else: |
| return self._parse_sequence_text_response(response_text) |
| |
| except (json.JSONDecodeError, ValueError): |
| return self._parse_sequence_text_response(response_text) |
| |
| def _parse_text_response(self, response_text: str) -> Dict[str, Any]: |
| """Enhanced fallback text parsing for single gesture.""" |
| response_lower = response_text.lower() |
| |
| |
| common_words = ['hello', 'hungry', 'thank you', 'please', 'sorry', 'yes', 'no', |
| 'i', 'you', 'love', 'help', 'more', 'water', 'eat', 'drink', |
| 'good', 'bad', 'happy', 'sad', 'stop', 'go', 'come', 'home'] |
| |
| |
| word = None |
| for w in common_words: |
| if w in response_lower: |
| word = w.upper() |
| break |
| |
| |
| letter = None |
| if not word: |
| import re |
| |
| letter_match = re.search(r'\b([A-Z])\b', response_text.upper()) |
| if letter_match: |
| letter = letter_match.group(1) |
| |
| |
| number_match = re.search(r'\b([0-9])\b', response_text) |
| if number_match: |
| letter = number_match.group(1) |
| |
| |
| confidence = 0.5 |
| conf_match = re.search(r'(\d+(?:\.\d+)?)\s*%', response_text) |
| if conf_match: |
| confidence = float(conf_match.group(1)) / 100 |
| |
| return { |
| 'letter': letter, |
| 'word': word, |
| 'confidence': confidence, |
| 'description': f"Parsed from text: {response_text[:100]}..." |
| } |
| |
| def _parse_sequence_text_response(self, response_text: str) -> Dict[str, Any]: |
| """Fallback text parsing for sequence.""" |
| |
| return { |
| 'word': None, |
| 'sentence': None, |
| 'confidence': 0.3, |
| 'individual_letters': [], |
| 'description': f"Text parsing fallback: {response_text[:100]}..." |
| } |
|
|