Spaces:
Build error
Build error
| """ | |
| OpenAI-based resume data extraction. | |
| Uses GPT models to extract structured information from resume text. | |
| """ | |
| import json | |
| import re | |
| import logging | |
| from typing import Dict, Any, List, Optional | |
| import openai | |
| from openai import OpenAI | |
| # Set up logging | |
| logger = logging.getLogger(__name__) | |
| class OpenAIResumeExtractor: | |
| """ | |
| Resume data extractor using OpenAI's GPT models. | |
| """ | |
| def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o"): | |
| """Initialize with OpenAI API key and model.""" | |
| self.client = OpenAI(api_key=api_key) if api_key else OpenAI() | |
| self.model = model | |
| logger.info(f"OpenAI extractor initialized with model: {model}") | |
| def extract_sections_openai(self, text: str) -> Dict[str, Any]: | |
| """ | |
| Extract resume sections using OpenAI API. | |
| Args: | |
| text: Raw resume text | |
| Returns: | |
| Dict containing extracted sections | |
| """ | |
| logger.info("Starting OpenAI extraction...") | |
| try: | |
| # Create extraction prompt | |
| prompt = self._create_extraction_prompt(text) | |
| # Call OpenAI API | |
| response = self.client.chat.completions.create( | |
| model=self.model, | |
| messages=[ | |
| {"role": "system", "content": "You are an expert resume parser. Extract information and return ONLY valid JSON."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.1, | |
| max_tokens=2000 | |
| ) | |
| # Parse response | |
| content = response.choices[0].message.content.strip() | |
| logger.debug(f"OpenAI response: {content[:200]}...") | |
| # Clean and parse JSON | |
| content = self._clean_json_response(content) | |
| result = json.loads(content) | |
| # Validate and enhance result | |
| result = self._validate_and_clean_result(result) | |
| # Add contact info extraction | |
| contact_info = self._extract_contact_info(text) | |
| result["ContactInfo"] = contact_info | |
| logger.info("✅ OpenAI extraction completed successfully") | |
| return result | |
| except json.JSONDecodeError as e: | |
| logger.error(f"JSON parsing error: {e}") | |
| logger.debug(f"Response content: {content}") | |
| return self._fallback_extraction(text) | |
| except Exception as e: | |
| logger.error(f"OpenAI extraction failed: {e}") | |
| return self._fallback_extraction(text) | |
| def _clean_json_response(self, content: str) -> str: | |
| """Clean JSON response from OpenAI.""" | |
| # Remove markdown code blocks | |
| content = re.sub(r'```json\s*', '', content) | |
| content = re.sub(r'```\s*$', '', content) | |
| # Remove any text before first { | |
| start = content.find('{') | |
| if start > 0: | |
| content = content[start:] | |
| # Remove any text after last } | |
| end = content.rfind('}') | |
| if end > 0 and end < len(content) - 1: | |
| content = content[:end + 1] | |
| return content.strip() | |
| def _create_extraction_prompt(self, text: str) -> str: | |
| """Create prompt for OpenAI extraction.""" | |
| prompt = f""" | |
| Extract information from this resume and return ONLY valid JSON in this exact format: | |
| {{ | |
| "Name": "Full Name with credentials (PhD, MBA, etc.)", | |
| "Summary": "Professional summary or objective", | |
| "Skills": ["skill1", "skill2", "skill3"], | |
| "StructuredExperiences": [ | |
| {{ | |
| "title": "Job Title", | |
| "company": "Company Name", | |
| "date_range": "Start Date - End Date", | |
| "responsibilities": ["responsibility1", "responsibility2"] | |
| }} | |
| ], | |
| "Education": ["degree info", "school info"], | |
| "Training": ["certification1", "training1"], | |
| "Address": "Full address if available" | |
| }} | |
| Resume text: | |
| {text} | |
| CRITICAL INSTRUCTIONS: | |
| - For NAME: Include ALL credentials (PhD, MBA, M.S., B.S., etc.) - example: "John Doe, PhD, MBA" | |
| - Read the ENTIRE resume text carefully, don't miss content | |
| - Extract ALL work experiences with full details | |
| - Return ONLY valid JSON, no explanations | |
| - If a section is not found, use empty string or empty array | |
| - Extract actual technical skills, not company names | |
| """ | |
| return prompt | |
| def _extract_contact_info(self, text: str) -> Dict[str, str]: | |
| """Extract contact information from resume text.""" | |
| contact_info = {} | |
| # Extract email | |
| email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
| email_match = re.search(email_pattern, text) | |
| if email_match: | |
| contact_info['email'] = email_match.group() | |
| # Extract phone number | |
| phone_patterns = [ | |
| r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', | |
| r'\+1[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', | |
| r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}' | |
| ] | |
| for pattern in phone_patterns: | |
| phone_match = re.search(pattern, text) | |
| if phone_match: | |
| contact_info['phone'] = phone_match.group().strip() | |
| break | |
| # Extract LinkedIn | |
| linkedin_pattern = r'linkedin\.com/in/[A-Za-z0-9-]+' | |
| linkedin_match = re.search(linkedin_pattern, text) | |
| if linkedin_match: | |
| contact_info['linkedin'] = linkedin_match.group() | |
| logger.info(f"OPENAI: Extracted ContactInfo as dict: {contact_info}") | |
| return contact_info | |
| def _validate_and_clean_result(self, result: Dict[str, Any]) -> Dict[str, Any]: | |
| """Validate and clean the extraction result.""" | |
| # Ensure all required keys exist | |
| required_keys = ["Name", "Summary", "Skills", "StructuredExperiences", "Education", "Training", "Address"] | |
| for key in required_keys: | |
| if key not in result: | |
| result[key] = [] if key in ["Skills", "StructuredExperiences", "Education", "Training"] else "" | |
| # Clean skills - remove company names and duplicates | |
| if result.get("Skills"): | |
| cleaned_skills = [] | |
| for skill in result["Skills"]: | |
| skill = skill.strip() | |
| # Skip if it looks like a company name or is too short | |
| if len(skill) > 1 and not self._is_company_name(skill): | |
| cleaned_skills.append(skill) | |
| result["Skills"] = list(set(cleaned_skills)) # Remove duplicates | |
| # Validate experience structure | |
| if result.get("StructuredExperiences"): | |
| cleaned_experiences = [] | |
| for exp in result["StructuredExperiences"]: | |
| if isinstance(exp, dict) and exp.get("title") and exp.get("company"): | |
| # Ensure responsibilities is a list | |
| if not isinstance(exp.get("responsibilities"), list): | |
| exp["responsibilities"] = [] | |
| cleaned_experiences.append(exp) | |
| result["StructuredExperiences"] = cleaned_experiences | |
| return result | |
| def _is_company_name(self, text: str) -> bool: | |
| """Check if text looks like a company name rather than a skill.""" | |
| company_indicators = [ | |
| "inc", "llc", "corp", "ltd", "company", "solutions", "services", | |
| "systems", "technologies", "financial", "insurance" | |
| ] | |
| text_lower = text.lower() | |
| return any(indicator in text_lower for indicator in company_indicators) | |
| def _fallback_extraction(self, text: str) -> Dict[str, Any]: | |
| """Fallback to regex-based extraction if OpenAI fails.""" | |
| logger.info("Using regex fallback extraction...") | |
| return { | |
| "Name": self._extract_name_regex(text), | |
| "Summary": self._extract_summary_regex(text), | |
| "Skills": self._extract_skills_regex(text), | |
| "StructuredExperiences": self._extract_experiences_regex(text), | |
| "Education": self._extract_education_regex(text), | |
| "Training": [], | |
| "Address": self._extract_address_regex(text), | |
| "ContactInfo": self._extract_contact_info(text) | |
| } | |
| def _extract_name_regex(self, text: str) -> str: | |
| """Regex fallback for name extraction.""" | |
| lines = text.split('\n')[:5] | |
| for line in lines: | |
| line = line.strip() | |
| if re.search(r'@|phone|email|linkedin|github', line.lower()): | |
| continue | |
| # Match name with potential credentials (PhD, MBA, etc.) | |
| name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?(?:,\s*[A-Z][a-z.]+(?:,\s*[A-Z][a-z.]+)?)?)', line) | |
| if name_match: | |
| return name_match.group(1) | |
| return "" | |
| def _extract_summary_regex(self, text: str) -> str: | |
| """Regex fallback for summary extraction.""" | |
| summary_pattern = r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))' | |
| match = re.search(summary_pattern, text, re.DOTALL) | |
| if match: | |
| summary = match.group(1).strip() | |
| summary = re.sub(r'\n+', ' ', summary) | |
| summary = re.sub(r'\s+', ' ', summary) | |
| return summary | |
| return "" | |
| def _extract_skills_regex(self, text: str) -> List[str]: | |
| """Regex fallback for skills extraction.""" | |
| skills = set() | |
| # Look for technical skills section | |
| skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:experience|education|projects?))' | |
| match = re.search(skills_pattern, text, re.DOTALL) | |
| if match: | |
| skills_text = match.group(1) | |
| # Split by common separators | |
| skill_items = re.split(r'[,;]\s*', skills_text.replace('\n', ' ')) | |
| for item in skill_items: | |
| item = item.strip() | |
| if item and len(item) > 1 and len(item) < 30: | |
| skills.add(item) | |
| return sorted(list(skills)) | |
| def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]: | |
| """Regex fallback for experience extraction.""" | |
| experiences = [] | |
| # Look for work experience section | |
| exp_pattern = r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|$))' | |
| match = re.search(exp_pattern, text, re.DOTALL) | |
| if match: | |
| exp_text = match.group(1) | |
| # Look for job entries with | separators | |
| job_pattern = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' | |
| matches = re.findall(job_pattern, exp_text) | |
| for match in matches: | |
| title, company, dates = match | |
| responsibilities = [] | |
| # Look for bullet points after this job | |
| job_section = exp_text[exp_text.find(f"{title}|{company}|{dates}"):] | |
| bullets = re.findall(r'[-•]\s*([^-•\n]+)', job_section) | |
| responsibilities = [bullet.strip() for bullet in bullets if len(bullet.strip()) > 10] | |
| experience = { | |
| "title": title.strip(), | |
| "company": company.strip(), | |
| "date_range": dates.strip(), | |
| "responsibilities": responsibilities | |
| } | |
| experiences.append(experience) | |
| return experiences | |
| def _extract_education_regex(self, text: str) -> List[str]: | |
| """Regex fallback for education extraction.""" | |
| education = [] | |
| edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))' | |
| match = re.search(edu_pattern, text, re.DOTALL) | |
| if match: | |
| edu_text = match.group(1) | |
| edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()] | |
| for line in edu_lines: | |
| if len(line) > 10: # Filter out short lines | |
| education.append(line) | |
| return education | |
| def _extract_address_regex(self, text: str) -> str: | |
| """Regex fallback for address extraction.""" | |
| # Look for address patterns like "6001 Tain Dr. Suite 203, Dublin, OH, 43016" | |
| address_patterns = [ | |
| r'(\d+\s+[A-Za-z\s\.]+(?:Suite|Apt|Unit)\s+\d+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})', | |
| r'(\d+\s+[A-Za-z\s\.]+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})', | |
| r'([A-Za-z\s\d\.]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})' | |
| ] | |
| for pattern in address_patterns: | |
| match = re.search(pattern, text) | |
| if match: | |
| return match.group(1).strip() | |
| return "" | |
| # Main extraction function for compatibility | |
| def extract_sections_openai(text: str, api_key: Optional[str] = None) -> Dict[str, Any]: | |
| """Extract resume sections using OpenAI API.""" | |
| extractor = OpenAIResumeExtractor(api_key=api_key) | |
| return extractor.extract_sections_openai(text) |