Spaces:

gauravbox
/

TalentLensAI

Build error

TalentLensAI / utils /openai_extractor.py

Johnny

feat: Update resume builder with LFS-tracked assets

79b5c9c 9 months ago

13.3 kB

	"""
	OpenAI-based resume data extraction.
	Uses GPT models to extract structured information from resume text.
	"""

	import json
	import re
	import logging
	from typing import Dict, Any, List, Optional

	import openai
	from openai import OpenAI

	# Set up logging
	logger = logging.getLogger(__name__)


	class OpenAIResumeExtractor:
	"""
	Resume data extractor using OpenAI's GPT models.
	"""

	def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o"):
	"""Initialize with OpenAI API key and model."""
	self.client = OpenAI(api_key=api_key) if api_key else OpenAI()
	self.model = model
	logger.info(f"OpenAI extractor initialized with model: {model}")

	def extract_sections_openai(self, text: str) -> Dict[str, Any]:
	"""
	Extract resume sections using OpenAI API.

	Args:
	text: Raw resume text

	Returns:
	Dict containing extracted sections
	"""
	logger.info("Starting OpenAI extraction...")

	try:
	# Create extraction prompt
	prompt = self._create_extraction_prompt(text)

	# Call OpenAI API
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[
	{"role": "system", "content": "You are an expert resume parser. Extract information and return ONLY valid JSON."},
	{"role": "user", "content": prompt}
	],
	temperature=0.1,
	max_tokens=2000
	)

	# Parse response
	content = response.choices[0].message.content.strip()
	logger.debug(f"OpenAI response: {content[:200]}...")

	# Clean and parse JSON
	content = self._clean_json_response(content)
	result = json.loads(content)

	# Validate and enhance result
	result = self._validate_and_clean_result(result)

	# Add contact info extraction
	contact_info = self._extract_contact_info(text)
	result["ContactInfo"] = contact_info

	logger.info("✅ OpenAI extraction completed successfully")
	return result

	except json.JSONDecodeError as e:
	logger.error(f"JSON parsing error: {e}")
	logger.debug(f"Response content: {content}")
	return self._fallback_extraction(text)

	except Exception as e:
	logger.error(f"OpenAI extraction failed: {e}")
	return self._fallback_extraction(text)

	def _clean_json_response(self, content: str) -> str:
	"""Clean JSON response from OpenAI."""
	# Remove markdown code blocks
	content = re.sub(r'```json\s*', '', content)
	content = re.sub(r'```\s*$', '', content)

	# Remove any text before first {
	start = content.find('{')
	if start > 0:
	content = content[start:]

	# Remove any text after last }
	end = content.rfind('}')
	if end > 0 and end < len(content) - 1:
	content = content[:end + 1]

	return content.strip()

	def _create_extraction_prompt(self, text: str) -> str:
	"""Create prompt for OpenAI extraction."""
	prompt = f"""
	Extract information from this resume and return ONLY valid JSON in this exact format:

	{{
	"Name": "Full Name with credentials (PhD, MBA, etc.)",
	"Summary": "Professional summary or objective",
	"Skills": ["skill1", "skill2", "skill3"],
	"StructuredExperiences": [
	{{
	"title": "Job Title",
	"company": "Company Name",
	"date_range": "Start Date - End Date",
	"responsibilities": ["responsibility1", "responsibility2"]
	}}
	],
	"Education": ["degree info", "school info"],
	"Training": ["certification1", "training1"],
	"Address": "Full address if available"
	}}

	Resume text:
	{text}

	CRITICAL INSTRUCTIONS:
	- For NAME: Include ALL credentials (PhD, MBA, M.S., B.S., etc.) - example: "John Doe, PhD, MBA"
	- Read the ENTIRE resume text carefully, don't miss content
	- Extract ALL work experiences with full details
	- Return ONLY valid JSON, no explanations
	- If a section is not found, use empty string or empty array
	- Extract actual technical skills, not company names
	"""
	return prompt

	def _extract_contact_info(self, text: str) -> Dict[str, str]:
	"""Extract contact information from resume text."""
	contact_info = {}

	# Extract email
	email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'
	email_match = re.search(email_pattern, text)
	if email_match:
	contact_info['email'] = email_match.group()

	# Extract phone number
	phone_patterns = [
	r'$?\d{3}$?[-.\s]?\d{3}[-.\s]?\d{4}',
	r'\+1[-.\s]?$?\d{3}$?[-.\s]?\d{3}[-.\s]?\d{4}',
	r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}'
	]

	for pattern in phone_patterns:
	phone_match = re.search(pattern, text)
	if phone_match:
	contact_info['phone'] = phone_match.group().strip()
	break

	# Extract LinkedIn
	linkedin_pattern = r'linkedin\.com/in/[A-Za-z0-9-]+'
	linkedin_match = re.search(linkedin_pattern, text)
	if linkedin_match:
	contact_info['linkedin'] = linkedin_match.group()

	logger.info(f"OPENAI: Extracted ContactInfo as dict: {contact_info}")
	return contact_info

	def _validate_and_clean_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
	"""Validate and clean the extraction result."""

	# Ensure all required keys exist
	required_keys = ["Name", "Summary", "Skills", "StructuredExperiences", "Education", "Training", "Address"]
	for key in required_keys:
	if key not in result:
	result[key] = [] if key in ["Skills", "StructuredExperiences", "Education", "Training"] else ""

	# Clean skills - remove company names and duplicates
	if result.get("Skills"):
	cleaned_skills = []
	for skill in result["Skills"]:
	skill = skill.strip()
	# Skip if it looks like a company name or is too short
	if len(skill) > 1 and not self._is_company_name(skill):
	cleaned_skills.append(skill)
	result["Skills"] = list(set(cleaned_skills)) # Remove duplicates

	# Validate experience structure
	if result.get("StructuredExperiences"):
	cleaned_experiences = []
	for exp in result["StructuredExperiences"]:
	if isinstance(exp, dict) and exp.get("title") and exp.get("company"):
	# Ensure responsibilities is a list
	if not isinstance(exp.get("responsibilities"), list):
	exp["responsibilities"] = []
	cleaned_experiences.append(exp)
	result["StructuredExperiences"] = cleaned_experiences

	return result

	def _is_company_name(self, text: str) -> bool:
	"""Check if text looks like a company name rather than a skill."""
	company_indicators = [
	"inc", "llc", "corp", "ltd", "company", "solutions", "services",
	"systems", "technologies", "financial", "insurance"
	]
	text_lower = text.lower()
	return any(indicator in text_lower for indicator in company_indicators)

	def _fallback_extraction(self, text: str) -> Dict[str, Any]:
	"""Fallback to regex-based extraction if OpenAI fails."""
	logger.info("Using regex fallback extraction...")

	return {
	"Name": self._extract_name_regex(text),
	"Summary": self._extract_summary_regex(text),
	"Skills": self._extract_skills_regex(text),
	"StructuredExperiences": self._extract_experiences_regex(text),
	"Education": self._extract_education_regex(text),
	"Training": [],
	"Address": self._extract_address_regex(text),
	"ContactInfo": self._extract_contact_info(text)
	}

	def _extract_name_regex(self, text: str) -> str:
	"""Regex fallback for name extraction."""
	lines = text.split('\n')[:5]
	for line in lines:
	line = line.strip()
	if re.search(r'@\|phone\|email\|linkedin\|github', line.lower()):
	continue
	# Match name with potential credentials (PhD, MBA, etc.)
	name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?(?:,\s[A-Z][a-z.]+(?:,\s[A-Z][a-z.]+)?)?)', line)
	if name_match:
	return name_match.group(1)
	return ""

	def _extract_summary_regex(self, text: str) -> str:
	"""Regex fallback for summary extraction."""
	summary_pattern = r'(?i)(?:professional\s+)?summary[:\s]\n(.?)(?=\n\s*(?:technical\s+skills?\|skills?\|experience\|education))'
	match = re.search(summary_pattern, text, re.DOTALL)
	if match:
	summary = match.group(1).strip()
	summary = re.sub(r'\n+', ' ', summary)
	summary = re.sub(r'\s+', ' ', summary)
	return summary
	return ""

	def _extract_skills_regex(self, text: str) -> List[str]:
	"""Regex fallback for skills extraction."""
	skills = set()

	# Look for technical skills section
	skills_pattern = r'(?i)technical\s+skills?[:\s]\n(.?)(?=\n\s*(?:experience\|education\|projects?))'
	match = re.search(skills_pattern, text, re.DOTALL)

	if match:
	skills_text = match.group(1)
	# Split by common separators
	skill_items = re.split(r'[,;]\s*', skills_text.replace('\n', ' '))
	for item in skill_items:
	item = item.strip()
	if item and len(item) > 1 and len(item) < 30:
	skills.add(item)

	return sorted(list(skills))

	def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]:
	"""Regex fallback for experience extraction."""
	experiences = []

	# Look for work experience section
	exp_pattern = r'(?i)(?:work\s+)?experience[:\s]\n(.?)(?=\n\s*(?:education\|projects?\|certifications?\|$))'
	match = re.search(exp_pattern, text, re.DOTALL)

	if match:
	exp_text = match.group(1)

	# Look for job entries with \| separators
	job_pattern = r'([^\|\n]+)\s\\|\s([^\|\n]+)\s\\|\s([^\|\n]+)'
	matches = re.findall(job_pattern, exp_text)

	for match in matches:
	title, company, dates = match
	responsibilities = []

	# Look for bullet points after this job
	job_section = exp_text[exp_text.find(f"{title}\|{company}\|{dates}"):]
	bullets = re.findall(r'[-•]\s*([^-•\n]+)', job_section)
	responsibilities = [bullet.strip() for bullet in bullets if len(bullet.strip()) > 10]

	experience = {
	"title": title.strip(),
	"company": company.strip(),
	"date_range": dates.strip(),
	"responsibilities": responsibilities
	}
	experiences.append(experience)

	return experiences

	def _extract_education_regex(self, text: str) -> List[str]:
	"""Regex fallback for education extraction."""
	education = []

	edu_pattern = r'(?i)education[:\s]\n(.?)(?=\n\s*(?:certifications?\|projects?\|$))'
	match = re.search(edu_pattern, text, re.DOTALL)

	if match:
	edu_text = match.group(1)
	edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()]

	for line in edu_lines:
	if len(line) > 10: # Filter out short lines
	education.append(line)

	return education

	def _extract_address_regex(self, text: str) -> str:
	"""Regex fallback for address extraction."""
	# Look for address patterns like "6001 Tain Dr. Suite 203, Dublin, OH, 43016"
	address_patterns = [
	r'(\d+\s+[A-Za-z\s\.]+(?:Suite\|Apt\|Unit)\s+\d+,?\s[A-Za-z\s]+,\s[A-Z]{2}\s*\d{5})',
	r'(\d+\s+[A-Za-z\s\.]+,?\s[A-Za-z\s]+,\s[A-Z]{2}\s*\d{5})',
	r'([A-Za-z\s\d\.]+,\s[A-Za-z\s]+,\s[A-Z]{2}\s*\d{5})'
	]

	for pattern in address_patterns:
	match = re.search(pattern, text)
	if match:
	return match.group(1).strip()

	return ""


	# Main extraction function for compatibility
	def extract_sections_openai(text: str, api_key: Optional[str] = None) -> Dict[str, Any]:
	"""Extract resume sections using OpenAI API."""
	extractor = OpenAIResumeExtractor(api_key=api_key)
	return extractor.extract_sections_openai(text)