Spaces:
Build error
Build error
| import json | |
| import re | |
| from typing import Dict, List, Any | |
| import requests | |
| import os | |
| from datetime import datetime | |
| import logging | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class AIResumeExtractor: | |
| def __init__(self, api_key: str = None, model_name: str = "microsoft/DialoGPT-medium"): | |
| """Initialize the AI extractor model with Hugging Face API key""" | |
| self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY') | |
| self.model_name = model_name | |
| self.base_url = "https://api-inference.huggingface.co/models" | |
| # Available models for different tasks | |
| self.models = { | |
| "text_generation": "microsoft/DialoGPT-medium", | |
| "instruction_following": "microsoft/DialoGPT-medium", | |
| "question_answering": "deepset/roberta-base-squad2", | |
| "summarization": "facebook/bart-large-cnn", | |
| "ner": "dbmdz/bert-large-cased-finetuned-conll03-english" | |
| } | |
| if not self.api_key: | |
| logger.warning("No Hugging Face valid API key found. Set HF_API_TOKEN or HUGGINGFACE_API_KEY environment variable.") | |
| def _make_api_request(self, model_name: str, payload: Dict[str, Any], max_retries: int = 3) -> Dict[str, Any]: | |
| """ | |
| Make a request to Hugging Face Inference API with retry logic | |
| """ | |
| headers = { | |
| "Authorization": f"Bearer {self.api_key}", | |
| "Content-Type": "application/json" | |
| } | |
| url = f"{self.base_url}/{model_name}" | |
| for attempt in range(max_retries): | |
| try: | |
| response = requests.post(url, headers=headers, json=payload, timeout=60) | |
| if response.status_code == 200: | |
| return response.json() | |
| elif response.status_code == 503: | |
| # Model is loading, wait and retry | |
| logger.info(f"Model {model_name} is loading, waiting...") | |
| import time | |
| time.sleep(15) | |
| continue | |
| else: | |
| logger.error(f"API request failed: {response.status_code} - {response.text}") | |
| break | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"Request failed (attempt {attempt + 1}): {e}") | |
| if attempt < max_retries - 1: | |
| import time | |
| time.sleep(3) | |
| continue | |
| break | |
| raise Exception(f"Failed to get response from {model_name} after {max_retries} attempts") | |
| def extract_sections_ai(self, text: str) -> Dict[str, Any]: | |
| """ | |
| Use Hugging Face AI models to extract resume sections in a structured format | |
| """ | |
| if not self.api_key: | |
| logger.warning("No valid API key available, falling back to regex extraction") | |
| from utils.extractor_fixed import extract_sections_spacy_fixed | |
| return extract_sections_spacy_fixed(text) | |
| try: | |
| # Extract different sections using Hugging Face models | |
| name = self._extract_name_hf(text) | |
| summary = self._extract_summary_hf(text) | |
| skills = self._extract_skills_hf(text) | |
| experiences = self._extract_experiences_hf(text) | |
| education = self._extract_education_hf(text) | |
| result = { | |
| "Name": name, | |
| "Summary": summary, | |
| "Skills": skills, | |
| "StructuredExperiences": experiences, | |
| "Education": education, | |
| "Training": [] | |
| } | |
| logger.info("β Hugging Face AI extraction completed") | |
| return self._post_process_extraction(result) | |
| except Exception as e: | |
| logger.error(f"Hugging Face AI extraction failed: {e}") | |
| # Fallback to regex-based extraction | |
| from utils.extractor_fixed import extract_sections_spacy_fixed | |
| return extract_sections_spacy_fixed(text) | |
| def _extract_name_hf(self, text: str) -> str: | |
| """Extract name using Hugging Face question-answering model""" | |
| try: | |
| payload = { | |
| "inputs": { | |
| "question": "What is the person's full name?", | |
| "context": text[:1000] # First 1000 chars should contain name | |
| } | |
| } | |
| response = self._make_api_request(self.models["question_answering"], payload) | |
| if response and "answer" in response: | |
| name = response["answer"].strip() | |
| # Validate the name format | |
| if re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+', name): | |
| return name | |
| except Exception as e: | |
| logger.warning(f"HF name extraction failed: {e}") | |
| # Fallback to regex | |
| return self._extract_name_regex(text) | |
| def _extract_summary_hf(self, text: str) -> str: | |
| """Extract summary using Hugging Face summarization model""" | |
| try: | |
| # Find summary section first | |
| summary_match = re.search( | |
| r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', | |
| text, re.DOTALL | |
| ) | |
| if summary_match: | |
| summary_text = summary_match.group(1).strip() | |
| # If the summary is long, use AI to condense it | |
| if len(summary_text) > 500: | |
| payload = { | |
| "inputs": summary_text, | |
| "parameters": { | |
| "max_length": 150, | |
| "min_length": 50, | |
| "do_sample": False | |
| } | |
| } | |
| response = self._make_api_request(self.models["summarization"], payload) | |
| if response and isinstance(response, list) and len(response) > 0: | |
| return response[0].get("summary_text", summary_text) | |
| return summary_text | |
| except Exception as e: | |
| logger.warning(f"HF summary extraction is failed: {e}") | |
| # Fallback to regex | |
| return self._extract_summary_regex(text) | |
| def _extract_skills_hf(self, text: str) -> List[str]: | |
| """Extract skills using Hugging Face NER model and regex patterns""" | |
| skills = set() | |
| try: | |
| # First, find the technical skills section using regex | |
| skills_match = re.search( | |
| r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:professional\s+experience|experience|education|projects?))', | |
| text, re.DOTALL | |
| ) | |
| if skills_match: | |
| skills_text = skills_match.group(1) | |
| # Parse bullet-pointed skills | |
| bullet_lines = re.findall(r'β\s*([^β\n]+)', skills_text) | |
| for line in bullet_lines: | |
| if ':' in line: | |
| # Format: "Category: skill1, skill2, skill3" | |
| skills_part = line.split(':', 1)[1].strip() | |
| individual_skills = re.split(r',\s*', skills_part) | |
| for skill in individual_skills: | |
| skill = skill.strip() | |
| if skill and len(skill) > 1: | |
| skills.add(skill) | |
| # Use NER model to find additional technical terms | |
| try: | |
| payload = { | |
| "inputs": text[:2000] # Limit text length for NER | |
| } | |
| response = self._make_api_request(self.models["ner"], payload) | |
| if response and isinstance(response, list): | |
| for entity in response: | |
| if entity.get("entity_group") in ["MISC", "ORG"] and entity.get("score", 0) > 0.8: | |
| word = entity.get("word", "").strip() | |
| # Filter for technical-looking terms | |
| if re.match(r'^[A-Za-z][A-Za-z0-9\.\-]*$', word) and len(word) > 2: | |
| skills.add(word) | |
| except Exception as e: | |
| logger.warning(f"NER extraction failed: {e}") | |
| except Exception as e: | |
| logger.warning(f"HF skills extraction failed: {e}") | |
| # Enhanced common technical skills detection as fallback | |
| common_skills = [ | |
| 'Python', 'Java', 'JavaScript', 'TypeScript', 'C++', 'C#', 'SQL', 'NoSQL', | |
| 'React', 'Angular', 'Vue', 'Node.js', 'Django', 'Flask', 'Spring', | |
| 'AWS', 'Azure', 'GCP', 'Docker', 'Kubernetes', 'Jenkins', | |
| 'Git', 'GitHub', 'GitLab', 'Jira', 'Confluence', | |
| 'TensorFlow', 'PyTorch', 'Scikit-learn', 'Pandas', 'NumPy', 'Matplotlib', | |
| 'MySQL', 'PostgreSQL', 'MongoDB', 'Redis', | |
| 'Linux', 'Windows', 'MacOS', 'Ubuntu', | |
| 'Selenium', 'Pytest', 'TestNG', 'Postman', | |
| 'AWS Glue', 'AWS SageMaker', 'REST APIs', 'Apex', 'Bash' | |
| ] | |
| for skill in common_skills: | |
| if re.search(rf'\b{re.escape(skill)}\b', text, re.IGNORECASE): | |
| skills.add(skill) | |
| return sorted(list(skills)) | |
| def _extract_experiences_hf(self, text: str) -> List[Dict[str, Any]]: | |
| """Extract work experiences using Hugging Face question-answering model""" | |
| experiences = [] | |
| try: | |
| # First find the experience section using regex | |
| exp_pattern = r'(?i)(?:professional\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|$))' | |
| match = re.search(exp_pattern, text, re.DOTALL) | |
| if not match: | |
| return experiences | |
| exp_text = match.group(1) | |
| # Parse job entries with improved patterns | |
| # Pattern 1: Company | Location | Title | Date | |
| pattern1 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' | |
| matches1 = re.findall(pattern1, exp_text) | |
| for match in matches1: | |
| company, location, title, dates = match | |
| # Extract responsibilities using QA model | |
| responsibilities = [] | |
| try: | |
| # Find the section for this specific job | |
| job_section = self._find_job_section(exp_text, company.strip(), title.strip()) | |
| if job_section: | |
| # Use QA model to extract responsibilities | |
| payload = { | |
| "inputs": { | |
| "question": "What are the main responsibilities and achievements?", | |
| "context": job_section | |
| } | |
| } | |
| response = self._make_api_request(self.models["question_answering"], payload) | |
| if response and "answer" in response: | |
| resp_text = response["answer"] | |
| # Split into individual responsibilities | |
| responsibilities = [r.strip() for r in re.split(r'[β’β\n]', resp_text) if r.strip()] | |
| # Fallback to regex if QA didn't work well | |
| if len(responsibilities) < 2: | |
| responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip()) | |
| except Exception as e: | |
| logger.warning(f"HF responsibility extraction failed: {e}") | |
| responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip()) | |
| experience = { | |
| "title": title.strip(), | |
| "company": f"{company.strip()}, {location.strip()}", | |
| "date_range": dates.strip(), | |
| "responsibilities": responsibilities | |
| } | |
| experiences.append(experience) | |
| except Exception as e: | |
| logger.warning(f"HF experience extraction failed: {e}") | |
| return experiences | |
| def _extract_education_hf(self, text: str) -> List[str]: | |
| """Extract education using Hugging Face question-answering model""" | |
| education = [] | |
| try: | |
| payload = { | |
| "inputs": { | |
| "question": "What education, degrees, or certifications does this person have?", | |
| "context": text | |
| } | |
| } | |
| response = self._make_api_request(self.models["question_answering"], payload) | |
| if response and "answer" in response: | |
| edu_text = response["answer"] | |
| # Parse the education information | |
| education_items = re.split(r'[,;]', edu_text) | |
| for item in education_items: | |
| item = item.strip() | |
| if item and len(item) > 5: # Reasonable length | |
| education.append(item) | |
| except Exception as e: | |
| logger.warning(f"HF education extraction failed: {e}") | |
| # Fallback to regex if HF extraction didn't work | |
| if not education: | |
| education = self._extract_education_regex(text) | |
| return education | |
| def _find_job_section(self, exp_text: str, company: str, title: str) -> str: | |
| """Find the specific section for a job in the experience text""" | |
| lines = exp_text.split('\n') | |
| job_lines = [] | |
| in_job_section = False | |
| for line in lines: | |
| if company in line and title in line: | |
| in_job_section = True | |
| job_lines.append(line) | |
| elif in_job_section: | |
| if re.match(r'^[A-Z].*\|.*\|.*\|', line): # Next job entry | |
| break | |
| job_lines.append(line) | |
| return '\n'.join(job_lines) | |
| def _extract_name_regex(self, text: str) -> str: | |
| """Fallback regex name extraction""" | |
| lines = text.split('\n')[:5] | |
| for line in lines: | |
| line = line.strip() | |
| if re.search(r'@|phone|email|linkedin|github|π§|π|π', line.lower()): | |
| continue | |
| name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', line) | |
| if name_match: | |
| return name_match.group(1) | |
| return "" | |
| def _extract_summary_regex(self, text: str) -> str: | |
| """Fallback regex summary extraction""" | |
| summary_patterns = [ | |
| r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', | |
| r'(?i)objective[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))' | |
| ] | |
| for pattern in summary_patterns: | |
| match = re.search(pattern, text, re.DOTALL) | |
| if match: | |
| summary = match.group(1).strip() | |
| summary = re.sub(r'\n+', ' ', summary) | |
| summary = re.sub(r'\s+', ' ', summary) | |
| if len(summary) > 50: | |
| return summary | |
| return "" | |
| def _extract_responsibilities_regex(self, exp_text: str, company: str, title: str) -> List[str]: | |
| """Extract responsibilities using regex patterns""" | |
| responsibilities = [] | |
| # Find the section for this specific job | |
| job_section = self._find_job_section(exp_text, company, title) | |
| if job_section: | |
| # Look for bullet points | |
| bullet_matches = re.findall(r'β\s*([^β\n]+)', job_section) | |
| for match in bullet_matches: | |
| resp = match.strip() | |
| if len(resp) > 20: # Substantial responsibility | |
| responsibilities.append(resp) | |
| return responsibilities | |
| def _extract_education_regex(self, text: str) -> List[str]: | |
| """Fallback regex education extraction""" | |
| education = [] | |
| # Look for education section | |
| edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))' | |
| match = re.search(edu_pattern, text, re.DOTALL) | |
| if match: | |
| edu_text = match.group(1) | |
| # Look for degree patterns | |
| degree_matches = re.findall(r'β\s*([^β\n]+)', edu_text) | |
| for match in degree_matches: | |
| edu_item = match.strip() | |
| if len(edu_item) > 10: | |
| education.append(edu_item) | |
| return education | |
| def _post_process_extraction(self, data: Dict[str, Any]) -> Dict[str, Any]: | |
| """ | |
| Clean up and validate the AI-extracted data | |
| """ | |
| # Ensure all required fields exist | |
| default_structure = { | |
| "Name": "", | |
| "Summary": "", | |
| "Skills": [], | |
| "StructuredExperiences": [], | |
| "Education": [], | |
| "Training": [] | |
| } | |
| # Merge with defaults | |
| for key, default_value in default_structure.items(): | |
| if key not in data: | |
| data[key] = default_value | |
| # Clean up skills (remove duplicates, empty entries) | |
| if data["Skills"]: | |
| data["Skills"] = list(set([ | |
| skill.strip() | |
| for skill in data["Skills"] | |
| if skill and skill.strip() and len(skill.strip()) > 1 | |
| ])) | |
| data["Skills"].sort() | |
| # Clean up experiences | |
| for exp in data["StructuredExperiences"]: | |
| # Ensure all experience fields exist | |
| exp.setdefault("title", "") | |
| exp.setdefault("company", "") | |
| exp.setdefault("date_range", "") | |
| exp.setdefault("responsibilities", []) | |
| # Clean up responsibilities | |
| if exp["responsibilities"]: | |
| exp["responsibilities"] = [ | |
| resp.strip() | |
| for resp in exp["responsibilities"] | |
| if resp and resp.strip() | |
| ] | |
| # Clean up education and training | |
| for field in ["Education", "Training"]: | |
| if data[field]: | |
| data[field] = [ | |
| item.strip() | |
| for item in data[field] | |
| if item and item.strip() | |
| ] | |
| return data | |
| # Convenience function for backward compatibility | |
| def extract_sections_ai(text: str) -> Dict[str, Any]: | |
| """ | |
| Extract resume sections using AI | |
| """ | |
| extractor = AIResumeExtractor() | |
| return extractor.extract_sections_ai(text) | |
| # Test function | |
| def test_ai_extraction(): | |
| """Test the Hugging Face AI extraction with sample resume""" | |
| sample_text = """ | |
| Jonathan Generic Smith | |
| πSan Diego, CA | 321-123-1234 | π§ testemail@icloud.com | |
| Summary | |
| Results-driven Automation Test Engineer with 8 years of experience in Selenium and Java, | |
| specializing in automation frameworks for financial and insurance domains. Expert in designing, | |
| developing, and executing automated test scripts, ensuring quality software delivery with CI/CD | |
| integration. Adept at working with Agile methodologies and cross-functional teams to improve | |
| software reliability | |
| Technical Skills | |
| β Selenium WebDriver, Java, TestNG, Cucumber, Jenkins, Maven | |
| β GIT, REST APIs, Apex, Bash | |
| β Jira, Agile, CI/CD, Docker, Kubernetes | |
| Professional Experience | |
| Senior Automation Test Engineer | ABC Financial Services | Jan 2021 - Present | |
| β Led automation framework enhancements using Selenium and Java, improving test efficiency. | |
| β Automated end-to-end UI and API testing for financial applications, reducing manual effort by 40%. | |
| Automation Test Engineer | XYZ Insurance Solutions | Jun 2017 - Dec 2020 | |
| β Designed and implemented Selenium automation framework using Java and TestNG. | |
| β Developed automated test scripts for insurance policy management applications. | |
| Education | |
| β Bachelor of Technology in Computer Science | ABC University | 2015 | |
| """ | |
| print("Testing Hugging Face AI extraction...") | |
| extractor = AIResumeExtractor() | |
| result = extractor.extract_sections_ai(sample_text) | |
| print("Hugging Face AI Extraction Results:") | |
| print(json.dumps(result, indent=2)) | |
| return result | |
| if __name__ == "__main__": | |
| test_ai_extraction() |