Spaces:
Sleeping
Sleeping
| # parser.py | |
| import fitz # PyMuPDF | |
| import re | |
| from io import BytesIO | |
| from docx import Document | |
| from config import supabase, embedding_model, HF_ENDPOINTS, query | |
| def extract_name(resume_text: str) -> str: | |
| # look at the very top lines for a capitalized full name | |
| for line in resume_text.splitlines()[:5]: | |
| if re.match(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$", line.strip()): | |
| return line.strip() | |
| # last‐ditch: pull the first multiword “Title Case” anywhere | |
| m = re.search(r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", resume_text) | |
| return m.group(1) if m else "Candidate Name" | |
| def parse_resume(file_obj, file_type=None): | |
| """ | |
| Extract raw text from PDF or DOCX resume. | |
| """ | |
| if file_type is None and hasattr(file_obj, 'name'): | |
| file_type = file_obj.name.split('.')[-1].lower() | |
| if file_type == 'pdf': | |
| doc = fitz.open(stream=file_obj.read(), filetype='pdf') | |
| return "\n".join(page.get_text('text') for page in doc) | |
| elif file_type == 'docx': | |
| doc = Document(file_obj) | |
| text = [] | |
| for para in doc.paragraphs: | |
| if para.text.strip(): | |
| text.append(para.text) | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| if cell.text.strip(): | |
| text.append(cell.text.strip()) | |
| return "\n".join(text) | |
| else: | |
| raise ValueError("Unsupported file format") | |
| def extract_email(resume_text): | |
| """ | |
| Extracts the first valid email found in text. | |
| """ | |
| match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text) | |
| return match.group(0) if match else None | |
| def summarize_resume(resume_text): | |
| prompt = ( | |
| "You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. " | |
| "Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. " | |
| "Format it as a professional summary paragraph.\n\n" | |
| f"Resume:\n{resume_text}\n\n" | |
| "Summary:" | |
| ) | |
| try: | |
| response = client.chat.completions.create( | |
| model="tgi", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.5, | |
| max_tokens=300, | |
| ) | |
| result = response.choices[0].message.content.strip() | |
| # Clean up generic lead-ins from the model | |
| cleaned = re.sub( | |
| r"^(Sure,|Certainly,)?\s*(here is|here’s|this is)?\s*(the)?\s*(extracted)?\s*(professional)?\s*summary.*?:\s*", | |
| "", result, flags=re.IGNORECASE | |
| ).strip() | |
| return cleaned | |
| except Exception as e: | |
| print(f"❌ Error generating structured summary: {e}") | |
| return "Summary unavailable due to API issues." |