Spaces:

gauravbox
/

TalentLensAI

Sleeping

Johnny

feat: Update resume builder with LFS-tracked assets

79b5c9c 8 months ago

2.83 kB

	# parser.py
	import fitz # PyMuPDF
	import re
	from io import BytesIO
	from docx import Document
	from config import supabase, embedding_model, HF_ENDPOINTS, query

	def extract_name(resume_text: str) -> str:
	# look at the very top lines for a capitalized full name
	for line in resume_text.splitlines()[:5]:
	if re.match(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$", line.strip()):
	return line.strip()
	# last‐ditch: pull the first multiword “Title Case” anywhere
	m = re.search(r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", resume_text)
	return m.group(1) if m else "Candidate Name"

	def parse_resume(file_obj, file_type=None):
	"""
	Extract raw text from PDF or DOCX resume.
	"""
	if file_type is None and hasattr(file_obj, 'name'):
	file_type = file_obj.name.split('.')[-1].lower()
	if file_type == 'pdf':
	doc = fitz.open(stream=file_obj.read(), filetype='pdf')
	return "\n".join(page.get_text('text') for page in doc)
	elif file_type == 'docx':
	doc = Document(file_obj)
	text = []
	for para in doc.paragraphs:
	if para.text.strip():
	text.append(para.text)
	for table in doc.tables:
	for row in table.rows:
	for cell in row.cells:
	if cell.text.strip():
	text.append(cell.text.strip())
	return "\n".join(text)
	else:
	raise ValueError("Unsupported file format")

	def extract_email(resume_text):
	"""
	Extracts the first valid email found in text.
	"""
	match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text)
	return match.group(0) if match else None

	def summarize_resume(resume_text):
	prompt = (
	"You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. "
	"Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. "
	"Format it as a professional summary paragraph.\n\n"
	f"Resume:\n{resume_text}\n\n"
	"Summary:"
	)

	try:
	response = client.chat.completions.create(
	model="tgi",
	messages=[{"role": "user", "content": prompt}],
	temperature=0.5,
	max_tokens=300,
	)
	result = response.choices[0].message.content.strip()

	# Clean up generic lead-ins from the model
	cleaned = re.sub(
	r"^(Sure,\|Certainly,)?\s(here is\|here’s\|this is)?\s(the)?\s(extracted)?\s(professional)?\ssummary.?:\s*",
	"", result, flags=re.IGNORECASE
	).strip()

	return cleaned

	except Exception as e:
	print(f"❌ Error generating structured summary: {e}")
	return "Summary unavailable due to API issues."