Spaces:

ByteRiot
/

CandidateExplorer

Running

App Files Files Community

CandidateExplorer / utils /utils.py

ishaq101

Update Azure Blob Service Config

df5a9e3 5 days ago

raw

history blame contribute delete

14.4 kB

	import os
	import sys
	import io
	import time
	import PyPDF2
	import asyncio
	# import fitz
	import pytesseract
	import dotenv
	from config.env_constant import EnvFilepath
	dotenv.load_dotenv(EnvFilepath.ENVPATH)
	from PyPDF2 import PdfReader
	from functools import wraps
	from typing import ByteString
	from pdf2image import convert_from_bytes


	def measure_runtime(func):
	if asyncio.iscoroutinefunction(func):
	@wraps(func)
	async def async_wrapper(args, *kwargs):
	start = time.perf_counter()
	result = await func(args, *kwargs)
	end = time.perf_counter()
	print(f"⏱️ Async function '{func.__name__}' executed in {end - start:.10f} seconds")
	return result
	return async_wrapper

	else:
	@wraps(func)
	def sync_wrapper(args, *kwargs):
	start = time.perf_counter()
	result = func(args, *kwargs)
	end = time.perf_counter()
	print(f"⏱️ Function '{func.__name__}' executed in {end - start:.10f} seconds")
	return result
	return sync_wrapper

	# async def is_nonsearchable_pdf(pdf_path: str) -> str:
	# try:
	# doc = fitz.open(pdf_path)
	# for page_num in range(doc.page_count):
	# page = doc.load_page(page_num)
	# # Attempt to extract text from the page
	# text = page.get_text("text")
	# text = f"__{text.strip()}__"
	# if text == "____":
	# print("Non Searchable")
	# return True
	# else:
	# print("Searchable")
	# return False
	# except Exception as E:
	# print(f"Failed to identify nonsearchable, {E}")
	# return False
	# finally:
	# doc.close()

	def pdf_decoder(pdf_bytes: bytes) -> str:
	"""Decode PDF bytes to a string."""
	pdf_stream = io.BytesIO(pdf_bytes)
	reader = PdfReader(pdf_stream)

	# Extract and concatenate text from all pages
	user_profile = ""
	for page in reader.pages:
	text = page.extract_text()
	if text:
	user_profile += text + "\n"

	return user_profile.strip()

	# from fastapi.datastructures import UploadedFile

	# async def pdf_reader(pdf_path: str) -> str:
	# """Decode PDF bytes to a string."""
	# try:
	# is_empty = await is_nonsearchable_pdf(pdf_path)
	# print(f">> Is nonsearchable: {is_empty}")

	# if is_empty:
	# pdf_writer = PyPDF2.PdfWriter()
	# poppler_path = None
	# if sys.platform == "win32":
	# poppler_path = "src/software/poppler-24.08.0/Library/bin"
	# print(f"pdf_path",pdf_path)
	# print(f"type pdf_path",type(pdf_path))
	# # if type(pdf_path) != str:
	# # images = convert_from_bytes(pdf_path, poppler_path=poppler_path)
	# # else:
	# images = convert_from_bytes(pdf_path, poppler_path=poppler_path)

	# pytesseract.pytesseract.tesseract_cmd = r"src/software/Tesseract-OCR/tesseract.exe"

	# for image in images:
	# page = pytesseract.image_to_pdf_or_hocr(image, extension='pdf')
	# pdf = PyPDF2.PdfReader(io.BytesIO(page))
	# pdf_writer.add_page(pdf.pages[0])

	# output_bytes_stream = io.BytesIO()
	# pdf_writer.write(output_bytes_stream)
	# reader = PyPDF2.PdfReader(output_bytes_stream)
	# user_profile = ""
	# for page in reader.pages:
	# text = page.extract_text()
	# user_profile += text + "\n"
	# return user_profile
	# else:
	# reader = PdfReader(pdf_path)
	# # Extract and concatenate text from all pages
	# user_profile = ""
	# for page in reader.pages:
	# text = page.extract_text()
	# if text:
	# user_profile += text + "\n"
	# print(f">>> user profile: {user_profile.strip()}")
	# return user_profile.strip()
	# except Exception as E:
	# print(f"pdf reader error, {E}")
	# exc_type, exc_obj, exc_tb = sys.exc_info()
	# fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
	# print(exc_type, fname, exc_tb.tb_lineno)
	# import tempfile


	async def pdf_reader(pdf_path: ByteString) -> str:
	"""Read PDF bytes to a string."""
	try:
	user_profile = ""

	if type(pdf_path) == bytes:
	reader = PdfReader(io.BytesIO(pdf_path))
	else:
	reader = PdfReader(pdf_path)

	for page in reader.pages:
	text = page.extract_text()
	if text:
	user_profile += text + "\n"

	if user_profile.strip() != "":
	return user_profile.strip()
	else:
	pdf_writer = PyPDF2.PdfWriter()
	poppler_path = None
	if sys.platform == "win32":
	poppler_path = "src/software/poppler-24.08.0/Library/bin"


	# images = convert_from_bytes(pdf_path.getvalue(), poppler_path=poppler_path)
	images = convert_from_bytes(pdf_path, poppler_path=poppler_path)

	pytesseract.pytesseract.tesseract_cmd = r"src/software/Tesseract-OCR/tesseract.exe"

	for image in images:
	page = pytesseract.image_to_pdf_or_hocr(image, extension='pdf')
	pdf = PyPDF2.PdfReader(io.BytesIO(page))
	# pdf = PyPDF2.PdfReader(page)
	pdf_writer.add_page(pdf.pages[0])

	output_bytes_stream = io.BytesIO()
	pdf_writer.write(output_bytes_stream)
	reader = PyPDF2.PdfReader(output_bytes_stream)
	user_profile = ""
	for page in reader.pages:
	text = page.extract_text()
	user_profile += text + "\n"
	return user_profile.strip()
	except Exception as E:
	print(f"pdf reader error, {E}")
	exc_type, exc_obj, exc_tb = sys.exc_info()
	fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
	print(exc_type, fname, exc_tb.tb_lineno)


	# cv = pdf_reader("src/data/cvs/1. Balu Rama Chandra_Data Scientist_Linkedin.pdf")
	# print(cv)
	# len(cv.pages)
	# for page in cv.pages:
	# print(page.extract_text())


	# ------------------
	# QDRANT
	# ------------------
	# from qdrant_client import QdrantClient, models
	# import uuid
	# from src.embed_model.embed_model import embed_model
	# from qdrant_client import AsyncQdrantClient, models
	# from fastapi import HTTPException
	# from typing import Dict, List, Union
	# from src.models.data_model import Profile, OutProfile


	# qdrant_client = AsyncQdrantClient(
	# url=os.environ.get('ss--qdrant--endpoint--url'),
	# api_key=os.environ.get('ss--qdrant--api-key'),
	# )
	# qdrant_collection_name = os.environ.get('ss--qdrant--collection--name')


	# async def check_collection(qdrant_client:AsyncQdrantClient=qdrant_client, collection_name: str=qdrant_collection_name):
	# try:
	# colls = await qdrant_client.get_collections()
	# if collection_name not in [item.name for item in colls.collections]:
	# await qdrant_client.create_collection(
	# collection_name=qdrant_collection_name,
	# vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
	# )
	# print(f"✅ collection '{collection_name}' is created!")
	# return True
	# else:
	# print(f"✅ collection '{collection_name}' already created!")
	# except Exception as E:
	# print(f"❌ Something when wrong!, {E}")
	# return False


	# async def prettyfy_profile(profile:Dict) -> str:
	# template = "----\n"
	# for k, v in profile.items():
	# template += f"{k}: {v} \n"
	# template += "----"
	# return template


	# async def ingest_one_profile(profile:Profile, qdrant_client:AsyncQdrantClient=qdrant_client, collection_name:str=qdrant_collection_name):
	# try:
	# await check_collection(qdrant_client, collection_name)
	# text = await prettyfy_profile(profile.profile.model_dump())
	# doc_id = profile.profile_id
	# embeddings = await embed_model.aembed_query(text = text)

	# qdrant_client.upload_points(
	# collection_name=collection_name,
	# points=[
	# models.PointStruct(
	# id=doc_id,
	# payload=profile.model_dump(),
	# vector=embeddings,
	# )
	# ]
	# )
	# print(f"✅ Ingest one profile succeeded!")
	# except Exception as E:
	# print(f"❌ Ingest one profile error!, {E}")
	# raise HTTPException(status_code=500, detail=f"❌ Ingest one profile error!, {E}")


	# async def ingest_bulk_profile(profiles:List[Profile], qdrant_client:AsyncQdrantClient=qdrant_client, collection_name:str=qdrant_collection_name):
	# try:
	# await check_collection(qdrant_client, collection_name)
	# points = []
	# for profile in profiles:
	# text = await prettyfy_profile(profile.profile.model_dump())
	# doc_id = profile.profile_id
	# embeddings = await embed_model.aembed_query(text = text)
	# points.append(
	# models.PointStruct(
	# id=doc_id,
	# payload=profile.model_dump(),
	# vector=embeddings,
	# )
	# )

	# qdrant_client.upload_points(
	# collection_name=collection_name,
	# points=points
	# )
	# print(f"✅ Ingest bulk profile succeeded!")
	# except Exception as E:
	# print(f"❌ Ingest bulk profile error!, {E}")
	# raise HTTPException(status_code=500, detail=f"❌ Ingest bulk profile error!, {E}")



	# async def pretty_profiles(profiles:List[Union[Profile, Dict]]) -> pd.DataFrame:
	# try:
	# records = []
	# for profile in profiles:
	# temp = {}
	# # text = await prettyfy_profile(profile.profile.model_dump())
	# # doc_id = profile.profile_id
	# filename = profile.filename

	# # if type(profile.profile) != Dict:
	# # temp = {{"filename":filename}, profile.profile.model_dump()}
	# # else:
	# if type(profile.profile) == dict:
	# temp = {{"filename":filename}, profile.profile}
	# elif type(profile.profile) == OutProfile:
	# temp = {{"filename":filename}, profile.profile.model_dump()}


	# if type(temp["hardskills"]) == list and temp["hardskills"] != []:
	# temp["hardskills"] = ", ".join(temp["hardskills"])
	# else:
	# temp["hardskills"] = "-"

	# if type(temp["softskills"]) == list and temp["softskills"] != []:
	# temp["softskills"] = ", ".join(temp["softskills"])
	# else:
	# temp["softskills"] = "-"

	# if type(temp["certifications"]) == list and temp["certifications"] != []:
	# temp["certifications"] = ", ".join(temp["certifications"])
	# else:
	# temp["certifications"] = "-"

	# if type(temp["business_domain_experiences"]) == list and temp["business_domain_experiences"] != []:
	# temp["business_domain_experiences"] = ", ".join(temp["business_domain_experiences"])
	# else:
	# temp["business_domain_experiences"] = "-"

	# records.append(temp)
	# # embeddings = await embed_model.aembed_query(text = text)
	# print(f"✅ Export profile succeeded!")
	# df = pd.DataFrame(records)
	# return df
	# except Exception as E:
	# print(f"❌ Export profile error!, {E}")
	# error_message = f"Processing pretty profile error: {E}"
	# print(error_message)
	# exc_type, exc_obj, exc_tb = sys.exc_info()
	# fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
	# print(exc_type, fname, exc_tb.tb_lineno)
	# raise HTTPException(status_code=500, detail=f"❌ Export profile error!, {E}")


	# async def helper_prepare_profiles(file_names:List, output_profiles:List[Union[OutProfile, Dict]]):
	# if len(file_names) == len(output_profiles):
	# profiles = []
	# for i in range(len(output_profiles)):
	# one_profile = Profile(
	# filename=file_names[i].split('\\')[-1],
	# profile_id=str(uuid.uuid4()),
	# profile=output_profiles[i]
	# )
	# profiles.append(one_profile)
	# return profiles
	# else:
	# return []


	# asyncio.run(ingest_one_profile(profile))
	# asyncio.run(ingest_one_profile(fake_profile))


	# async def retrieve_profile(input_user: str, qdrant_client:AsyncQdrantClient=qdrant_client, collection_name:str=qdrant_collection_name, limit:int=5):
	# try:
	# embeddings = await embed_model.aembed_query(text = input_user)
	# query_result = await qdrant_client.query_points(
	# collection_name=collection_name,
	# query=embeddings,
	# limit=limit,
	# )
	# return query_result.points
	# except Exception as E:
	# print(f"❌ retrieve_profile error, {E}")
	# return []


	# criteria1 = """latest_university: Institut Teknologi Sepuluh November (ITS)
	# major: Matematika
	# gpa: >3.6
	# hardskill: Certified Business Strategic Business Analyst, analytics
	# business_domain_experience: people analytics"""

	# criteria1 = """universitas: Institut Teknologi Sepuluh November (ITS)"""

	# retrieved_profiles = asyncio.run(retrieve_profile(criteria1, limit=None))
	# len(retrieved_profiles)
	# retrieved_profiles[-1].payload


	# from langchain_community.document_loaders import PyPDFLoader

	# loader = PyPDFLoader(files_path[0])
	# pages = []
	# for page in loader.lazy_load():
	# pages.append(page)

	# len(pages)