# libraries import os from huggingface_hub import InferenceClient from dotenv import load_dotenv import json import re #import easyocr from PIL import Image, ImageEnhance, ImageDraw import cv2 import numpy as np from paddleocr import PaddleOCR import logging from datetime import datetime # Configure logging logging.basicConfig( level=logging.INFO, handlers=[ logging.StreamHandler() # Remove FileHandler and log only to the console ] ) # Set the PaddleOCR home directory to a writable location import os os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr' RESULT_FOLDER = 'static/results/' JSON_FOLDER = 'static/json/' if not os.path.exists('/tmp/.paddleocr'): os.makedirs(RESULT_FOLDER, exist_ok=True) # Check if PaddleOCR home directory is writable if not os.path.exists('/tmp/.paddleocr'): os.makedirs('/tmp/.paddleocr', exist_ok=True) logging.info("Created PaddleOCR home directory.") else: logging.info("PaddleOCR home directory exists.") # Load environment variables from .env file load_dotenv() # Authenticate with Hugging Face HFT = os.getenv('HF_TOKEN') # Initialize the InferenceClient client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT) def load_image(image_path): ext = os.path.splitext(image_path)[1].lower() if ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff']: image = cv2.imread(image_path) if image is None: raise ValueError(f"Failed to load image from {image_path}. The file may be corrupted or unreadable.") return image else: raise ValueError(f"Unsupported image format: {ext}") # Function for upscaling image using OpenCV's INTER_CUBIC def upscale_image(image, scale=2): height, width = image.shape[:2] upscaled_image = cv2.resize(image, (width * scale, height * scale), interpolation=cv2.INTER_CUBIC) return upscaled_image # Function to denoise the image (reduce noise) def reduce_noise(image): return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21) # Function to sharpen the image def sharpen_image(image): kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]) sharpened_image = cv2.filter2D(image, -1, kernel) return sharpened_image # Function to increase contrast and enhance details without changing color def enhance_image(image): pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) enhancer = ImageEnhance.Contrast(pil_img) enhanced_image = enhancer.enhance(1.5) enhanced_image_bgr = cv2.cvtColor(np.array(enhanced_image), cv2.COLOR_RGB2BGR) return enhanced_image_bgr # Complete function to process image def process_image(image_path, scale=2): # Load the image image = load_image(image_path) # Upscale the image upscaled_image = upscale_image(image, scale) # Reduce noise denoised_image = reduce_noise(upscaled_image) # Sharpen the image sharpened_image = sharpen_image(denoised_image) # Enhance the image contrast and details without changing color final_image = enhance_image(sharpened_image) return final_image # Function for OCR with PaddleOCR, returning both text and bounding boxes def ocr_with_paddle(img): final_text = '' boxes = [] # Initialize PaddleOCR # In /app/utility/utils.py ocr = PaddleOCR( use_angle_cls=True, lang='en', enable_mkldnn=False, # <--- Add this line to disable the failing optimization use_gpu=False # Ensure this is False if you are on a CPU-only container ) # ocr = PaddleOCR( # lang='en', # use_angle_cls=True, # det_model_dir=os.path.join(os.environ['PADDLEOCR_HOME'], 'whl/det'), # rec_model_dir=os.path.join(os.environ['PADDLEOCR_HOME'], 'whl/rec/en/en_PP-OCRv4_rec_infer'), # cls_model_dir=os.path.join(os.environ['PADDLEOCR_HOME'], 'whl/cls/ch_ppocr_mobile_v2.0_cls_infer') # ) # ocr = PaddleOCR( # use_angle_cls=True, # lang='en', # det_model_dir='/app/paddleocr_models/whl/det/ch_ppocr_mobile_v2.0_det_infer', # rec_model_dir='/app/paddleocr_models/whl/rec/ch_ppocr_mobile_v2.0_rec_infer', # cls_model_dir='/app/paddleocr_models/whl/cls/ch_ppocr_mobile_v2.0_cls_infer' # ) # Check if img is a file path or an image array if isinstance(img, str): img = cv2.imread(img) # Perform OCR result = ocr.ocr(img) # Iterate through the OCR result for line in result[0]: # Check how many values are returned (2 or 3) and unpack accordingly if len(line) == 3: box, (text, confidence), _ = line # When 3 values are returned elif len(line) == 2: box, (text, confidence) = line # When only 2 values are returned # Store the recognized text and bounding boxes final_text += ' ' + text # Extract the text from the tuple boxes.append(box) # Draw the bounding box points = [(int(point[0]), int(point[1])) for point in box] cv2.polylines(img, [np.array(points)], isClosed=True, color=(0, 255, 0), thickness=2) # Store the image with bounding boxes in a variable img_with_boxes = img return final_text, img_with_boxes def extract_text_from_images(image_paths): all_extracted_texts = {} all_extracted_imgs = {} for image_path in image_paths: try: # Enhance the image before OCR enhanced_image = process_image(image_path, scale=2) # Perform OCR on the enhanced image and get boxes result, img_with_boxes = ocr_with_paddle(enhanced_image) # Draw bounding boxes on the processed image img_result = Image.fromarray(enhanced_image) #img_with_boxes = draw_boxes(img_result, boxes) # genrating unique id to save the images # Get the current date and time current_time = datetime.now() # Format it as a string to create a unique ID unique_id = current_time.strftime("%Y%m%d%H%M%S%f") #print(unique_id) # Save the image with boxes result_image_path = os.path.join(RESULT_FOLDER, f'result_{unique_id}_{os.path.basename(image_path)}') #img_with_boxes.save(result_image_path) cv2.imwrite(result_image_path, img_with_boxes) # Store the text and image result paths all_extracted_texts[image_path] = result all_extracted_imgs[image_path] = result_image_path except ValueError as ve: print(f"Error processing image {image_path}: {ve}") continue # Continue to the next image if there's an error # Convert to JSON-compatible structure all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()} return all_extracted_texts, all_extracted_imgs_json # Function to call the Gemma model and process the output as Json # def Data_Extractor(data, client=client): # text = f'''Act as a Text extractor for the following text given in text: {data} # extract text in the following output JSON string: # {{ # "Name": ["Identify and Extract All the person's name from the text."], # "Designation": ["Extract All the designation or job title mentioned in the text."], # "Company": ["Extract All the company or organization name if mentioned."], # "Contact": ["Extract All phone number, including country codes if present."], # "Address": ["Extract All the full postal address or location mentioned in the text."], # "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."], # "Link": ["Identify and Extract any website URLs or social media links present in the text."] # }} # Output: # ''' # # Call the API for inference # response = client.text_generation(text, max_new_tokens=1000)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2) # print("parse in text ---:",response) # # Convert the response text to JSON # try: # json_data = json.loads(response) # print("Json_data-------------->",json_data) # return json_data # except json.JSONDecodeError as e: # return {"error": f"Error decoding JSON: {e}"} def Data_Extractor(data): url = "https://api.groq.com/openai/v1/chat/completions" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}" } prompt = f""" You are a strict JSON generator. Extract structured data from the following text. Return ONLY valid JSON. No explanation. No markdown. Schema: {{ "Name": [], "Designation": [], "Company": [], "Contact": [], "Address": [], "Email": [], "Link": [] }} Rules: - Always return all keys - If nothing found → return empty list [] - Do NOT return "Not found" - Ensure valid JSON format Text: {data} """ payload = { "model": "llama-3.3-70b-versatile", "messages": [ {"role": "user", "content": prompt} ], "temperature": 0.2, # 🔥 IMPORTANT: lower = more structured "max_tokens": 1024, "top_p": 1, "stream": False } response = requests.post(url, headers=headers, json=payload) if response.status_code != 200: return {"error": response.text} result = response.json() # Extract model output content = result["choices"][0]["message"]["content"] print("RAW LLM OUTPUT:\n", content) # 🔧 Clean response (important) content = content.strip() # Remove markdown if model adds ```json if content.startswith("```"): content = content.split("```")[1] try: json_data = json.loads(content) return json_data except json.JSONDecodeError as e: print("JSON ERROR:", e) return {"error": "Invalid JSON from model", "raw": content} # For have text compatible to the llm def json_to_llm_str(textJson): str='' for file,item in textJson.items(): str+=item + ' ' return str # Define the RE for extracting the contact details like number, mail , portfolio, website etc def extract_contact_details(text): # Regex patterns # Phone numbers with at least 5 digits in any segment combined_phone_regex = re.compile(r''' (?: #(?:(?:\+91[-.\s]?)?\d{5}[-.\s]?\d{5})|(?:\+?\d{1,3})?[-.\s()]?\d{5,}[-.\s()]?\d{5,}[-.\s()]?\d{1,9} | /^[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{4})$/ | \+1\s\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada Intl +1 (XXX) XXX-XXXX \(\d{3}\)\s\d{3}-\d{4} | # USA/Canada STD (XXX) XXX-XXXX \(\d{3}\)\s\d{3}\s\d{4} | # USA/Canada (XXX) XXX XXXX \(\d{3}\)\s\d{3}\s\d{3} | # USA/Canada (XXX) XXX XXX \+1\d{10} | # +1 XXXXXXXXXX \d{10} | # XXXXXXXXXX \+44\s\d{4}\s\d{6} | # UK Intl +44 XXXX XXXXXX \+44\s\d{3}\s\d{3}\s\d{4} | # UK Intl +44 XXX XXX XXXX 0\d{4}\s\d{6} | # UK STD 0XXXX XXXXXX 0\d{3}\s\d{3}\s\d{4} | # UK STD 0XXX XXX XXXX \+44\d{10} | # +44 XXXXXXXXXX 0\d{10} | # 0XXXXXXXXXX \+61\s\d\s\d{4}\s\d{4} | # Australia Intl +61 X XXXX XXXX 0\d\s\d{4}\s\d{4} | # Australia STD 0X XXXX XXXX \+61\d{9} | # +61 XXXXXXXXX 0\d{9} | # 0XXXXXXXXX \+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX \+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX \+91\s\d{10} | # India Intl +91 XXXXXXXXXX \+91\s\d{3}\s\d{3}\s\d{4} | # India Intl +91 XXX XXX XXXX \+91\s\d{3}-\d{3}-\d{4} | # India Intl +91 XXX-XXX-XXXX \+91\s\d{2}\s\d{4}\s\d{4} | # India Intl +91 XX XXXX XXXX \+91\s\d{2}-\d{4}-\d{4} | # India Intl +91 XX-XXXX-XXXX \+91\s\d{5}\s\d{5} | # India Intl +91 XXXXX XXXXX \d{5}\s\d{5} | # India XXXXX XXXXX \d{5}-\d{5} | # India XXXXX-XXXXX 0\d{2}-\d{7} | # India STD 0XX-XXXXXXX \+91\d{10} | # +91 XXXXXXXXXX \d{10} | # XXXXXXXXXX # Here is the regex to handle all possible combination of the contact \d{6}-\d{4} | # XXXXXX-XXXX \d{4}-\d{6} | # XXXX-XXXXXX \d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX \d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX \d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX \d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX #----- \+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX \+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX 0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX \+49\d{12} | # +49 XXXXXXXXXXXX \+49\d{10} | # +49 XXXXXXXXXX 0\d{11} | # 0XXXXXXXXXXX \+86\s\d{3}\s\d{4}\s\d{4} | # China Intl +86 XXX XXXX XXXX 0\d{3}\s\d{4}\s\d{4} | # China STD 0XXX XXXX XXXX \+86\d{11} | # +86 XXXXXXXXXXX \+81\s\d\s\d{4}\s\d{4} | # Japan Intl +81 X XXXX XXXX \+81\s\d{2}\s\d{4}\s\d{4} | # Japan Intl +81 XX XXXX XXXX 0\d\s\d{4}\s\d{4} | # Japan STD 0X XXXX XXXX \+81\d{10} | # +81 XXXXXXXXXX \+81\d{9} | # +81 XXXXXXXXX 0\d{9} | # 0XXXXXXXXX \+55\s\d{2}\s\d{5}-\d{4} | # Brazil Intl +55 XX XXXXX-XXXX \+55\s\d{2}\s\d{4}-\d{4} | # Brazil Intl +55 XX XXXX-XXXX 0\d{2}\s\d{4}\s\d{4} | # Brazil STD 0XX XXXX XXXX \+55\d{11} | # +55 XXXXXXXXXXX \+55\d{10} | # +55 XXXXXXXXXX 0\d{10} | # 0XXXXXXXXXX \+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} | # France Intl +33 X XX XX XX XX 0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} | # France STD 0X XX XX XX XX \+33\d{9} | # +33 XXXXXXXXX 0\d{9} | # 0XXXXXXXXX \+7\s\d{3}\s\d{3}-\d{2}-\d{2} | # Russia Intl +7 XXX XXX-XX-XX 8\s\d{3}\s\d{3}-\d{2}-\d{2} | # Russia STD 8 XXX XXX-XX-XX \+7\d{10} | # +7 XXXXXXXXXX 8\d{10} | # 8 XXXXXXXXXX \+27\s\d{2}\s\d{3}\s\d{4} | # South Africa Intl +27 XX XXX XXXX 0\d{2}\s\d{3}\s\d{4} | # South Africa STD 0XX XXX XXXX \+27\d{9} | # +27 XXXXXXXXX 0\d{9} | # 0XXXXXXXXX \+52\s\d{3}\s\d{3}\s\d{4} | # Mexico Intl +52 XXX XXX XXXX \+52\s\d{2}\s\d{4}\s\d{4} | # Mexico Intl +52 XX XXXX XXXX 01\s\d{3}\s\d{4} | # Mexico STD 01 XXX XXXX \+52\d{10} | # +52 XXXXXXXXXX 01\d{7} | # 01 XXXXXXX \+234\s\d{3}\s\d{3}\s\d{4} | # Nigeria Intl +234 XXX XXX XXXX 0\d{3}\s\d{3}\s\d{4} | # Nigeria STD 0XXX XXX XXXX \+234\d{10} | # +234 XXXXXXXXXX 0\d{10} | # 0XXXXXXXXXX \+971\s\d\s\d{3}\s\d{4} | # UAE Intl +971 X XXX XXXX 0\d\s\d{3}\s\d{4} | # UAE STD 0X XXX XXXX \+971\d{8} | # +971 XXXXXXXX 0\d{8} | # 0XXXXXXXX \+54\s9\s\d{3}\s\d{3}\s\d{4} | # Argentina Intl +54 9 XXX XXX XXXX \+54\s\d{1}\s\d{4}\s\d{4} | # Argentina Intl +54 X XXXX XXXX 0\d{3}\s\d{4} | # Argentina STD 0XXX XXXX \+54\d{10} | # +54 9 XXXXXXXXXX \+54\d{9} | # +54 XXXXXXXXX 0\d{7} | # 0XXXXXXX \+966\s\d\s\d{3}\s\d{4} | # Saudi Intl +966 X XXX XXXX 0\d\s\d{3}\s\d{4} | # Saudi STD 0X XXX XXXX \+966\d{8} | # +966 XXXXXXXX 0\d{8} | # 0XXXXXXXX \+1\d{10} | # +1 XXXXXXXXXX \+1\s\d{3}\s\d{3}\s\d{4} | # +1 XXX XXX XXXX \d{5}\s\d{5} | # XXXXX XXXXX \d{10} | # XXXXXXXXXX \+44\d{10} | # +44 XXXXXXXXXX 0\d{10} | # 0XXXXXXXXXX \+61\d{9} | # +61 XXXXXXXXX 0\d{9} | # 0XXXXXXXXX \+91\d{10} | # +91 XXXXXXXXXX \+49\d{12} | # +49 XXXXXXXXXXXX \+49\d{10} | # +49 XXXXXXXXXX 0\d{11} | # 0XXXXXXXXXXX \+86\d{11} | # +86 XXXXXXXXXXX \+81\d{10} | # +81 XXXXXXXXXX \+81\d{9} | # +81 XXXXXXXXX 0\d{9} | # 0XXXXXXXXX \+55\d{11} | # +55 XXXXXXXXXXX \+55\d{10} | # +55 XXXXXXXXXX 0\d{10} | # 0XXXXXXXXXX \+33\d{9} | # +33 XXXXXXXXX 0\d{9} | # 0XXXXXXXXX \+7\d{10} | # +7 XXXXXXXXXX 8\d{10} | # 8 XXXXXXXXXX \+27\d{9} | # +27 XXXXXXXXX 0\d{9} | # 0XXXXXXXXX (South Africa STD) \+52\d{10} | # +52 XXXXXXXXXX 01\d{7} | # 01 XXXXXXX \+234\d{10} | # +234 XXXXXXXXXX 0\d{10} | # 0XXXXXXXXXX \+971\d{8} | # +971 XXXXXXXX 0\d{8} | # 0XXXXXXXX \+54\s9\s\d{10} | # +54 9 XXXXXXXXXX \+54\d{9} | # +54 XXXXXXXXX 0\d{7} | # 0XXXXXXX \+966\d{8} | # +966 XXXXXXXX 0\d{8} # 0XXXXXXXX \+\d{3}-\d{3}-\d{4} ) ''',re.VERBOSE) # Email regex email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b') # URL and links regex, updated to avoid conflicts with email domains link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b') # Find all matches in the text phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5] emails = email_regex.findall(text) links_RE = [link for link in link_regex.findall(text) if len(link)>=11] # Remove profile links that might conflict with emails links_RE = [link for link in links_RE if not any(email in link for email in emails)] return { "phone_numbers": phone_numbers, "emails": emails, "links_RE": links_RE } # preprocessing the data def process_extracted_text(extracted_text): # Load JSON data data = json.dumps(extracted_text, indent=4) data = json.loads(data) # Create a single dictionary to hold combined results combined_results = { "phone_numbers": [], "emails": [], "links_RE": [] } # Process each text entry for filename, text in data.items(): contact_details = extract_contact_details(text) # Extend combined results with the details from this file combined_results["phone_numbers"].extend(contact_details["phone_numbers"]) combined_results["emails"].extend(contact_details["emails"]) combined_results["links_RE"].extend(contact_details["links_RE"]) # Convert the combined results to JSON #combined_results_json = json.dumps(combined_results, indent=4) combined_results_json = combined_results # Print the final JSON results print("Combined contact details in JSON format:") print(combined_results_json) return combined_results_json # Function to remove duplicates (case-insensitive) from each list in the dictionary def remove_duplicates_case_insensitive(data_dict): for key, value_list in data_dict.items(): seen = set() unique_list = [] for item in value_list: if item.lower() not in seen: unique_list.append(item) # Add original item (preserving its case) seen.add(item.lower()) # Track lowercase version # Update the dictionary with unique values data_dict[key] = unique_list return data_dict # # Process the model output for parsed result # def process_resume_data(LLMdata,cont_data,extracted_text): # # # Removing duplicate emails # # unique_emails = [] # # for email in cont_data['emails']: # # if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']): # # unique_emails.append(email) # # # Removing duplicate links (case insensitive) # # unique_links = [] # # for link in cont_data['links_RE']: # # if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']): # # unique_links.append(link) # # # Removing duplicate phone numbers # # normalized_contact = [num[-10:] for num in LLMdata['Contact']] # # unique_numbers = [] # # for num in cont_data['phone_numbers']: # # if num[-10:] not in normalized_contact: # # unique_numbers.append(num) # # # Add unique emails, links, and phone numbers to the original LLMdata # # LLMdata['Email'] += unique_emails # # LLMdata['Link'] += unique_links # # LLMdata['Contact'] += unique_numbers # # Ensure keys exist (CRITICAL FIX) # LLMdata['Email'] = LLMdata.get('Email', []) or [] # LLMdata['Link'] = LLMdata.get('Link', []) or [] # LLMdata['Contact'] = LLMdata.get('Contact', []) or [] # # Removing duplicate emails # unique_emails = [] # for email in cont_data.get('emails', []): # if not any(email.lower() == str(existing_email).lower() for existing_email in LLMdata['Email']): # unique_emails.append(email) # # Removing duplicate links # unique_links = [] # for link in cont_data.get('links_RE', []): # if not any(link.lower() == str(existing_link).lower() for existing_link in LLMdata['Link']): # unique_links.append(link) # # Normalize existing contacts safely # normalized_contact = [ # str(num)[-10:] for num in LLMdata['Contact'] if num # ] # # Removing duplicate phone numbers # unique_numbers = [] # for num in cont_data.get('phone_numbers', []): # if str(num)[-10:] not in normalized_contact: # unique_numbers.append(num) # # Merge safely # LLMdata['Email'].extend(unique_emails) # LLMdata['Link'].extend(unique_links) # LLMdata['Contact'].extend(unique_numbers) # # Apply the function to the data # LLMdata=remove_duplicates_case_insensitive(LLMdata) # # Initialize the processed data dictionary # processed_data = { # "name": [], # "contact_number": [], # "Designation":[], # "email": [], # "Location": [], # "Link": [], # "Company":[], # "extracted_text": extracted_text # } # #LLM # processed_data['name'].extend(LLMdata.get('Name', None)) # #processed_data['contact_number'].extend(LLMdata.get('Contact', [])) # processed_data['Designation'].extend(LLMdata.get('Designation', [])) # #processed_data['email'].extend(LLMdata.get("Email", [])) # processed_data['Location'].extend(LLMdata.get('Address', [])) # #processed_data['Link'].extend(LLMdata.get('Link', [])) # processed_data['Company'].extend(LLMdata.get('Company', [])) # #Contact # #processed_data['email'].extend(cont_data.get("emails", [])) # #processed_data['contact_number'].extend(cont_data.get("phone_numbers", [])) # #processed_data['Link'].extend(cont_data.get("links_RE", [])) # #New_merge_data # processed_data['email'].extend(LLMdata['Email']) # processed_data['contact_number'].extend(LLMdata['Contact']) # processed_data['Link'].extend(LLMdata['Link']) # #to remove not found fields # # List of keys to check for 'Not found' # keys_to_check = ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"] # # Replace 'Not found' with an empty list for each key # for key in keys_to_check: # if processed_data[key] == ['Not found'] or processed_data[key] == ['not found']: # processed_data[key] = [] # return processed_data def process_resume_data(LLMdata, cont_data, extracted_text): # ------------------------------- # ✅ STEP 1: Normalize LLM Schema # ------------------------------- expected_keys = ["Name", "Designation", "Company", "Contact", "Address", "Email", "Link"] for key in expected_keys: if key not in LLMdata or LLMdata[key] is None: LLMdata[key] = [] elif not isinstance(LLMdata[key], list): LLMdata[key] = [LLMdata[key]] # ------------------------------- # ✅ STEP 2: Normalize cont_data # ------------------------------- cont_data = cont_data or {} cont_data.setdefault("emails", []) cont_data.setdefault("phone_numbers", []) cont_data.setdefault("links_RE", []) # ------------------------------- # ✅ STEP 3: Normalize existing contacts # ------------------------------- normalized_llm_numbers = { str(num)[-10:] for num in LLMdata["Contact"] if num } # ------------------------------- # ✅ STEP 4: Merge Emails # ------------------------------- for email in cont_data["emails"]: if not any(email.lower() == str(e).lower() for e in LLMdata["Email"]): LLMdata["Email"].append(email) # ------------------------------- # ✅ STEP 5: Merge Links # ------------------------------- for link in cont_data["links_RE"]: if not any(link.lower() == str(l).lower() for l in LLMdata["Link"]): LLMdata["Link"].append(link) # ------------------------------- # ✅ STEP 6: Merge Phone Numbers # ------------------------------- for num in cont_data["phone_numbers"]: norm = str(num)[-10:] if norm not in normalized_llm_numbers: LLMdata["Contact"].append(num) normalized_llm_numbers.add(norm) # ------------------------------- # ✅ STEP 7: Remove duplicates (case-insensitive) # ------------------------------- LLMdata = remove_duplicates_case_insensitive(LLMdata) # ------------------------------- # ✅ STEP 8: Build final structure # ------------------------------- processed_data = { "name": LLMdata["Name"], "contact_number": LLMdata["Contact"], "Designation": LLMdata["Designation"], "email": LLMdata["Email"], "Location": LLMdata["Address"], "Link": LLMdata["Link"], "Company": LLMdata["Company"], "extracted_text": extracted_text } # ------------------------------- # ✅ STEP 9: Clean "Not found" # ------------------------------- for key in ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]: processed_data[key] = [ v for v in processed_data[key] if str(v).lower() != "not found" ] return processed_data