WebashalarForML commited on
Commit
04ba4bb
·
verified ·
1 Parent(s): ab72f2c

Update utility/utils.py

Browse files
Files changed (1) hide show
  1. utility/utils.py +69 -17
utility/utils.py CHANGED
@@ -280,12 +280,51 @@ Rules:
280
  """
281
 
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  def extract_text_from_images(image_paths):
284
  """
285
- Groq VLM single-pass extraction.
286
  Returns:
287
- merged_llm_data: dict with the schema above
288
- all_extracted_texts: dict[path] -> JSON string per image
289
  all_extracted_imgs: dict[path] -> processed image path
290
  """
291
  merged_llm_data = _empty_schema()
@@ -304,29 +343,42 @@ def extract_text_from_images(image_paths):
304
  )
305
 
306
  cv2.imwrite(result_image_path, enhanced_image)
307
-
308
- single_data = call_groq_vlm(
309
- enhanced_image,
310
- build_vlm_prompt()
311
- )
312
-
313
- # Merge into combined schema
314
- for key in merged_llm_data.keys():
315
- merged_llm_data[key].extend(_coerce_list(single_data.get(key)))
316
-
317
- # Keep per-image extracted JSON as text for downstream regex processing
318
- all_extracted_texts[image_path] = json.dumps(single_data, ensure_ascii=False)
319
  all_extracted_imgs[image_path] = result_image_path
320
 
321
- logging.info(f"Processed image: {image_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
  except Exception as e:
324
- logging.exception(f"Error processing image {image_path}: {e}")
325
  continue
326
 
327
  return merged_llm_data, all_extracted_texts, all_extracted_imgs
328
 
329
 
 
330
  def extract_contact_details(text):
331
  # Regex patterns
332
  # Phone numbers with at least 5 digits in any segment
 
280
  """
281
 
282
 
283
+ from paddleocr import PaddleOCR
284
+
285
+ # Global PaddleOCR instance (lazy initialized)
286
+ _PADDLE_OCR = None
287
+
288
+ def get_paddle_ocr():
289
+ global _PADDLE_OCR
290
+ if _PADDLE_OCR is None:
291
+ try:
292
+ _PADDLE_OCR = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
293
+ except Exception as e:
294
+ logging.error(f"Failed to initialize PaddleOCR: {e}")
295
+ return None
296
+ return _PADDLE_OCR
297
+
298
+ def call_paddle_ocr(image_bgr):
299
+ """
300
+ Backup OCR using local PaddleOCR.
301
+ Returns: A string of all detected text joined by spaces.
302
+ """
303
+ ocr_engine = get_paddle_ocr()
304
+ if not ocr_engine:
305
+ return ""
306
+
307
+ try:
308
+ results = ocr_engine.ocr(image_bgr, cls=True)
309
+ if not results or not results[0]:
310
+ return ""
311
+
312
+ text_blobs = []
313
+ for line in results[0]:
314
+ # Each entry is like: [[(x1,y1), ...], (text, confidence)]
315
+ text_blobs.append(line[1][0])
316
+
317
+ return " ".join(text_blobs)
318
+ except Exception as e:
319
+ logging.error(f"PaddleOCR error: {e}")
320
+ return ""
321
+
322
  def extract_text_from_images(image_paths):
323
  """
324
+ Groq VLM single-pass extraction with local PaddleOCR fallback.
325
  Returns:
326
+ merged_llm_data: dict with the normalized schema
327
+ all_extracted_texts: dict[path] -> Raw text (json from VLM or string from OCR)
328
  all_extracted_imgs: dict[path] -> processed image path
329
  """
330
  merged_llm_data = _empty_schema()
 
343
  )
344
 
345
  cv2.imwrite(result_image_path, enhanced_image)
 
 
 
 
 
 
 
 
 
 
 
 
346
  all_extracted_imgs[image_path] = result_image_path
347
 
348
+ # Attempt Primary: Groq VLM
349
+ try:
350
+ single_data = call_groq_vlm(
351
+ enhanced_image,
352
+ build_vlm_prompt()
353
+ )
354
+
355
+ # Merge into combined schema
356
+ for key in merged_llm_data.keys():
357
+ merged_llm_data[key].extend(_coerce_list(single_data.get(key)))
358
+
359
+ # Store VLM output JSON
360
+ all_extracted_texts[image_path] = json.dumps(single_data, ensure_ascii=False)
361
+ logging.info(f"Groq VLM success for: {image_path}")
362
+
363
+ except Exception as vlm_e:
364
+ logging.warning(f"Groq VLM failed for {image_path}, trying PaddleOCR: {vlm_e}")
365
+
366
+ # Attempt Fallback: PaddleOCR
367
+ raw_text = call_paddle_ocr(enhanced_image)
368
+ if raw_text:
369
+ all_extracted_texts[image_path] = raw_text
370
+ logging.info(f"PaddleOCR success for: {image_path}")
371
+ else:
372
+ logging.error(f"All OCR/VLM failed for: {image_path}")
373
 
374
  except Exception as e:
375
+ logging.exception(f"Fatal error processing image {image_path}: {e}")
376
  continue
377
 
378
  return merged_llm_data, all_extracted_texts, all_extracted_imgs
379
 
380
 
381
+
382
  def extract_contact_details(text):
383
  # Regex patterns
384
  # Phone numbers with at least 5 digits in any segment