Spaces:

WebashalarForML
/

ImageDataExtractor2

Sleeping

App Files Files Community

WebashalarForML commited on 2 days ago

Commit

04ba4bb

verified ·

1 Parent(s): ab72f2c

Update utility/utils.py

Browse files

Files changed (1) hide show

utility/utils.py +69 -17

utility/utils.py CHANGED Viewed

@@ -280,12 +280,51 @@ Rules:
 """
 def extract_text_from_images(image_paths):
     """
-    Groq VLM single-pass extraction.
     Returns:
-        merged_llm_data: dict with the schema above
-        all_extracted_texts: dict[path] -> JSON string per image
         all_extracted_imgs: dict[path] -> processed image path
     """
     merged_llm_data = _empty_schema()
@@ -304,29 +343,42 @@ def extract_text_from_images(image_paths):
             )
             cv2.imwrite(result_image_path, enhanced_image)
-            single_data = call_groq_vlm(
-                enhanced_image,
-                build_vlm_prompt()
-            )
-            # Merge into combined schema
-            for key in merged_llm_data.keys():
-                merged_llm_data[key].extend(_coerce_list(single_data.get(key)))
-            # Keep per-image extracted JSON as text for downstream regex processing
-            all_extracted_texts[image_path] = json.dumps(single_data, ensure_ascii=False)
             all_extracted_imgs[image_path] = result_image_path
-            logging.info(f"Processed image: {image_path}")
         except Exception as e:
-            logging.exception(f"Error processing image {image_path}: {e}")
             continue
     return merged_llm_data, all_extracted_texts, all_extracted_imgs
 def extract_contact_details(text):
     # Regex patterns
     # Phone numbers with at least 5 digits in any segment

 """
+from paddleocr import PaddleOCR
+# Global PaddleOCR instance (lazy initialized)
+_PADDLE_OCR = None
+def get_paddle_ocr():
+    global _PADDLE_OCR
+    if _PADDLE_OCR is None:
+        try:
+            _PADDLE_OCR = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
+        except Exception as e:
+            logging.error(f"Failed to initialize PaddleOCR: {e}")
+            return None
+    return _PADDLE_OCR
+def call_paddle_ocr(image_bgr):
+    """
+    Backup OCR using local PaddleOCR.
+    Returns: A string of all detected text joined by spaces.
+    """
+    ocr_engine = get_paddle_ocr()
+    if not ocr_engine:
+        return ""
+    try:
+        results = ocr_engine.ocr(image_bgr, cls=True)
+        if not results or not results[0]:
+            return ""
+        text_blobs = []
+        for line in results[0]:
+            # Each entry is like: [[(x1,y1), ...], (text, confidence)]
+            text_blobs.append(line[1][0])
+        return " ".join(text_blobs)
+    except Exception as e:
+        logging.error(f"PaddleOCR error: {e}")
+        return ""
 def extract_text_from_images(image_paths):
     """
+    Groq VLM single-pass extraction with local PaddleOCR fallback.
     Returns:
+        merged_llm_data: dict with the normalized schema
+        all_extracted_texts: dict[path] -> Raw text (json from VLM or string from OCR)
         all_extracted_imgs: dict[path] -> processed image path
     """
     merged_llm_data = _empty_schema()
             )
             cv2.imwrite(result_image_path, enhanced_image)
             all_extracted_imgs[image_path] = result_image_path
+            # Attempt Primary: Groq VLM
+            try:
+                single_data = call_groq_vlm(
+                    enhanced_image,
+                    build_vlm_prompt()
+                )
+                # Merge into combined schema
+                for key in merged_llm_data.keys():
+                    merged_llm_data[key].extend(_coerce_list(single_data.get(key)))
+                # Store VLM output JSON
+                all_extracted_texts[image_path] = json.dumps(single_data, ensure_ascii=False)
+                logging.info(f"Groq VLM success for: {image_path}")
+            except Exception as vlm_e:
+                logging.warning(f"Groq VLM failed for {image_path}, trying PaddleOCR: {vlm_e}")
+                # Attempt Fallback: PaddleOCR
+                raw_text = call_paddle_ocr(enhanced_image)
+                if raw_text:
+                    all_extracted_texts[image_path] = raw_text
+                    logging.info(f"PaddleOCR success for: {image_path}")
+                else:
+                    logging.error(f"All OCR/VLM failed for: {image_path}")
         except Exception as e:
+            logging.exception(f"Fatal error processing image {image_path}: {e}")
             continue
     return merged_llm_data, all_extracted_texts, all_extracted_imgs
 def extract_contact_details(text):
     # Regex patterns
     # Phone numbers with at least 5 digits in any segment