|
|
| """
|
| Simplified Document Text Extraction API
|
| Uses regex patterns instead of ML model for demonstration
|
| """
|
|
|
| import json
|
| import re
|
| from datetime import datetime
|
| from typing import Dict, List, Any, Optional
|
| from pathlib import Path
|
| import sys
|
| import os
|
|
|
|
|
| sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
| try:
|
| from fastapi import FastAPI, HTTPException, File, UploadFile
|
| from fastapi.responses import HTMLResponse, FileResponse
|
| from fastapi.middleware.cors import CORSMiddleware
|
| from pydantic import BaseModel
|
| import uvicorn
|
| HAS_FASTAPI = True
|
| except ImportError:
|
| print("FastAPI not installed. Install with: pip install fastapi uvicorn python-multipart")
|
| HAS_FASTAPI = False
|
|
|
| class SimpleDocumentProcessor:
|
| """Simplified document processor using regex patterns"""
|
|
|
| def __init__(self):
|
|
|
| self.patterns = {
|
| 'NAME': [
|
| r'\b(?:Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',
|
| r'\b([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b',
|
| r'(?:Invoice|Bill|Receipt)\s+(?:sent\s+)?(?:to\s+|for\s+)?([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',
|
| ],
|
| 'DATE': [
|
| r'\b(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})\b',
|
| r'\b(\d{2,4}[\/\-]\d{1,2}[\/\-]\d{1,2})\b',
|
| r'\b((?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{2,4})\b',
|
| r'\b((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{2,4})\b',
|
| ],
|
| 'AMOUNT': [
|
| r'\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',
|
| r'(?:Amount|Total|Sum):\s*\$?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',
|
| r'(\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|dollars?))',
|
| ],
|
| 'INVOICE_NO': [
|
| r'(?:Invoice|Bill|Receipt)(?:\s+No\.?|#|Number):\s*([A-Z]{2,4}[-\s]?\d{3,6})',
|
| r'(?:INV|BL|REC)[-\s]?(\d{3,6})',
|
| r'Reference:\s*([A-Z]{2,4}[-\s]?\d{3,6})',
|
| ],
|
| 'EMAIL': [
|
| r'\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b',
|
| ],
|
| 'PHONE': [
|
| r'\b(\+?1[-.\s]?\(?[2-9]\d{2}\)?[-.\s]?\d{3}[-.\s]?\d{4})\b',
|
| r'\b(\([2-9]\d{2}\)\s*[2-9]\d{2}[-.\s]?\d{4})\b',
|
| r'\b([2-9]\d{2}[-.\s]?[2-9]\d{2}[-.\s]?\d{4})\b',
|
| ],
|
| 'ADDRESS': [
|
| r'\b(\d+\s+[A-Z][a-z]+\s+(?:Street|St|Avenue|Ave|Road|Rd|Lane|Ln|Drive|Dr|Boulevard|Blvd|Way))\b',
|
| ]
|
| }
|
|
|
|
|
| self.confidence_scores = {
|
| 'NAME': 0.80,
|
| 'DATE': 0.85,
|
| 'AMOUNT': 0.85,
|
| 'INVOICE_NO': 0.90,
|
| 'EMAIL': 0.95,
|
| 'PHONE': 0.90,
|
| 'ADDRESS': 0.75
|
| }
|
|
|
| def extract_entities(self, text: str) -> List[Dict[str, Any]]:
|
| """Extract entities from text using regex patterns"""
|
| entities = []
|
|
|
| for entity_type, patterns in self.patterns.items():
|
| for pattern in patterns:
|
| matches = re.finditer(pattern, text, re.IGNORECASE)
|
| for match in matches:
|
| entity = {
|
| 'entity': entity_type,
|
| 'text': match.group(1) if match.groups() else match.group(0),
|
| 'start': match.start(),
|
| 'end': match.end(),
|
| 'confidence': self.confidence_scores[entity_type]
|
| }
|
| entities.append(entity)
|
|
|
| return entities
|
|
|
| def create_structured_data(self, entities: List[Dict]) -> Dict[str, str]:
|
| """Create structured data from extracted entities"""
|
| structured = {}
|
|
|
|
|
| entity_groups = {}
|
| for entity in entities:
|
| entity_type = entity['entity']
|
| if entity_type not in entity_groups:
|
| entity_groups[entity_type] = []
|
| entity_groups[entity_type].append(entity)
|
|
|
|
|
| for entity_type, group in entity_groups.items():
|
| if group:
|
|
|
| best_entity = max(group, key=lambda x: x['confidence'])
|
|
|
|
|
| field_mapping = {
|
| 'NAME': 'Name',
|
| 'DATE': 'Date',
|
| 'AMOUNT': 'Amount',
|
| 'INVOICE_NO': 'InvoiceNo',
|
| 'EMAIL': 'Email',
|
| 'PHONE': 'Phone',
|
| 'ADDRESS': 'Address'
|
| }
|
|
|
| field_name = field_mapping.get(entity_type, entity_type)
|
| structured[field_name] = best_entity['text']
|
|
|
| return structured
|
|
|
| def process_text(self, text: str) -> Dict[str, Any]:
|
| """Process text and extract structured information"""
|
| entities = self.extract_entities(text)
|
| structured_data = self.create_structured_data(entities)
|
|
|
|
|
| entity_types = list(set(entity['entity'] for entity in entities))
|
|
|
| return {
|
| 'status': 'success',
|
| 'data': {
|
| 'original_text': text,
|
| 'entities': entities,
|
| 'structured_data': structured_data,
|
| 'processing_timestamp': datetime.now().isoformat(),
|
| 'total_entities_found': len(entities),
|
| 'entity_types_found': sorted(entity_types)
|
| }
|
| }
|
|
|
|
|
| if HAS_FASTAPI:
|
| class TextRequest(BaseModel):
|
| text: str
|
|
|
| def create_app():
|
| """Create and configure FastAPI app"""
|
| if not HAS_FASTAPI:
|
| raise ImportError("FastAPI dependencies not installed")
|
|
|
| app = FastAPI(
|
| title="Simple Document Text Extraction API",
|
| description="Extract structured information from documents using regex patterns",
|
| version="1.0.0"
|
| )
|
|
|
|
|
| app.add_middleware(
|
| CORSMiddleware,
|
| allow_origins=["*"],
|
| allow_credentials=True,
|
| allow_methods=["*"],
|
| allow_headers=["*"],
|
| )
|
|
|
|
|
| processor = SimpleDocumentProcessor()
|
|
|
| @app.get("/", response_class=HTMLResponse)
|
| async def get_interface():
|
| """Serve the web interface"""
|
| return """
|
| <!DOCTYPE html>
|
| <html>
|
| <head>
|
| <title>Document Text Extraction Demo</title>
|
| <style>
|
| body {
|
| font-family: Arial, sans-serif;
|
| max-width: 1200px;
|
| margin: 0 auto;
|
| padding: 20px;
|
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| color: #333;
|
| }
|
| .container {
|
| background: white;
|
| padding: 30px;
|
| border-radius: 10px;
|
| box-shadow: 0 10px 30px rgba(0,0,0,0.2);
|
| }
|
| .header {
|
| text-align: center;
|
| margin-bottom: 30px;
|
| }
|
| .header h1 {
|
| color: #2c3e50;
|
| font-size: 2.5em;
|
| margin-bottom: 10px;
|
| }
|
| .header p {
|
| color: #7f8c8d;
|
| font-size: 1.2em;
|
| }
|
| .tabs {
|
| display: flex;
|
| margin-bottom: 20px;
|
| }
|
| .tab {
|
| flex: 1;
|
| text-align: center;
|
| padding: 15px;
|
| background: #ecf0f1;
|
| border: none;
|
| cursor: pointer;
|
| font-size: 16px;
|
| transition: background 0.3s;
|
| }
|
| .tab.active {
|
| background: #3498db;
|
| color: white;
|
| }
|
| .tab:hover {
|
| background: #3498db;
|
| color: white;
|
| }
|
| .tab-content {
|
| display: none;
|
| padding: 20px;
|
| border: 1px solid #ddd;
|
| border-radius: 5px;
|
| }
|
| .tab-content.active {
|
| display: block;
|
| }
|
| textarea {
|
| width: 100%;
|
| height: 150px;
|
| margin-bottom: 15px;
|
| padding: 10px;
|
| border: 1px solid #ddd;
|
| border-radius: 5px;
|
| font-size: 14px;
|
| }
|
| input[type="file"] {
|
| margin-bottom: 15px;
|
| padding: 10px;
|
| }
|
| button {
|
| background: #27ae60;
|
| color: white;
|
| padding: 12px 25px;
|
| border: none;
|
| border-radius: 5px;
|
| cursor: pointer;
|
| font-size: 16px;
|
| transition: background 0.3s;
|
| }
|
| button:hover {
|
| background: #2ecc71;
|
| }
|
| .results {
|
| margin-top: 20px;
|
| padding: 20px;
|
| background: #f8f9fa;
|
| border-radius: 5px;
|
| border-left: 4px solid #27ae60;
|
| }
|
| .entity {
|
| background: #e8f4fd;
|
| padding: 8px 12px;
|
| margin: 5px;
|
| border-radius: 20px;
|
| display: inline-block;
|
| font-size: 12px;
|
| border: 1px solid #3498db;
|
| }
|
| .entity.NAME { background: #ffeb3b; border-color: #ff9800; }
|
| .entity.DATE { background: #4caf50; border-color: #2e7d32; color: white; }
|
| .entity.AMOUNT { background: #f44336; border-color: #c62828; color: white; }
|
| .entity.INVOICE_NO { background: #9c27b0; border-color: #6a1b9a; color: white; }
|
| .entity.EMAIL { background: #00bcd4; border-color: #00838f; color: white; }
|
| .entity.PHONE { background: #ff5722; border-color: #d84315; color: white; }
|
| .entity.ADDRESS { background: #795548; border-color: #5d4037; color: white; }
|
| .structured-data {
|
| background: #e8f5e8;
|
| padding: 15px;
|
| border-radius: 5px;
|
| margin-top: 15px;
|
| }
|
| .examples {
|
| background: #fff3cd;
|
| padding: 15px;
|
| border-radius: 5px;
|
| margin-top: 20px;
|
| }
|
| .example-btn {
|
| background: #6c757d;
|
| font-size: 12px;
|
| padding: 5px 10px;
|
| margin: 2px;
|
| }
|
| pre {
|
| background: #f8f9fa;
|
| padding: 15px;
|
| border-radius: 5px;
|
| overflow-x: auto;
|
| font-size: 12px;
|
| border: 1px solid #dee2e6;
|
| }
|
| </style>
|
| </head>
|
| <body>
|
| <div class="container">
|
| <div class="header">
|
| <h1> Document Text Extraction</h1>
|
| <p>Extract structured information from documents using AI patterns</p>
|
| </div>
|
|
|
| <div class="tabs">
|
| <button class="tab active" onclick="showTab('text')">Enter Text</button>
|
| <button class="tab" onclick="showTab('file')">Upload File</button>
|
| <button class="tab" onclick="showTab('api')">API Docs</button>
|
| </div>
|
|
|
| <div id="text-tab" class="tab-content active">
|
| <h3>Enter Text to Extract:</h3>
|
| <textarea id="textInput" placeholder="Paste your document text here...">Invoice sent to Robert White on 15/09/2025 Invoice No: INV-1024 Amount: $1,250.00 Phone: (555) 123-4567 Email: robert.white@email.com</textarea>
|
| <button onclick="extractFromText()">Extract Information</button>
|
|
|
| <div class="examples">
|
| <h4>Try These Examples:</h4>
|
| <button class="example-btn" onclick="useExample(0)">Invoice Example</button>
|
| <button class="example-btn" onclick="useExample(1)">Receipt Example</button>
|
| <button class="example-btn" onclick="useExample(2)">Business Document</button>
|
| <button class="example-btn" onclick="useExample(3)">Payment Notice</button>
|
| </div>
|
| </div>
|
|
|
| <div id="file-tab" class="tab-content">
|
| <h3>Upload Document:</h3>
|
| <input type="file" id="fileInput" accept=".pdf,.docx,.txt,.jpg,.png,.tiff">
|
| <br>
|
| <button onclick="extractFromFile()">Upload & Extract</button>
|
| <p><em>Note: File upload processing is simplified in this demo</em></p>
|
| </div>
|
|
|
| <div id="api-tab" class="tab-content">
|
| <h3>API Documentation</h3>
|
| <h4>Endpoints:</h4>
|
| <pre><strong>POST /extract-from-text</strong>
|
| Content-Type: application/json
|
| {
|
| "text": "Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00"
|
| }</pre>
|
|
|
| <pre><strong>POST /extract-from-file</strong>
|
| Content-Type: multipart/form-data
|
| file: [uploaded file]</pre>
|
|
|
| <h4>Response Format:</h4>
|
| <pre>{
|
| "status": "success",
|
| "data": {
|
| "original_text": "...",
|
| "entities": [...],
|
| "structured_data": {...},
|
| "processing_timestamp": "2025-09-27T...",
|
| "total_entities_found": 7,
|
| "entity_types_found": ["NAME", "DATE", "AMOUNT", "INVOICE_NO"]
|
| }
|
| }</pre>
|
| </div>
|
|
|
| <div id="results"></div>
|
| </div>
|
|
|
| <script>
|
| const examples = [
|
| "Invoice sent to Robert White on 15/09/2025 Invoice No: INV-1024 Amount: $1,250.00 Phone: (555) 123-4567 Email: robert.white@email.com",
|
| "Receipt for Michael Brown Invoice: REC-3089 Date: 2025-04-22 Amount: $890.75 Contact: +1-555-987-6543",
|
| "Ms. Emma Wilson 456 Oak Street Payment due: January 15, 2025 Reference: INV-4567 Total: $1,750.25",
|
| "Bill for Dr. Sarah Johnson dated March 10, 2025. Invoice Number: BL-2045. Total: $2,300.50 Email: sarah.johnson@email.com"
|
| ];
|
|
|
| function showTab(tabName) {
|
| // Hide all tabs
|
| document.querySelectorAll('.tab-content').forEach(content => {
|
| content.classList.remove('active');
|
| });
|
| document.querySelectorAll('.tab').forEach(tab => {
|
| tab.classList.remove('active');
|
| });
|
|
|
| // Show selected tab
|
| document.getElementById(tabName + '-tab').classList.add('active');
|
| event.target.classList.add('active');
|
| }
|
|
|
| function useExample(index) {
|
| document.getElementById('textInput').value = examples[index];
|
| }
|
|
|
| async function extractFromText() {
|
| const text = document.getElementById('textInput').value;
|
| if (!text.trim()) {
|
| alert('Please enter some text');
|
| return;
|
| }
|
|
|
| try {
|
| const response = await fetch('/extract-from-text', {
|
| method: 'POST',
|
| headers: {
|
| 'Content-Type': 'application/json',
|
| },
|
| body: JSON.stringify({ text: text })
|
| });
|
|
|
| const result = await response.json();
|
| displayResults(result);
|
| } catch (error) {
|
| alert('Error: ' + error.message);
|
| }
|
| }
|
|
|
| async function extractFromFile() {
|
| const fileInput = document.getElementById('fileInput');
|
| if (!fileInput.files[0]) {
|
| alert('Please select a file');
|
| return;
|
| }
|
|
|
| // For demo purposes, show that file upload would work
|
| alert('File upload processing would happen here. For now, using sample text extraction.');
|
| document.getElementById('textInput').value = examples[0];
|
| showTab('text');
|
| extractFromText();
|
| }
|
|
|
| function displayResults(result) {
|
| const resultsDiv = document.getElementById('results');
|
|
|
| if (result.status !== 'success') {
|
| resultsDiv.innerHTML = '<div class="results"><h3>Error</h3><p>' + result.message + '</p></div>';
|
| return;
|
| }
|
|
|
| const data = result.data;
|
| let html = '<div class="results">';
|
| html += '<h3>Extraction Results</h3>';
|
| html += '<p><strong>Found:</strong> ' + data.total_entities_found + ' entities of ' + data.entity_types_found.length + ' types</p>';
|
|
|
| // Show entities
|
| html += '<h4>Detected Entities:</h4>';
|
| data.entities.forEach(entity => {
|
| html += '<span class="entity ' + entity.entity + '">' + entity.entity + ': ' + entity.text + ' (' + Math.round(entity.confidence * 100) + '%)</span> ';
|
| });
|
|
|
| // Show structured data
|
| if (Object.keys(data.structured_data).length > 0) {
|
| html += '<div class="structured-data">';
|
| html += '<h4>Structured Information:</h4>';
|
| html += '<ul>';
|
| for (const [key, value] of Object.entries(data.structured_data)) {
|
| html += '<li><strong>' + key + ':</strong> ' + value + '</li>';
|
| }
|
| html += '</ul>';
|
| html += '</div>';
|
| }
|
|
|
| // Show processing info
|
| html += '<p><small>🕒 Processed at: ' + new Date(data.processing_timestamp).toLocaleString() + '</small></p>';
|
| html += '</div>';
|
|
|
| resultsDiv.innerHTML = html;
|
| }
|
| </script>
|
| </body>
|
| </html>
|
| """
|
|
|
| @app.post("/extract-from-text")
|
| async def extract_from_text(request: TextRequest):
|
| """Extract entities from text"""
|
| try:
|
| result = processor.process_text(request.text)
|
| return result
|
| except Exception as e:
|
| raise HTTPException(status_code=500, detail=str(e))
|
|
|
| @app.post("/extract-from-file")
|
| async def extract_from_file(file: UploadFile = File(...)):
|
| """Extract entities from uploaded file"""
|
| try:
|
|
|
| content = await file.read()
|
|
|
|
|
| if file.filename.lower().endswith('.txt'):
|
| text = content.decode('utf-8')
|
| else:
|
|
|
| text = "Demo processing for " + file.filename + ": Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00"
|
|
|
| result = processor.process_text(text)
|
| return result
|
|
|
| except Exception as e:
|
| raise HTTPException(status_code=500, detail=str(e))
|
|
|
| @app.get("/health")
|
| async def health_check():
|
| """Health check endpoint"""
|
| return {"status": "healthy", "timestamp": datetime.now().isoformat()}
|
|
|
| return app
|
|
|
| def main():
|
| """Main function to run the API server"""
|
| if not HAS_FASTAPI:
|
| print("FastAPI dependencies not installed.")
|
| print("📦 Install with: pip install fastapi uvicorn python-multipart")
|
| return
|
|
|
| print("Starting Simple Document Text Extraction API...")
|
| print("Access the web interface at: http://localhost:7000")
|
| print("API documentation at: http://localhost:7000/docs")
|
| print("Health check at: http://localhost:7000/health")
|
| print("\nServer starting...")
|
|
|
| app = create_app()
|
| uvicorn.run(app, host="0.0.0.0", port=7000, log_level="info")
|
|
|
| if __name__ == "__main__":
|
| main() |