|
|
| """
|
| Setup script for the Document Text Extraction system.
|
| Creates directories, checks dependencies, and initializes the project.
|
| """
|
|
|
| import os
|
| import sys
|
| import subprocess
|
| from pathlib import Path
|
| import importlib.util
|
|
|
|
|
| def check_python_version():
|
| """Check if Python version is compatible."""
|
| if sys.version_info < (3, 8):
|
| print("Python 3.8 or higher is required.")
|
| print(f"Current version: {sys.version}")
|
| return False
|
|
|
| print(f"Python {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
|
| return True
|
|
|
|
|
| def create_directories():
|
| """Create necessary project directories."""
|
| directories = [
|
| "data/raw",
|
| "data/processed",
|
| "models",
|
| "results/plots",
|
| "results/metrics",
|
| "logs"
|
| ]
|
|
|
| print("\n📁 Creating project directories...")
|
| for directory in directories:
|
| Path(directory).mkdir(parents=True, exist_ok=True)
|
| print(f" {directory}")
|
|
|
|
|
| def check_dependencies():
|
| """Check if required dependencies are installed."""
|
| print("\n📦 Checking dependencies...")
|
|
|
| required_packages = [
|
| ('torch', 'PyTorch'),
|
| ('transformers', 'Transformers'),
|
| ('PIL', 'Pillow'),
|
| ('cv2', 'OpenCV'),
|
| ('pandas', 'Pandas'),
|
| ('numpy', 'NumPy'),
|
| ('sklearn', 'Scikit-learn')
|
| ]
|
|
|
| missing_packages = []
|
|
|
| for package, name in required_packages:
|
| spec = importlib.util.find_spec(package)
|
| if spec is None:
|
| missing_packages.append(name)
|
| print(f" {name} not found")
|
| else:
|
| print(f" {name}")
|
|
|
| return missing_packages
|
|
|
|
|
| def check_ocr_dependencies():
|
| """Check OCR-related dependencies."""
|
| print("\nChecking OCR dependencies...")
|
|
|
|
|
| try:
|
| import easyocr
|
| print(" EasyOCR")
|
| except ImportError:
|
| print(" EasyOCR not found")
|
|
|
|
|
| try:
|
| import pytesseract
|
| print(" PyTesseract")
|
|
|
|
|
| try:
|
| pytesseract.get_tesseract_version()
|
| print(" Tesseract OCR engine")
|
| except Exception:
|
| print(" Tesseract OCR engine not found or not in PATH")
|
| print(" Please install Tesseract OCR:")
|
| print(" - Windows: https://github.com/UB-Mannheim/tesseract/wiki")
|
| print(" - Ubuntu: sudo apt install tesseract-ocr")
|
| print(" - macOS: brew install tesseract")
|
|
|
| except ImportError:
|
| print(" PyTesseract not found")
|
|
|
|
|
| def install_dependencies():
|
| """Install missing dependencies."""
|
| print("\nInstalling dependencies from requirements.txt...")
|
|
|
| try:
|
| result = subprocess.run([
|
| sys.executable, "-m", "pip", "install", "-r", "requirements.txt"
|
| ], capture_output=True, text=True, check=True)
|
|
|
| print(" Dependencies installed successfully")
|
| return True
|
|
|
| except subprocess.CalledProcessError as e:
|
| print(f" Failed to install dependencies: {e}")
|
| print(f" Output: {e.stdout}")
|
| print(f" Error: {e.stderr}")
|
| return False
|
|
|
|
|
| def check_gpu_support():
|
| """Check if GPU support is available."""
|
| print("\n🖥️ Checking GPU support...")
|
|
|
| try:
|
| import torch
|
| if torch.cuda.is_available():
|
| gpu_count = torch.cuda.device_count()
|
| gpu_name = torch.cuda.get_device_name(0)
|
| print(f" CUDA available - {gpu_count} GPU(s)")
|
| print(f" Primary GPU: {gpu_name}")
|
| else:
|
| print(" CUDA not available - will use CPU")
|
| except ImportError:
|
| print(" PyTorch not installed")
|
|
|
|
|
| def create_sample_documents():
|
| """Create sample documents for testing."""
|
| print("\nCreating sample test documents...")
|
|
|
| sample_texts = [
|
| "Invoice sent to John Doe on 01/15/2025\nInvoice No: INV-1001\nAmount: $1,500.00\nPhone: (555) 123-4567",
|
| "Bill for Dr. Sarah Johnson dated March 10, 2025.\nInvoice Number: BL-2045.\nTotal: $2,300.50\nEmail: sarah@email.com",
|
| "Receipt for Michael Brown\n456 Oak Street, Boston MA 02101\nInvoice: REC-3089\nDate: 2025-04-22\nAmount: $890.75"
|
| ]
|
|
|
| sample_dir = Path("data/raw/samples")
|
| sample_dir.mkdir(parents=True, exist_ok=True)
|
|
|
| for i, text in enumerate(sample_texts, 1):
|
| sample_file = sample_dir / f"sample_document_{i}.txt"
|
| with open(sample_file, 'w', encoding='utf-8') as f:
|
| f.write(text)
|
| print(f" {sample_file.name}")
|
|
|
|
|
| def run_initial_test():
|
| """Run a basic test to verify setup."""
|
| print("\nRunning initial setup test...")
|
|
|
| try:
|
|
|
| from src.data_preparation import DocumentProcessor, NERDatasetCreator
|
| from src.model import ModelConfig
|
| print(" Core modules imported successfully")
|
|
|
|
|
| processor = DocumentProcessor()
|
| test_text = "Invoice sent to John Doe on 01/15/2025 Amount: $500.00"
|
| cleaned_text = processor.clean_text(test_text)
|
| print(" Document processor working")
|
|
|
|
|
| dataset_creator = NERDatasetCreator(processor)
|
| sample_dataset = dataset_creator.create_sample_dataset()
|
| print(f" Dataset creator working - {len(sample_dataset)} samples")
|
|
|
|
|
| config = ModelConfig()
|
| print(f" Model config created - {config.num_labels} labels")
|
|
|
| return True
|
|
|
| except Exception as e:
|
| print(f" Setup test failed: {e}")
|
| return False
|
|
|
|
|
| def display_next_steps():
|
| """Display next steps for the user."""
|
| print("\n" + "=" * 30)
|
| print("SETUP COMPLETED SUCCESSFULLY!")
|
| print("=" * 30)
|
|
|
| print("\nNext Steps:")
|
| print("1. Quick Demo:")
|
| print(" python demo.py")
|
|
|
| print("\n2. Train Your Model:")
|
| print(" # Add your documents to data/raw/")
|
| print(" # Then run:")
|
| print(" python src/training_pipeline.py")
|
|
|
| print("\n3. 🌐 Start Web API:")
|
| print(" python api/app.py")
|
| print(" # Then open: http://localhost:8000")
|
|
|
| print("\n4. Run Tests:")
|
| print(" python tests/test_extraction.py")
|
|
|
| print("\n5. 📚 Documentation:")
|
| print(" # View README.md for detailed usage")
|
| print(" # API docs: http://localhost:8000/docs")
|
|
|
| print("\nPro Tips:")
|
| print(" - Place your documents in data/raw/ for training")
|
| print(" - Use GPU for faster training (if available)")
|
| print(" - Adjust batch_size in config if you get memory errors")
|
| print(" - Check logs/ directory for debugging information")
|
|
|
|
|
| def main():
|
| """Main setup function."""
|
| print("DOCUMENT TEXT EXTRACTION - SETUP SCRIPT")
|
| print("=" * 60)
|
|
|
|
|
| if not check_python_version():
|
| return False
|
|
|
|
|
| create_directories()
|
|
|
|
|
| missing_packages = check_dependencies()
|
| if missing_packages:
|
| print(f"\nMissing packages: {', '.join(missing_packages)}")
|
| install_deps = input("Install missing dependencies? (y/n): ").lower().strip()
|
|
|
| if install_deps == 'y':
|
| if not install_dependencies():
|
| print("Failed to install dependencies. Please install manually:")
|
| print(" pip install -r requirements.txt")
|
| return False
|
| else:
|
| print("Some features may not work without required dependencies.")
|
|
|
|
|
| check_ocr_dependencies()
|
|
|
|
|
| check_gpu_support()
|
|
|
|
|
| create_sample_documents()
|
|
|
|
|
| if not run_initial_test():
|
| print("Setup test failed. Some features may not work correctly.")
|
| print(" Check error messages above and ensure all dependencies are installed.")
|
|
|
|
|
| display_next_steps()
|
|
|
| return True
|
|
|
|
|
| if __name__ == "__main__":
|
| success = main()
|
|
|
| if success:
|
| print(f"\nSetup completed! Ready to extract text from documents!")
|
| else:
|
| print(f"\nSetup encountered issues. Please check the messages above.")
|
| sys.exit(1) |