Spaces:

rajmethun0
/

Data_Extractor_Using_Gemini

Sleeping

Data_Extractor_Using_Gemini / models /data_models.py

methunraj

feat: initialize project structure with core components

cfeb3a6 8 months ago

8.39 kB

	from pydantic import BaseModel, Field
	from typing import List, Dict, Any, Optional
	from datetime import datetime


	class FileInfo(BaseModel):
	"""Information about the file being processed."""
	name: str = Field(description="File name")
	type: str = Field(description="File type/extension")
	size_mb: float = Field(description="File size in MB")
	path: str = Field(description="Full file path")


	class SimplifiedAgentConfig(BaseModel):
	"""Simplified configuration for agent creation without complex nesting."""
	instructions: str = Field(description="Single string instructions")
	requirement_type: str = Field(default="standard", description="Type of requirements")
	custom_notes: List[str] = Field(default_factory=list, description="Simple notes")


	class ProcessingPlan(BaseModel):
	"""Simplified processing plan for document analysis."""
	# Basic plan information
	document_type: str = Field(description="Document type (financial, legal, technical, etc.)")
	analysis_objective: str = Field(description="Primary analysis objective")
	complexity: str = Field(default="moderate", description="Complexity level")
	processing_strategy: str = Field(description="Overall processing strategy")

	# Essential configurations (simplified)
	agent_configs: Dict[str, str] = Field(
	default_factory=dict,
	description="Simple agent configuration summaries"
	)

	# Simple schema suggestions using basic types
	data_fields: List[str] = Field(description="List of suggested data fields to extract")
	validation_rules: List[str] = Field(default_factory=list, description="Validation rules")
	output_formats: List[str] = Field(default_factory=list, description="Required output formats")

	# Simple notes and requirements
	requirements: List[str] = Field(default_factory=list, description="Processing requirements")
	notes: str = Field(default="", description="Additional notes")


	class AgentConfiguration(BaseModel):
	"""Configuration for a dynamically created agent."""
	instructions: List[str] = Field(description="Specific instructions for this agent")
	custom_prompt_template: Optional[str] = Field(default="", description="Custom prompt template for this agent")
	special_requirements: List[str] = Field(default_factory=list, description="Special requirements or constraints")


	class DataPoint(BaseModel):
	"""Individual data point extracted from document."""
	field_name: str = Field(description="Name of the data field")
	value: str = Field(description="Value of the field")
	data_type: Optional[str] = Field(default="", description="Type of data (text, number, date, etc.)")
	category: Optional[str] = Field(default="", description="Category or section this data belongs to")
	unit: Optional[str] = Field(default="", description="Unit of measurement if applicable")
	period: Optional[str] = Field(default="", description="Time period if applicable")
	confidence_score: float = Field(description="Confidence score for the extraction (0-1)")
	source_location: Optional[str] = Field(default="", description="Location in document where data was found")


	class ExtractedData(BaseModel):
	"""Structured data extracted from the document."""
	data_points: List[DataPoint] = Field(description="List of extracted data points")
	extraction_notes: str = Field(default="", description="Notes about the extraction process")
	confidence_score: float = Field(description="Overall confidence score for the extraction")
	extraction_timestamp: datetime = Field(default_factory=datetime.now, description="When extraction was performed")
	document_summary: Optional[str] = Field(default="", description="Brief summary of the document content")


	class DataInsight(BaseModel):
	"""Individual insight from data analysis."""
	insight_type: str = Field(description="Type of insight (trend, comparison, etc.)")
	description: str = Field(description="Description of the insight")
	supporting_data: List[str] = Field(description="Data points that support this insight")
	importance_level: str = Field(description="Importance level (high, medium, low)")


	class DataCategory(BaseModel):
	"""A category of organized data."""
	category_name: str = Field(description="Name of the data category")
	data_points: Dict[str, str] = Field(description="Key-value pairs of data in this category")

	class ArrangedData(BaseModel):
	"""Organized and analyzed data."""
	organized_categories: List[DataCategory] = Field(
	description="Data organized into logical categories"
	)
	insights: List[DataInsight] = Field(description="Insights generated from the data")
	summary: str = Field(description="Summary of the arranged data")
	arrangement_notes: str = Field(description="Notes about the arrangement process")


	class CodeGenerationResult(BaseModel):
	"""Result of code generation and execution."""
	generated_code: str = Field(description="The generated Python code")
	execution_result: str = Field(description="Result of code execution")
	output_files: List[str] = Field(description="List of output files created")
	execution_success: bool = Field(description="Whether code execution was successful")
	error_messages: List[str] = Field(default_factory=list, description="Any error messages encountered")


	class DocumentAnalysisResult(BaseModel):
	"""Complete result of document analysis team workflow."""
	document_type: str = Field(description="Type of document analyzed")
	analysis_objective: str = Field(description="Original analysis objective")
	processing_summary: str = Field(description="Summary of the entire processing workflow")

	# Results from each stage
	planning_notes: str = Field(description="Notes from the planning stage")
	prompts_created: str = Field(description="Summary of prompts and schemas created")
	data_extracted: str = Field(description="Summary of data extraction results")
	data_arranged: str = Field(description="Summary of data arrangement and insights")
	code_generated: str = Field(description="Summary of code generation and execution")

	# Final outputs
	key_findings: List[str] = Field(description="Key findings from the analysis")
	output_files_created: List[str] = Field(description="List of output files created")
	success: bool = Field(description="Whether the analysis completed successfully")
	recommendations: List[str] = Field(default_factory=list, description="Recommendations based on analysis")


	class ExtractionField(BaseModel):
	"""Individual field specification for data extraction."""
	field_name: str = Field(description="Name of the field to extract")
	field_type: str = Field(description="Type of data (text, number, date, etc.)")
	description: str = Field(description="Description of what this field represents")
	required: bool = Field(default=True, description="Whether this field is required")

	class AgentPrompt(BaseModel):
	"""Prompt configuration for a specific agent."""
	agent_name: str = Field(description="Name of the agent")
	specialized_instructions: List[str] = Field(description="Specialized instructions for this agent")
	input_requirements: List[str] = Field(description="What input this agent needs")
	output_requirements: List[str] = Field(description="What output this agent should produce")
	success_criteria: List[str] = Field(description="Criteria for successful completion")

	class PromptsAndSchemas(BaseModel):
	"""Prompts and schemas for all agents in the workflow."""
	# Data extraction specific
	extraction_prompt: str = Field(description="Optimized prompt for data extraction")
	extraction_fields: List[ExtractionField] = Field(
	description="List of fields to extract from the document"
	)
	arrangement_rules: List[str] = Field(description="Rules for organizing extracted data")
	validation_criteria: List[str] = Field(description="Criteria for validating extracted data")

	# All agent prompts
	agent_prompts: List[AgentPrompt] = Field(description="Specialized prompts for each agent")
	workflow_coordination: List[str] = Field(description="Instructions for coordinating between agents")
	quality_assurance: List[str] = Field(description="Quality assurance guidelines for all agents")