| from web2json.ai_extractor import * |
| from web2json.postprocessor import * |
| from web2json.preprocessor import * |
| from pydantic import BaseModel |
|
|
| class Pipeline: |
| |
| def __init__(self, |
| preprocessor: Preprocessor, |
| ai_extractor: AIExtractor, |
| postprocessor: PostProcessor): |
| self.preprocessor = preprocessor |
| self.ai_extractor = ai_extractor |
| self.postprocessor = postprocessor |
|
|
| def run(self, content: str, is_url: bool, schema:BaseModel) -> dict: |
| """ |
| Run the entire pipeline: preprocess, extract, and postprocess. |
| |
| Args: |
| content (str): The raw content to process. |
| is_url (bool): Whether the content is a URL or raw text. |
| schema (BaseModel): The schema defining the structure of the expected output. |
| |
| Returns: |
| dict: The final structured data after processing. |
| """ |
| |
| preprocessed_content = self.preprocessor.preprocess(content, is_url) |
| print(f"Preprocessed content: {preprocessed_content[:100]}...") |
| print('+'*80) |
| |
| extracted_data = self.ai_extractor.extract(preprocessed_content, schema) |
| print(f"Extracted data: {extracted_data[:100]}...") |
| print('+'*80) |
| |
| final_output = self.postprocessor.process(extracted_data) |
| print(f"Final output: {final_output}") |
| print('+'*80) |
| |
| return final_output |
|
|
| |