| import asyncio |
|
|
| from src.file_handler.handlers import ( |
| convert_docx_to_markdown, |
| convert_excel_bytes_to_llm_format, |
| convert_file_to_string, |
| convert_image_to_pillow, |
| convert_pdf_to_markdown, |
| ) |
|
|
|
|
| async def aparse_file(task_id: str, file_name: str, api_base_url: str) -> str: |
| """ |
| Parses a file and returns its content in a format suitable for LLMs. |
| |
| Args: |
| task_id (str): The ID of the task. |
| file_name (str): The name of the file. |
| api_base_url (str): The base URL of the API. |
| |
| Returns: |
| str: The content of the file in a format suitable for LLMs. |
| """ |
| file_extension = file_name.split(".")[-1] |
|
|
| if file_extension == "xlsx": |
| return await convert_excel_bytes_to_llm_format(task_id, api_base_url) |
| elif file_extension == "docx": |
| return await convert_docx_to_markdown(task_id, api_base_url) |
| elif file_extension in ["jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp"]: |
| return await convert_image_to_pillow(task_id, api_base_url) |
| elif file_extension == "pdf": |
| return await convert_pdf_to_markdown(task_id, api_base_url) |
| elif file_extension == "mp3": |
| return None |
| else: |
| return await convert_file_to_string(task_id, api_base_url) |
|
|
|
|
| def parse_file(task_id: str, file_name: str, api_base_url: str) -> str: |
| """ |
| Parses a file and returns its content in a format suitable for LLMs. |
| |
| Args: |
| task_id (str): The ID of the task. |
| file_name (str): The name of the file. |
| api_base_url (str): The base URL of the API. |
| |
| Returns: |
| str: The content of the file in a format suitable for LLMs. |
| """ |
| return asyncio.run(aparse_file(task_id, file_name, api_base_url)) |
|
|