zrguo commited on
Commit ·
120e951
1
Parent(s): 60e1dab
fix lint
Browse files- docs/mineru_integration_en.md +2 -2
- docs/mineru_integration_zh.md +2 -2
- examples/mineru_example.py +27 -24
- examples/modalprocessors_example.py +90 -66
- examples/raganything_example.py +63 -38
- lightrag/mineru_parser.py +132 -73
- lightrag/modalprocessors.py +127 -136
- lightrag/raganything.py +224 -170
docs/mineru_integration_en.md
CHANGED
|
@@ -257,7 +257,7 @@ The processors support different types of content:
|
|
| 257 |
- `ImageModalProcessor`: Processes images with captions and footnotes
|
| 258 |
- `TableModalProcessor`: Processes tables with captions and footnotes
|
| 259 |
- `EquationModalProcessor`: Processes mathematical equations in LaTeX format
|
| 260 |
-
- `GenericModalProcessor`: A base processor that can be extended for custom content types
|
| 261 |
|
| 262 |
> **Note**: A complete working example can be found in `examples/modalprocessors_example.py`. You can run it using:
|
| 263 |
> ```bash
|
|
@@ -357,4 +357,4 @@ description, entity_info = await equation_processor.process_multimodal_content(
|
|
| 357 |
)
|
| 358 |
```
|
| 359 |
|
| 360 |
-
</details>
|
|
|
|
| 257 |
- `ImageModalProcessor`: Processes images with captions and footnotes
|
| 258 |
- `TableModalProcessor`: Processes tables with captions and footnotes
|
| 259 |
- `EquationModalProcessor`: Processes mathematical equations in LaTeX format
|
| 260 |
+
- `GenericModalProcessor`: A base processor that can be extended for custom content types
|
| 261 |
|
| 262 |
> **Note**: A complete working example can be found in `examples/modalprocessors_example.py`. You can run it using:
|
| 263 |
> ```bash
|
|
|
|
| 357 |
)
|
| 358 |
```
|
| 359 |
|
| 360 |
+
</details>
|
docs/mineru_integration_zh.md
CHANGED
|
@@ -256,7 +256,7 @@ MinerU 配置文件 `magic-pdf.json` 支持多种自定义选项,包括:
|
|
| 256 |
- `ImageModalProcessor`:处理带有标题和脚注的图像
|
| 257 |
- `TableModalProcessor`:处理带有标题和脚注的表格
|
| 258 |
- `EquationModalProcessor`:处理 LaTeX 格式的数学公式
|
| 259 |
-
- `GenericModalProcessor`:可用于扩展自定义内容类型的基础处理器
|
| 260 |
|
| 261 |
> **注意**:完整的可运行示例可以在 `examples/modalprocessors_example.py` 中找到。您可以使用以下命令运行它:
|
| 262 |
> ```bash
|
|
@@ -355,4 +355,4 @@ description, entity_info = await equation_processor.process_multimodal_content(
|
|
| 355 |
entity_name="质能方程"
|
| 356 |
)
|
| 357 |
```
|
| 358 |
-
</details>
|
|
|
|
| 256 |
- `ImageModalProcessor`:处理带有标题和脚注的图像
|
| 257 |
- `TableModalProcessor`:处理带有标题和脚注的表格
|
| 258 |
- `EquationModalProcessor`:处理 LaTeX 格式的数学公式
|
| 259 |
+
- `GenericModalProcessor`:可用于扩展自定义内容类型的基础处理器
|
| 260 |
|
| 261 |
> **注意**:完整的可运行示例可以在 `examples/modalprocessors_example.py` 中找到。您可以使用以下命令运行它:
|
| 262 |
> ```bash
|
|
|
|
| 355 |
entity_name="质能方程"
|
| 356 |
)
|
| 357 |
```
|
| 358 |
+
</details>
|
examples/mineru_example.py
CHANGED
|
@@ -10,13 +10,15 @@ This example shows how to:
|
|
| 10 |
|
| 11 |
import os
|
| 12 |
import argparse
|
| 13 |
-
from pathlib import Path
|
| 14 |
from lightrag.mineru_parser import MineruParser
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
Parse a document using MinerU parser
|
| 19 |
-
|
| 20 |
Args:
|
| 21 |
file_path: Path to the document
|
| 22 |
output_dir: Output directory for parsed results
|
|
@@ -26,22 +28,20 @@ def parse_document(file_path: str, output_dir: str = None, method: str = "auto",
|
|
| 26 |
try:
|
| 27 |
# Parse the document
|
| 28 |
content_list, md_content = MineruParser.parse_document(
|
| 29 |
-
file_path=file_path,
|
| 30 |
-
parse_method=method,
|
| 31 |
-
output_dir=output_dir
|
| 32 |
)
|
| 33 |
|
| 34 |
# Display statistics if requested
|
| 35 |
if stats:
|
| 36 |
print("\nDocument Statistics:")
|
| 37 |
print(f"Total content blocks: {len(content_list)}")
|
| 38 |
-
|
| 39 |
# Count different types of content
|
| 40 |
content_types = {}
|
| 41 |
for item in content_list:
|
| 42 |
-
content_type = item.get(
|
| 43 |
content_types[content_type] = content_types.get(content_type, 0) + 1
|
| 44 |
-
|
| 45 |
print("\nContent Type Distribution:")
|
| 46 |
for content_type, count in content_types.items():
|
| 47 |
print(f"- {content_type}: {count}")
|
|
@@ -52,17 +52,22 @@ def parse_document(file_path: str, output_dir: str = None, method: str = "auto",
|
|
| 52 |
print(f"Error parsing document: {str(e)}")
|
| 53 |
return None, None
|
| 54 |
|
|
|
|
| 55 |
def main():
|
| 56 |
"""Main function to run the example"""
|
| 57 |
-
parser = argparse.ArgumentParser(description=
|
| 58 |
-
parser.add_argument(
|
| 59 |
-
parser.add_argument(
|
| 60 |
-
parser.add_argument(
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
args = parser.parse_args()
|
| 68 |
|
|
@@ -72,11 +77,9 @@ def main():
|
|
| 72 |
|
| 73 |
# Parse document
|
| 74 |
content_list, md_content = parse_document(
|
| 75 |
-
args.file_path,
|
| 76 |
-
args.output,
|
| 77 |
-
args.method,
|
| 78 |
-
args.stats
|
| 79 |
)
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
| 10 |
|
| 11 |
import os
|
| 12 |
import argparse
|
|
|
|
| 13 |
from lightrag.mineru_parser import MineruParser
|
| 14 |
|
| 15 |
+
|
| 16 |
+
def parse_document(
|
| 17 |
+
file_path: str, output_dir: str = None, method: str = "auto", stats: bool = False
|
| 18 |
+
):
|
| 19 |
"""
|
| 20 |
Parse a document using MinerU parser
|
| 21 |
+
|
| 22 |
Args:
|
| 23 |
file_path: Path to the document
|
| 24 |
output_dir: Output directory for parsed results
|
|
|
|
| 28 |
try:
|
| 29 |
# Parse the document
|
| 30 |
content_list, md_content = MineruParser.parse_document(
|
| 31 |
+
file_path=file_path, parse_method=method, output_dir=output_dir
|
|
|
|
|
|
|
| 32 |
)
|
| 33 |
|
| 34 |
# Display statistics if requested
|
| 35 |
if stats:
|
| 36 |
print("\nDocument Statistics:")
|
| 37 |
print(f"Total content blocks: {len(content_list)}")
|
| 38 |
+
|
| 39 |
# Count different types of content
|
| 40 |
content_types = {}
|
| 41 |
for item in content_list:
|
| 42 |
+
content_type = item.get("type", "unknown")
|
| 43 |
content_types[content_type] = content_types.get(content_type, 0) + 1
|
| 44 |
+
|
| 45 |
print("\nContent Type Distribution:")
|
| 46 |
for content_type, count in content_types.items():
|
| 47 |
print(f"- {content_type}: {count}")
|
|
|
|
| 52 |
print(f"Error parsing document: {str(e)}")
|
| 53 |
return None, None
|
| 54 |
|
| 55 |
+
|
| 56 |
def main():
|
| 57 |
"""Main function to run the example"""
|
| 58 |
+
parser = argparse.ArgumentParser(description="MinerU Parser Example")
|
| 59 |
+
parser.add_argument("file_path", help="Path to the document to parse")
|
| 60 |
+
parser.add_argument("--output", "-o", help="Output directory path")
|
| 61 |
+
parser.add_argument(
|
| 62 |
+
"--method",
|
| 63 |
+
"-m",
|
| 64 |
+
choices=["auto", "ocr", "txt"],
|
| 65 |
+
default="auto",
|
| 66 |
+
help="Parsing method (auto, ocr, txt)",
|
| 67 |
+
)
|
| 68 |
+
parser.add_argument(
|
| 69 |
+
"--stats", action="store_true", help="Display content statistics"
|
| 70 |
+
)
|
| 71 |
|
| 72 |
args = parser.parse_args()
|
| 73 |
|
|
|
|
| 77 |
|
| 78 |
# Parse document
|
| 79 |
content_list, md_content = parse_document(
|
| 80 |
+
args.file_path, args.output, args.method, args.stats
|
|
|
|
|
|
|
|
|
|
| 81 |
)
|
| 82 |
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
main()
|
examples/modalprocessors_example.py
CHANGED
|
@@ -8,94 +8,112 @@ import asyncio
|
|
| 8 |
import argparse
|
| 9 |
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 10 |
from lightrag.kg.shared_storage import initialize_pipeline_status
|
| 11 |
-
from pathlib import Path
|
| 12 |
from lightrag import LightRAG
|
| 13 |
from lightrag.modalprocessors import (
|
| 14 |
ImageModalProcessor,
|
| 15 |
TableModalProcessor,
|
| 16 |
EquationModalProcessor,
|
| 17 |
-
GenericModalProcessor
|
| 18 |
)
|
| 19 |
|
| 20 |
WORKING_DIR = "./rag_storage"
|
| 21 |
|
|
|
|
| 22 |
def get_llm_model_func(api_key: str, base_url: str = None):
|
| 23 |
-
return
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
)
|
| 32 |
|
|
|
|
| 33 |
def get_vision_model_func(api_key: str, base_url: str = None):
|
| 34 |
-
return
|
| 35 |
-
|
| 36 |
-
"",
|
| 37 |
system_prompt=None,
|
| 38 |
history_messages=[],
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
{
|
| 44 |
-
"
|
| 45 |
-
"
|
| 46 |
-
"
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
}
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
)
|
| 63 |
|
|
|
|
| 64 |
async def process_image_example(lightrag: LightRAG, vision_model_func):
|
| 65 |
"""Example of processing an image"""
|
| 66 |
# Create image processor
|
| 67 |
image_processor = ImageModalProcessor(
|
| 68 |
-
lightrag=lightrag,
|
| 69 |
-
modal_caption_func=vision_model_func
|
| 70 |
)
|
| 71 |
-
|
| 72 |
# Prepare image content
|
| 73 |
image_content = {
|
| 74 |
"img_path": "image.jpg",
|
| 75 |
"img_caption": ["Example image caption"],
|
| 76 |
-
"img_footnote": ["Example image footnote"]
|
| 77 |
}
|
| 78 |
-
|
| 79 |
# Process image
|
| 80 |
description, entity_info = await image_processor.process_multimodal_content(
|
| 81 |
modal_content=image_content,
|
| 82 |
content_type="image",
|
| 83 |
file_path="image_example.jpg",
|
| 84 |
-
entity_name="Example Image"
|
| 85 |
)
|
| 86 |
-
|
| 87 |
print("Image Processing Results:")
|
| 88 |
print(f"Description: {description}")
|
| 89 |
print(f"Entity Info: {entity_info}")
|
| 90 |
|
|
|
|
| 91 |
async def process_table_example(lightrag: LightRAG, llm_model_func):
|
| 92 |
"""Example of processing a table"""
|
| 93 |
# Create table processor
|
| 94 |
table_processor = TableModalProcessor(
|
| 95 |
-
lightrag=lightrag,
|
| 96 |
-
modal_caption_func=llm_model_func
|
| 97 |
)
|
| 98 |
-
|
| 99 |
# Prepare table content
|
| 100 |
table_content = {
|
| 101 |
"table_body": """
|
|
@@ -105,47 +123,45 @@ async def process_table_example(lightrag: LightRAG, llm_model_func):
|
|
| 105 |
| Mary | 30 | Designer |
|
| 106 |
""",
|
| 107 |
"table_caption": ["Employee Information Table"],
|
| 108 |
-
"table_footnote": ["Data updated as of 2024"]
|
| 109 |
}
|
| 110 |
-
|
| 111 |
# Process table
|
| 112 |
description, entity_info = await table_processor.process_multimodal_content(
|
| 113 |
modal_content=table_content,
|
| 114 |
content_type="table",
|
| 115 |
file_path="table_example.md",
|
| 116 |
-
entity_name="Employee Table"
|
| 117 |
)
|
| 118 |
-
|
| 119 |
print("\nTable Processing Results:")
|
| 120 |
print(f"Description: {description}")
|
| 121 |
print(f"Entity Info: {entity_info}")
|
| 122 |
|
|
|
|
| 123 |
async def process_equation_example(lightrag: LightRAG, llm_model_func):
|
| 124 |
"""Example of processing a mathematical equation"""
|
| 125 |
# Create equation processor
|
| 126 |
equation_processor = EquationModalProcessor(
|
| 127 |
-
lightrag=lightrag,
|
| 128 |
-
modal_caption_func=llm_model_func
|
| 129 |
)
|
| 130 |
-
|
| 131 |
# Prepare equation content
|
| 132 |
-
equation_content = {
|
| 133 |
-
|
| 134 |
-
"text_format": "LaTeX"
|
| 135 |
-
}
|
| 136 |
-
|
| 137 |
# Process equation
|
| 138 |
description, entity_info = await equation_processor.process_multimodal_content(
|
| 139 |
modal_content=equation_content,
|
| 140 |
content_type="equation",
|
| 141 |
file_path="equation_example.txt",
|
| 142 |
-
entity_name="Mass-Energy Equivalence"
|
| 143 |
)
|
| 144 |
-
|
| 145 |
print("\nEquation Processing Results:")
|
| 146 |
print(f"Description: {description}")
|
| 147 |
print(f"Entity Info: {entity_info}")
|
| 148 |
|
|
|
|
| 149 |
async def initialize_rag(api_key: str, base_url: str = None):
|
| 150 |
rag = LightRAG(
|
| 151 |
working_dir=WORKING_DIR,
|
|
@@ -155,7 +171,10 @@ async def initialize_rag(api_key: str, base_url: str = None):
|
|
| 155 |
api_key=api_key,
|
| 156 |
base_url=base_url,
|
| 157 |
),
|
| 158 |
-
llm_model_func=lambda prompt,
|
|
|
|
|
|
|
|
|
|
| 159 |
"gpt-4o-mini",
|
| 160 |
prompt,
|
| 161 |
system_prompt=system_prompt,
|
|
@@ -171,30 +190,35 @@ async def initialize_rag(api_key: str, base_url: str = None):
|
|
| 171 |
|
| 172 |
return rag
|
| 173 |
|
|
|
|
| 174 |
def main():
|
| 175 |
"""Main function to run the example"""
|
| 176 |
-
parser = argparse.ArgumentParser(description=
|
| 177 |
-
parser.add_argument(
|
| 178 |
-
parser.add_argument(
|
| 179 |
-
parser.add_argument(
|
|
|
|
|
|
|
| 180 |
|
| 181 |
args = parser.parse_args()
|
| 182 |
|
| 183 |
# Run examples
|
| 184 |
asyncio.run(main_async(args.api_key, args.base_url))
|
| 185 |
|
|
|
|
| 186 |
async def main_async(api_key: str, base_url: str = None):
|
| 187 |
# Initialize LightRAG
|
| 188 |
lightrag = await initialize_rag(api_key, base_url)
|
| 189 |
-
|
| 190 |
# Get model functions
|
| 191 |
llm_model_func = get_llm_model_func(api_key, base_url)
|
| 192 |
vision_model_func = get_vision_model_func(api_key, base_url)
|
| 193 |
-
|
| 194 |
# Run examples
|
| 195 |
await process_image_example(lightrag, vision_model_func)
|
| 196 |
await process_table_example(lightrag, llm_model_func)
|
| 197 |
await process_equation_example(lightrag, llm_model_func)
|
| 198 |
|
|
|
|
| 199 |
if __name__ == "__main__":
|
| 200 |
-
main()
|
|
|
|
| 8 |
import argparse
|
| 9 |
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 10 |
from lightrag.kg.shared_storage import initialize_pipeline_status
|
|
|
|
| 11 |
from lightrag import LightRAG
|
| 12 |
from lightrag.modalprocessors import (
|
| 13 |
ImageModalProcessor,
|
| 14 |
TableModalProcessor,
|
| 15 |
EquationModalProcessor,
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
WORKING_DIR = "./rag_storage"
|
| 19 |
|
| 20 |
+
|
| 21 |
def get_llm_model_func(api_key: str, base_url: str = None):
|
| 22 |
+
return (
|
| 23 |
+
lambda prompt,
|
| 24 |
+
system_prompt=None,
|
| 25 |
+
history_messages=[],
|
| 26 |
+
**kwargs: openai_complete_if_cache(
|
| 27 |
+
"gpt-4o-mini",
|
| 28 |
+
prompt,
|
| 29 |
+
system_prompt=system_prompt,
|
| 30 |
+
history_messages=history_messages,
|
| 31 |
+
api_key=api_key,
|
| 32 |
+
base_url=base_url,
|
| 33 |
+
**kwargs,
|
| 34 |
+
)
|
| 35 |
)
|
| 36 |
|
| 37 |
+
|
| 38 |
def get_vision_model_func(api_key: str, base_url: str = None):
|
| 39 |
+
return (
|
| 40 |
+
lambda prompt,
|
|
|
|
| 41 |
system_prompt=None,
|
| 42 |
history_messages=[],
|
| 43 |
+
image_data=None,
|
| 44 |
+
**kwargs: openai_complete_if_cache(
|
| 45 |
+
"gpt-4o",
|
| 46 |
+
"",
|
| 47 |
+
system_prompt=None,
|
| 48 |
+
history_messages=[],
|
| 49 |
+
messages=[
|
| 50 |
+
{"role": "system", "content": system_prompt} if system_prompt else None,
|
| 51 |
{
|
| 52 |
+
"role": "user",
|
| 53 |
+
"content": [
|
| 54 |
+
{"type": "text", "text": prompt},
|
| 55 |
+
{
|
| 56 |
+
"type": "image_url",
|
| 57 |
+
"image_url": {
|
| 58 |
+
"url": f"data:image/jpeg;base64,{image_data}"
|
| 59 |
+
},
|
| 60 |
+
},
|
| 61 |
+
],
|
| 62 |
}
|
| 63 |
+
if image_data
|
| 64 |
+
else {"role": "user", "content": prompt},
|
| 65 |
+
],
|
| 66 |
+
api_key=api_key,
|
| 67 |
+
base_url=base_url,
|
| 68 |
+
**kwargs,
|
| 69 |
+
)
|
| 70 |
+
if image_data
|
| 71 |
+
else openai_complete_if_cache(
|
| 72 |
+
"gpt-4o-mini",
|
| 73 |
+
prompt,
|
| 74 |
+
system_prompt=system_prompt,
|
| 75 |
+
history_messages=history_messages,
|
| 76 |
+
api_key=api_key,
|
| 77 |
+
base_url=base_url,
|
| 78 |
+
**kwargs,
|
| 79 |
+
)
|
| 80 |
)
|
| 81 |
|
| 82 |
+
|
| 83 |
async def process_image_example(lightrag: LightRAG, vision_model_func):
|
| 84 |
"""Example of processing an image"""
|
| 85 |
# Create image processor
|
| 86 |
image_processor = ImageModalProcessor(
|
| 87 |
+
lightrag=lightrag, modal_caption_func=vision_model_func
|
|
|
|
| 88 |
)
|
| 89 |
+
|
| 90 |
# Prepare image content
|
| 91 |
image_content = {
|
| 92 |
"img_path": "image.jpg",
|
| 93 |
"img_caption": ["Example image caption"],
|
| 94 |
+
"img_footnote": ["Example image footnote"],
|
| 95 |
}
|
| 96 |
+
|
| 97 |
# Process image
|
| 98 |
description, entity_info = await image_processor.process_multimodal_content(
|
| 99 |
modal_content=image_content,
|
| 100 |
content_type="image",
|
| 101 |
file_path="image_example.jpg",
|
| 102 |
+
entity_name="Example Image",
|
| 103 |
)
|
| 104 |
+
|
| 105 |
print("Image Processing Results:")
|
| 106 |
print(f"Description: {description}")
|
| 107 |
print(f"Entity Info: {entity_info}")
|
| 108 |
|
| 109 |
+
|
| 110 |
async def process_table_example(lightrag: LightRAG, llm_model_func):
|
| 111 |
"""Example of processing a table"""
|
| 112 |
# Create table processor
|
| 113 |
table_processor = TableModalProcessor(
|
| 114 |
+
lightrag=lightrag, modal_caption_func=llm_model_func
|
|
|
|
| 115 |
)
|
| 116 |
+
|
| 117 |
# Prepare table content
|
| 118 |
table_content = {
|
| 119 |
"table_body": """
|
|
|
|
| 123 |
| Mary | 30 | Designer |
|
| 124 |
""",
|
| 125 |
"table_caption": ["Employee Information Table"],
|
| 126 |
+
"table_footnote": ["Data updated as of 2024"],
|
| 127 |
}
|
| 128 |
+
|
| 129 |
# Process table
|
| 130 |
description, entity_info = await table_processor.process_multimodal_content(
|
| 131 |
modal_content=table_content,
|
| 132 |
content_type="table",
|
| 133 |
file_path="table_example.md",
|
| 134 |
+
entity_name="Employee Table",
|
| 135 |
)
|
| 136 |
+
|
| 137 |
print("\nTable Processing Results:")
|
| 138 |
print(f"Description: {description}")
|
| 139 |
print(f"Entity Info: {entity_info}")
|
| 140 |
|
| 141 |
+
|
| 142 |
async def process_equation_example(lightrag: LightRAG, llm_model_func):
|
| 143 |
"""Example of processing a mathematical equation"""
|
| 144 |
# Create equation processor
|
| 145 |
equation_processor = EquationModalProcessor(
|
| 146 |
+
lightrag=lightrag, modal_caption_func=llm_model_func
|
|
|
|
| 147 |
)
|
| 148 |
+
|
| 149 |
# Prepare equation content
|
| 150 |
+
equation_content = {"text": "E = mc^2", "text_format": "LaTeX"}
|
| 151 |
+
|
|
|
|
|
|
|
|
|
|
| 152 |
# Process equation
|
| 153 |
description, entity_info = await equation_processor.process_multimodal_content(
|
| 154 |
modal_content=equation_content,
|
| 155 |
content_type="equation",
|
| 156 |
file_path="equation_example.txt",
|
| 157 |
+
entity_name="Mass-Energy Equivalence",
|
| 158 |
)
|
| 159 |
+
|
| 160 |
print("\nEquation Processing Results:")
|
| 161 |
print(f"Description: {description}")
|
| 162 |
print(f"Entity Info: {entity_info}")
|
| 163 |
|
| 164 |
+
|
| 165 |
async def initialize_rag(api_key: str, base_url: str = None):
|
| 166 |
rag = LightRAG(
|
| 167 |
working_dir=WORKING_DIR,
|
|
|
|
| 171 |
api_key=api_key,
|
| 172 |
base_url=base_url,
|
| 173 |
),
|
| 174 |
+
llm_model_func=lambda prompt,
|
| 175 |
+
system_prompt=None,
|
| 176 |
+
history_messages=[],
|
| 177 |
+
**kwargs: openai_complete_if_cache(
|
| 178 |
"gpt-4o-mini",
|
| 179 |
prompt,
|
| 180 |
system_prompt=system_prompt,
|
|
|
|
| 190 |
|
| 191 |
return rag
|
| 192 |
|
| 193 |
+
|
| 194 |
def main():
|
| 195 |
"""Main function to run the example"""
|
| 196 |
+
parser = argparse.ArgumentParser(description="Modal Processors Example")
|
| 197 |
+
parser.add_argument("--api-key", required=True, help="OpenAI API key")
|
| 198 |
+
parser.add_argument("--base-url", help="Optional base URL for API")
|
| 199 |
+
parser.add_argument(
|
| 200 |
+
"--working-dir", "-w", default=WORKING_DIR, help="Working directory path"
|
| 201 |
+
)
|
| 202 |
|
| 203 |
args = parser.parse_args()
|
| 204 |
|
| 205 |
# Run examples
|
| 206 |
asyncio.run(main_async(args.api_key, args.base_url))
|
| 207 |
|
| 208 |
+
|
| 209 |
async def main_async(api_key: str, base_url: str = None):
|
| 210 |
# Initialize LightRAG
|
| 211 |
lightrag = await initialize_rag(api_key, base_url)
|
| 212 |
+
|
| 213 |
# Get model functions
|
| 214 |
llm_model_func = get_llm_model_func(api_key, base_url)
|
| 215 |
vision_model_func = get_vision_model_func(api_key, base_url)
|
| 216 |
+
|
| 217 |
# Run examples
|
| 218 |
await process_image_example(lightrag, vision_model_func)
|
| 219 |
await process_table_example(lightrag, llm_model_func)
|
| 220 |
await process_equation_example(lightrag, llm_model_func)
|
| 221 |
|
| 222 |
+
|
| 223 |
if __name__ == "__main__":
|
| 224 |
+
main()
|
examples/raganything_example.py
CHANGED
|
@@ -11,15 +11,20 @@ This example shows how to:
|
|
| 11 |
import os
|
| 12 |
import argparse
|
| 13 |
import asyncio
|
| 14 |
-
from pathlib import Path
|
| 15 |
-
from lightrag.mineru_parser import MineruParser
|
| 16 |
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 17 |
from lightrag.raganything import RAGAnything
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"""
|
| 21 |
Process document with RAGAnything
|
| 22 |
-
|
| 23 |
Args:
|
| 24 |
file_path: Path to the document
|
| 25 |
output_dir: Output directory for RAG results
|
|
@@ -30,7 +35,10 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
|
|
| 30 |
# Initialize RAGAnything
|
| 31 |
rag = RAGAnything(
|
| 32 |
working_dir=working_dir,
|
| 33 |
-
llm_model_func=lambda prompt,
|
|
|
|
|
|
|
|
|
|
| 34 |
"gpt-4o-mini",
|
| 35 |
prompt,
|
| 36 |
system_prompt=system_prompt,
|
|
@@ -39,27 +47,40 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
|
|
| 39 |
base_url=base_url,
|
| 40 |
**kwargs,
|
| 41 |
),
|
| 42 |
-
vision_model_func=lambda prompt,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
"gpt-4o",
|
| 44 |
"",
|
| 45 |
system_prompt=None,
|
| 46 |
history_messages=[],
|
| 47 |
messages=[
|
| 48 |
-
{"role": "system", "content": system_prompt}
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
],
|
| 59 |
api_key=api_key,
|
| 60 |
base_url=base_url,
|
| 61 |
**kwargs,
|
| 62 |
-
)
|
|
|
|
|
|
|
| 63 |
"gpt-4o-mini",
|
| 64 |
prompt,
|
| 65 |
system_prompt=system_prompt,
|
|
@@ -75,21 +96,19 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
|
|
| 75 |
base_url=base_url,
|
| 76 |
),
|
| 77 |
embedding_dim=3072,
|
| 78 |
-
max_token_size=8192
|
| 79 |
)
|
| 80 |
|
| 81 |
# Process document
|
| 82 |
await rag.process_document_complete(
|
| 83 |
-
file_path=file_path,
|
| 84 |
-
output_dir=output_dir,
|
| 85 |
-
parse_method="auto"
|
| 86 |
)
|
| 87 |
|
| 88 |
# Example queries
|
| 89 |
queries = [
|
| 90 |
"What is the main content of the document?",
|
| 91 |
"Describe the images and figures in the document",
|
| 92 |
-
"Tell me about the experimental results and data tables"
|
| 93 |
]
|
| 94 |
|
| 95 |
print("\nQuerying processed document:")
|
|
@@ -101,14 +120,21 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
|
|
| 101 |
except Exception as e:
|
| 102 |
print(f"Error processing with RAG: {str(e)}")
|
| 103 |
|
|
|
|
| 104 |
def main():
|
| 105 |
"""Main function to run the example"""
|
| 106 |
-
parser = argparse.ArgumentParser(description=
|
| 107 |
-
parser.add_argument(
|
| 108 |
-
parser.add_argument(
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
parser.add_argument(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
args = parser.parse_args()
|
| 114 |
|
|
@@ -117,13 +143,12 @@ def main():
|
|
| 117 |
os.makedirs(args.output, exist_ok=True)
|
| 118 |
|
| 119 |
# Process with RAG
|
| 120 |
-
asyncio.run(
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
main()
|
|
|
|
| 11 |
import os
|
| 12 |
import argparse
|
| 13 |
import asyncio
|
|
|
|
|
|
|
| 14 |
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 15 |
from lightrag.raganything import RAGAnything
|
| 16 |
|
| 17 |
+
|
| 18 |
+
async def process_with_rag(
|
| 19 |
+
file_path: str,
|
| 20 |
+
output_dir: str,
|
| 21 |
+
api_key: str,
|
| 22 |
+
base_url: str = None,
|
| 23 |
+
working_dir: str = None,
|
| 24 |
+
):
|
| 25 |
"""
|
| 26 |
Process document with RAGAnything
|
| 27 |
+
|
| 28 |
Args:
|
| 29 |
file_path: Path to the document
|
| 30 |
output_dir: Output directory for RAG results
|
|
|
|
| 35 |
# Initialize RAGAnything
|
| 36 |
rag = RAGAnything(
|
| 37 |
working_dir=working_dir,
|
| 38 |
+
llm_model_func=lambda prompt,
|
| 39 |
+
system_prompt=None,
|
| 40 |
+
history_messages=[],
|
| 41 |
+
**kwargs: openai_complete_if_cache(
|
| 42 |
"gpt-4o-mini",
|
| 43 |
prompt,
|
| 44 |
system_prompt=system_prompt,
|
|
|
|
| 47 |
base_url=base_url,
|
| 48 |
**kwargs,
|
| 49 |
),
|
| 50 |
+
vision_model_func=lambda prompt,
|
| 51 |
+
system_prompt=None,
|
| 52 |
+
history_messages=[],
|
| 53 |
+
image_data=None,
|
| 54 |
+
**kwargs: openai_complete_if_cache(
|
| 55 |
"gpt-4o",
|
| 56 |
"",
|
| 57 |
system_prompt=None,
|
| 58 |
history_messages=[],
|
| 59 |
messages=[
|
| 60 |
+
{"role": "system", "content": system_prompt}
|
| 61 |
+
if system_prompt
|
| 62 |
+
else None,
|
| 63 |
+
{
|
| 64 |
+
"role": "user",
|
| 65 |
+
"content": [
|
| 66 |
+
{"type": "text", "text": prompt},
|
| 67 |
+
{
|
| 68 |
+
"type": "image_url",
|
| 69 |
+
"image_url": {
|
| 70 |
+
"url": f"data:image/jpeg;base64,{image_data}"
|
| 71 |
+
},
|
| 72 |
+
},
|
| 73 |
+
],
|
| 74 |
+
}
|
| 75 |
+
if image_data
|
| 76 |
+
else {"role": "user", "content": prompt},
|
| 77 |
],
|
| 78 |
api_key=api_key,
|
| 79 |
base_url=base_url,
|
| 80 |
**kwargs,
|
| 81 |
+
)
|
| 82 |
+
if image_data
|
| 83 |
+
else openai_complete_if_cache(
|
| 84 |
"gpt-4o-mini",
|
| 85 |
prompt,
|
| 86 |
system_prompt=system_prompt,
|
|
|
|
| 96 |
base_url=base_url,
|
| 97 |
),
|
| 98 |
embedding_dim=3072,
|
| 99 |
+
max_token_size=8192,
|
| 100 |
)
|
| 101 |
|
| 102 |
# Process document
|
| 103 |
await rag.process_document_complete(
|
| 104 |
+
file_path=file_path, output_dir=output_dir, parse_method="auto"
|
|
|
|
|
|
|
| 105 |
)
|
| 106 |
|
| 107 |
# Example queries
|
| 108 |
queries = [
|
| 109 |
"What is the main content of the document?",
|
| 110 |
"Describe the images and figures in the document",
|
| 111 |
+
"Tell me about the experimental results and data tables",
|
| 112 |
]
|
| 113 |
|
| 114 |
print("\nQuerying processed document:")
|
|
|
|
| 120 |
except Exception as e:
|
| 121 |
print(f"Error processing with RAG: {str(e)}")
|
| 122 |
|
| 123 |
+
|
| 124 |
def main():
|
| 125 |
"""Main function to run the example"""
|
| 126 |
+
parser = argparse.ArgumentParser(description="MinerU RAG Example")
|
| 127 |
+
parser.add_argument("file_path", help="Path to the document to process")
|
| 128 |
+
parser.add_argument(
|
| 129 |
+
"--working_dir", "-w", default="./rag_storage", help="Working directory path"
|
| 130 |
+
)
|
| 131 |
+
parser.add_argument(
|
| 132 |
+
"--output", "-o", default="./output", help="Output directory path"
|
| 133 |
+
)
|
| 134 |
+
parser.add_argument(
|
| 135 |
+
"--api-key", required=True, help="OpenAI API key for RAG processing"
|
| 136 |
+
)
|
| 137 |
+
parser.add_argument("--base-url", help="Optional base URL for API")
|
| 138 |
|
| 139 |
args = parser.parse_args()
|
| 140 |
|
|
|
|
| 143 |
os.makedirs(args.output, exist_ok=True)
|
| 144 |
|
| 145 |
# Process with RAG
|
| 146 |
+
asyncio.run(
|
| 147 |
+
process_with_rag(
|
| 148 |
+
args.file_path, args.output, args.api_key, args.base_url, args.working_dir
|
| 149 |
+
)
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
if __name__ == "__main__":
|
| 154 |
+
main()
|
|
|
lightrag/mineru_parser.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
"""
|
| 3 |
MinerU Document Parser Utility
|
| 4 |
|
|
@@ -14,7 +14,18 @@ import os
|
|
| 14 |
import json
|
| 15 |
import argparse
|
| 16 |
from pathlib import Path
|
| 17 |
-
from typing import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Type stubs for magic_pdf
|
| 20 |
FileBasedDataWriter = Any
|
|
@@ -28,20 +39,27 @@ read_local_office = Any
|
|
| 28 |
read_local_images = Any
|
| 29 |
|
| 30 |
if TYPE_CHECKING:
|
| 31 |
-
from magic_pdf.data.data_reader_writer import
|
|
|
|
|
|
|
|
|
|
| 32 |
from magic_pdf.data.dataset import PymuDocDataset
|
| 33 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
| 34 |
from magic_pdf.config.enums import SupportedPdfParseMethod
|
| 35 |
from magic_pdf.data.read_api import read_local_office, read_local_images
|
| 36 |
else:
|
| 37 |
# MinerU imports
|
| 38 |
-
from magic_pdf.data.data_reader_writer import
|
|
|
|
|
|
|
|
|
|
| 39 |
from magic_pdf.data.dataset import PymuDocDataset
|
| 40 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
| 41 |
from magic_pdf.config.enums import SupportedPdfParseMethod
|
| 42 |
from magic_pdf.data.read_api import read_local_office, read_local_images
|
| 43 |
|
| 44 |
-
T = TypeVar(
|
|
|
|
| 45 |
|
| 46 |
class MineruParser:
|
| 47 |
"""
|
|
@@ -58,7 +76,11 @@ class MineruParser:
|
|
| 58 |
pass
|
| 59 |
|
| 60 |
@staticmethod
|
| 61 |
-
def safe_write(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
"""
|
| 63 |
Safely write content to a file, ensuring the filename is valid
|
| 64 |
|
|
@@ -80,15 +102,22 @@ class MineruParser:
|
|
| 80 |
writer.write(content, filename)
|
| 81 |
except TypeError:
|
| 82 |
# If the writer expects bytes, convert string to bytes
|
| 83 |
-
writer.write(content.encode(
|
| 84 |
else:
|
| 85 |
# For dict/list content, always encode as JSON string first
|
| 86 |
if isinstance(content, (dict, list)):
|
| 87 |
try:
|
| 88 |
-
writer.write(
|
|
|
|
|
|
|
| 89 |
except TypeError:
|
| 90 |
# If the writer expects bytes, convert JSON string to bytes
|
| 91 |
-
writer.write(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
else:
|
| 93 |
# Regular content (assumed to be bytes or compatible)
|
| 94 |
writer.write(content, filename)
|
|
@@ -97,7 +126,7 @@ class MineruParser:
|
|
| 97 |
def parse_pdf(
|
| 98 |
pdf_path: Union[str, Path],
|
| 99 |
output_dir: Optional[str] = None,
|
| 100 |
-
use_ocr: bool = False
|
| 101 |
) -> Tuple[List[Dict[str, Any]], str]:
|
| 102 |
"""
|
| 103 |
Parse PDF document
|
|
@@ -150,9 +179,15 @@ class MineruParser:
|
|
| 150 |
|
| 151 |
# Draw visualizations
|
| 152 |
try:
|
| 153 |
-
infer_result.draw_model(
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
except Exception as e:
|
| 157 |
print(f"Warning: Failed to draw visualizations: {str(e)}")
|
| 158 |
|
|
@@ -162,7 +197,9 @@ class MineruParser:
|
|
| 162 |
|
| 163 |
# Save files using dump methods (consistent with API)
|
| 164 |
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore
|
| 165 |
-
pipe_result.dump_content_list(
|
|
|
|
|
|
|
| 166 |
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
|
| 167 |
|
| 168 |
# Save model result - convert JSON string to bytes before writing
|
|
@@ -171,16 +208,24 @@ class MineruParser:
|
|
| 171 |
|
| 172 |
try:
|
| 173 |
# Try to write to a file manually to avoid FileBasedDataWriter issues
|
| 174 |
-
model_file_path = os.path.join(
|
| 175 |
-
|
|
|
|
|
|
|
| 176 |
f.write(json_str)
|
| 177 |
except Exception as e:
|
| 178 |
-
print(
|
|
|
|
|
|
|
| 179 |
try:
|
| 180 |
# If direct file write fails, try using the writer with bytes encoding
|
| 181 |
-
md_writer.write(
|
|
|
|
|
|
|
| 182 |
except Exception as e2:
|
| 183 |
-
print(
|
|
|
|
|
|
|
| 184 |
|
| 185 |
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
|
| 186 |
|
|
@@ -190,8 +235,7 @@ class MineruParser:
|
|
| 190 |
|
| 191 |
@staticmethod
|
| 192 |
def parse_office_doc(
|
| 193 |
-
doc_path: Union[str, Path],
|
| 194 |
-
output_dir: Optional[str] = None
|
| 195 |
) -> Tuple[List[Dict[str, Any]], str]:
|
| 196 |
"""
|
| 197 |
Parse office document (Word, PPT, etc.)
|
|
@@ -231,9 +275,9 @@ class MineruParser:
|
|
| 231 |
|
| 232 |
# Apply chain of operations according to API documentation
|
| 233 |
# This follows the pattern shown in MS-Office example in the API docs
|
| 234 |
-
ds.apply(doc_analyze, ocr=True)
|
| 235 |
-
|
| 236 |
-
|
| 237 |
|
| 238 |
# Re-execute for getting the content data
|
| 239 |
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
|
|
@@ -244,7 +288,9 @@ class MineruParser:
|
|
| 244 |
content_list = pipe_result.get_content_list(image_dir) # type: ignore
|
| 245 |
|
| 246 |
# Save additional output files
|
| 247 |
-
pipe_result.dump_content_list(
|
|
|
|
|
|
|
| 248 |
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
|
| 249 |
|
| 250 |
# Save model result - convert JSON string to bytes before writing
|
|
@@ -253,16 +299,24 @@ class MineruParser:
|
|
| 253 |
|
| 254 |
try:
|
| 255 |
# Try to write to a file manually to avoid FileBasedDataWriter issues
|
| 256 |
-
model_file_path = os.path.join(
|
| 257 |
-
|
|
|
|
|
|
|
| 258 |
f.write(json_str)
|
| 259 |
except Exception as e:
|
| 260 |
-
print(
|
|
|
|
|
|
|
| 261 |
try:
|
| 262 |
# If direct file write fails, try using the writer with bytes encoding
|
| 263 |
-
md_writer.write(
|
|
|
|
|
|
|
| 264 |
except Exception as e2:
|
| 265 |
-
print(
|
|
|
|
|
|
|
| 266 |
|
| 267 |
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
|
| 268 |
|
|
@@ -272,8 +326,7 @@ class MineruParser:
|
|
| 272 |
|
| 273 |
@staticmethod
|
| 274 |
def parse_image(
|
| 275 |
-
image_path: Union[str, Path],
|
| 276 |
-
output_dir: Optional[str] = None
|
| 277 |
) -> Tuple[List[Dict[str, Any]], str]:
|
| 278 |
"""
|
| 279 |
Parse image document
|
|
@@ -313,9 +366,9 @@ class MineruParser:
|
|
| 313 |
|
| 314 |
# Apply chain of operations according to API documentation
|
| 315 |
# This follows the pattern shown in Image example in the API docs
|
| 316 |
-
ds.apply(doc_analyze, ocr=True)
|
| 317 |
-
|
| 318 |
-
|
| 319 |
|
| 320 |
# Re-execute for getting the content data
|
| 321 |
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
|
|
@@ -326,7 +379,9 @@ class MineruParser:
|
|
| 326 |
content_list = pipe_result.get_content_list(image_dir) # type: ignore
|
| 327 |
|
| 328 |
# Save additional output files
|
| 329 |
-
pipe_result.dump_content_list(
|
|
|
|
|
|
|
| 330 |
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
|
| 331 |
|
| 332 |
# Save model result - convert JSON string to bytes before writing
|
|
@@ -335,16 +390,24 @@ class MineruParser:
|
|
| 335 |
|
| 336 |
try:
|
| 337 |
# Try to write to a file manually to avoid FileBasedDataWriter issues
|
| 338 |
-
model_file_path = os.path.join(
|
| 339 |
-
|
|
|
|
|
|
|
| 340 |
f.write(json_str)
|
| 341 |
except Exception as e:
|
| 342 |
-
print(
|
|
|
|
|
|
|
| 343 |
try:
|
| 344 |
# If direct file write fails, try using the writer with bytes encoding
|
| 345 |
-
md_writer.write(
|
|
|
|
|
|
|
| 346 |
except Exception as e2:
|
| 347 |
-
print(
|
|
|
|
|
|
|
| 348 |
|
| 349 |
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
|
| 350 |
|
|
@@ -357,7 +420,7 @@ class MineruParser:
|
|
| 357 |
file_path: Union[str, Path],
|
| 358 |
parse_method: str = "auto",
|
| 359 |
output_dir: Optional[str] = None,
|
| 360 |
-
save_results: bool = True
|
| 361 |
) -> Tuple[List[Dict[str, Any]], str]:
|
| 362 |
"""
|
| 363 |
Parse document using MinerU based on file extension
|
|
@@ -382,64 +445,59 @@ class MineruParser:
|
|
| 382 |
# Choose appropriate parser based on file type
|
| 383 |
if ext in [".pdf"]:
|
| 384 |
return MineruParser.parse_pdf(
|
| 385 |
-
file_path,
|
| 386 |
-
output_dir,
|
| 387 |
-
use_ocr=(parse_method == "ocr")
|
| 388 |
)
|
| 389 |
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
|
| 390 |
-
return MineruParser.parse_image(
|
| 391 |
-
file_path,
|
| 392 |
-
output_dir
|
| 393 |
-
)
|
| 394 |
elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
|
| 395 |
-
return MineruParser.parse_office_doc(
|
| 396 |
-
file_path,
|
| 397 |
-
output_dir
|
| 398 |
-
)
|
| 399 |
else:
|
| 400 |
# For unsupported file types, default to PDF parsing
|
| 401 |
-
print(
|
|
|
|
|
|
|
| 402 |
return MineruParser.parse_pdf(
|
| 403 |
-
file_path,
|
| 404 |
-
output_dir,
|
| 405 |
-
use_ocr=(parse_method == "ocr")
|
| 406 |
)
|
| 407 |
|
|
|
|
| 408 |
def main():
|
| 409 |
"""
|
| 410 |
Main function to run the MinerU parser from command line
|
| 411 |
"""
|
| 412 |
-
parser = argparse.ArgumentParser(description=
|
| 413 |
-
parser.add_argument(
|
| 414 |
-
parser.add_argument(
|
| 415 |
-
parser.add_argument(
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
args = parser.parse_args()
|
| 423 |
|
| 424 |
try:
|
| 425 |
# Parse the document
|
| 426 |
content_list, md_content = MineruParser.parse_document(
|
| 427 |
-
file_path=args.file_path,
|
| 428 |
-
parse_method=args.method,
|
| 429 |
-
output_dir=args.output
|
| 430 |
)
|
| 431 |
|
| 432 |
# Display statistics if requested
|
| 433 |
if args.stats:
|
| 434 |
print("\nDocument Statistics:")
|
| 435 |
print(f"Total content blocks: {len(content_list)}")
|
| 436 |
-
|
| 437 |
# Count different types of content
|
| 438 |
content_types = {}
|
| 439 |
for item in content_list:
|
| 440 |
-
content_type = item.get(
|
| 441 |
content_types[content_type] = content_types.get(content_type, 0) + 1
|
| 442 |
-
|
| 443 |
print("\nContent Type Distribution:")
|
| 444 |
for content_type, count in content_types.items():
|
| 445 |
print(f"- {content_type}: {count}")
|
|
@@ -450,5 +508,6 @@ def main():
|
|
| 450 |
|
| 451 |
return 0
|
| 452 |
|
| 453 |
-
|
|
|
|
| 454 |
exit(main())
|
|
|
|
| 1 |
+
# type: ignore
|
| 2 |
"""
|
| 3 |
MinerU Document Parser Utility
|
| 4 |
|
|
|
|
| 14 |
import json
|
| 15 |
import argparse
|
| 16 |
from pathlib import Path
|
| 17 |
+
from typing import (
|
| 18 |
+
Dict,
|
| 19 |
+
List,
|
| 20 |
+
Optional,
|
| 21 |
+
Union,
|
| 22 |
+
Tuple,
|
| 23 |
+
Any,
|
| 24 |
+
TypeVar,
|
| 25 |
+
cast,
|
| 26 |
+
TYPE_CHECKING,
|
| 27 |
+
ClassVar,
|
| 28 |
+
)
|
| 29 |
|
| 30 |
# Type stubs for magic_pdf
|
| 31 |
FileBasedDataWriter = Any
|
|
|
|
| 39 |
read_local_images = Any
|
| 40 |
|
| 41 |
if TYPE_CHECKING:
|
| 42 |
+
from magic_pdf.data.data_reader_writer import (
|
| 43 |
+
FileBasedDataWriter,
|
| 44 |
+
FileBasedDataReader,
|
| 45 |
+
)
|
| 46 |
from magic_pdf.data.dataset import PymuDocDataset
|
| 47 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
| 48 |
from magic_pdf.config.enums import SupportedPdfParseMethod
|
| 49 |
from magic_pdf.data.read_api import read_local_office, read_local_images
|
| 50 |
else:
|
| 51 |
# MinerU imports
|
| 52 |
+
from magic_pdf.data.data_reader_writer import (
|
| 53 |
+
FileBasedDataWriter,
|
| 54 |
+
FileBasedDataReader,
|
| 55 |
+
)
|
| 56 |
from magic_pdf.data.dataset import PymuDocDataset
|
| 57 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
| 58 |
from magic_pdf.config.enums import SupportedPdfParseMethod
|
| 59 |
from magic_pdf.data.read_api import read_local_office, read_local_images
|
| 60 |
|
| 61 |
+
T = TypeVar("T")
|
| 62 |
+
|
| 63 |
|
| 64 |
class MineruParser:
|
| 65 |
"""
|
|
|
|
| 76 |
pass
|
| 77 |
|
| 78 |
@staticmethod
|
| 79 |
+
def safe_write(
|
| 80 |
+
writer: Any,
|
| 81 |
+
content: Union[str, bytes, Dict[str, Any], List[Any]],
|
| 82 |
+
filename: str,
|
| 83 |
+
) -> None:
|
| 84 |
"""
|
| 85 |
Safely write content to a file, ensuring the filename is valid
|
| 86 |
|
|
|
|
| 102 |
writer.write(content, filename)
|
| 103 |
except TypeError:
|
| 104 |
# If the writer expects bytes, convert string to bytes
|
| 105 |
+
writer.write(content.encode("utf-8"), filename)
|
| 106 |
else:
|
| 107 |
# For dict/list content, always encode as JSON string first
|
| 108 |
if isinstance(content, (dict, list)):
|
| 109 |
try:
|
| 110 |
+
writer.write(
|
| 111 |
+
json.dumps(content, ensure_ascii=False, indent=4), filename
|
| 112 |
+
)
|
| 113 |
except TypeError:
|
| 114 |
# If the writer expects bytes, convert JSON string to bytes
|
| 115 |
+
writer.write(
|
| 116 |
+
json.dumps(content, ensure_ascii=False, indent=4).encode(
|
| 117 |
+
"utf-8"
|
| 118 |
+
),
|
| 119 |
+
filename,
|
| 120 |
+
)
|
| 121 |
else:
|
| 122 |
# Regular content (assumed to be bytes or compatible)
|
| 123 |
writer.write(content, filename)
|
|
|
|
| 126 |
def parse_pdf(
|
| 127 |
pdf_path: Union[str, Path],
|
| 128 |
output_dir: Optional[str] = None,
|
| 129 |
+
use_ocr: bool = False,
|
| 130 |
) -> Tuple[List[Dict[str, Any]], str]:
|
| 131 |
"""
|
| 132 |
Parse PDF document
|
|
|
|
| 179 |
|
| 180 |
# Draw visualizations
|
| 181 |
try:
|
| 182 |
+
infer_result.draw_model(
|
| 183 |
+
os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")
|
| 184 |
+
) # type: ignore
|
| 185 |
+
pipe_result.draw_layout(
|
| 186 |
+
os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")
|
| 187 |
+
) # type: ignore
|
| 188 |
+
pipe_result.draw_span(
|
| 189 |
+
os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")
|
| 190 |
+
) # type: ignore
|
| 191 |
except Exception as e:
|
| 192 |
print(f"Warning: Failed to draw visualizations: {str(e)}")
|
| 193 |
|
|
|
|
| 197 |
|
| 198 |
# Save files using dump methods (consistent with API)
|
| 199 |
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore
|
| 200 |
+
pipe_result.dump_content_list(
|
| 201 |
+
md_writer, f"{name_without_suff}_content_list.json", image_dir
|
| 202 |
+
) # type: ignore
|
| 203 |
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
|
| 204 |
|
| 205 |
# Save model result - convert JSON string to bytes before writing
|
|
|
|
| 208 |
|
| 209 |
try:
|
| 210 |
# Try to write to a file manually to avoid FileBasedDataWriter issues
|
| 211 |
+
model_file_path = os.path.join(
|
| 212 |
+
local_md_dir, f"{name_without_suff}_model.json"
|
| 213 |
+
)
|
| 214 |
+
with open(model_file_path, "w", encoding="utf-8") as f:
|
| 215 |
f.write(json_str)
|
| 216 |
except Exception as e:
|
| 217 |
+
print(
|
| 218 |
+
f"Warning: Failed to save model result using file write: {str(e)}"
|
| 219 |
+
)
|
| 220 |
try:
|
| 221 |
# If direct file write fails, try using the writer with bytes encoding
|
| 222 |
+
md_writer.write(
|
| 223 |
+
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
|
| 224 |
+
) # type: ignore
|
| 225 |
except Exception as e2:
|
| 226 |
+
print(
|
| 227 |
+
f"Warning: Failed to save model result using writer: {str(e2)}"
|
| 228 |
+
)
|
| 229 |
|
| 230 |
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
|
| 231 |
|
|
|
|
| 235 |
|
| 236 |
@staticmethod
|
| 237 |
def parse_office_doc(
|
| 238 |
+
doc_path: Union[str, Path], output_dir: Optional[str] = None
|
|
|
|
| 239 |
) -> Tuple[List[Dict[str, Any]], str]:
|
| 240 |
"""
|
| 241 |
Parse office document (Word, PPT, etc.)
|
|
|
|
| 275 |
|
| 276 |
# Apply chain of operations according to API documentation
|
| 277 |
# This follows the pattern shown in MS-Office example in the API docs
|
| 278 |
+
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
|
| 279 |
+
md_writer, f"{name_without_suff}.md", image_dir
|
| 280 |
+
) # type: ignore
|
| 281 |
|
| 282 |
# Re-execute for getting the content data
|
| 283 |
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
|
|
|
|
| 288 |
content_list = pipe_result.get_content_list(image_dir) # type: ignore
|
| 289 |
|
| 290 |
# Save additional output files
|
| 291 |
+
pipe_result.dump_content_list(
|
| 292 |
+
md_writer, f"{name_without_suff}_content_list.json", image_dir
|
| 293 |
+
) # type: ignore
|
| 294 |
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
|
| 295 |
|
| 296 |
# Save model result - convert JSON string to bytes before writing
|
|
|
|
| 299 |
|
| 300 |
try:
|
| 301 |
# Try to write to a file manually to avoid FileBasedDataWriter issues
|
| 302 |
+
model_file_path = os.path.join(
|
| 303 |
+
local_md_dir, f"{name_without_suff}_model.json"
|
| 304 |
+
)
|
| 305 |
+
with open(model_file_path, "w", encoding="utf-8") as f:
|
| 306 |
f.write(json_str)
|
| 307 |
except Exception as e:
|
| 308 |
+
print(
|
| 309 |
+
f"Warning: Failed to save model result using file write: {str(e)}"
|
| 310 |
+
)
|
| 311 |
try:
|
| 312 |
# If direct file write fails, try using the writer with bytes encoding
|
| 313 |
+
md_writer.write(
|
| 314 |
+
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
|
| 315 |
+
) # type: ignore
|
| 316 |
except Exception as e2:
|
| 317 |
+
print(
|
| 318 |
+
f"Warning: Failed to save model result using writer: {str(e2)}"
|
| 319 |
+
)
|
| 320 |
|
| 321 |
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
|
| 322 |
|
|
|
|
| 326 |
|
| 327 |
@staticmethod
|
| 328 |
def parse_image(
|
| 329 |
+
image_path: Union[str, Path], output_dir: Optional[str] = None
|
|
|
|
| 330 |
) -> Tuple[List[Dict[str, Any]], str]:
|
| 331 |
"""
|
| 332 |
Parse image document
|
|
|
|
| 366 |
|
| 367 |
# Apply chain of operations according to API documentation
|
| 368 |
# This follows the pattern shown in Image example in the API docs
|
| 369 |
+
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
|
| 370 |
+
md_writer, f"{name_without_suff}.md", image_dir
|
| 371 |
+
) # type: ignore
|
| 372 |
|
| 373 |
# Re-execute for getting the content data
|
| 374 |
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
|
|
|
|
| 379 |
content_list = pipe_result.get_content_list(image_dir) # type: ignore
|
| 380 |
|
| 381 |
# Save additional output files
|
| 382 |
+
pipe_result.dump_content_list(
|
| 383 |
+
md_writer, f"{name_without_suff}_content_list.json", image_dir
|
| 384 |
+
) # type: ignore
|
| 385 |
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
|
| 386 |
|
| 387 |
# Save model result - convert JSON string to bytes before writing
|
|
|
|
| 390 |
|
| 391 |
try:
|
| 392 |
# Try to write to a file manually to avoid FileBasedDataWriter issues
|
| 393 |
+
model_file_path = os.path.join(
|
| 394 |
+
local_md_dir, f"{name_without_suff}_model.json"
|
| 395 |
+
)
|
| 396 |
+
with open(model_file_path, "w", encoding="utf-8") as f:
|
| 397 |
f.write(json_str)
|
| 398 |
except Exception as e:
|
| 399 |
+
print(
|
| 400 |
+
f"Warning: Failed to save model result using file write: {str(e)}"
|
| 401 |
+
)
|
| 402 |
try:
|
| 403 |
# If direct file write fails, try using the writer with bytes encoding
|
| 404 |
+
md_writer.write(
|
| 405 |
+
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
|
| 406 |
+
) # type: ignore
|
| 407 |
except Exception as e2:
|
| 408 |
+
print(
|
| 409 |
+
f"Warning: Failed to save model result using writer: {str(e2)}"
|
| 410 |
+
)
|
| 411 |
|
| 412 |
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
|
| 413 |
|
|
|
|
| 420 |
file_path: Union[str, Path],
|
| 421 |
parse_method: str = "auto",
|
| 422 |
output_dir: Optional[str] = None,
|
| 423 |
+
save_results: bool = True,
|
| 424 |
) -> Tuple[List[Dict[str, Any]], str]:
|
| 425 |
"""
|
| 426 |
Parse document using MinerU based on file extension
|
|
|
|
| 445 |
# Choose appropriate parser based on file type
|
| 446 |
if ext in [".pdf"]:
|
| 447 |
return MineruParser.parse_pdf(
|
| 448 |
+
file_path, output_dir, use_ocr=(parse_method == "ocr")
|
|
|
|
|
|
|
| 449 |
)
|
| 450 |
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
|
| 451 |
+
return MineruParser.parse_image(file_path, output_dir)
|
|
|
|
|
|
|
|
|
|
| 452 |
elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
|
| 453 |
+
return MineruParser.parse_office_doc(file_path, output_dir)
|
|
|
|
|
|
|
|
|
|
| 454 |
else:
|
| 455 |
# For unsupported file types, default to PDF parsing
|
| 456 |
+
print(
|
| 457 |
+
f"Warning: Unsupported file extension '{ext}', trying generic PDF parser"
|
| 458 |
+
)
|
| 459 |
return MineruParser.parse_pdf(
|
| 460 |
+
file_path, output_dir, use_ocr=(parse_method == "ocr")
|
|
|
|
|
|
|
| 461 |
)
|
| 462 |
|
| 463 |
+
|
| 464 |
def main():
|
| 465 |
"""
|
| 466 |
Main function to run the MinerU parser from command line
|
| 467 |
"""
|
| 468 |
+
parser = argparse.ArgumentParser(description="Parse documents using MinerU")
|
| 469 |
+
parser.add_argument("file_path", help="Path to the document to parse")
|
| 470 |
+
parser.add_argument("--output", "-o", help="Output directory path")
|
| 471 |
+
parser.add_argument(
|
| 472 |
+
"--method",
|
| 473 |
+
"-m",
|
| 474 |
+
choices=["auto", "ocr", "txt"],
|
| 475 |
+
default="auto",
|
| 476 |
+
help="Parsing method (auto, ocr, txt)",
|
| 477 |
+
)
|
| 478 |
+
parser.add_argument(
|
| 479 |
+
"--stats", action="store_true", help="Display content statistics"
|
| 480 |
+
)
|
| 481 |
|
| 482 |
args = parser.parse_args()
|
| 483 |
|
| 484 |
try:
|
| 485 |
# Parse the document
|
| 486 |
content_list, md_content = MineruParser.parse_document(
|
| 487 |
+
file_path=args.file_path, parse_method=args.method, output_dir=args.output
|
|
|
|
|
|
|
| 488 |
)
|
| 489 |
|
| 490 |
# Display statistics if requested
|
| 491 |
if args.stats:
|
| 492 |
print("\nDocument Statistics:")
|
| 493 |
print(f"Total content blocks: {len(content_list)}")
|
| 494 |
+
|
| 495 |
# Count different types of content
|
| 496 |
content_types = {}
|
| 497 |
for item in content_list:
|
| 498 |
+
content_type = item.get("type", "unknown")
|
| 499 |
content_types[content_type] = content_types.get(content_type, 0) + 1
|
| 500 |
+
|
| 501 |
print("\nContent Type Distribution:")
|
| 502 |
for content_type, count in content_types.items():
|
| 503 |
print(f"- {content_type}: {count}")
|
|
|
|
| 508 |
|
| 509 |
return 0
|
| 510 |
|
| 511 |
+
|
| 512 |
+
if __name__ == "__main__":
|
| 513 |
exit(main())
|
lightrag/modalprocessors.py
CHANGED
|
@@ -31,7 +31,7 @@ class BaseModalProcessor:
|
|
| 31 |
|
| 32 |
def __init__(self, lightrag: LightRAG, modal_caption_func):
|
| 33 |
"""Initialize base processor
|
| 34 |
-
|
| 35 |
Args:
|
| 36 |
lightrag: LightRAG instance
|
| 37 |
modal_caption_func: Function for generating descriptions
|
|
@@ -65,8 +65,8 @@ class BaseModalProcessor:
|
|
| 65 |
raise NotImplementedError("Subclasses must implement this method")
|
| 66 |
|
| 67 |
async def _create_entity_and_chunk(
|
| 68 |
-
|
| 69 |
-
|
| 70 |
"""Create entity and text chunk"""
|
| 71 |
# Create chunk
|
| 72 |
chunk_id = compute_mdhash_id(str(modal_chunk), prefix="chunk-")
|
|
@@ -93,16 +93,16 @@ class BaseModalProcessor:
|
|
| 93 |
"created_at": int(time.time()),
|
| 94 |
}
|
| 95 |
|
| 96 |
-
await self.knowledge_graph_inst.upsert_node(
|
| 97 |
-
|
|
|
|
| 98 |
|
| 99 |
# Insert entity into vector database
|
| 100 |
entity_vdb_data = {
|
| 101 |
compute_mdhash_id(entity_info["entity_name"], prefix="ent-"): {
|
| 102 |
"entity_name": entity_info["entity_name"],
|
| 103 |
"entity_type": entity_info["entity_type"],
|
| 104 |
-
"content":
|
| 105 |
-
f"{entity_info['entity_name']}\n{entity_info['summary']}",
|
| 106 |
"source_id": chunk_id,
|
| 107 |
"file_path": file_path,
|
| 108 |
}
|
|
@@ -110,8 +110,7 @@ class BaseModalProcessor:
|
|
| 110 |
await self.entities_vdb.upsert(entity_vdb_data)
|
| 111 |
|
| 112 |
# Process entity and relationship extraction
|
| 113 |
-
await self._process_chunk_for_extraction(chunk_id,
|
| 114 |
-
entity_info["entity_name"])
|
| 115 |
|
| 116 |
# Ensure all storage updates are complete
|
| 117 |
await self._insert_done()
|
|
@@ -120,11 +119,12 @@ class BaseModalProcessor:
|
|
| 120 |
"entity_name": entity_info["entity_name"],
|
| 121 |
"entity_type": entity_info["entity_type"],
|
| 122 |
"description": entity_info["summary"],
|
| 123 |
-
"chunk_id": chunk_id
|
| 124 |
}
|
| 125 |
|
| 126 |
-
async def _process_chunk_for_extraction(
|
| 127 |
-
|
|
|
|
| 128 |
"""Process chunk for entity and relationship extraction"""
|
| 129 |
chunk_data = await self.text_chunks_db.get_by_id(chunk_id)
|
| 130 |
if not chunk_data:
|
|
@@ -168,37 +168,27 @@ class BaseModalProcessor:
|
|
| 168 |
if entity_name != modal_entity_name: # Skip self-relationship
|
| 169 |
# Create belongs_to relationship
|
| 170 |
relation_data = {
|
| 171 |
-
"description":
|
| 172 |
-
|
| 173 |
-
"
|
| 174 |
-
"
|
| 175 |
-
"
|
| 176 |
-
chunk_id,
|
| 177 |
-
"weight":
|
| 178 |
-
10.0,
|
| 179 |
-
"file_path":
|
| 180 |
-
chunk_data.get("file_path", "manual_creation"),
|
| 181 |
}
|
| 182 |
await self.knowledge_graph_inst.upsert_edge(
|
| 183 |
-
entity_name, modal_entity_name, relation_data
|
|
|
|
| 184 |
|
| 185 |
-
relation_id = compute_mdhash_id(
|
| 186 |
-
|
| 187 |
-
|
| 188 |
relation_vdb_data = {
|
| 189 |
relation_id: {
|
| 190 |
-
"src_id":
|
| 191 |
-
|
| 192 |
-
"
|
| 193 |
-
modal_entity_name,
|
| 194 |
-
"
|
| 195 |
-
|
| 196 |
-
"content":
|
| 197 |
-
f"{relation_data['keywords']}\t{entity_name}\n{modal_entity_name}\n{relation_data['description']}",
|
| 198 |
-
"source_id":
|
| 199 |
-
chunk_id,
|
| 200 |
-
"file_path":
|
| 201 |
-
chunk_data.get("file_path", "manual_creation"),
|
| 202 |
}
|
| 203 |
}
|
| 204 |
await self.relationships_vdb.upsert(relation_vdb_data)
|
|
@@ -215,16 +205,18 @@ class BaseModalProcessor:
|
|
| 215 |
)
|
| 216 |
|
| 217 |
async def _insert_done(self) -> None:
|
| 218 |
-
await asyncio.gather(
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
| 226 |
]
|
| 227 |
-
|
| 228 |
|
| 229 |
|
| 230 |
class ImageModalProcessor(BaseModalProcessor):
|
|
@@ -232,7 +224,7 @@ class ImageModalProcessor(BaseModalProcessor):
|
|
| 232 |
|
| 233 |
def __init__(self, lightrag: LightRAG, modal_caption_func):
|
| 234 |
"""Initialize image processor
|
| 235 |
-
|
| 236 |
Args:
|
| 237 |
lightrag: LightRAG instance
|
| 238 |
modal_caption_func: Function for generating descriptions (supporting image understanding)
|
|
@@ -243,8 +235,7 @@ class ImageModalProcessor(BaseModalProcessor):
|
|
| 243 |
"""Encode image to base64"""
|
| 244 |
try:
|
| 245 |
with open(image_path, "rb") as image_file:
|
| 246 |
-
encoded_string = base64.b64encode(
|
| 247 |
-
image_file.read()).decode('utf-8')
|
| 248 |
return encoded_string
|
| 249 |
except Exception as e:
|
| 250 |
logger.error(f"Failed to encode image {image_path}: {e}")
|
|
@@ -309,13 +300,12 @@ class ImageModalProcessor(BaseModalProcessor):
|
|
| 309 |
response = await self.modal_caption_func(
|
| 310 |
vision_prompt,
|
| 311 |
image_data=image_base64,
|
| 312 |
-
system_prompt=
|
| 313 |
-
"You are an expert image analyst. Provide detailed, accurate descriptions."
|
| 314 |
)
|
| 315 |
else:
|
| 316 |
# Analyze based on existing text information
|
| 317 |
text_prompt = f"""Based on the following image information, provide analysis:
|
| 318 |
-
|
| 319 |
Image Path: {image_path}
|
| 320 |
Captions: {captions}
|
| 321 |
Footnotes: {footnotes}
|
|
@@ -324,13 +314,11 @@ class ImageModalProcessor(BaseModalProcessor):
|
|
| 324 |
|
| 325 |
response = await self.modal_caption_func(
|
| 326 |
text_prompt,
|
| 327 |
-
system_prompt=
|
| 328 |
-
"You are an expert image analyst. Provide detailed analysis based on available information."
|
| 329 |
)
|
| 330 |
|
| 331 |
# Parse response
|
| 332 |
-
enhanced_caption, entity_info = self._parse_response(
|
| 333 |
-
response, entity_name)
|
| 334 |
|
| 335 |
# Build complete image content
|
| 336 |
modal_chunk = f"""
|
|
@@ -341,27 +329,30 @@ class ImageModalProcessor(BaseModalProcessor):
|
|
| 341 |
|
| 342 |
Visual Analysis: {enhanced_caption}"""
|
| 343 |
|
| 344 |
-
return await self._create_entity_and_chunk(
|
| 345 |
-
|
|
|
|
| 346 |
|
| 347 |
except Exception as e:
|
| 348 |
logger.error(f"Error processing image content: {e}")
|
| 349 |
# Fallback processing
|
| 350 |
fallback_entity = {
|
| 351 |
-
"entity_name": entity_name
|
| 352 |
-
|
|
|
|
| 353 |
"entity_type": "image",
|
| 354 |
-
"summary": f"Image content: {str(modal_content)[:100]}"
|
| 355 |
}
|
| 356 |
return str(modal_content), fallback_entity
|
| 357 |
|
| 358 |
-
def _parse_response(
|
| 359 |
-
|
| 360 |
-
|
| 361 |
"""Parse model response"""
|
| 362 |
try:
|
| 363 |
response_data = json.loads(
|
| 364 |
-
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
|
|
|
| 365 |
|
| 366 |
description = response_data.get("detailed_description", "")
|
| 367 |
entity_data = response_data.get("entity_info", {})
|
|
@@ -369,11 +360,14 @@ class ImageModalProcessor(BaseModalProcessor):
|
|
| 369 |
if not description or not entity_data:
|
| 370 |
raise ValueError("Missing required fields in response")
|
| 371 |
|
| 372 |
-
if not all(
|
| 373 |
-
|
|
|
|
| 374 |
raise ValueError("Missing required fields in entity_info")
|
| 375 |
|
| 376 |
-
entity_data["entity_name"] =
|
|
|
|
|
|
|
| 377 |
if entity_name:
|
| 378 |
entity_data["entity_name"] = entity_name
|
| 379 |
|
|
@@ -382,13 +376,11 @@ class ImageModalProcessor(BaseModalProcessor):
|
|
| 382 |
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
| 383 |
logger.error(f"Error parsing image analysis response: {e}")
|
| 384 |
fallback_entity = {
|
| 385 |
-
"entity_name":
|
| 386 |
-
entity_name
|
| 387 |
-
|
| 388 |
-
"entity_type":
|
| 389 |
-
"
|
| 390 |
-
"summary":
|
| 391 |
-
response[:100] + "..." if len(response) > 100 else response
|
| 392 |
}
|
| 393 |
return response, fallback_entity
|
| 394 |
|
|
@@ -447,15 +439,15 @@ class TableModalProcessor(BaseModalProcessor):
|
|
| 447 |
|
| 448 |
response = await self.modal_caption_func(
|
| 449 |
table_prompt,
|
| 450 |
-
system_prompt=
|
| 451 |
-
"You are an expert data analyst. Provide detailed table analysis with specific insights."
|
| 452 |
)
|
| 453 |
|
| 454 |
# Parse response
|
| 455 |
enhanced_caption, entity_info = self._parse_table_response(
|
| 456 |
-
response, entity_name
|
| 457 |
-
|
| 458 |
-
|
|
|
|
| 459 |
|
| 460 |
# Build complete table content
|
| 461 |
modal_chunk = f"""Table Analysis:
|
|
@@ -466,17 +458,16 @@ class TableModalProcessor(BaseModalProcessor):
|
|
| 466 |
|
| 467 |
Analysis: {enhanced_caption}"""
|
| 468 |
|
| 469 |
-
return await self._create_entity_and_chunk(modal_chunk, entity_info,
|
| 470 |
-
file_path)
|
| 471 |
|
| 472 |
def _parse_table_response(
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
entity_name: str = None) -> Tuple[str, Dict[str, Any]]:
|
| 476 |
"""Parse table analysis response"""
|
| 477 |
try:
|
| 478 |
response_data = json.loads(
|
| 479 |
-
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
|
|
|
| 480 |
|
| 481 |
description = response_data.get("detailed_description", "")
|
| 482 |
entity_data = response_data.get("entity_info", {})
|
|
@@ -484,11 +475,14 @@ class TableModalProcessor(BaseModalProcessor):
|
|
| 484 |
if not description or not entity_data:
|
| 485 |
raise ValueError("Missing required fields in response")
|
| 486 |
|
| 487 |
-
if not all(
|
| 488 |
-
|
|
|
|
| 489 |
raise ValueError("Missing required fields in entity_info")
|
| 490 |
|
| 491 |
-
entity_data["entity_name"] =
|
|
|
|
|
|
|
| 492 |
if entity_name:
|
| 493 |
entity_data["entity_name"] = entity_name
|
| 494 |
|
|
@@ -497,13 +491,11 @@ class TableModalProcessor(BaseModalProcessor):
|
|
| 497 |
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
| 498 |
logger.error(f"Error parsing table analysis response: {e}")
|
| 499 |
fallback_entity = {
|
| 500 |
-
"entity_name":
|
| 501 |
-
entity_name
|
| 502 |
-
|
| 503 |
-
"entity_type":
|
| 504 |
-
"
|
| 505 |
-
"summary":
|
| 506 |
-
response[:100] + "..." if len(response) > 100 else response
|
| 507 |
}
|
| 508 |
return response, fallback_entity
|
| 509 |
|
|
@@ -559,13 +551,13 @@ class EquationModalProcessor(BaseModalProcessor):
|
|
| 559 |
|
| 560 |
response = await self.modal_caption_func(
|
| 561 |
equation_prompt,
|
| 562 |
-
system_prompt=
|
| 563 |
-
"You are an expert mathematician. Provide detailed mathematical analysis."
|
| 564 |
)
|
| 565 |
|
| 566 |
# Parse response
|
| 567 |
enhanced_caption, entity_info = self._parse_equation_response(
|
| 568 |
-
response, entity_name
|
|
|
|
| 569 |
|
| 570 |
# Build complete equation content
|
| 571 |
modal_chunk = f"""Mathematical Equation Analysis:
|
|
@@ -574,17 +566,16 @@ class EquationModalProcessor(BaseModalProcessor):
|
|
| 574 |
|
| 575 |
Mathematical Analysis: {enhanced_caption}"""
|
| 576 |
|
| 577 |
-
return await self._create_entity_and_chunk(modal_chunk, entity_info,
|
| 578 |
-
file_path)
|
| 579 |
|
| 580 |
def _parse_equation_response(
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
entity_name: str = None) -> Tuple[str, Dict[str, Any]]:
|
| 584 |
"""Parse equation analysis response"""
|
| 585 |
try:
|
| 586 |
response_data = json.loads(
|
| 587 |
-
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
|
|
|
| 588 |
|
| 589 |
description = response_data.get("detailed_description", "")
|
| 590 |
entity_data = response_data.get("entity_info", {})
|
|
@@ -592,11 +583,14 @@ class EquationModalProcessor(BaseModalProcessor):
|
|
| 592 |
if not description or not entity_data:
|
| 593 |
raise ValueError("Missing required fields in response")
|
| 594 |
|
| 595 |
-
if not all(
|
| 596 |
-
|
|
|
|
| 597 |
raise ValueError("Missing required fields in entity_info")
|
| 598 |
|
| 599 |
-
entity_data["entity_name"] =
|
|
|
|
|
|
|
| 600 |
if entity_name:
|
| 601 |
entity_data["entity_name"] = entity_name
|
| 602 |
|
|
@@ -605,13 +599,11 @@ class EquationModalProcessor(BaseModalProcessor):
|
|
| 605 |
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
| 606 |
logger.error(f"Error parsing equation analysis response: {e}")
|
| 607 |
fallback_entity = {
|
| 608 |
-
"entity_name":
|
| 609 |
-
entity_name
|
| 610 |
-
|
| 611 |
-
"entity_type":
|
| 612 |
-
"
|
| 613 |
-
"summary":
|
| 614 |
-
response[:100] + "..." if len(response) > 100 else response
|
| 615 |
}
|
| 616 |
return response, fallback_entity
|
| 617 |
|
|
@@ -651,13 +643,13 @@ class GenericModalProcessor(BaseModalProcessor):
|
|
| 651 |
|
| 652 |
response = await self.modal_caption_func(
|
| 653 |
generic_prompt,
|
| 654 |
-
system_prompt=
|
| 655 |
-
f"You are an expert content analyst specializing in {content_type} content."
|
| 656 |
)
|
| 657 |
|
| 658 |
# Parse response
|
| 659 |
enhanced_caption, entity_info = self._parse_generic_response(
|
| 660 |
-
response, entity_name, content_type
|
|
|
|
| 661 |
|
| 662 |
# Build complete content
|
| 663 |
modal_chunk = f"""{content_type.title()} Content Analysis:
|
|
@@ -665,18 +657,16 @@ class GenericModalProcessor(BaseModalProcessor):
|
|
| 665 |
|
| 666 |
Analysis: {enhanced_caption}"""
|
| 667 |
|
| 668 |
-
return await self._create_entity_and_chunk(modal_chunk, entity_info,
|
| 669 |
-
file_path)
|
| 670 |
|
| 671 |
def _parse_generic_response(
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
entity_name: str = None,
|
| 675 |
-
content_type: str = "content") -> Tuple[str, Dict[str, Any]]:
|
| 676 |
"""Parse generic analysis response"""
|
| 677 |
try:
|
| 678 |
response_data = json.loads(
|
| 679 |
-
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
|
|
|
| 680 |
|
| 681 |
description = response_data.get("detailed_description", "")
|
| 682 |
entity_data = response_data.get("entity_info", {})
|
|
@@ -684,11 +674,14 @@ class GenericModalProcessor(BaseModalProcessor):
|
|
| 684 |
if not description or not entity_data:
|
| 685 |
raise ValueError("Missing required fields in response")
|
| 686 |
|
| 687 |
-
if not all(
|
| 688 |
-
|
|
|
|
| 689 |
raise ValueError("Missing required fields in entity_info")
|
| 690 |
|
| 691 |
-
entity_data["entity_name"] =
|
|
|
|
|
|
|
| 692 |
if entity_name:
|
| 693 |
entity_data["entity_name"] = entity_name
|
| 694 |
|
|
@@ -697,12 +690,10 @@ class GenericModalProcessor(BaseModalProcessor):
|
|
| 697 |
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
| 698 |
logger.error(f"Error parsing generic analysis response: {e}")
|
| 699 |
fallback_entity = {
|
| 700 |
-
"entity_name":
|
| 701 |
-
|
| 702 |
-
f"{content_type}_{compute_mdhash_id(response)}",
|
| 703 |
-
"entity_type":
|
| 704 |
-
|
| 705 |
-
"summary":
|
| 706 |
-
response[:100] + "..." if len(response) > 100 else response
|
| 707 |
}
|
| 708 |
return response, fallback_entity
|
|
|
|
| 31 |
|
| 32 |
def __init__(self, lightrag: LightRAG, modal_caption_func):
|
| 33 |
"""Initialize base processor
|
| 34 |
+
|
| 35 |
Args:
|
| 36 |
lightrag: LightRAG instance
|
| 37 |
modal_caption_func: Function for generating descriptions
|
|
|
|
| 65 |
raise NotImplementedError("Subclasses must implement this method")
|
| 66 |
|
| 67 |
async def _create_entity_and_chunk(
|
| 68 |
+
self, modal_chunk: str, entity_info: Dict[str, Any], file_path: str
|
| 69 |
+
) -> Tuple[str, Dict[str, Any]]:
|
| 70 |
"""Create entity and text chunk"""
|
| 71 |
# Create chunk
|
| 72 |
chunk_id = compute_mdhash_id(str(modal_chunk), prefix="chunk-")
|
|
|
|
| 93 |
"created_at": int(time.time()),
|
| 94 |
}
|
| 95 |
|
| 96 |
+
await self.knowledge_graph_inst.upsert_node(
|
| 97 |
+
entity_info["entity_name"], node_data
|
| 98 |
+
)
|
| 99 |
|
| 100 |
# Insert entity into vector database
|
| 101 |
entity_vdb_data = {
|
| 102 |
compute_mdhash_id(entity_info["entity_name"], prefix="ent-"): {
|
| 103 |
"entity_name": entity_info["entity_name"],
|
| 104 |
"entity_type": entity_info["entity_type"],
|
| 105 |
+
"content": f"{entity_info['entity_name']}\n{entity_info['summary']}",
|
|
|
|
| 106 |
"source_id": chunk_id,
|
| 107 |
"file_path": file_path,
|
| 108 |
}
|
|
|
|
| 110 |
await self.entities_vdb.upsert(entity_vdb_data)
|
| 111 |
|
| 112 |
# Process entity and relationship extraction
|
| 113 |
+
await self._process_chunk_for_extraction(chunk_id, entity_info["entity_name"])
|
|
|
|
| 114 |
|
| 115 |
# Ensure all storage updates are complete
|
| 116 |
await self._insert_done()
|
|
|
|
| 119 |
"entity_name": entity_info["entity_name"],
|
| 120 |
"entity_type": entity_info["entity_type"],
|
| 121 |
"description": entity_info["summary"],
|
| 122 |
+
"chunk_id": chunk_id,
|
| 123 |
}
|
| 124 |
|
| 125 |
+
async def _process_chunk_for_extraction(
|
| 126 |
+
self, chunk_id: str, modal_entity_name: str
|
| 127 |
+
):
|
| 128 |
"""Process chunk for entity and relationship extraction"""
|
| 129 |
chunk_data = await self.text_chunks_db.get_by_id(chunk_id)
|
| 130 |
if not chunk_data:
|
|
|
|
| 168 |
if entity_name != modal_entity_name: # Skip self-relationship
|
| 169 |
# Create belongs_to relationship
|
| 170 |
relation_data = {
|
| 171 |
+
"description": f"Entity {entity_name} belongs to {modal_entity_name}",
|
| 172 |
+
"keywords": "belongs_to,part_of,contained_in",
|
| 173 |
+
"source_id": chunk_id,
|
| 174 |
+
"weight": 10.0,
|
| 175 |
+
"file_path": chunk_data.get("file_path", "manual_creation"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
}
|
| 177 |
await self.knowledge_graph_inst.upsert_edge(
|
| 178 |
+
entity_name, modal_entity_name, relation_data
|
| 179 |
+
)
|
| 180 |
|
| 181 |
+
relation_id = compute_mdhash_id(
|
| 182 |
+
entity_name + modal_entity_name, prefix="rel-"
|
| 183 |
+
)
|
| 184 |
relation_vdb_data = {
|
| 185 |
relation_id: {
|
| 186 |
+
"src_id": entity_name,
|
| 187 |
+
"tgt_id": modal_entity_name,
|
| 188 |
+
"keywords": relation_data["keywords"],
|
| 189 |
+
"content": f"{relation_data['keywords']}\t{entity_name}\n{modal_entity_name}\n{relation_data['description']}",
|
| 190 |
+
"source_id": chunk_id,
|
| 191 |
+
"file_path": chunk_data.get("file_path", "manual_creation"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
}
|
| 193 |
}
|
| 194 |
await self.relationships_vdb.upsert(relation_vdb_data)
|
|
|
|
| 205 |
)
|
| 206 |
|
| 207 |
async def _insert_done(self) -> None:
|
| 208 |
+
await asyncio.gather(
|
| 209 |
+
*[
|
| 210 |
+
cast(StorageNameSpace, storage_inst).index_done_callback()
|
| 211 |
+
for storage_inst in [
|
| 212 |
+
self.text_chunks_db,
|
| 213 |
+
self.chunks_vdb,
|
| 214 |
+
self.entities_vdb,
|
| 215 |
+
self.relationships_vdb,
|
| 216 |
+
self.knowledge_graph_inst,
|
| 217 |
+
]
|
| 218 |
]
|
| 219 |
+
)
|
| 220 |
|
| 221 |
|
| 222 |
class ImageModalProcessor(BaseModalProcessor):
|
|
|
|
| 224 |
|
| 225 |
def __init__(self, lightrag: LightRAG, modal_caption_func):
|
| 226 |
"""Initialize image processor
|
| 227 |
+
|
| 228 |
Args:
|
| 229 |
lightrag: LightRAG instance
|
| 230 |
modal_caption_func: Function for generating descriptions (supporting image understanding)
|
|
|
|
| 235 |
"""Encode image to base64"""
|
| 236 |
try:
|
| 237 |
with open(image_path, "rb") as image_file:
|
| 238 |
+
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
|
|
|
| 239 |
return encoded_string
|
| 240 |
except Exception as e:
|
| 241 |
logger.error(f"Failed to encode image {image_path}: {e}")
|
|
|
|
| 300 |
response = await self.modal_caption_func(
|
| 301 |
vision_prompt,
|
| 302 |
image_data=image_base64,
|
| 303 |
+
system_prompt="You are an expert image analyst. Provide detailed, accurate descriptions.",
|
|
|
|
| 304 |
)
|
| 305 |
else:
|
| 306 |
# Analyze based on existing text information
|
| 307 |
text_prompt = f"""Based on the following image information, provide analysis:
|
| 308 |
+
|
| 309 |
Image Path: {image_path}
|
| 310 |
Captions: {captions}
|
| 311 |
Footnotes: {footnotes}
|
|
|
|
| 314 |
|
| 315 |
response = await self.modal_caption_func(
|
| 316 |
text_prompt,
|
| 317 |
+
system_prompt="You are an expert image analyst. Provide detailed analysis based on available information.",
|
|
|
|
| 318 |
)
|
| 319 |
|
| 320 |
# Parse response
|
| 321 |
+
enhanced_caption, entity_info = self._parse_response(response, entity_name)
|
|
|
|
| 322 |
|
| 323 |
# Build complete image content
|
| 324 |
modal_chunk = f"""
|
|
|
|
| 329 |
|
| 330 |
Visual Analysis: {enhanced_caption}"""
|
| 331 |
|
| 332 |
+
return await self._create_entity_and_chunk(
|
| 333 |
+
modal_chunk, entity_info, file_path
|
| 334 |
+
)
|
| 335 |
|
| 336 |
except Exception as e:
|
| 337 |
logger.error(f"Error processing image content: {e}")
|
| 338 |
# Fallback processing
|
| 339 |
fallback_entity = {
|
| 340 |
+
"entity_name": entity_name
|
| 341 |
+
if entity_name
|
| 342 |
+
else f"image_{compute_mdhash_id(str(modal_content))}",
|
| 343 |
"entity_type": "image",
|
| 344 |
+
"summary": f"Image content: {str(modal_content)[:100]}",
|
| 345 |
}
|
| 346 |
return str(modal_content), fallback_entity
|
| 347 |
|
| 348 |
+
def _parse_response(
|
| 349 |
+
self, response: str, entity_name: str = None
|
| 350 |
+
) -> Tuple[str, Dict[str, Any]]:
|
| 351 |
"""Parse model response"""
|
| 352 |
try:
|
| 353 |
response_data = json.loads(
|
| 354 |
+
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
| 355 |
+
)
|
| 356 |
|
| 357 |
description = response_data.get("detailed_description", "")
|
| 358 |
entity_data = response_data.get("entity_info", {})
|
|
|
|
| 360 |
if not description or not entity_data:
|
| 361 |
raise ValueError("Missing required fields in response")
|
| 362 |
|
| 363 |
+
if not all(
|
| 364 |
+
key in entity_data for key in ["entity_name", "entity_type", "summary"]
|
| 365 |
+
):
|
| 366 |
raise ValueError("Missing required fields in entity_info")
|
| 367 |
|
| 368 |
+
entity_data["entity_name"] = (
|
| 369 |
+
entity_data["entity_name"] + f" ({entity_data['entity_type']})"
|
| 370 |
+
)
|
| 371 |
if entity_name:
|
| 372 |
entity_data["entity_name"] = entity_name
|
| 373 |
|
|
|
|
| 376 |
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
| 377 |
logger.error(f"Error parsing image analysis response: {e}")
|
| 378 |
fallback_entity = {
|
| 379 |
+
"entity_name": entity_name
|
| 380 |
+
if entity_name
|
| 381 |
+
else f"image_{compute_mdhash_id(response)}",
|
| 382 |
+
"entity_type": "image",
|
| 383 |
+
"summary": response[:100] + "..." if len(response) > 100 else response,
|
|
|
|
|
|
|
| 384 |
}
|
| 385 |
return response, fallback_entity
|
| 386 |
|
|
|
|
| 439 |
|
| 440 |
response = await self.modal_caption_func(
|
| 441 |
table_prompt,
|
| 442 |
+
system_prompt="You are an expert data analyst. Provide detailed table analysis with specific insights.",
|
|
|
|
| 443 |
)
|
| 444 |
|
| 445 |
# Parse response
|
| 446 |
enhanced_caption, entity_info = self._parse_table_response(
|
| 447 |
+
response, entity_name
|
| 448 |
+
)
|
| 449 |
+
|
| 450 |
+
# TODO: Add Retry Mechanism
|
| 451 |
|
| 452 |
# Build complete table content
|
| 453 |
modal_chunk = f"""Table Analysis:
|
|
|
|
| 458 |
|
| 459 |
Analysis: {enhanced_caption}"""
|
| 460 |
|
| 461 |
+
return await self._create_entity_and_chunk(modal_chunk, entity_info, file_path)
|
|
|
|
| 462 |
|
| 463 |
def _parse_table_response(
|
| 464 |
+
self, response: str, entity_name: str = None
|
| 465 |
+
) -> Tuple[str, Dict[str, Any]]:
|
|
|
|
| 466 |
"""Parse table analysis response"""
|
| 467 |
try:
|
| 468 |
response_data = json.loads(
|
| 469 |
+
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
| 470 |
+
)
|
| 471 |
|
| 472 |
description = response_data.get("detailed_description", "")
|
| 473 |
entity_data = response_data.get("entity_info", {})
|
|
|
|
| 475 |
if not description or not entity_data:
|
| 476 |
raise ValueError("Missing required fields in response")
|
| 477 |
|
| 478 |
+
if not all(
|
| 479 |
+
key in entity_data for key in ["entity_name", "entity_type", "summary"]
|
| 480 |
+
):
|
| 481 |
raise ValueError("Missing required fields in entity_info")
|
| 482 |
|
| 483 |
+
entity_data["entity_name"] = (
|
| 484 |
+
entity_data["entity_name"] + f" ({entity_data['entity_type']})"
|
| 485 |
+
)
|
| 486 |
if entity_name:
|
| 487 |
entity_data["entity_name"] = entity_name
|
| 488 |
|
|
|
|
| 491 |
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
| 492 |
logger.error(f"Error parsing table analysis response: {e}")
|
| 493 |
fallback_entity = {
|
| 494 |
+
"entity_name": entity_name
|
| 495 |
+
if entity_name
|
| 496 |
+
else f"table_{compute_mdhash_id(response)}",
|
| 497 |
+
"entity_type": "table",
|
| 498 |
+
"summary": response[:100] + "..." if len(response) > 100 else response,
|
|
|
|
|
|
|
| 499 |
}
|
| 500 |
return response, fallback_entity
|
| 501 |
|
|
|
|
| 551 |
|
| 552 |
response = await self.modal_caption_func(
|
| 553 |
equation_prompt,
|
| 554 |
+
system_prompt="You are an expert mathematician. Provide detailed mathematical analysis.",
|
|
|
|
| 555 |
)
|
| 556 |
|
| 557 |
# Parse response
|
| 558 |
enhanced_caption, entity_info = self._parse_equation_response(
|
| 559 |
+
response, entity_name
|
| 560 |
+
)
|
| 561 |
|
| 562 |
# Build complete equation content
|
| 563 |
modal_chunk = f"""Mathematical Equation Analysis:
|
|
|
|
| 566 |
|
| 567 |
Mathematical Analysis: {enhanced_caption}"""
|
| 568 |
|
| 569 |
+
return await self._create_entity_and_chunk(modal_chunk, entity_info, file_path)
|
|
|
|
| 570 |
|
| 571 |
def _parse_equation_response(
|
| 572 |
+
self, response: str, entity_name: str = None
|
| 573 |
+
) -> Tuple[str, Dict[str, Any]]:
|
|
|
|
| 574 |
"""Parse equation analysis response"""
|
| 575 |
try:
|
| 576 |
response_data = json.loads(
|
| 577 |
+
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
| 578 |
+
)
|
| 579 |
|
| 580 |
description = response_data.get("detailed_description", "")
|
| 581 |
entity_data = response_data.get("entity_info", {})
|
|
|
|
| 583 |
if not description or not entity_data:
|
| 584 |
raise ValueError("Missing required fields in response")
|
| 585 |
|
| 586 |
+
if not all(
|
| 587 |
+
key in entity_data for key in ["entity_name", "entity_type", "summary"]
|
| 588 |
+
):
|
| 589 |
raise ValueError("Missing required fields in entity_info")
|
| 590 |
|
| 591 |
+
entity_data["entity_name"] = (
|
| 592 |
+
entity_data["entity_name"] + f" ({entity_data['entity_type']})"
|
| 593 |
+
)
|
| 594 |
if entity_name:
|
| 595 |
entity_data["entity_name"] = entity_name
|
| 596 |
|
|
|
|
| 599 |
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
| 600 |
logger.error(f"Error parsing equation analysis response: {e}")
|
| 601 |
fallback_entity = {
|
| 602 |
+
"entity_name": entity_name
|
| 603 |
+
if entity_name
|
| 604 |
+
else f"equation_{compute_mdhash_id(response)}",
|
| 605 |
+
"entity_type": "equation",
|
| 606 |
+
"summary": response[:100] + "..." if len(response) > 100 else response,
|
|
|
|
|
|
|
| 607 |
}
|
| 608 |
return response, fallback_entity
|
| 609 |
|
|
|
|
| 643 |
|
| 644 |
response = await self.modal_caption_func(
|
| 645 |
generic_prompt,
|
| 646 |
+
system_prompt=f"You are an expert content analyst specializing in {content_type} content.",
|
|
|
|
| 647 |
)
|
| 648 |
|
| 649 |
# Parse response
|
| 650 |
enhanced_caption, entity_info = self._parse_generic_response(
|
| 651 |
+
response, entity_name, content_type
|
| 652 |
+
)
|
| 653 |
|
| 654 |
# Build complete content
|
| 655 |
modal_chunk = f"""{content_type.title()} Content Analysis:
|
|
|
|
| 657 |
|
| 658 |
Analysis: {enhanced_caption}"""
|
| 659 |
|
| 660 |
+
return await self._create_entity_and_chunk(modal_chunk, entity_info, file_path)
|
|
|
|
| 661 |
|
| 662 |
def _parse_generic_response(
|
| 663 |
+
self, response: str, entity_name: str = None, content_type: str = "content"
|
| 664 |
+
) -> Tuple[str, Dict[str, Any]]:
|
|
|
|
|
|
|
| 665 |
"""Parse generic analysis response"""
|
| 666 |
try:
|
| 667 |
response_data = json.loads(
|
| 668 |
+
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
| 669 |
+
)
|
| 670 |
|
| 671 |
description = response_data.get("detailed_description", "")
|
| 672 |
entity_data = response_data.get("entity_info", {})
|
|
|
|
| 674 |
if not description or not entity_data:
|
| 675 |
raise ValueError("Missing required fields in response")
|
| 676 |
|
| 677 |
+
if not all(
|
| 678 |
+
key in entity_data for key in ["entity_name", "entity_type", "summary"]
|
| 679 |
+
):
|
| 680 |
raise ValueError("Missing required fields in entity_info")
|
| 681 |
|
| 682 |
+
entity_data["entity_name"] = (
|
| 683 |
+
entity_data["entity_name"] + f" ({entity_data['entity_type']})"
|
| 684 |
+
)
|
| 685 |
if entity_name:
|
| 686 |
entity_data["entity_name"] = entity_name
|
| 687 |
|
|
|
|
| 690 |
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
| 691 |
logger.error(f"Error parsing generic analysis response: {e}")
|
| 692 |
fallback_entity = {
|
| 693 |
+
"entity_name": entity_name
|
| 694 |
+
if entity_name
|
| 695 |
+
else f"{content_type}_{compute_mdhash_id(response)}",
|
| 696 |
+
"entity_type": content_type,
|
| 697 |
+
"summary": response[:100] + "..." if len(response) > 100 else response,
|
|
|
|
|
|
|
| 698 |
}
|
| 699 |
return response, fallback_entity
|
lightrag/raganything.py
CHANGED
|
@@ -26,15 +26,15 @@ from lightrag.mineru_parser import MineruParser
|
|
| 26 |
# Import specialized processors
|
| 27 |
from lightrag.modalprocessors import (
|
| 28 |
ImageModalProcessor,
|
| 29 |
-
TableModalProcessor,
|
| 30 |
EquationModalProcessor,
|
| 31 |
-
GenericModalProcessor
|
| 32 |
)
|
| 33 |
|
| 34 |
|
| 35 |
class RAGAnything:
|
| 36 |
"""Multimodal Document Processing Pipeline - Complete document parsing and insertion pipeline"""
|
| 37 |
-
|
| 38 |
def __init__(
|
| 39 |
self,
|
| 40 |
lightrag: Optional[LightRAG] = None,
|
|
@@ -43,11 +43,11 @@ class RAGAnything:
|
|
| 43 |
embedding_func: Optional[Callable] = None,
|
| 44 |
working_dir: str = "./rag_storage",
|
| 45 |
embedding_dim: int = 3072,
|
| 46 |
-
max_token_size: int = 8192
|
| 47 |
):
|
| 48 |
"""
|
| 49 |
Initialize Multimodal Document Processing Pipeline
|
| 50 |
-
|
| 51 |
Args:
|
| 52 |
lightrag: Optional pre-initialized LightRAG instance
|
| 53 |
llm_model_func: LLM model function for text analysis
|
|
@@ -63,64 +63,67 @@ class RAGAnything:
|
|
| 63 |
self.embedding_func = embedding_func
|
| 64 |
self.embedding_dim = embedding_dim
|
| 65 |
self.max_token_size = max_token_size
|
| 66 |
-
|
| 67 |
# Set up logging
|
| 68 |
setup_logger("RAGAnything")
|
| 69 |
self.logger = logging.getLogger("RAGAnything")
|
| 70 |
-
|
| 71 |
# Create working directory if needed
|
| 72 |
if not os.path.exists(working_dir):
|
| 73 |
os.makedirs(working_dir)
|
| 74 |
-
|
| 75 |
# Use provided LightRAG or mark for later initialization
|
| 76 |
self.lightrag = lightrag
|
| 77 |
self.modal_processors = {}
|
| 78 |
-
|
| 79 |
# If LightRAG is provided, initialize processors immediately
|
| 80 |
if self.lightrag is not None:
|
| 81 |
self._initialize_processors()
|
| 82 |
-
|
| 83 |
def _initialize_processors(self):
|
| 84 |
"""Initialize multimodal processors with appropriate model functions"""
|
| 85 |
if self.lightrag is None:
|
| 86 |
-
raise ValueError(
|
| 87 |
-
|
|
|
|
|
|
|
| 88 |
# Create different multimodal processors
|
| 89 |
self.modal_processors = {
|
| 90 |
"image": ImageModalProcessor(
|
| 91 |
lightrag=self.lightrag,
|
| 92 |
-
modal_caption_func=self.vision_model_func or self.llm_model_func
|
| 93 |
),
|
| 94 |
"table": TableModalProcessor(
|
| 95 |
-
lightrag=self.lightrag,
|
| 96 |
-
modal_caption_func=self.llm_model_func
|
| 97 |
),
|
| 98 |
"equation": EquationModalProcessor(
|
| 99 |
-
lightrag=self.lightrag,
|
| 100 |
-
modal_caption_func=self.llm_model_func
|
| 101 |
),
|
| 102 |
"generic": GenericModalProcessor(
|
| 103 |
-
lightrag=self.lightrag,
|
| 104 |
-
|
| 105 |
-
)
|
| 106 |
}
|
| 107 |
-
|
| 108 |
self.logger.info("Multimodal processors initialized")
|
| 109 |
self.logger.info(f"Available processors: {list(self.modal_processors.keys())}")
|
| 110 |
-
|
| 111 |
async def _ensure_lightrag_initialized(self):
|
| 112 |
"""Ensure LightRAG instance is initialized, create if necessary"""
|
| 113 |
if self.lightrag is not None:
|
| 114 |
return
|
| 115 |
-
|
| 116 |
# Validate required functions
|
| 117 |
if self.llm_model_func is None:
|
| 118 |
-
raise ValueError(
|
|
|
|
|
|
|
| 119 |
if self.embedding_func is None:
|
| 120 |
-
raise ValueError(
|
| 121 |
-
|
|
|
|
|
|
|
| 122 |
from lightrag.kg.shared_storage import initialize_pipeline_status
|
| 123 |
-
|
| 124 |
# Create LightRAG instance with provided functions
|
| 125 |
self.lightrag = LightRAG(
|
| 126 |
working_dir=self.working_dir,
|
|
@@ -134,88 +137,86 @@ class RAGAnything:
|
|
| 134 |
|
| 135 |
await self.lightrag.initialize_storages()
|
| 136 |
await initialize_pipeline_status()
|
| 137 |
-
|
| 138 |
# Initialize processors after LightRAG is ready
|
| 139 |
self._initialize_processors()
|
| 140 |
-
|
| 141 |
self.logger.info("LightRAG and multimodal processors initialized")
|
| 142 |
|
| 143 |
def parse_document(
|
| 144 |
-
self,
|
| 145 |
-
file_path: str,
|
| 146 |
output_dir: str = "./output",
|
| 147 |
parse_method: str = "auto",
|
| 148 |
-
display_stats: bool = True
|
| 149 |
) -> Tuple[List[Dict[str, Any]], str]:
|
| 150 |
"""
|
| 151 |
Parse document using MinerU
|
| 152 |
-
|
| 153 |
Args:
|
| 154 |
file_path: Path to the file to parse
|
| 155 |
output_dir: Output directory
|
| 156 |
parse_method: Parse method ("auto", "ocr", "txt")
|
| 157 |
display_stats: Whether to display content statistics
|
| 158 |
-
|
| 159 |
Returns:
|
| 160 |
(content_list, md_content): Content list and markdown text
|
| 161 |
"""
|
| 162 |
self.logger.info(f"Starting document parsing: {file_path}")
|
| 163 |
-
|
| 164 |
file_path = Path(file_path)
|
| 165 |
if not file_path.exists():
|
| 166 |
raise FileNotFoundError(f"File not found: {file_path}")
|
| 167 |
-
|
| 168 |
# Choose appropriate parsing method based on file extension
|
| 169 |
ext = file_path.suffix.lower()
|
| 170 |
-
|
| 171 |
try:
|
| 172 |
if ext in [".pdf"]:
|
| 173 |
-
self.logger.info(
|
|
|
|
|
|
|
| 174 |
content_list, md_content = MineruParser.parse_pdf(
|
| 175 |
-
file_path,
|
| 176 |
-
output_dir,
|
| 177 |
-
use_ocr=(parse_method == "ocr")
|
| 178 |
)
|
| 179 |
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
|
| 180 |
self.logger.info("Detected image file, using image parser...")
|
| 181 |
content_list, md_content = MineruParser.parse_image(
|
| 182 |
-
file_path,
|
| 183 |
-
output_dir
|
| 184 |
)
|
| 185 |
elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
|
| 186 |
self.logger.info("Detected Office document, using Office parser...")
|
| 187 |
content_list, md_content = MineruParser.parse_office_doc(
|
| 188 |
-
file_path,
|
| 189 |
-
output_dir
|
| 190 |
)
|
| 191 |
else:
|
| 192 |
# For other or unknown formats, use generic parser
|
| 193 |
-
self.logger.info(
|
|
|
|
|
|
|
| 194 |
content_list, md_content = MineruParser.parse_document(
|
| 195 |
-
file_path,
|
| 196 |
-
parse_method=parse_method,
|
| 197 |
-
output_dir=output_dir
|
| 198 |
)
|
| 199 |
-
|
| 200 |
except Exception as e:
|
| 201 |
self.logger.error(f"Error during parsing with specific parser: {str(e)}")
|
| 202 |
self.logger.warning("Falling back to generic parser...")
|
| 203 |
# If specific parser fails, fall back to generic parser
|
| 204 |
content_list, md_content = MineruParser.parse_document(
|
| 205 |
-
file_path,
|
| 206 |
-
parse_method=parse_method,
|
| 207 |
-
output_dir=output_dir
|
| 208 |
)
|
| 209 |
-
|
| 210 |
-
self.logger.info(
|
|
|
|
|
|
|
| 211 |
self.logger.info(f"Markdown text length: {len(md_content)} characters")
|
| 212 |
-
|
| 213 |
# Display content statistics if requested
|
| 214 |
if display_stats:
|
| 215 |
self.logger.info("\nContent Information:")
|
| 216 |
self.logger.info(f"* Total blocks in content_list: {len(content_list)}")
|
| 217 |
self.logger.info(f"* Markdown content length: {len(md_content)} characters")
|
| 218 |
-
|
| 219 |
# Count elements by type
|
| 220 |
block_types: Dict[str, int] = {}
|
| 221 |
for block in content_list:
|
|
@@ -223,29 +224,31 @@ class RAGAnything:
|
|
| 223 |
block_type = block.get("type", "unknown")
|
| 224 |
if isinstance(block_type, str):
|
| 225 |
block_types[block_type] = block_types.get(block_type, 0) + 1
|
| 226 |
-
|
| 227 |
self.logger.info("* Content block types:")
|
| 228 |
for block_type, count in block_types.items():
|
| 229 |
self.logger.info(f" - {block_type}: {count}")
|
| 230 |
-
|
| 231 |
return content_list, md_content
|
| 232 |
|
| 233 |
-
def _separate_content(
|
|
|
|
|
|
|
| 234 |
"""
|
| 235 |
Separate text content and multimodal content
|
| 236 |
-
|
| 237 |
Args:
|
| 238 |
content_list: Content list from MinerU parsing
|
| 239 |
-
|
| 240 |
Returns:
|
| 241 |
(text_content, multimodal_items): Pure text content and multimodal items list
|
| 242 |
"""
|
| 243 |
text_parts = []
|
| 244 |
multimodal_items = []
|
| 245 |
-
|
| 246 |
for item in content_list:
|
| 247 |
content_type = item.get("type", "text")
|
| 248 |
-
|
| 249 |
if content_type == "text":
|
| 250 |
# Text content
|
| 251 |
text = item.get("text", "")
|
|
@@ -254,27 +257,27 @@ class RAGAnything:
|
|
| 254 |
else:
|
| 255 |
# Multimodal content (image, table, equation, etc.)
|
| 256 |
multimodal_items.append(item)
|
| 257 |
-
|
| 258 |
# Merge all text content
|
| 259 |
text_content = "\n\n".join(text_parts)
|
| 260 |
-
|
| 261 |
-
self.logger.info(
|
| 262 |
self.logger.info(f" - Text content length: {len(text_content)} characters")
|
| 263 |
self.logger.info(f" - Multimodal items count: {len(multimodal_items)}")
|
| 264 |
-
|
| 265 |
# Count multimodal types
|
| 266 |
modal_types = {}
|
| 267 |
for item in multimodal_items:
|
| 268 |
modal_type = item.get("type", "unknown")
|
| 269 |
modal_types[modal_type] = modal_types.get(modal_type, 0) + 1
|
| 270 |
-
|
| 271 |
if modal_types:
|
| 272 |
self.logger.info(f" - Multimodal type distribution: {modal_types}")
|
| 273 |
-
|
| 274 |
return text_content, multimodal_items
|
| 275 |
|
| 276 |
async def _insert_text_content(
|
| 277 |
-
self,
|
| 278 |
input: str | list[str],
|
| 279 |
split_by_character: str | None = None,
|
| 280 |
split_by_character_only: bool = False,
|
|
@@ -283,7 +286,7 @@ class RAGAnything:
|
|
| 283 |
):
|
| 284 |
"""
|
| 285 |
Insert pure text content into LightRAG
|
| 286 |
-
|
| 287 |
Args:
|
| 288 |
input: Single document string or list of document strings
|
| 289 |
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
|
|
@@ -292,24 +295,26 @@ class RAGAnything:
|
|
| 292 |
split_by_character is None, this parameter is ignored.
|
| 293 |
ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
|
| 294 |
file_paths: single string of the file path or list of file paths, used for citation
|
| 295 |
-
"""
|
| 296 |
self.logger.info("Starting text content insertion into LightRAG...")
|
| 297 |
-
|
| 298 |
# Use LightRAG's insert method with all parameters
|
| 299 |
await self.lightrag.ainsert(
|
| 300 |
input=input,
|
| 301 |
file_paths=file_paths,
|
| 302 |
split_by_character=split_by_character,
|
| 303 |
split_by_character_only=split_by_character_only,
|
| 304 |
-
ids=ids
|
| 305 |
)
|
| 306 |
-
|
| 307 |
self.logger.info("Text content insertion complete")
|
| 308 |
|
| 309 |
-
async def _process_multimodal_content(
|
|
|
|
|
|
|
| 310 |
"""
|
| 311 |
Process multimodal content (using specialized processors)
|
| 312 |
-
|
| 313 |
Args:
|
| 314 |
multimodal_items: List of multimodal items
|
| 315 |
file_path: File path (for reference)
|
|
@@ -317,43 +322,52 @@ class RAGAnything:
|
|
| 317 |
if not multimodal_items:
|
| 318 |
self.logger.debug("No multimodal content to process")
|
| 319 |
return
|
| 320 |
-
|
| 321 |
self.logger.info("Starting multimodal content processing...")
|
| 322 |
-
|
| 323 |
file_name = os.path.basename(file_path)
|
| 324 |
-
|
| 325 |
for i, item in enumerate(multimodal_items):
|
| 326 |
try:
|
| 327 |
content_type = item.get("type", "unknown")
|
| 328 |
-
self.logger.info(
|
| 329 |
-
|
|
|
|
|
|
|
| 330 |
# Select appropriate processor
|
| 331 |
processor = self._get_processor_for_type(content_type)
|
| 332 |
-
|
| 333 |
if processor:
|
| 334 |
-
|
|
|
|
|
|
|
|
|
|
| 335 |
modal_content=item,
|
| 336 |
content_type=content_type,
|
| 337 |
-
file_path=file_name
|
|
|
|
|
|
|
|
|
|
| 338 |
)
|
| 339 |
-
self.logger.info(f"{content_type} processing complete: {entity_info.get('entity_name', 'Unknown')}")
|
| 340 |
else:
|
| 341 |
-
self.logger.warning(
|
| 342 |
-
|
|
|
|
|
|
|
| 343 |
except Exception as e:
|
| 344 |
self.logger.error(f"Error processing multimodal content: {str(e)}")
|
| 345 |
self.logger.debug("Exception details:", exc_info=True)
|
| 346 |
continue
|
| 347 |
-
|
| 348 |
self.logger.info("Multimodal content processing complete")
|
| 349 |
|
| 350 |
def _get_processor_for_type(self, content_type: str):
|
| 351 |
"""
|
| 352 |
Get appropriate processor based on content type
|
| 353 |
-
|
| 354 |
Args:
|
| 355 |
content_type: Content type
|
| 356 |
-
|
| 357 |
Returns:
|
| 358 |
Corresponding processor instance
|
| 359 |
"""
|
|
@@ -369,18 +383,18 @@ class RAGAnything:
|
|
| 369 |
return self.modal_processors.get("generic")
|
| 370 |
|
| 371 |
async def process_document_complete(
|
| 372 |
-
self,
|
| 373 |
-
file_path: str,
|
| 374 |
output_dir: str = "./output",
|
| 375 |
parse_method: str = "auto",
|
| 376 |
display_stats: bool = True,
|
| 377 |
split_by_character: str | None = None,
|
| 378 |
split_by_character_only: bool = False,
|
| 379 |
-
doc_id: str | None = None
|
| 380 |
):
|
| 381 |
"""
|
| 382 |
Complete document processing workflow
|
| 383 |
-
|
| 384 |
Args:
|
| 385 |
file_path: Path to the file to process
|
| 386 |
output_dir: MinerU output directory
|
|
@@ -392,35 +406,32 @@ class RAGAnything:
|
|
| 392 |
"""
|
| 393 |
# Ensure LightRAG is initialized
|
| 394 |
await self._ensure_lightrag_initialized()
|
| 395 |
-
|
| 396 |
self.logger.info(f"Starting complete document processing: {file_path}")
|
| 397 |
-
|
| 398 |
# Step 1: Parse document using MinerU
|
| 399 |
content_list, md_content = self.parse_document(
|
| 400 |
-
file_path,
|
| 401 |
-
output_dir,
|
| 402 |
-
parse_method,
|
| 403 |
-
display_stats
|
| 404 |
)
|
| 405 |
-
|
| 406 |
# Step 2: Separate text and multimodal content
|
| 407 |
text_content, multimodal_items = self._separate_content(content_list)
|
| 408 |
-
|
| 409 |
# Step 3: Insert pure text content with all parameters
|
| 410 |
if text_content.strip():
|
| 411 |
file_name = os.path.basename(file_path)
|
| 412 |
await self._insert_text_content(
|
| 413 |
-
text_content,
|
| 414 |
file_paths=file_name,
|
| 415 |
split_by_character=split_by_character,
|
| 416 |
split_by_character_only=split_by_character_only,
|
| 417 |
-
ids=doc_id
|
| 418 |
)
|
| 419 |
-
|
| 420 |
# Step 4: Process multimodal content (using specialized processors)
|
| 421 |
if multimodal_items:
|
| 422 |
await self._process_multimodal_content(multimodal_items, file_path)
|
| 423 |
-
|
| 424 |
self.logger.info(f"Document {file_path} processing complete!")
|
| 425 |
|
| 426 |
async def process_folder_complete(
|
|
@@ -433,11 +444,11 @@ class RAGAnything:
|
|
| 433 |
split_by_character_only: bool = False,
|
| 434 |
file_extensions: Optional[List[str]] = None,
|
| 435 |
recursive: bool = True,
|
| 436 |
-
max_workers: int = 1
|
| 437 |
):
|
| 438 |
"""
|
| 439 |
Process all files in a folder in batch
|
| 440 |
-
|
| 441 |
Args:
|
| 442 |
folder_path: Path to the folder to process
|
| 443 |
output_dir: MinerU output directory
|
|
@@ -451,75 +462,98 @@ class RAGAnything:
|
|
| 451 |
"""
|
| 452 |
# Ensure LightRAG is initialized
|
| 453 |
await self._ensure_lightrag_initialized()
|
| 454 |
-
|
| 455 |
folder_path = Path(folder_path)
|
| 456 |
if not folder_path.exists() or not folder_path.is_dir():
|
| 457 |
-
raise ValueError(
|
| 458 |
-
|
|
|
|
|
|
|
| 459 |
# Supported file formats
|
| 460 |
supported_extensions = {
|
| 461 |
-
".pdf",
|
| 462 |
-
".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
}
|
| 464 |
-
|
| 465 |
# Use specified extensions or all supported formats
|
| 466 |
if file_extensions:
|
| 467 |
target_extensions = set(ext.lower() for ext in file_extensions)
|
| 468 |
# Validate if all are supported formats
|
| 469 |
unsupported = target_extensions - supported_extensions
|
| 470 |
if unsupported:
|
| 471 |
-
self.logger.warning(
|
|
|
|
|
|
|
| 472 |
else:
|
| 473 |
target_extensions = supported_extensions
|
| 474 |
-
|
| 475 |
# Collect all files to process
|
| 476 |
files_to_process = []
|
| 477 |
-
|
| 478 |
if recursive:
|
| 479 |
# Recursively traverse all subfolders
|
| 480 |
for file_path in folder_path.rglob("*"):
|
| 481 |
-
if
|
|
|
|
|
|
|
|
|
|
| 482 |
files_to_process.append(file_path)
|
| 483 |
else:
|
| 484 |
# Process only current folder
|
| 485 |
for file_path in folder_path.glob("*"):
|
| 486 |
-
if
|
|
|
|
|
|
|
|
|
|
| 487 |
files_to_process.append(file_path)
|
| 488 |
-
|
| 489 |
if not files_to_process:
|
| 490 |
self.logger.info(f"No files to process found in {folder_path}")
|
| 491 |
return
|
| 492 |
-
|
| 493 |
self.logger.info(f"Found {len(files_to_process)} files to process")
|
| 494 |
-
self.logger.info(
|
| 495 |
-
|
| 496 |
# Count file types
|
| 497 |
file_type_count = {}
|
| 498 |
for file_path in files_to_process:
|
| 499 |
ext = file_path.suffix.lower()
|
| 500 |
file_type_count[ext] = file_type_count.get(ext, 0) + 1
|
| 501 |
-
|
| 502 |
for ext, count in sorted(file_type_count.items()):
|
| 503 |
self.logger.info(f" {ext}: {count} files")
|
| 504 |
-
|
| 505 |
# Create progress tracking
|
| 506 |
processed_count = 0
|
| 507 |
failed_files = []
|
| 508 |
-
|
| 509 |
# Use semaphore to control concurrency
|
| 510 |
semaphore = asyncio.Semaphore(max_workers)
|
| 511 |
-
|
| 512 |
async def process_single_file(file_path: Path, index: int) -> None:
|
| 513 |
"""Process a single file"""
|
| 514 |
async with semaphore:
|
| 515 |
nonlocal processed_count
|
| 516 |
try:
|
| 517 |
-
self.logger.info(
|
| 518 |
-
|
|
|
|
|
|
|
| 519 |
# Create separate output directory for each file
|
| 520 |
file_output_dir = Path(output_dir) / file_path.stem
|
| 521 |
file_output_dir.mkdir(parents=True, exist_ok=True)
|
| 522 |
-
|
| 523 |
# Process file
|
| 524 |
await self.process_document_complete(
|
| 525 |
file_path=str(file_path),
|
|
@@ -527,56 +561,56 @@ class RAGAnything:
|
|
| 527 |
parse_method=parse_method,
|
| 528 |
display_stats=display_stats,
|
| 529 |
split_by_character=split_by_character,
|
| 530 |
-
split_by_character_only=split_by_character_only
|
| 531 |
)
|
| 532 |
-
|
| 533 |
processed_count += 1
|
| 534 |
-
self.logger.info(
|
| 535 |
-
|
|
|
|
|
|
|
| 536 |
except Exception as e:
|
| 537 |
-
self.logger.error(
|
|
|
|
|
|
|
| 538 |
self.logger.error(f"Error: {str(e)}")
|
| 539 |
failed_files.append((file_path, str(e)))
|
| 540 |
-
|
| 541 |
# Create all processing tasks
|
| 542 |
tasks = []
|
| 543 |
for index, file_path in enumerate(files_to_process, 1):
|
| 544 |
task = process_single_file(file_path, index)
|
| 545 |
tasks.append(task)
|
| 546 |
-
|
| 547 |
# Wait for all tasks to complete
|
| 548 |
await asyncio.gather(*tasks, return_exceptions=True)
|
| 549 |
-
|
| 550 |
# Output processing statistics
|
| 551 |
self.logger.info("\n===== Batch Processing Complete =====")
|
| 552 |
self.logger.info(f"Total files: {len(files_to_process)}")
|
| 553 |
self.logger.info(f"Successfully processed: {processed_count}")
|
| 554 |
self.logger.info(f"Failed: {len(failed_files)}")
|
| 555 |
-
|
| 556 |
if failed_files:
|
| 557 |
self.logger.info("\nFailed files:")
|
| 558 |
for file_path, error in failed_files:
|
| 559 |
self.logger.info(f" - {file_path}: {error}")
|
| 560 |
-
|
| 561 |
return {
|
| 562 |
"total": len(files_to_process),
|
| 563 |
"success": processed_count,
|
| 564 |
"failed": len(failed_files),
|
| 565 |
-
"failed_files": failed_files
|
| 566 |
}
|
| 567 |
|
| 568 |
-
async def query_with_multimodal(
|
| 569 |
-
self,
|
| 570 |
-
query: str,
|
| 571 |
-
mode: str = "hybrid"
|
| 572 |
-
) -> str:
|
| 573 |
"""
|
| 574 |
Query with multimodal content support
|
| 575 |
-
|
| 576 |
Args:
|
| 577 |
query: Query content
|
| 578 |
mode: Query mode
|
| 579 |
-
|
| 580 |
Returns:
|
| 581 |
Query result
|
| 582 |
"""
|
|
@@ -588,45 +622,65 @@ class RAGAnything:
|
|
| 588 |
"2. Process documents first using process_document_complete() or process_folder_complete() "
|
| 589 |
"to create and populate the LightRAG instance."
|
| 590 |
)
|
| 591 |
-
|
| 592 |
-
result = await self.lightrag.aquery(
|
| 593 |
-
|
| 594 |
-
param=QueryParam(mode=mode)
|
| 595 |
-
)
|
| 596 |
-
|
| 597 |
return result
|
| 598 |
|
| 599 |
def get_processor_info(self) -> Dict[str, Any]:
|
| 600 |
"""Get processor information"""
|
| 601 |
if not self.modal_processors:
|
| 602 |
return {"status": "Not initialized"}
|
| 603 |
-
|
| 604 |
info = {
|
| 605 |
"status": "Initialized",
|
| 606 |
"processors": {},
|
| 607 |
"models": {
|
| 608 |
-
"llm_model": "External function"
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
}
|
| 613 |
-
|
| 614 |
for proc_type, processor in self.modal_processors.items():
|
| 615 |
info["processors"][proc_type] = {
|
| 616 |
"class": processor.__class__.__name__,
|
| 617 |
-
"supports": self._get_processor_supports(proc_type)
|
| 618 |
}
|
| 619 |
-
|
| 620 |
return info
|
| 621 |
|
| 622 |
def _get_processor_supports(self, proc_type: str) -> List[str]:
|
| 623 |
"""Get processor supported features"""
|
| 624 |
supports_map = {
|
| 625 |
-
"image": [
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 629 |
}
|
| 630 |
return supports_map.get(proc_type, ["Basic processing"])
|
| 631 |
-
|
| 632 |
-
|
|
|
|
| 26 |
# Import specialized processors
|
| 27 |
from lightrag.modalprocessors import (
|
| 28 |
ImageModalProcessor,
|
| 29 |
+
TableModalProcessor,
|
| 30 |
EquationModalProcessor,
|
| 31 |
+
GenericModalProcessor,
|
| 32 |
)
|
| 33 |
|
| 34 |
|
| 35 |
class RAGAnything:
|
| 36 |
"""Multimodal Document Processing Pipeline - Complete document parsing and insertion pipeline"""
|
| 37 |
+
|
| 38 |
def __init__(
|
| 39 |
self,
|
| 40 |
lightrag: Optional[LightRAG] = None,
|
|
|
|
| 43 |
embedding_func: Optional[Callable] = None,
|
| 44 |
working_dir: str = "./rag_storage",
|
| 45 |
embedding_dim: int = 3072,
|
| 46 |
+
max_token_size: int = 8192,
|
| 47 |
):
|
| 48 |
"""
|
| 49 |
Initialize Multimodal Document Processing Pipeline
|
| 50 |
+
|
| 51 |
Args:
|
| 52 |
lightrag: Optional pre-initialized LightRAG instance
|
| 53 |
llm_model_func: LLM model function for text analysis
|
|
|
|
| 63 |
self.embedding_func = embedding_func
|
| 64 |
self.embedding_dim = embedding_dim
|
| 65 |
self.max_token_size = max_token_size
|
| 66 |
+
|
| 67 |
# Set up logging
|
| 68 |
setup_logger("RAGAnything")
|
| 69 |
self.logger = logging.getLogger("RAGAnything")
|
| 70 |
+
|
| 71 |
# Create working directory if needed
|
| 72 |
if not os.path.exists(working_dir):
|
| 73 |
os.makedirs(working_dir)
|
| 74 |
+
|
| 75 |
# Use provided LightRAG or mark for later initialization
|
| 76 |
self.lightrag = lightrag
|
| 77 |
self.modal_processors = {}
|
| 78 |
+
|
| 79 |
# If LightRAG is provided, initialize processors immediately
|
| 80 |
if self.lightrag is not None:
|
| 81 |
self._initialize_processors()
|
| 82 |
+
|
| 83 |
def _initialize_processors(self):
|
| 84 |
"""Initialize multimodal processors with appropriate model functions"""
|
| 85 |
if self.lightrag is None:
|
| 86 |
+
raise ValueError(
|
| 87 |
+
"LightRAG instance must be initialized before creating processors"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
# Create different multimodal processors
|
| 91 |
self.modal_processors = {
|
| 92 |
"image": ImageModalProcessor(
|
| 93 |
lightrag=self.lightrag,
|
| 94 |
+
modal_caption_func=self.vision_model_func or self.llm_model_func,
|
| 95 |
),
|
| 96 |
"table": TableModalProcessor(
|
| 97 |
+
lightrag=self.lightrag, modal_caption_func=self.llm_model_func
|
|
|
|
| 98 |
),
|
| 99 |
"equation": EquationModalProcessor(
|
| 100 |
+
lightrag=self.lightrag, modal_caption_func=self.llm_model_func
|
|
|
|
| 101 |
),
|
| 102 |
"generic": GenericModalProcessor(
|
| 103 |
+
lightrag=self.lightrag, modal_caption_func=self.llm_model_func
|
| 104 |
+
),
|
|
|
|
| 105 |
}
|
| 106 |
+
|
| 107 |
self.logger.info("Multimodal processors initialized")
|
| 108 |
self.logger.info(f"Available processors: {list(self.modal_processors.keys())}")
|
| 109 |
+
|
| 110 |
async def _ensure_lightrag_initialized(self):
|
| 111 |
"""Ensure LightRAG instance is initialized, create if necessary"""
|
| 112 |
if self.lightrag is not None:
|
| 113 |
return
|
| 114 |
+
|
| 115 |
# Validate required functions
|
| 116 |
if self.llm_model_func is None:
|
| 117 |
+
raise ValueError(
|
| 118 |
+
"llm_model_func must be provided when LightRAG is not pre-initialized"
|
| 119 |
+
)
|
| 120 |
if self.embedding_func is None:
|
| 121 |
+
raise ValueError(
|
| 122 |
+
"embedding_func must be provided when LightRAG is not pre-initialized"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
from lightrag.kg.shared_storage import initialize_pipeline_status
|
| 126 |
+
|
| 127 |
# Create LightRAG instance with provided functions
|
| 128 |
self.lightrag = LightRAG(
|
| 129 |
working_dir=self.working_dir,
|
|
|
|
| 137 |
|
| 138 |
await self.lightrag.initialize_storages()
|
| 139 |
await initialize_pipeline_status()
|
| 140 |
+
|
| 141 |
# Initialize processors after LightRAG is ready
|
| 142 |
self._initialize_processors()
|
| 143 |
+
|
| 144 |
self.logger.info("LightRAG and multimodal processors initialized")
|
| 145 |
|
| 146 |
def parse_document(
|
| 147 |
+
self,
|
| 148 |
+
file_path: str,
|
| 149 |
output_dir: str = "./output",
|
| 150 |
parse_method: str = "auto",
|
| 151 |
+
display_stats: bool = True,
|
| 152 |
) -> Tuple[List[Dict[str, Any]], str]:
|
| 153 |
"""
|
| 154 |
Parse document using MinerU
|
| 155 |
+
|
| 156 |
Args:
|
| 157 |
file_path: Path to the file to parse
|
| 158 |
output_dir: Output directory
|
| 159 |
parse_method: Parse method ("auto", "ocr", "txt")
|
| 160 |
display_stats: Whether to display content statistics
|
| 161 |
+
|
| 162 |
Returns:
|
| 163 |
(content_list, md_content): Content list and markdown text
|
| 164 |
"""
|
| 165 |
self.logger.info(f"Starting document parsing: {file_path}")
|
| 166 |
+
|
| 167 |
file_path = Path(file_path)
|
| 168 |
if not file_path.exists():
|
| 169 |
raise FileNotFoundError(f"File not found: {file_path}")
|
| 170 |
+
|
| 171 |
# Choose appropriate parsing method based on file extension
|
| 172 |
ext = file_path.suffix.lower()
|
| 173 |
+
|
| 174 |
try:
|
| 175 |
if ext in [".pdf"]:
|
| 176 |
+
self.logger.info(
|
| 177 |
+
f"Detected PDF file, using PDF parser (OCR={parse_method == 'ocr'})..."
|
| 178 |
+
)
|
| 179 |
content_list, md_content = MineruParser.parse_pdf(
|
| 180 |
+
file_path, output_dir, use_ocr=(parse_method == "ocr")
|
|
|
|
|
|
|
| 181 |
)
|
| 182 |
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
|
| 183 |
self.logger.info("Detected image file, using image parser...")
|
| 184 |
content_list, md_content = MineruParser.parse_image(
|
| 185 |
+
file_path, output_dir
|
|
|
|
| 186 |
)
|
| 187 |
elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
|
| 188 |
self.logger.info("Detected Office document, using Office parser...")
|
| 189 |
content_list, md_content = MineruParser.parse_office_doc(
|
| 190 |
+
file_path, output_dir
|
|
|
|
| 191 |
)
|
| 192 |
else:
|
| 193 |
# For other or unknown formats, use generic parser
|
| 194 |
+
self.logger.info(
|
| 195 |
+
f"Using generic parser for {ext} file (method={parse_method})..."
|
| 196 |
+
)
|
| 197 |
content_list, md_content = MineruParser.parse_document(
|
| 198 |
+
file_path, parse_method=parse_method, output_dir=output_dir
|
|
|
|
|
|
|
| 199 |
)
|
| 200 |
+
|
| 201 |
except Exception as e:
|
| 202 |
self.logger.error(f"Error during parsing with specific parser: {str(e)}")
|
| 203 |
self.logger.warning("Falling back to generic parser...")
|
| 204 |
# If specific parser fails, fall back to generic parser
|
| 205 |
content_list, md_content = MineruParser.parse_document(
|
| 206 |
+
file_path, parse_method=parse_method, output_dir=output_dir
|
|
|
|
|
|
|
| 207 |
)
|
| 208 |
+
|
| 209 |
+
self.logger.info(
|
| 210 |
+
f"Parsing complete! Extracted {len(content_list)} content blocks"
|
| 211 |
+
)
|
| 212 |
self.logger.info(f"Markdown text length: {len(md_content)} characters")
|
| 213 |
+
|
| 214 |
# Display content statistics if requested
|
| 215 |
if display_stats:
|
| 216 |
self.logger.info("\nContent Information:")
|
| 217 |
self.logger.info(f"* Total blocks in content_list: {len(content_list)}")
|
| 218 |
self.logger.info(f"* Markdown content length: {len(md_content)} characters")
|
| 219 |
+
|
| 220 |
# Count elements by type
|
| 221 |
block_types: Dict[str, int] = {}
|
| 222 |
for block in content_list:
|
|
|
|
| 224 |
block_type = block.get("type", "unknown")
|
| 225 |
if isinstance(block_type, str):
|
| 226 |
block_types[block_type] = block_types.get(block_type, 0) + 1
|
| 227 |
+
|
| 228 |
self.logger.info("* Content block types:")
|
| 229 |
for block_type, count in block_types.items():
|
| 230 |
self.logger.info(f" - {block_type}: {count}")
|
| 231 |
+
|
| 232 |
return content_list, md_content
|
| 233 |
|
| 234 |
+
def _separate_content(
|
| 235 |
+
self, content_list: List[Dict[str, Any]]
|
| 236 |
+
) -> Tuple[str, List[Dict[str, Any]]]:
|
| 237 |
"""
|
| 238 |
Separate text content and multimodal content
|
| 239 |
+
|
| 240 |
Args:
|
| 241 |
content_list: Content list from MinerU parsing
|
| 242 |
+
|
| 243 |
Returns:
|
| 244 |
(text_content, multimodal_items): Pure text content and multimodal items list
|
| 245 |
"""
|
| 246 |
text_parts = []
|
| 247 |
multimodal_items = []
|
| 248 |
+
|
| 249 |
for item in content_list:
|
| 250 |
content_type = item.get("type", "text")
|
| 251 |
+
|
| 252 |
if content_type == "text":
|
| 253 |
# Text content
|
| 254 |
text = item.get("text", "")
|
|
|
|
| 257 |
else:
|
| 258 |
# Multimodal content (image, table, equation, etc.)
|
| 259 |
multimodal_items.append(item)
|
| 260 |
+
|
| 261 |
# Merge all text content
|
| 262 |
text_content = "\n\n".join(text_parts)
|
| 263 |
+
|
| 264 |
+
self.logger.info("Content separation complete:")
|
| 265 |
self.logger.info(f" - Text content length: {len(text_content)} characters")
|
| 266 |
self.logger.info(f" - Multimodal items count: {len(multimodal_items)}")
|
| 267 |
+
|
| 268 |
# Count multimodal types
|
| 269 |
modal_types = {}
|
| 270 |
for item in multimodal_items:
|
| 271 |
modal_type = item.get("type", "unknown")
|
| 272 |
modal_types[modal_type] = modal_types.get(modal_type, 0) + 1
|
| 273 |
+
|
| 274 |
if modal_types:
|
| 275 |
self.logger.info(f" - Multimodal type distribution: {modal_types}")
|
| 276 |
+
|
| 277 |
return text_content, multimodal_items
|
| 278 |
|
| 279 |
async def _insert_text_content(
|
| 280 |
+
self,
|
| 281 |
input: str | list[str],
|
| 282 |
split_by_character: str | None = None,
|
| 283 |
split_by_character_only: bool = False,
|
|
|
|
| 286 |
):
|
| 287 |
"""
|
| 288 |
Insert pure text content into LightRAG
|
| 289 |
+
|
| 290 |
Args:
|
| 291 |
input: Single document string or list of document strings
|
| 292 |
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
|
|
|
|
| 295 |
split_by_character is None, this parameter is ignored.
|
| 296 |
ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
|
| 297 |
file_paths: single string of the file path or list of file paths, used for citation
|
| 298 |
+
"""
|
| 299 |
self.logger.info("Starting text content insertion into LightRAG...")
|
| 300 |
+
|
| 301 |
# Use LightRAG's insert method with all parameters
|
| 302 |
await self.lightrag.ainsert(
|
| 303 |
input=input,
|
| 304 |
file_paths=file_paths,
|
| 305 |
split_by_character=split_by_character,
|
| 306 |
split_by_character_only=split_by_character_only,
|
| 307 |
+
ids=ids,
|
| 308 |
)
|
| 309 |
+
|
| 310 |
self.logger.info("Text content insertion complete")
|
| 311 |
|
| 312 |
+
async def _process_multimodal_content(
|
| 313 |
+
self, multimodal_items: List[Dict[str, Any]], file_path: str
|
| 314 |
+
):
|
| 315 |
"""
|
| 316 |
Process multimodal content (using specialized processors)
|
| 317 |
+
|
| 318 |
Args:
|
| 319 |
multimodal_items: List of multimodal items
|
| 320 |
file_path: File path (for reference)
|
|
|
|
| 322 |
if not multimodal_items:
|
| 323 |
self.logger.debug("No multimodal content to process")
|
| 324 |
return
|
| 325 |
+
|
| 326 |
self.logger.info("Starting multimodal content processing...")
|
| 327 |
+
|
| 328 |
file_name = os.path.basename(file_path)
|
| 329 |
+
|
| 330 |
for i, item in enumerate(multimodal_items):
|
| 331 |
try:
|
| 332 |
content_type = item.get("type", "unknown")
|
| 333 |
+
self.logger.info(
|
| 334 |
+
f"Processing item {i+1}/{len(multimodal_items)}: {content_type} content"
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
# Select appropriate processor
|
| 338 |
processor = self._get_processor_for_type(content_type)
|
| 339 |
+
|
| 340 |
if processor:
|
| 341 |
+
(
|
| 342 |
+
enhanced_caption,
|
| 343 |
+
entity_info,
|
| 344 |
+
) = await processor.process_multimodal_content(
|
| 345 |
modal_content=item,
|
| 346 |
content_type=content_type,
|
| 347 |
+
file_path=file_name,
|
| 348 |
+
)
|
| 349 |
+
self.logger.info(
|
| 350 |
+
f"{content_type} processing complete: {entity_info.get('entity_name', 'Unknown')}"
|
| 351 |
)
|
|
|
|
| 352 |
else:
|
| 353 |
+
self.logger.warning(
|
| 354 |
+
f"No suitable processor found for {content_type} type content"
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
except Exception as e:
|
| 358 |
self.logger.error(f"Error processing multimodal content: {str(e)}")
|
| 359 |
self.logger.debug("Exception details:", exc_info=True)
|
| 360 |
continue
|
| 361 |
+
|
| 362 |
self.logger.info("Multimodal content processing complete")
|
| 363 |
|
| 364 |
def _get_processor_for_type(self, content_type: str):
|
| 365 |
"""
|
| 366 |
Get appropriate processor based on content type
|
| 367 |
+
|
| 368 |
Args:
|
| 369 |
content_type: Content type
|
| 370 |
+
|
| 371 |
Returns:
|
| 372 |
Corresponding processor instance
|
| 373 |
"""
|
|
|
|
| 383 |
return self.modal_processors.get("generic")
|
| 384 |
|
| 385 |
async def process_document_complete(
|
| 386 |
+
self,
|
| 387 |
+
file_path: str,
|
| 388 |
output_dir: str = "./output",
|
| 389 |
parse_method: str = "auto",
|
| 390 |
display_stats: bool = True,
|
| 391 |
split_by_character: str | None = None,
|
| 392 |
split_by_character_only: bool = False,
|
| 393 |
+
doc_id: str | None = None,
|
| 394 |
):
|
| 395 |
"""
|
| 396 |
Complete document processing workflow
|
| 397 |
+
|
| 398 |
Args:
|
| 399 |
file_path: Path to the file to process
|
| 400 |
output_dir: MinerU output directory
|
|
|
|
| 406 |
"""
|
| 407 |
# Ensure LightRAG is initialized
|
| 408 |
await self._ensure_lightrag_initialized()
|
| 409 |
+
|
| 410 |
self.logger.info(f"Starting complete document processing: {file_path}")
|
| 411 |
+
|
| 412 |
# Step 1: Parse document using MinerU
|
| 413 |
content_list, md_content = self.parse_document(
|
| 414 |
+
file_path, output_dir, parse_method, display_stats
|
|
|
|
|
|
|
|
|
|
| 415 |
)
|
| 416 |
+
|
| 417 |
# Step 2: Separate text and multimodal content
|
| 418 |
text_content, multimodal_items = self._separate_content(content_list)
|
| 419 |
+
|
| 420 |
# Step 3: Insert pure text content with all parameters
|
| 421 |
if text_content.strip():
|
| 422 |
file_name = os.path.basename(file_path)
|
| 423 |
await self._insert_text_content(
|
| 424 |
+
text_content,
|
| 425 |
file_paths=file_name,
|
| 426 |
split_by_character=split_by_character,
|
| 427 |
split_by_character_only=split_by_character_only,
|
| 428 |
+
ids=doc_id,
|
| 429 |
)
|
| 430 |
+
|
| 431 |
# Step 4: Process multimodal content (using specialized processors)
|
| 432 |
if multimodal_items:
|
| 433 |
await self._process_multimodal_content(multimodal_items, file_path)
|
| 434 |
+
|
| 435 |
self.logger.info(f"Document {file_path} processing complete!")
|
| 436 |
|
| 437 |
async def process_folder_complete(
|
|
|
|
| 444 |
split_by_character_only: bool = False,
|
| 445 |
file_extensions: Optional[List[str]] = None,
|
| 446 |
recursive: bool = True,
|
| 447 |
+
max_workers: int = 1,
|
| 448 |
):
|
| 449 |
"""
|
| 450 |
Process all files in a folder in batch
|
| 451 |
+
|
| 452 |
Args:
|
| 453 |
folder_path: Path to the folder to process
|
| 454 |
output_dir: MinerU output directory
|
|
|
|
| 462 |
"""
|
| 463 |
# Ensure LightRAG is initialized
|
| 464 |
await self._ensure_lightrag_initialized()
|
| 465 |
+
|
| 466 |
folder_path = Path(folder_path)
|
| 467 |
if not folder_path.exists() or not folder_path.is_dir():
|
| 468 |
+
raise ValueError(
|
| 469 |
+
f"Folder does not exist or is not a valid directory: {folder_path}"
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
# Supported file formats
|
| 473 |
supported_extensions = {
|
| 474 |
+
".pdf",
|
| 475 |
+
".jpg",
|
| 476 |
+
".jpeg",
|
| 477 |
+
".png",
|
| 478 |
+
".bmp",
|
| 479 |
+
".tiff",
|
| 480 |
+
".tif",
|
| 481 |
+
".doc",
|
| 482 |
+
".docx",
|
| 483 |
+
".ppt",
|
| 484 |
+
".pptx",
|
| 485 |
+
".txt",
|
| 486 |
+
".md",
|
| 487 |
}
|
| 488 |
+
|
| 489 |
# Use specified extensions or all supported formats
|
| 490 |
if file_extensions:
|
| 491 |
target_extensions = set(ext.lower() for ext in file_extensions)
|
| 492 |
# Validate if all are supported formats
|
| 493 |
unsupported = target_extensions - supported_extensions
|
| 494 |
if unsupported:
|
| 495 |
+
self.logger.warning(
|
| 496 |
+
f"The following file formats may not be fully supported: {unsupported}"
|
| 497 |
+
)
|
| 498 |
else:
|
| 499 |
target_extensions = supported_extensions
|
| 500 |
+
|
| 501 |
# Collect all files to process
|
| 502 |
files_to_process = []
|
| 503 |
+
|
| 504 |
if recursive:
|
| 505 |
# Recursively traverse all subfolders
|
| 506 |
for file_path in folder_path.rglob("*"):
|
| 507 |
+
if (
|
| 508 |
+
file_path.is_file()
|
| 509 |
+
and file_path.suffix.lower() in target_extensions
|
| 510 |
+
):
|
| 511 |
files_to_process.append(file_path)
|
| 512 |
else:
|
| 513 |
# Process only current folder
|
| 514 |
for file_path in folder_path.glob("*"):
|
| 515 |
+
if (
|
| 516 |
+
file_path.is_file()
|
| 517 |
+
and file_path.suffix.lower() in target_extensions
|
| 518 |
+
):
|
| 519 |
files_to_process.append(file_path)
|
| 520 |
+
|
| 521 |
if not files_to_process:
|
| 522 |
self.logger.info(f"No files to process found in {folder_path}")
|
| 523 |
return
|
| 524 |
+
|
| 525 |
self.logger.info(f"Found {len(files_to_process)} files to process")
|
| 526 |
+
self.logger.info("File type distribution:")
|
| 527 |
+
|
| 528 |
# Count file types
|
| 529 |
file_type_count = {}
|
| 530 |
for file_path in files_to_process:
|
| 531 |
ext = file_path.suffix.lower()
|
| 532 |
file_type_count[ext] = file_type_count.get(ext, 0) + 1
|
| 533 |
+
|
| 534 |
for ext, count in sorted(file_type_count.items()):
|
| 535 |
self.logger.info(f" {ext}: {count} files")
|
| 536 |
+
|
| 537 |
# Create progress tracking
|
| 538 |
processed_count = 0
|
| 539 |
failed_files = []
|
| 540 |
+
|
| 541 |
# Use semaphore to control concurrency
|
| 542 |
semaphore = asyncio.Semaphore(max_workers)
|
| 543 |
+
|
| 544 |
async def process_single_file(file_path: Path, index: int) -> None:
|
| 545 |
"""Process a single file"""
|
| 546 |
async with semaphore:
|
| 547 |
nonlocal processed_count
|
| 548 |
try:
|
| 549 |
+
self.logger.info(
|
| 550 |
+
f"[{index}/{len(files_to_process)}] Processing: {file_path}"
|
| 551 |
+
)
|
| 552 |
+
|
| 553 |
# Create separate output directory for each file
|
| 554 |
file_output_dir = Path(output_dir) / file_path.stem
|
| 555 |
file_output_dir.mkdir(parents=True, exist_ok=True)
|
| 556 |
+
|
| 557 |
# Process file
|
| 558 |
await self.process_document_complete(
|
| 559 |
file_path=str(file_path),
|
|
|
|
| 561 |
parse_method=parse_method,
|
| 562 |
display_stats=display_stats,
|
| 563 |
split_by_character=split_by_character,
|
| 564 |
+
split_by_character_only=split_by_character_only,
|
| 565 |
)
|
| 566 |
+
|
| 567 |
processed_count += 1
|
| 568 |
+
self.logger.info(
|
| 569 |
+
f"[{index}/{len(files_to_process)}] Successfully processed: {file_path}"
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
except Exception as e:
|
| 573 |
+
self.logger.error(
|
| 574 |
+
f"[{index}/{len(files_to_process)}] Failed to process: {file_path}"
|
| 575 |
+
)
|
| 576 |
self.logger.error(f"Error: {str(e)}")
|
| 577 |
failed_files.append((file_path, str(e)))
|
| 578 |
+
|
| 579 |
# Create all processing tasks
|
| 580 |
tasks = []
|
| 581 |
for index, file_path in enumerate(files_to_process, 1):
|
| 582 |
task = process_single_file(file_path, index)
|
| 583 |
tasks.append(task)
|
| 584 |
+
|
| 585 |
# Wait for all tasks to complete
|
| 586 |
await asyncio.gather(*tasks, return_exceptions=True)
|
| 587 |
+
|
| 588 |
# Output processing statistics
|
| 589 |
self.logger.info("\n===== Batch Processing Complete =====")
|
| 590 |
self.logger.info(f"Total files: {len(files_to_process)}")
|
| 591 |
self.logger.info(f"Successfully processed: {processed_count}")
|
| 592 |
self.logger.info(f"Failed: {len(failed_files)}")
|
| 593 |
+
|
| 594 |
if failed_files:
|
| 595 |
self.logger.info("\nFailed files:")
|
| 596 |
for file_path, error in failed_files:
|
| 597 |
self.logger.info(f" - {file_path}: {error}")
|
| 598 |
+
|
| 599 |
return {
|
| 600 |
"total": len(files_to_process),
|
| 601 |
"success": processed_count,
|
| 602 |
"failed": len(failed_files),
|
| 603 |
+
"failed_files": failed_files,
|
| 604 |
}
|
| 605 |
|
| 606 |
+
async def query_with_multimodal(self, query: str, mode: str = "hybrid") -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
"""
|
| 608 |
Query with multimodal content support
|
| 609 |
+
|
| 610 |
Args:
|
| 611 |
query: Query content
|
| 612 |
mode: Query mode
|
| 613 |
+
|
| 614 |
Returns:
|
| 615 |
Query result
|
| 616 |
"""
|
|
|
|
| 622 |
"2. Process documents first using process_document_complete() or process_folder_complete() "
|
| 623 |
"to create and populate the LightRAG instance."
|
| 624 |
)
|
| 625 |
+
|
| 626 |
+
result = await self.lightrag.aquery(query, param=QueryParam(mode=mode))
|
| 627 |
+
|
|
|
|
|
|
|
|
|
|
| 628 |
return result
|
| 629 |
|
| 630 |
def get_processor_info(self) -> Dict[str, Any]:
|
| 631 |
"""Get processor information"""
|
| 632 |
if not self.modal_processors:
|
| 633 |
return {"status": "Not initialized"}
|
| 634 |
+
|
| 635 |
info = {
|
| 636 |
"status": "Initialized",
|
| 637 |
"processors": {},
|
| 638 |
"models": {
|
| 639 |
+
"llm_model": "External function"
|
| 640 |
+
if self.llm_model_func
|
| 641 |
+
else "Not provided",
|
| 642 |
+
"vision_model": "External function"
|
| 643 |
+
if self.vision_model_func
|
| 644 |
+
else "Not provided",
|
| 645 |
+
"embedding_model": "External function"
|
| 646 |
+
if self.embedding_func
|
| 647 |
+
else "Not provided",
|
| 648 |
+
},
|
| 649 |
}
|
| 650 |
+
|
| 651 |
for proc_type, processor in self.modal_processors.items():
|
| 652 |
info["processors"][proc_type] = {
|
| 653 |
"class": processor.__class__.__name__,
|
| 654 |
+
"supports": self._get_processor_supports(proc_type),
|
| 655 |
}
|
| 656 |
+
|
| 657 |
return info
|
| 658 |
|
| 659 |
def _get_processor_supports(self, proc_type: str) -> List[str]:
|
| 660 |
"""Get processor supported features"""
|
| 661 |
supports_map = {
|
| 662 |
+
"image": [
|
| 663 |
+
"Image content analysis",
|
| 664 |
+
"Visual understanding",
|
| 665 |
+
"Image description generation",
|
| 666 |
+
"Image entity extraction",
|
| 667 |
+
],
|
| 668 |
+
"table": [
|
| 669 |
+
"Table structure analysis",
|
| 670 |
+
"Data statistics",
|
| 671 |
+
"Trend identification",
|
| 672 |
+
"Table entity extraction",
|
| 673 |
+
],
|
| 674 |
+
"equation": [
|
| 675 |
+
"Mathematical formula parsing",
|
| 676 |
+
"Variable identification",
|
| 677 |
+
"Formula meaning explanation",
|
| 678 |
+
"Formula entity extraction",
|
| 679 |
+
],
|
| 680 |
+
"generic": [
|
| 681 |
+
"General content analysis",
|
| 682 |
+
"Structured processing",
|
| 683 |
+
"Entity extraction",
|
| 684 |
+
],
|
| 685 |
}
|
| 686 |
return supports_map.get(proc_type, ["Basic processing"])
|
|
|
|
|
|