zrguo commited on
Commit
120e951
·
1 Parent(s): 60e1dab
docs/mineru_integration_en.md CHANGED
@@ -257,7 +257,7 @@ The processors support different types of content:
257
  - `ImageModalProcessor`: Processes images with captions and footnotes
258
  - `TableModalProcessor`: Processes tables with captions and footnotes
259
  - `EquationModalProcessor`: Processes mathematical equations in LaTeX format
260
- - `GenericModalProcessor`: A base processor that can be extended for custom content types
261
 
262
  > **Note**: A complete working example can be found in `examples/modalprocessors_example.py`. You can run it using:
263
  > ```bash
@@ -357,4 +357,4 @@ description, entity_info = await equation_processor.process_multimodal_content(
357
  )
358
  ```
359
 
360
- </details>
 
257
  - `ImageModalProcessor`: Processes images with captions and footnotes
258
  - `TableModalProcessor`: Processes tables with captions and footnotes
259
  - `EquationModalProcessor`: Processes mathematical equations in LaTeX format
260
+ - `GenericModalProcessor`: A base processor that can be extended for custom content types
261
 
262
  > **Note**: A complete working example can be found in `examples/modalprocessors_example.py`. You can run it using:
263
  > ```bash
 
357
  )
358
  ```
359
 
360
+ </details>
docs/mineru_integration_zh.md CHANGED
@@ -256,7 +256,7 @@ MinerU 配置文件 `magic-pdf.json` 支持多种自定义选项,包括:
256
  - `ImageModalProcessor`:处理带有标题和脚注的图像
257
  - `TableModalProcessor`:处理带有标题和脚注的表格
258
  - `EquationModalProcessor`:处理 LaTeX 格式的数学公式
259
- - `GenericModalProcessor`:可用于扩展自定义内容类型的基础处理器
260
 
261
  > **注意**:完整的可运行示例可以在 `examples/modalprocessors_example.py` 中找到。您可以使用以下命令运行它:
262
  > ```bash
@@ -355,4 +355,4 @@ description, entity_info = await equation_processor.process_multimodal_content(
355
  entity_name="质能方程"
356
  )
357
  ```
358
- </details>
 
256
  - `ImageModalProcessor`:处理带有标题和脚注的图像
257
  - `TableModalProcessor`:处理带有标题和脚注的表格
258
  - `EquationModalProcessor`:处理 LaTeX 格式的数学公式
259
+ - `GenericModalProcessor`:可用于扩展自定义内容类型的基础处理器
260
 
261
  > **注意**:完整的可运行示例可以在 `examples/modalprocessors_example.py` 中找到。您可以使用以下命令运行它:
262
  > ```bash
 
355
  entity_name="质能方程"
356
  )
357
  ```
358
+ </details>
examples/mineru_example.py CHANGED
@@ -10,13 +10,15 @@ This example shows how to:
10
 
11
  import os
12
  import argparse
13
- from pathlib import Path
14
  from lightrag.mineru_parser import MineruParser
15
 
16
- def parse_document(file_path: str, output_dir: str = None, method: str = "auto", stats: bool = False):
 
 
 
17
  """
18
  Parse a document using MinerU parser
19
-
20
  Args:
21
  file_path: Path to the document
22
  output_dir: Output directory for parsed results
@@ -26,22 +28,20 @@ def parse_document(file_path: str, output_dir: str = None, method: str = "auto",
26
  try:
27
  # Parse the document
28
  content_list, md_content = MineruParser.parse_document(
29
- file_path=file_path,
30
- parse_method=method,
31
- output_dir=output_dir
32
  )
33
 
34
  # Display statistics if requested
35
  if stats:
36
  print("\nDocument Statistics:")
37
  print(f"Total content blocks: {len(content_list)}")
38
-
39
  # Count different types of content
40
  content_types = {}
41
  for item in content_list:
42
- content_type = item.get('type', 'unknown')
43
  content_types[content_type] = content_types.get(content_type, 0) + 1
44
-
45
  print("\nContent Type Distribution:")
46
  for content_type, count in content_types.items():
47
  print(f"- {content_type}: {count}")
@@ -52,17 +52,22 @@ def parse_document(file_path: str, output_dir: str = None, method: str = "auto",
52
  print(f"Error parsing document: {str(e)}")
53
  return None, None
54
 
 
55
  def main():
56
  """Main function to run the example"""
57
- parser = argparse.ArgumentParser(description='MinerU Parser Example')
58
- parser.add_argument('file_path', help='Path to the document to parse')
59
- parser.add_argument('--output', '-o', help='Output directory path')
60
- parser.add_argument('--method', '-m',
61
- choices=['auto', 'ocr', 'txt'],
62
- default='auto',
63
- help='Parsing method (auto, ocr, txt)')
64
- parser.add_argument('--stats', action='store_true',
65
- help='Display content statistics')
 
 
 
 
66
 
67
  args = parser.parse_args()
68
 
@@ -72,11 +77,9 @@ def main():
72
 
73
  # Parse document
74
  content_list, md_content = parse_document(
75
- args.file_path,
76
- args.output,
77
- args.method,
78
- args.stats
79
  )
80
 
81
- if __name__ == '__main__':
82
- main()
 
 
10
 
11
  import os
12
  import argparse
 
13
  from lightrag.mineru_parser import MineruParser
14
 
15
+
16
+ def parse_document(
17
+ file_path: str, output_dir: str = None, method: str = "auto", stats: bool = False
18
+ ):
19
  """
20
  Parse a document using MinerU parser
21
+
22
  Args:
23
  file_path: Path to the document
24
  output_dir: Output directory for parsed results
 
28
  try:
29
  # Parse the document
30
  content_list, md_content = MineruParser.parse_document(
31
+ file_path=file_path, parse_method=method, output_dir=output_dir
 
 
32
  )
33
 
34
  # Display statistics if requested
35
  if stats:
36
  print("\nDocument Statistics:")
37
  print(f"Total content blocks: {len(content_list)}")
38
+
39
  # Count different types of content
40
  content_types = {}
41
  for item in content_list:
42
+ content_type = item.get("type", "unknown")
43
  content_types[content_type] = content_types.get(content_type, 0) + 1
44
+
45
  print("\nContent Type Distribution:")
46
  for content_type, count in content_types.items():
47
  print(f"- {content_type}: {count}")
 
52
  print(f"Error parsing document: {str(e)}")
53
  return None, None
54
 
55
+
56
  def main():
57
  """Main function to run the example"""
58
+ parser = argparse.ArgumentParser(description="MinerU Parser Example")
59
+ parser.add_argument("file_path", help="Path to the document to parse")
60
+ parser.add_argument("--output", "-o", help="Output directory path")
61
+ parser.add_argument(
62
+ "--method",
63
+ "-m",
64
+ choices=["auto", "ocr", "txt"],
65
+ default="auto",
66
+ help="Parsing method (auto, ocr, txt)",
67
+ )
68
+ parser.add_argument(
69
+ "--stats", action="store_true", help="Display content statistics"
70
+ )
71
 
72
  args = parser.parse_args()
73
 
 
77
 
78
  # Parse document
79
  content_list, md_content = parse_document(
80
+ args.file_path, args.output, args.method, args.stats
 
 
 
81
  )
82
 
83
+
84
+ if __name__ == "__main__":
85
+ main()
examples/modalprocessors_example.py CHANGED
@@ -8,94 +8,112 @@ import asyncio
8
  import argparse
9
  from lightrag.llm.openai import openai_complete_if_cache, openai_embed
10
  from lightrag.kg.shared_storage import initialize_pipeline_status
11
- from pathlib import Path
12
  from lightrag import LightRAG
13
  from lightrag.modalprocessors import (
14
  ImageModalProcessor,
15
  TableModalProcessor,
16
  EquationModalProcessor,
17
- GenericModalProcessor
18
  )
19
 
20
  WORKING_DIR = "./rag_storage"
21
 
 
22
  def get_llm_model_func(api_key: str, base_url: str = None):
23
- return lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
24
- "gpt-4o-mini",
25
- prompt,
26
- system_prompt=system_prompt,
27
- history_messages=history_messages,
28
- api_key=api_key,
29
- base_url=base_url,
30
- **kwargs,
 
 
 
 
 
31
  )
32
 
 
33
  def get_vision_model_func(api_key: str, base_url: str = None):
34
- return lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
35
- "gpt-4o",
36
- "",
37
  system_prompt=None,
38
  history_messages=[],
39
- messages=[
40
- {"role": "system", "content": system_prompt} if system_prompt else None,
41
- {"role": "user", "content": [
42
- {"type": "text", "text": prompt},
 
 
 
 
43
  {
44
- "type": "image_url",
45
- "image_url": {
46
- "url": f"data:image/jpeg;base64,{image_data}"
47
- }
 
 
 
 
 
 
48
  }
49
- ]} if image_data else {"role": "user", "content": prompt}
50
- ],
51
- api_key=api_key,
52
- base_url=base_url,
53
- **kwargs,
54
- ) if image_data else openai_complete_if_cache(
55
- "gpt-4o-mini",
56
- prompt,
57
- system_prompt=system_prompt,
58
- history_messages=history_messages,
59
- api_key=api_key,
60
- base_url=base_url,
61
- **kwargs,
 
 
 
 
62
  )
63
 
 
64
  async def process_image_example(lightrag: LightRAG, vision_model_func):
65
  """Example of processing an image"""
66
  # Create image processor
67
  image_processor = ImageModalProcessor(
68
- lightrag=lightrag,
69
- modal_caption_func=vision_model_func
70
  )
71
-
72
  # Prepare image content
73
  image_content = {
74
  "img_path": "image.jpg",
75
  "img_caption": ["Example image caption"],
76
- "img_footnote": ["Example image footnote"]
77
  }
78
-
79
  # Process image
80
  description, entity_info = await image_processor.process_multimodal_content(
81
  modal_content=image_content,
82
  content_type="image",
83
  file_path="image_example.jpg",
84
- entity_name="Example Image"
85
  )
86
-
87
  print("Image Processing Results:")
88
  print(f"Description: {description}")
89
  print(f"Entity Info: {entity_info}")
90
 
 
91
  async def process_table_example(lightrag: LightRAG, llm_model_func):
92
  """Example of processing a table"""
93
  # Create table processor
94
  table_processor = TableModalProcessor(
95
- lightrag=lightrag,
96
- modal_caption_func=llm_model_func
97
  )
98
-
99
  # Prepare table content
100
  table_content = {
101
  "table_body": """
@@ -105,47 +123,45 @@ async def process_table_example(lightrag: LightRAG, llm_model_func):
105
  | Mary | 30 | Designer |
106
  """,
107
  "table_caption": ["Employee Information Table"],
108
- "table_footnote": ["Data updated as of 2024"]
109
  }
110
-
111
  # Process table
112
  description, entity_info = await table_processor.process_multimodal_content(
113
  modal_content=table_content,
114
  content_type="table",
115
  file_path="table_example.md",
116
- entity_name="Employee Table"
117
  )
118
-
119
  print("\nTable Processing Results:")
120
  print(f"Description: {description}")
121
  print(f"Entity Info: {entity_info}")
122
 
 
123
  async def process_equation_example(lightrag: LightRAG, llm_model_func):
124
  """Example of processing a mathematical equation"""
125
  # Create equation processor
126
  equation_processor = EquationModalProcessor(
127
- lightrag=lightrag,
128
- modal_caption_func=llm_model_func
129
  )
130
-
131
  # Prepare equation content
132
- equation_content = {
133
- "text": "E = mc^2",
134
- "text_format": "LaTeX"
135
- }
136
-
137
  # Process equation
138
  description, entity_info = await equation_processor.process_multimodal_content(
139
  modal_content=equation_content,
140
  content_type="equation",
141
  file_path="equation_example.txt",
142
- entity_name="Mass-Energy Equivalence"
143
  )
144
-
145
  print("\nEquation Processing Results:")
146
  print(f"Description: {description}")
147
  print(f"Entity Info: {entity_info}")
148
 
 
149
  async def initialize_rag(api_key: str, base_url: str = None):
150
  rag = LightRAG(
151
  working_dir=WORKING_DIR,
@@ -155,7 +171,10 @@ async def initialize_rag(api_key: str, base_url: str = None):
155
  api_key=api_key,
156
  base_url=base_url,
157
  ),
158
- llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
 
 
 
159
  "gpt-4o-mini",
160
  prompt,
161
  system_prompt=system_prompt,
@@ -171,30 +190,35 @@ async def initialize_rag(api_key: str, base_url: str = None):
171
 
172
  return rag
173
 
 
174
  def main():
175
  """Main function to run the example"""
176
- parser = argparse.ArgumentParser(description='Modal Processors Example')
177
- parser.add_argument('--api-key', required=True, help='OpenAI API key')
178
- parser.add_argument('--base-url', help='Optional base URL for API')
179
- parser.add_argument('--working-dir', '-w', default=WORKING_DIR, help='Working directory path')
 
 
180
 
181
  args = parser.parse_args()
182
 
183
  # Run examples
184
  asyncio.run(main_async(args.api_key, args.base_url))
185
 
 
186
  async def main_async(api_key: str, base_url: str = None):
187
  # Initialize LightRAG
188
  lightrag = await initialize_rag(api_key, base_url)
189
-
190
  # Get model functions
191
  llm_model_func = get_llm_model_func(api_key, base_url)
192
  vision_model_func = get_vision_model_func(api_key, base_url)
193
-
194
  # Run examples
195
  await process_image_example(lightrag, vision_model_func)
196
  await process_table_example(lightrag, llm_model_func)
197
  await process_equation_example(lightrag, llm_model_func)
198
 
 
199
  if __name__ == "__main__":
200
- main()
 
8
  import argparse
9
  from lightrag.llm.openai import openai_complete_if_cache, openai_embed
10
  from lightrag.kg.shared_storage import initialize_pipeline_status
 
11
  from lightrag import LightRAG
12
  from lightrag.modalprocessors import (
13
  ImageModalProcessor,
14
  TableModalProcessor,
15
  EquationModalProcessor,
 
16
  )
17
 
18
  WORKING_DIR = "./rag_storage"
19
 
20
+
21
  def get_llm_model_func(api_key: str, base_url: str = None):
22
+ return (
23
+ lambda prompt,
24
+ system_prompt=None,
25
+ history_messages=[],
26
+ **kwargs: openai_complete_if_cache(
27
+ "gpt-4o-mini",
28
+ prompt,
29
+ system_prompt=system_prompt,
30
+ history_messages=history_messages,
31
+ api_key=api_key,
32
+ base_url=base_url,
33
+ **kwargs,
34
+ )
35
  )
36
 
37
+
38
  def get_vision_model_func(api_key: str, base_url: str = None):
39
+ return (
40
+ lambda prompt,
 
41
  system_prompt=None,
42
  history_messages=[],
43
+ image_data=None,
44
+ **kwargs: openai_complete_if_cache(
45
+ "gpt-4o",
46
+ "",
47
+ system_prompt=None,
48
+ history_messages=[],
49
+ messages=[
50
+ {"role": "system", "content": system_prompt} if system_prompt else None,
51
  {
52
+ "role": "user",
53
+ "content": [
54
+ {"type": "text", "text": prompt},
55
+ {
56
+ "type": "image_url",
57
+ "image_url": {
58
+ "url": f"data:image/jpeg;base64,{image_data}"
59
+ },
60
+ },
61
+ ],
62
  }
63
+ if image_data
64
+ else {"role": "user", "content": prompt},
65
+ ],
66
+ api_key=api_key,
67
+ base_url=base_url,
68
+ **kwargs,
69
+ )
70
+ if image_data
71
+ else openai_complete_if_cache(
72
+ "gpt-4o-mini",
73
+ prompt,
74
+ system_prompt=system_prompt,
75
+ history_messages=history_messages,
76
+ api_key=api_key,
77
+ base_url=base_url,
78
+ **kwargs,
79
+ )
80
  )
81
 
82
+
83
  async def process_image_example(lightrag: LightRAG, vision_model_func):
84
  """Example of processing an image"""
85
  # Create image processor
86
  image_processor = ImageModalProcessor(
87
+ lightrag=lightrag, modal_caption_func=vision_model_func
 
88
  )
89
+
90
  # Prepare image content
91
  image_content = {
92
  "img_path": "image.jpg",
93
  "img_caption": ["Example image caption"],
94
+ "img_footnote": ["Example image footnote"],
95
  }
96
+
97
  # Process image
98
  description, entity_info = await image_processor.process_multimodal_content(
99
  modal_content=image_content,
100
  content_type="image",
101
  file_path="image_example.jpg",
102
+ entity_name="Example Image",
103
  )
104
+
105
  print("Image Processing Results:")
106
  print(f"Description: {description}")
107
  print(f"Entity Info: {entity_info}")
108
 
109
+
110
  async def process_table_example(lightrag: LightRAG, llm_model_func):
111
  """Example of processing a table"""
112
  # Create table processor
113
  table_processor = TableModalProcessor(
114
+ lightrag=lightrag, modal_caption_func=llm_model_func
 
115
  )
116
+
117
  # Prepare table content
118
  table_content = {
119
  "table_body": """
 
123
  | Mary | 30 | Designer |
124
  """,
125
  "table_caption": ["Employee Information Table"],
126
+ "table_footnote": ["Data updated as of 2024"],
127
  }
128
+
129
  # Process table
130
  description, entity_info = await table_processor.process_multimodal_content(
131
  modal_content=table_content,
132
  content_type="table",
133
  file_path="table_example.md",
134
+ entity_name="Employee Table",
135
  )
136
+
137
  print("\nTable Processing Results:")
138
  print(f"Description: {description}")
139
  print(f"Entity Info: {entity_info}")
140
 
141
+
142
  async def process_equation_example(lightrag: LightRAG, llm_model_func):
143
  """Example of processing a mathematical equation"""
144
  # Create equation processor
145
  equation_processor = EquationModalProcessor(
146
+ lightrag=lightrag, modal_caption_func=llm_model_func
 
147
  )
148
+
149
  # Prepare equation content
150
+ equation_content = {"text": "E = mc^2", "text_format": "LaTeX"}
151
+
 
 
 
152
  # Process equation
153
  description, entity_info = await equation_processor.process_multimodal_content(
154
  modal_content=equation_content,
155
  content_type="equation",
156
  file_path="equation_example.txt",
157
+ entity_name="Mass-Energy Equivalence",
158
  )
159
+
160
  print("\nEquation Processing Results:")
161
  print(f"Description: {description}")
162
  print(f"Entity Info: {entity_info}")
163
 
164
+
165
  async def initialize_rag(api_key: str, base_url: str = None):
166
  rag = LightRAG(
167
  working_dir=WORKING_DIR,
 
171
  api_key=api_key,
172
  base_url=base_url,
173
  ),
174
+ llm_model_func=lambda prompt,
175
+ system_prompt=None,
176
+ history_messages=[],
177
+ **kwargs: openai_complete_if_cache(
178
  "gpt-4o-mini",
179
  prompt,
180
  system_prompt=system_prompt,
 
190
 
191
  return rag
192
 
193
+
194
  def main():
195
  """Main function to run the example"""
196
+ parser = argparse.ArgumentParser(description="Modal Processors Example")
197
+ parser.add_argument("--api-key", required=True, help="OpenAI API key")
198
+ parser.add_argument("--base-url", help="Optional base URL for API")
199
+ parser.add_argument(
200
+ "--working-dir", "-w", default=WORKING_DIR, help="Working directory path"
201
+ )
202
 
203
  args = parser.parse_args()
204
 
205
  # Run examples
206
  asyncio.run(main_async(args.api_key, args.base_url))
207
 
208
+
209
  async def main_async(api_key: str, base_url: str = None):
210
  # Initialize LightRAG
211
  lightrag = await initialize_rag(api_key, base_url)
212
+
213
  # Get model functions
214
  llm_model_func = get_llm_model_func(api_key, base_url)
215
  vision_model_func = get_vision_model_func(api_key, base_url)
216
+
217
  # Run examples
218
  await process_image_example(lightrag, vision_model_func)
219
  await process_table_example(lightrag, llm_model_func)
220
  await process_equation_example(lightrag, llm_model_func)
221
 
222
+
223
  if __name__ == "__main__":
224
+ main()
examples/raganything_example.py CHANGED
@@ -11,15 +11,20 @@ This example shows how to:
11
  import os
12
  import argparse
13
  import asyncio
14
- from pathlib import Path
15
- from lightrag.mineru_parser import MineruParser
16
  from lightrag.llm.openai import openai_complete_if_cache, openai_embed
17
  from lightrag.raganything import RAGAnything
18
 
19
- async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_url: str = None, working_dir: str = None):
 
 
 
 
 
 
 
20
  """
21
  Process document with RAGAnything
22
-
23
  Args:
24
  file_path: Path to the document
25
  output_dir: Output directory for RAG results
@@ -30,7 +35,10 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
30
  # Initialize RAGAnything
31
  rag = RAGAnything(
32
  working_dir=working_dir,
33
- llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
 
 
 
34
  "gpt-4o-mini",
35
  prompt,
36
  system_prompt=system_prompt,
@@ -39,27 +47,40 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
39
  base_url=base_url,
40
  **kwargs,
41
  ),
42
- vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
 
 
 
 
43
  "gpt-4o",
44
  "",
45
  system_prompt=None,
46
  history_messages=[],
47
  messages=[
48
- {"role": "system", "content": system_prompt} if system_prompt else None,
49
- {"role": "user", "content": [
50
- {"type": "text", "text": prompt},
51
- {
52
- "type": "image_url",
53
- "image_url": {
54
- "url": f"data:image/jpeg;base64,{image_data}"
55
- }
56
- }
57
- ]} if image_data else {"role": "user", "content": prompt}
 
 
 
 
 
 
 
58
  ],
59
  api_key=api_key,
60
  base_url=base_url,
61
  **kwargs,
62
- ) if image_data else openai_complete_if_cache(
 
 
63
  "gpt-4o-mini",
64
  prompt,
65
  system_prompt=system_prompt,
@@ -75,21 +96,19 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
75
  base_url=base_url,
76
  ),
77
  embedding_dim=3072,
78
- max_token_size=8192
79
  )
80
 
81
  # Process document
82
  await rag.process_document_complete(
83
- file_path=file_path,
84
- output_dir=output_dir,
85
- parse_method="auto"
86
  )
87
 
88
  # Example queries
89
  queries = [
90
  "What is the main content of the document?",
91
  "Describe the images and figures in the document",
92
- "Tell me about the experimental results and data tables"
93
  ]
94
 
95
  print("\nQuerying processed document:")
@@ -101,14 +120,21 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
101
  except Exception as e:
102
  print(f"Error processing with RAG: {str(e)}")
103
 
 
104
  def main():
105
  """Main function to run the example"""
106
- parser = argparse.ArgumentParser(description='MinerU RAG Example')
107
- parser.add_argument('file_path', help='Path to the document to process')
108
- parser.add_argument('--working_dir', '-w', default="./rag_storage", help='Working directory path')
109
- parser.add_argument('--output', '-o', default="./output", help='Output directory path')
110
- parser.add_argument('--api-key', required=True, help='OpenAI API key for RAG processing')
111
- parser.add_argument('--base-url', help='Optional base URL for API')
 
 
 
 
 
 
112
 
113
  args = parser.parse_args()
114
 
@@ -117,13 +143,12 @@ def main():
117
  os.makedirs(args.output, exist_ok=True)
118
 
119
  # Process with RAG
120
- asyncio.run(process_with_rag(
121
- args.file_path,
122
- args.output,
123
- args.api_key,
124
- args.base_url,
125
- args.working_dir
126
- ))
127
-
128
- if __name__ == '__main__':
129
- main()
 
11
  import os
12
  import argparse
13
  import asyncio
 
 
14
  from lightrag.llm.openai import openai_complete_if_cache, openai_embed
15
  from lightrag.raganything import RAGAnything
16
 
17
+
18
+ async def process_with_rag(
19
+ file_path: str,
20
+ output_dir: str,
21
+ api_key: str,
22
+ base_url: str = None,
23
+ working_dir: str = None,
24
+ ):
25
  """
26
  Process document with RAGAnything
27
+
28
  Args:
29
  file_path: Path to the document
30
  output_dir: Output directory for RAG results
 
35
  # Initialize RAGAnything
36
  rag = RAGAnything(
37
  working_dir=working_dir,
38
+ llm_model_func=lambda prompt,
39
+ system_prompt=None,
40
+ history_messages=[],
41
+ **kwargs: openai_complete_if_cache(
42
  "gpt-4o-mini",
43
  prompt,
44
  system_prompt=system_prompt,
 
47
  base_url=base_url,
48
  **kwargs,
49
  ),
50
+ vision_model_func=lambda prompt,
51
+ system_prompt=None,
52
+ history_messages=[],
53
+ image_data=None,
54
+ **kwargs: openai_complete_if_cache(
55
  "gpt-4o",
56
  "",
57
  system_prompt=None,
58
  history_messages=[],
59
  messages=[
60
+ {"role": "system", "content": system_prompt}
61
+ if system_prompt
62
+ else None,
63
+ {
64
+ "role": "user",
65
+ "content": [
66
+ {"type": "text", "text": prompt},
67
+ {
68
+ "type": "image_url",
69
+ "image_url": {
70
+ "url": f"data:image/jpeg;base64,{image_data}"
71
+ },
72
+ },
73
+ ],
74
+ }
75
+ if image_data
76
+ else {"role": "user", "content": prompt},
77
  ],
78
  api_key=api_key,
79
  base_url=base_url,
80
  **kwargs,
81
+ )
82
+ if image_data
83
+ else openai_complete_if_cache(
84
  "gpt-4o-mini",
85
  prompt,
86
  system_prompt=system_prompt,
 
96
  base_url=base_url,
97
  ),
98
  embedding_dim=3072,
99
+ max_token_size=8192,
100
  )
101
 
102
  # Process document
103
  await rag.process_document_complete(
104
+ file_path=file_path, output_dir=output_dir, parse_method="auto"
 
 
105
  )
106
 
107
  # Example queries
108
  queries = [
109
  "What is the main content of the document?",
110
  "Describe the images and figures in the document",
111
+ "Tell me about the experimental results and data tables",
112
  ]
113
 
114
  print("\nQuerying processed document:")
 
120
  except Exception as e:
121
  print(f"Error processing with RAG: {str(e)}")
122
 
123
+
124
  def main():
125
  """Main function to run the example"""
126
+ parser = argparse.ArgumentParser(description="MinerU RAG Example")
127
+ parser.add_argument("file_path", help="Path to the document to process")
128
+ parser.add_argument(
129
+ "--working_dir", "-w", default="./rag_storage", help="Working directory path"
130
+ )
131
+ parser.add_argument(
132
+ "--output", "-o", default="./output", help="Output directory path"
133
+ )
134
+ parser.add_argument(
135
+ "--api-key", required=True, help="OpenAI API key for RAG processing"
136
+ )
137
+ parser.add_argument("--base-url", help="Optional base URL for API")
138
 
139
  args = parser.parse_args()
140
 
 
143
  os.makedirs(args.output, exist_ok=True)
144
 
145
  # Process with RAG
146
+ asyncio.run(
147
+ process_with_rag(
148
+ args.file_path, args.output, args.api_key, args.base_url, args.working_dir
149
+ )
150
+ )
151
+
152
+
153
+ if __name__ == "__main__":
154
+ main()
 
lightrag/mineru_parser.py CHANGED
@@ -1,4 +1,4 @@
1
- # type: ignore
2
  """
3
  MinerU Document Parser Utility
4
 
@@ -14,7 +14,18 @@ import os
14
  import json
15
  import argparse
16
  from pathlib import Path
17
- from typing import Dict, List, Optional, Union, Tuple, Any, TypeVar, cast, TYPE_CHECKING, ClassVar
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # Type stubs for magic_pdf
20
  FileBasedDataWriter = Any
@@ -28,20 +39,27 @@ read_local_office = Any
28
  read_local_images = Any
29
 
30
  if TYPE_CHECKING:
31
- from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
 
 
 
32
  from magic_pdf.data.dataset import PymuDocDataset
33
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
34
  from magic_pdf.config.enums import SupportedPdfParseMethod
35
  from magic_pdf.data.read_api import read_local_office, read_local_images
36
  else:
37
  # MinerU imports
38
- from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
 
 
 
39
  from magic_pdf.data.dataset import PymuDocDataset
40
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
41
  from magic_pdf.config.enums import SupportedPdfParseMethod
42
  from magic_pdf.data.read_api import read_local_office, read_local_images
43
 
44
- T = TypeVar('T')
 
45
 
46
  class MineruParser:
47
  """
@@ -58,7 +76,11 @@ class MineruParser:
58
  pass
59
 
60
  @staticmethod
61
- def safe_write(writer: Any, content: Union[str, bytes, Dict[str, Any], List[Any]], filename: str) -> None:
 
 
 
 
62
  """
63
  Safely write content to a file, ensuring the filename is valid
64
 
@@ -80,15 +102,22 @@ class MineruParser:
80
  writer.write(content, filename)
81
  except TypeError:
82
  # If the writer expects bytes, convert string to bytes
83
- writer.write(content.encode('utf-8'), filename)
84
  else:
85
  # For dict/list content, always encode as JSON string first
86
  if isinstance(content, (dict, list)):
87
  try:
88
- writer.write(json.dumps(content, ensure_ascii=False, indent=4), filename)
 
 
89
  except TypeError:
90
  # If the writer expects bytes, convert JSON string to bytes
91
- writer.write(json.dumps(content, ensure_ascii=False, indent=4).encode('utf-8'), filename)
 
 
 
 
 
92
  else:
93
  # Regular content (assumed to be bytes or compatible)
94
  writer.write(content, filename)
@@ -97,7 +126,7 @@ class MineruParser:
97
  def parse_pdf(
98
  pdf_path: Union[str, Path],
99
  output_dir: Optional[str] = None,
100
- use_ocr: bool = False
101
  ) -> Tuple[List[Dict[str, Any]], str]:
102
  """
103
  Parse PDF document
@@ -150,9 +179,15 @@ class MineruParser:
150
 
151
  # Draw visualizations
152
  try:
153
- infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) # type: ignore
154
- pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) # type: ignore
155
- pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")) # type: ignore
 
 
 
 
 
 
156
  except Exception as e:
157
  print(f"Warning: Failed to draw visualizations: {str(e)}")
158
 
@@ -162,7 +197,9 @@ class MineruParser:
162
 
163
  # Save files using dump methods (consistent with API)
164
  pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore
165
- pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) # type: ignore
 
 
166
  pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
167
 
168
  # Save model result - convert JSON string to bytes before writing
@@ -171,16 +208,24 @@ class MineruParser:
171
 
172
  try:
173
  # Try to write to a file manually to avoid FileBasedDataWriter issues
174
- model_file_path = os.path.join(local_md_dir, f"{name_without_suff}_model.json")
175
- with open(model_file_path, 'w', encoding='utf-8') as f:
 
 
176
  f.write(json_str)
177
  except Exception as e:
178
- print(f"Warning: Failed to save model result using file write: {str(e)}")
 
 
179
  try:
180
  # If direct file write fails, try using the writer with bytes encoding
181
- md_writer.write(json_str.encode('utf-8'), f"{name_without_suff}_model.json") # type: ignore
 
 
182
  except Exception as e2:
183
- print(f"Warning: Failed to save model result using writer: {str(e2)}")
 
 
184
 
185
  return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
186
 
@@ -190,8 +235,7 @@ class MineruParser:
190
 
191
  @staticmethod
192
  def parse_office_doc(
193
- doc_path: Union[str, Path],
194
- output_dir: Optional[str] = None
195
  ) -> Tuple[List[Dict[str, Any]], str]:
196
  """
197
  Parse office document (Word, PPT, etc.)
@@ -231,9 +275,9 @@ class MineruParser:
231
 
232
  # Apply chain of operations according to API documentation
233
  # This follows the pattern shown in MS-Office example in the API docs
234
- ds.apply(doc_analyze, ocr=True)\
235
- .pipe_txt_mode(image_writer)\
236
- .dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore
237
 
238
  # Re-execute for getting the content data
239
  infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
@@ -244,7 +288,9 @@ class MineruParser:
244
  content_list = pipe_result.get_content_list(image_dir) # type: ignore
245
 
246
  # Save additional output files
247
- pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) # type: ignore
 
 
248
  pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
249
 
250
  # Save model result - convert JSON string to bytes before writing
@@ -253,16 +299,24 @@ class MineruParser:
253
 
254
  try:
255
  # Try to write to a file manually to avoid FileBasedDataWriter issues
256
- model_file_path = os.path.join(local_md_dir, f"{name_without_suff}_model.json")
257
- with open(model_file_path, 'w', encoding='utf-8') as f:
 
 
258
  f.write(json_str)
259
  except Exception as e:
260
- print(f"Warning: Failed to save model result using file write: {str(e)}")
 
 
261
  try:
262
  # If direct file write fails, try using the writer with bytes encoding
263
- md_writer.write(json_str.encode('utf-8'), f"{name_without_suff}_model.json") # type: ignore
 
 
264
  except Exception as e2:
265
- print(f"Warning: Failed to save model result using writer: {str(e2)}")
 
 
266
 
267
  return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
268
 
@@ -272,8 +326,7 @@ class MineruParser:
272
 
273
  @staticmethod
274
  def parse_image(
275
- image_path: Union[str, Path],
276
- output_dir: Optional[str] = None
277
  ) -> Tuple[List[Dict[str, Any]], str]:
278
  """
279
  Parse image document
@@ -313,9 +366,9 @@ class MineruParser:
313
 
314
  # Apply chain of operations according to API documentation
315
  # This follows the pattern shown in Image example in the API docs
316
- ds.apply(doc_analyze, ocr=True)\
317
- .pipe_ocr_mode(image_writer)\
318
- .dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore
319
 
320
  # Re-execute for getting the content data
321
  infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
@@ -326,7 +379,9 @@ class MineruParser:
326
  content_list = pipe_result.get_content_list(image_dir) # type: ignore
327
 
328
  # Save additional output files
329
- pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) # type: ignore
 
 
330
  pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
331
 
332
  # Save model result - convert JSON string to bytes before writing
@@ -335,16 +390,24 @@ class MineruParser:
335
 
336
  try:
337
  # Try to write to a file manually to avoid FileBasedDataWriter issues
338
- model_file_path = os.path.join(local_md_dir, f"{name_without_suff}_model.json")
339
- with open(model_file_path, 'w', encoding='utf-8') as f:
 
 
340
  f.write(json_str)
341
  except Exception as e:
342
- print(f"Warning: Failed to save model result using file write: {str(e)}")
 
 
343
  try:
344
  # If direct file write fails, try using the writer with bytes encoding
345
- md_writer.write(json_str.encode('utf-8'), f"{name_without_suff}_model.json") # type: ignore
 
 
346
  except Exception as e2:
347
- print(f"Warning: Failed to save model result using writer: {str(e2)}")
 
 
348
 
349
  return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
350
 
@@ -357,7 +420,7 @@ class MineruParser:
357
  file_path: Union[str, Path],
358
  parse_method: str = "auto",
359
  output_dir: Optional[str] = None,
360
- save_results: bool = True
361
  ) -> Tuple[List[Dict[str, Any]], str]:
362
  """
363
  Parse document using MinerU based on file extension
@@ -382,64 +445,59 @@ class MineruParser:
382
  # Choose appropriate parser based on file type
383
  if ext in [".pdf"]:
384
  return MineruParser.parse_pdf(
385
- file_path,
386
- output_dir,
387
- use_ocr=(parse_method == "ocr")
388
  )
389
  elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
390
- return MineruParser.parse_image(
391
- file_path,
392
- output_dir
393
- )
394
  elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
395
- return MineruParser.parse_office_doc(
396
- file_path,
397
- output_dir
398
- )
399
  else:
400
  # For unsupported file types, default to PDF parsing
401
- print(f"Warning: Unsupported file extension '{ext}', trying generic PDF parser")
 
 
402
  return MineruParser.parse_pdf(
403
- file_path,
404
- output_dir,
405
- use_ocr=(parse_method == "ocr")
406
  )
407
 
 
408
  def main():
409
  """
410
  Main function to run the MinerU parser from command line
411
  """
412
- parser = argparse.ArgumentParser(description='Parse documents using MinerU')
413
- parser.add_argument('file_path', help='Path to the document to parse')
414
- parser.add_argument('--output', '-o', help='Output directory path')
415
- parser.add_argument('--method', '-m',
416
- choices=['auto', 'ocr', 'txt'],
417
- default='auto',
418
- help='Parsing method (auto, ocr, txt)')
419
- parser.add_argument('--stats', action='store_true',
420
- help='Display content statistics')
 
 
 
 
421
 
422
  args = parser.parse_args()
423
 
424
  try:
425
  # Parse the document
426
  content_list, md_content = MineruParser.parse_document(
427
- file_path=args.file_path,
428
- parse_method=args.method,
429
- output_dir=args.output
430
  )
431
 
432
  # Display statistics if requested
433
  if args.stats:
434
  print("\nDocument Statistics:")
435
  print(f"Total content blocks: {len(content_list)}")
436
-
437
  # Count different types of content
438
  content_types = {}
439
  for item in content_list:
440
- content_type = item.get('type', 'unknown')
441
  content_types[content_type] = content_types.get(content_type, 0) + 1
442
-
443
  print("\nContent Type Distribution:")
444
  for content_type, count in content_types.items():
445
  print(f"- {content_type}: {count}")
@@ -450,5 +508,6 @@ def main():
450
 
451
  return 0
452
 
453
- if __name__ == '__main__':
 
454
  exit(main())
 
1
+ # type: ignore
2
  """
3
  MinerU Document Parser Utility
4
 
 
14
  import json
15
  import argparse
16
  from pathlib import Path
17
+ from typing import (
18
+ Dict,
19
+ List,
20
+ Optional,
21
+ Union,
22
+ Tuple,
23
+ Any,
24
+ TypeVar,
25
+ cast,
26
+ TYPE_CHECKING,
27
+ ClassVar,
28
+ )
29
 
30
  # Type stubs for magic_pdf
31
  FileBasedDataWriter = Any
 
39
  read_local_images = Any
40
 
41
  if TYPE_CHECKING:
42
+ from magic_pdf.data.data_reader_writer import (
43
+ FileBasedDataWriter,
44
+ FileBasedDataReader,
45
+ )
46
  from magic_pdf.data.dataset import PymuDocDataset
47
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
48
  from magic_pdf.config.enums import SupportedPdfParseMethod
49
  from magic_pdf.data.read_api import read_local_office, read_local_images
50
  else:
51
  # MinerU imports
52
+ from magic_pdf.data.data_reader_writer import (
53
+ FileBasedDataWriter,
54
+ FileBasedDataReader,
55
+ )
56
  from magic_pdf.data.dataset import PymuDocDataset
57
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
58
  from magic_pdf.config.enums import SupportedPdfParseMethod
59
  from magic_pdf.data.read_api import read_local_office, read_local_images
60
 
61
+ T = TypeVar("T")
62
+
63
 
64
  class MineruParser:
65
  """
 
76
  pass
77
 
78
  @staticmethod
79
+ def safe_write(
80
+ writer: Any,
81
+ content: Union[str, bytes, Dict[str, Any], List[Any]],
82
+ filename: str,
83
+ ) -> None:
84
  """
85
  Safely write content to a file, ensuring the filename is valid
86
 
 
102
  writer.write(content, filename)
103
  except TypeError:
104
  # If the writer expects bytes, convert string to bytes
105
+ writer.write(content.encode("utf-8"), filename)
106
  else:
107
  # For dict/list content, always encode as JSON string first
108
  if isinstance(content, (dict, list)):
109
  try:
110
+ writer.write(
111
+ json.dumps(content, ensure_ascii=False, indent=4), filename
112
+ )
113
  except TypeError:
114
  # If the writer expects bytes, convert JSON string to bytes
115
+ writer.write(
116
+ json.dumps(content, ensure_ascii=False, indent=4).encode(
117
+ "utf-8"
118
+ ),
119
+ filename,
120
+ )
121
  else:
122
  # Regular content (assumed to be bytes or compatible)
123
  writer.write(content, filename)
 
126
  def parse_pdf(
127
  pdf_path: Union[str, Path],
128
  output_dir: Optional[str] = None,
129
+ use_ocr: bool = False,
130
  ) -> Tuple[List[Dict[str, Any]], str]:
131
  """
132
  Parse PDF document
 
179
 
180
  # Draw visualizations
181
  try:
182
+ infer_result.draw_model(
183
+ os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")
184
+ ) # type: ignore
185
+ pipe_result.draw_layout(
186
+ os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")
187
+ ) # type: ignore
188
+ pipe_result.draw_span(
189
+ os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")
190
+ ) # type: ignore
191
  except Exception as e:
192
  print(f"Warning: Failed to draw visualizations: {str(e)}")
193
 
 
197
 
198
  # Save files using dump methods (consistent with API)
199
  pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore
200
+ pipe_result.dump_content_list(
201
+ md_writer, f"{name_without_suff}_content_list.json", image_dir
202
+ ) # type: ignore
203
  pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
204
 
205
  # Save model result - convert JSON string to bytes before writing
 
208
 
209
  try:
210
  # Try to write to a file manually to avoid FileBasedDataWriter issues
211
+ model_file_path = os.path.join(
212
+ local_md_dir, f"{name_without_suff}_model.json"
213
+ )
214
+ with open(model_file_path, "w", encoding="utf-8") as f:
215
  f.write(json_str)
216
  except Exception as e:
217
+ print(
218
+ f"Warning: Failed to save model result using file write: {str(e)}"
219
+ )
220
  try:
221
  # If direct file write fails, try using the writer with bytes encoding
222
+ md_writer.write(
223
+ json_str.encode("utf-8"), f"{name_without_suff}_model.json"
224
+ ) # type: ignore
225
  except Exception as e2:
226
+ print(
227
+ f"Warning: Failed to save model result using writer: {str(e2)}"
228
+ )
229
 
230
  return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
231
 
 
235
 
236
  @staticmethod
237
  def parse_office_doc(
238
+ doc_path: Union[str, Path], output_dir: Optional[str] = None
 
239
  ) -> Tuple[List[Dict[str, Any]], str]:
240
  """
241
  Parse office document (Word, PPT, etc.)
 
275
 
276
  # Apply chain of operations according to API documentation
277
  # This follows the pattern shown in MS-Office example in the API docs
278
+ ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
279
+ md_writer, f"{name_without_suff}.md", image_dir
280
+ ) # type: ignore
281
 
282
  # Re-execute for getting the content data
283
  infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
 
288
  content_list = pipe_result.get_content_list(image_dir) # type: ignore
289
 
290
  # Save additional output files
291
+ pipe_result.dump_content_list(
292
+ md_writer, f"{name_without_suff}_content_list.json", image_dir
293
+ ) # type: ignore
294
  pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
295
 
296
  # Save model result - convert JSON string to bytes before writing
 
299
 
300
  try:
301
  # Try to write to a file manually to avoid FileBasedDataWriter issues
302
+ model_file_path = os.path.join(
303
+ local_md_dir, f"{name_without_suff}_model.json"
304
+ )
305
+ with open(model_file_path, "w", encoding="utf-8") as f:
306
  f.write(json_str)
307
  except Exception as e:
308
+ print(
309
+ f"Warning: Failed to save model result using file write: {str(e)}"
310
+ )
311
  try:
312
  # If direct file write fails, try using the writer with bytes encoding
313
+ md_writer.write(
314
+ json_str.encode("utf-8"), f"{name_without_suff}_model.json"
315
+ ) # type: ignore
316
  except Exception as e2:
317
+ print(
318
+ f"Warning: Failed to save model result using writer: {str(e2)}"
319
+ )
320
 
321
  return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
322
 
 
326
 
327
  @staticmethod
328
  def parse_image(
329
+ image_path: Union[str, Path], output_dir: Optional[str] = None
 
330
  ) -> Tuple[List[Dict[str, Any]], str]:
331
  """
332
  Parse image document
 
366
 
367
  # Apply chain of operations according to API documentation
368
  # This follows the pattern shown in Image example in the API docs
369
+ ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
370
+ md_writer, f"{name_without_suff}.md", image_dir
371
+ ) # type: ignore
372
 
373
  # Re-execute for getting the content data
374
  infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
 
379
  content_list = pipe_result.get_content_list(image_dir) # type: ignore
380
 
381
  # Save additional output files
382
+ pipe_result.dump_content_list(
383
+ md_writer, f"{name_without_suff}_content_list.json", image_dir
384
+ ) # type: ignore
385
  pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
386
 
387
  # Save model result - convert JSON string to bytes before writing
 
390
 
391
  try:
392
  # Try to write to a file manually to avoid FileBasedDataWriter issues
393
+ model_file_path = os.path.join(
394
+ local_md_dir, f"{name_without_suff}_model.json"
395
+ )
396
+ with open(model_file_path, "w", encoding="utf-8") as f:
397
  f.write(json_str)
398
  except Exception as e:
399
+ print(
400
+ f"Warning: Failed to save model result using file write: {str(e)}"
401
+ )
402
  try:
403
  # If direct file write fails, try using the writer with bytes encoding
404
+ md_writer.write(
405
+ json_str.encode("utf-8"), f"{name_without_suff}_model.json"
406
+ ) # type: ignore
407
  except Exception as e2:
408
+ print(
409
+ f"Warning: Failed to save model result using writer: {str(e2)}"
410
+ )
411
 
412
  return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
413
 
 
420
  file_path: Union[str, Path],
421
  parse_method: str = "auto",
422
  output_dir: Optional[str] = None,
423
+ save_results: bool = True,
424
  ) -> Tuple[List[Dict[str, Any]], str]:
425
  """
426
  Parse document using MinerU based on file extension
 
445
  # Choose appropriate parser based on file type
446
  if ext in [".pdf"]:
447
  return MineruParser.parse_pdf(
448
+ file_path, output_dir, use_ocr=(parse_method == "ocr")
 
 
449
  )
450
  elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
451
+ return MineruParser.parse_image(file_path, output_dir)
 
 
 
452
  elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
453
+ return MineruParser.parse_office_doc(file_path, output_dir)
 
 
 
454
  else:
455
  # For unsupported file types, default to PDF parsing
456
+ print(
457
+ f"Warning: Unsupported file extension '{ext}', trying generic PDF parser"
458
+ )
459
  return MineruParser.parse_pdf(
460
+ file_path, output_dir, use_ocr=(parse_method == "ocr")
 
 
461
  )
462
 
463
+
464
  def main():
465
  """
466
  Main function to run the MinerU parser from command line
467
  """
468
+ parser = argparse.ArgumentParser(description="Parse documents using MinerU")
469
+ parser.add_argument("file_path", help="Path to the document to parse")
470
+ parser.add_argument("--output", "-o", help="Output directory path")
471
+ parser.add_argument(
472
+ "--method",
473
+ "-m",
474
+ choices=["auto", "ocr", "txt"],
475
+ default="auto",
476
+ help="Parsing method (auto, ocr, txt)",
477
+ )
478
+ parser.add_argument(
479
+ "--stats", action="store_true", help="Display content statistics"
480
+ )
481
 
482
  args = parser.parse_args()
483
 
484
  try:
485
  # Parse the document
486
  content_list, md_content = MineruParser.parse_document(
487
+ file_path=args.file_path, parse_method=args.method, output_dir=args.output
 
 
488
  )
489
 
490
  # Display statistics if requested
491
  if args.stats:
492
  print("\nDocument Statistics:")
493
  print(f"Total content blocks: {len(content_list)}")
494
+
495
  # Count different types of content
496
  content_types = {}
497
  for item in content_list:
498
+ content_type = item.get("type", "unknown")
499
  content_types[content_type] = content_types.get(content_type, 0) + 1
500
+
501
  print("\nContent Type Distribution:")
502
  for content_type, count in content_types.items():
503
  print(f"- {content_type}: {count}")
 
508
 
509
  return 0
510
 
511
+
512
+ if __name__ == "__main__":
513
  exit(main())
lightrag/modalprocessors.py CHANGED
@@ -31,7 +31,7 @@ class BaseModalProcessor:
31
 
32
  def __init__(self, lightrag: LightRAG, modal_caption_func):
33
  """Initialize base processor
34
-
35
  Args:
36
  lightrag: LightRAG instance
37
  modal_caption_func: Function for generating descriptions
@@ -65,8 +65,8 @@ class BaseModalProcessor:
65
  raise NotImplementedError("Subclasses must implement this method")
66
 
67
  async def _create_entity_and_chunk(
68
- self, modal_chunk: str, entity_info: Dict[str, Any],
69
- file_path: str) -> Tuple[str, Dict[str, Any]]:
70
  """Create entity and text chunk"""
71
  # Create chunk
72
  chunk_id = compute_mdhash_id(str(modal_chunk), prefix="chunk-")
@@ -93,16 +93,16 @@ class BaseModalProcessor:
93
  "created_at": int(time.time()),
94
  }
95
 
96
- await self.knowledge_graph_inst.upsert_node(entity_info["entity_name"],
97
- node_data)
 
98
 
99
  # Insert entity into vector database
100
  entity_vdb_data = {
101
  compute_mdhash_id(entity_info["entity_name"], prefix="ent-"): {
102
  "entity_name": entity_info["entity_name"],
103
  "entity_type": entity_info["entity_type"],
104
- "content":
105
- f"{entity_info['entity_name']}\n{entity_info['summary']}",
106
  "source_id": chunk_id,
107
  "file_path": file_path,
108
  }
@@ -110,8 +110,7 @@ class BaseModalProcessor:
110
  await self.entities_vdb.upsert(entity_vdb_data)
111
 
112
  # Process entity and relationship extraction
113
- await self._process_chunk_for_extraction(chunk_id,
114
- entity_info["entity_name"])
115
 
116
  # Ensure all storage updates are complete
117
  await self._insert_done()
@@ -120,11 +119,12 @@ class BaseModalProcessor:
120
  "entity_name": entity_info["entity_name"],
121
  "entity_type": entity_info["entity_type"],
122
  "description": entity_info["summary"],
123
- "chunk_id": chunk_id
124
  }
125
 
126
- async def _process_chunk_for_extraction(self, chunk_id: str,
127
- modal_entity_name: str):
 
128
  """Process chunk for entity and relationship extraction"""
129
  chunk_data = await self.text_chunks_db.get_by_id(chunk_id)
130
  if not chunk_data:
@@ -168,37 +168,27 @@ class BaseModalProcessor:
168
  if entity_name != modal_entity_name: # Skip self-relationship
169
  # Create belongs_to relationship
170
  relation_data = {
171
- "description":
172
- f"Entity {entity_name} belongs to {modal_entity_name}",
173
- "keywords":
174
- "belongs_to,part_of,contained_in",
175
- "source_id":
176
- chunk_id,
177
- "weight":
178
- 10.0,
179
- "file_path":
180
- chunk_data.get("file_path", "manual_creation"),
181
  }
182
  await self.knowledge_graph_inst.upsert_edge(
183
- entity_name, modal_entity_name, relation_data)
 
184
 
185
- relation_id = compute_mdhash_id(entity_name +
186
- modal_entity_name,
187
- prefix="rel-")
188
  relation_vdb_data = {
189
  relation_id: {
190
- "src_id":
191
- entity_name,
192
- "tgt_id":
193
- modal_entity_name,
194
- "keywords":
195
- relation_data["keywords"],
196
- "content":
197
- f"{relation_data['keywords']}\t{entity_name}\n{modal_entity_name}\n{relation_data['description']}",
198
- "source_id":
199
- chunk_id,
200
- "file_path":
201
- chunk_data.get("file_path", "manual_creation"),
202
  }
203
  }
204
  await self.relationships_vdb.upsert(relation_vdb_data)
@@ -215,16 +205,18 @@ class BaseModalProcessor:
215
  )
216
 
217
  async def _insert_done(self) -> None:
218
- await asyncio.gather(*[
219
- cast(StorageNameSpace, storage_inst).index_done_callback()
220
- for storage_inst in [
221
- self.text_chunks_db,
222
- self.chunks_vdb,
223
- self.entities_vdb,
224
- self.relationships_vdb,
225
- self.knowledge_graph_inst,
 
 
226
  ]
227
- ])
228
 
229
 
230
  class ImageModalProcessor(BaseModalProcessor):
@@ -232,7 +224,7 @@ class ImageModalProcessor(BaseModalProcessor):
232
 
233
  def __init__(self, lightrag: LightRAG, modal_caption_func):
234
  """Initialize image processor
235
-
236
  Args:
237
  lightrag: LightRAG instance
238
  modal_caption_func: Function for generating descriptions (supporting image understanding)
@@ -243,8 +235,7 @@ class ImageModalProcessor(BaseModalProcessor):
243
  """Encode image to base64"""
244
  try:
245
  with open(image_path, "rb") as image_file:
246
- encoded_string = base64.b64encode(
247
- image_file.read()).decode('utf-8')
248
  return encoded_string
249
  except Exception as e:
250
  logger.error(f"Failed to encode image {image_path}: {e}")
@@ -309,13 +300,12 @@ class ImageModalProcessor(BaseModalProcessor):
309
  response = await self.modal_caption_func(
310
  vision_prompt,
311
  image_data=image_base64,
312
- system_prompt=
313
- "You are an expert image analyst. Provide detailed, accurate descriptions."
314
  )
315
  else:
316
  # Analyze based on existing text information
317
  text_prompt = f"""Based on the following image information, provide analysis:
318
-
319
  Image Path: {image_path}
320
  Captions: {captions}
321
  Footnotes: {footnotes}
@@ -324,13 +314,11 @@ class ImageModalProcessor(BaseModalProcessor):
324
 
325
  response = await self.modal_caption_func(
326
  text_prompt,
327
- system_prompt=
328
- "You are an expert image analyst. Provide detailed analysis based on available information."
329
  )
330
 
331
  # Parse response
332
- enhanced_caption, entity_info = self._parse_response(
333
- response, entity_name)
334
 
335
  # Build complete image content
336
  modal_chunk = f"""
@@ -341,27 +329,30 @@ class ImageModalProcessor(BaseModalProcessor):
341
 
342
  Visual Analysis: {enhanced_caption}"""
343
 
344
- return await self._create_entity_and_chunk(modal_chunk,
345
- entity_info, file_path)
 
346
 
347
  except Exception as e:
348
  logger.error(f"Error processing image content: {e}")
349
  # Fallback processing
350
  fallback_entity = {
351
- "entity_name": entity_name if entity_name else
352
- f"image_{compute_mdhash_id(str(modal_content))}",
 
353
  "entity_type": "image",
354
- "summary": f"Image content: {str(modal_content)[:100]}"
355
  }
356
  return str(modal_content), fallback_entity
357
 
358
- def _parse_response(self,
359
- response: str,
360
- entity_name: str = None) -> Tuple[str, Dict[str, Any]]:
361
  """Parse model response"""
362
  try:
363
  response_data = json.loads(
364
- re.search(r"\{.*\}", response, re.DOTALL).group(0))
 
365
 
366
  description = response_data.get("detailed_description", "")
367
  entity_data = response_data.get("entity_info", {})
@@ -369,11 +360,14 @@ class ImageModalProcessor(BaseModalProcessor):
369
  if not description or not entity_data:
370
  raise ValueError("Missing required fields in response")
371
 
372
- if not all(key in entity_data
373
- for key in ["entity_name", "entity_type", "summary"]):
 
374
  raise ValueError("Missing required fields in entity_info")
375
 
376
- entity_data["entity_name"] = entity_data["entity_name"] + f" ({entity_data['entity_type']})"
 
 
377
  if entity_name:
378
  entity_data["entity_name"] = entity_name
379
 
@@ -382,13 +376,11 @@ class ImageModalProcessor(BaseModalProcessor):
382
  except (json.JSONDecodeError, AttributeError, ValueError) as e:
383
  logger.error(f"Error parsing image analysis response: {e}")
384
  fallback_entity = {
385
- "entity_name":
386
- entity_name
387
- if entity_name else f"image_{compute_mdhash_id(response)}",
388
- "entity_type":
389
- "image",
390
- "summary":
391
- response[:100] + "..." if len(response) > 100 else response
392
  }
393
  return response, fallback_entity
394
 
@@ -447,15 +439,15 @@ class TableModalProcessor(BaseModalProcessor):
447
 
448
  response = await self.modal_caption_func(
449
  table_prompt,
450
- system_prompt=
451
- "You are an expert data analyst. Provide detailed table analysis with specific insights."
452
  )
453
 
454
  # Parse response
455
  enhanced_caption, entity_info = self._parse_table_response(
456
- response, entity_name)
457
-
458
- #TODO: Add Retry Mechanism
 
459
 
460
  # Build complete table content
461
  modal_chunk = f"""Table Analysis:
@@ -466,17 +458,16 @@ class TableModalProcessor(BaseModalProcessor):
466
 
467
  Analysis: {enhanced_caption}"""
468
 
469
- return await self._create_entity_and_chunk(modal_chunk, entity_info,
470
- file_path)
471
 
472
  def _parse_table_response(
473
- self,
474
- response: str,
475
- entity_name: str = None) -> Tuple[str, Dict[str, Any]]:
476
  """Parse table analysis response"""
477
  try:
478
  response_data = json.loads(
479
- re.search(r"\{.*\}", response, re.DOTALL).group(0))
 
480
 
481
  description = response_data.get("detailed_description", "")
482
  entity_data = response_data.get("entity_info", {})
@@ -484,11 +475,14 @@ class TableModalProcessor(BaseModalProcessor):
484
  if not description or not entity_data:
485
  raise ValueError("Missing required fields in response")
486
 
487
- if not all(key in entity_data
488
- for key in ["entity_name", "entity_type", "summary"]):
 
489
  raise ValueError("Missing required fields in entity_info")
490
 
491
- entity_data["entity_name"] = entity_data["entity_name"] + f" ({entity_data['entity_type']})"
 
 
492
  if entity_name:
493
  entity_data["entity_name"] = entity_name
494
 
@@ -497,13 +491,11 @@ class TableModalProcessor(BaseModalProcessor):
497
  except (json.JSONDecodeError, AttributeError, ValueError) as e:
498
  logger.error(f"Error parsing table analysis response: {e}")
499
  fallback_entity = {
500
- "entity_name":
501
- entity_name
502
- if entity_name else f"table_{compute_mdhash_id(response)}",
503
- "entity_type":
504
- "table",
505
- "summary":
506
- response[:100] + "..." if len(response) > 100 else response
507
  }
508
  return response, fallback_entity
509
 
@@ -559,13 +551,13 @@ class EquationModalProcessor(BaseModalProcessor):
559
 
560
  response = await self.modal_caption_func(
561
  equation_prompt,
562
- system_prompt=
563
- "You are an expert mathematician. Provide detailed mathematical analysis."
564
  )
565
 
566
  # Parse response
567
  enhanced_caption, entity_info = self._parse_equation_response(
568
- response, entity_name)
 
569
 
570
  # Build complete equation content
571
  modal_chunk = f"""Mathematical Equation Analysis:
@@ -574,17 +566,16 @@ class EquationModalProcessor(BaseModalProcessor):
574
 
575
  Mathematical Analysis: {enhanced_caption}"""
576
 
577
- return await self._create_entity_and_chunk(modal_chunk, entity_info,
578
- file_path)
579
 
580
  def _parse_equation_response(
581
- self,
582
- response: str,
583
- entity_name: str = None) -> Tuple[str, Dict[str, Any]]:
584
  """Parse equation analysis response"""
585
  try:
586
  response_data = json.loads(
587
- re.search(r"\{.*\}", response, re.DOTALL).group(0))
 
588
 
589
  description = response_data.get("detailed_description", "")
590
  entity_data = response_data.get("entity_info", {})
@@ -592,11 +583,14 @@ class EquationModalProcessor(BaseModalProcessor):
592
  if not description or not entity_data:
593
  raise ValueError("Missing required fields in response")
594
 
595
- if not all(key in entity_data
596
- for key in ["entity_name", "entity_type", "summary"]):
 
597
  raise ValueError("Missing required fields in entity_info")
598
 
599
- entity_data["entity_name"] = entity_data["entity_name"] + f" ({entity_data['entity_type']})"
 
 
600
  if entity_name:
601
  entity_data["entity_name"] = entity_name
602
 
@@ -605,13 +599,11 @@ class EquationModalProcessor(BaseModalProcessor):
605
  except (json.JSONDecodeError, AttributeError, ValueError) as e:
606
  logger.error(f"Error parsing equation analysis response: {e}")
607
  fallback_entity = {
608
- "entity_name":
609
- entity_name
610
- if entity_name else f"equation_{compute_mdhash_id(response)}",
611
- "entity_type":
612
- "equation",
613
- "summary":
614
- response[:100] + "..." if len(response) > 100 else response
615
  }
616
  return response, fallback_entity
617
 
@@ -651,13 +643,13 @@ class GenericModalProcessor(BaseModalProcessor):
651
 
652
  response = await self.modal_caption_func(
653
  generic_prompt,
654
- system_prompt=
655
- f"You are an expert content analyst specializing in {content_type} content."
656
  )
657
 
658
  # Parse response
659
  enhanced_caption, entity_info = self._parse_generic_response(
660
- response, entity_name, content_type)
 
661
 
662
  # Build complete content
663
  modal_chunk = f"""{content_type.title()} Content Analysis:
@@ -665,18 +657,16 @@ class GenericModalProcessor(BaseModalProcessor):
665
 
666
  Analysis: {enhanced_caption}"""
667
 
668
- return await self._create_entity_and_chunk(modal_chunk, entity_info,
669
- file_path)
670
 
671
  def _parse_generic_response(
672
- self,
673
- response: str,
674
- entity_name: str = None,
675
- content_type: str = "content") -> Tuple[str, Dict[str, Any]]:
676
  """Parse generic analysis response"""
677
  try:
678
  response_data = json.loads(
679
- re.search(r"\{.*\}", response, re.DOTALL).group(0))
 
680
 
681
  description = response_data.get("detailed_description", "")
682
  entity_data = response_data.get("entity_info", {})
@@ -684,11 +674,14 @@ class GenericModalProcessor(BaseModalProcessor):
684
  if not description or not entity_data:
685
  raise ValueError("Missing required fields in response")
686
 
687
- if not all(key in entity_data
688
- for key in ["entity_name", "entity_type", "summary"]):
 
689
  raise ValueError("Missing required fields in entity_info")
690
 
691
- entity_data["entity_name"] = entity_data["entity_name"] + f" ({entity_data['entity_type']})"
 
 
692
  if entity_name:
693
  entity_data["entity_name"] = entity_name
694
 
@@ -697,12 +690,10 @@ class GenericModalProcessor(BaseModalProcessor):
697
  except (json.JSONDecodeError, AttributeError, ValueError) as e:
698
  logger.error(f"Error parsing generic analysis response: {e}")
699
  fallback_entity = {
700
- "entity_name":
701
- entity_name if entity_name else
702
- f"{content_type}_{compute_mdhash_id(response)}",
703
- "entity_type":
704
- content_type,
705
- "summary":
706
- response[:100] + "..." if len(response) > 100 else response
707
  }
708
  return response, fallback_entity
 
31
 
32
  def __init__(self, lightrag: LightRAG, modal_caption_func):
33
  """Initialize base processor
34
+
35
  Args:
36
  lightrag: LightRAG instance
37
  modal_caption_func: Function for generating descriptions
 
65
  raise NotImplementedError("Subclasses must implement this method")
66
 
67
  async def _create_entity_and_chunk(
68
+ self, modal_chunk: str, entity_info: Dict[str, Any], file_path: str
69
+ ) -> Tuple[str, Dict[str, Any]]:
70
  """Create entity and text chunk"""
71
  # Create chunk
72
  chunk_id = compute_mdhash_id(str(modal_chunk), prefix="chunk-")
 
93
  "created_at": int(time.time()),
94
  }
95
 
96
+ await self.knowledge_graph_inst.upsert_node(
97
+ entity_info["entity_name"], node_data
98
+ )
99
 
100
  # Insert entity into vector database
101
  entity_vdb_data = {
102
  compute_mdhash_id(entity_info["entity_name"], prefix="ent-"): {
103
  "entity_name": entity_info["entity_name"],
104
  "entity_type": entity_info["entity_type"],
105
+ "content": f"{entity_info['entity_name']}\n{entity_info['summary']}",
 
106
  "source_id": chunk_id,
107
  "file_path": file_path,
108
  }
 
110
  await self.entities_vdb.upsert(entity_vdb_data)
111
 
112
  # Process entity and relationship extraction
113
+ await self._process_chunk_for_extraction(chunk_id, entity_info["entity_name"])
 
114
 
115
  # Ensure all storage updates are complete
116
  await self._insert_done()
 
119
  "entity_name": entity_info["entity_name"],
120
  "entity_type": entity_info["entity_type"],
121
  "description": entity_info["summary"],
122
+ "chunk_id": chunk_id,
123
  }
124
 
125
+ async def _process_chunk_for_extraction(
126
+ self, chunk_id: str, modal_entity_name: str
127
+ ):
128
  """Process chunk for entity and relationship extraction"""
129
  chunk_data = await self.text_chunks_db.get_by_id(chunk_id)
130
  if not chunk_data:
 
168
  if entity_name != modal_entity_name: # Skip self-relationship
169
  # Create belongs_to relationship
170
  relation_data = {
171
+ "description": f"Entity {entity_name} belongs to {modal_entity_name}",
172
+ "keywords": "belongs_to,part_of,contained_in",
173
+ "source_id": chunk_id,
174
+ "weight": 10.0,
175
+ "file_path": chunk_data.get("file_path", "manual_creation"),
 
 
 
 
 
176
  }
177
  await self.knowledge_graph_inst.upsert_edge(
178
+ entity_name, modal_entity_name, relation_data
179
+ )
180
 
181
+ relation_id = compute_mdhash_id(
182
+ entity_name + modal_entity_name, prefix="rel-"
183
+ )
184
  relation_vdb_data = {
185
  relation_id: {
186
+ "src_id": entity_name,
187
+ "tgt_id": modal_entity_name,
188
+ "keywords": relation_data["keywords"],
189
+ "content": f"{relation_data['keywords']}\t{entity_name}\n{modal_entity_name}\n{relation_data['description']}",
190
+ "source_id": chunk_id,
191
+ "file_path": chunk_data.get("file_path", "manual_creation"),
 
 
 
 
 
 
192
  }
193
  }
194
  await self.relationships_vdb.upsert(relation_vdb_data)
 
205
  )
206
 
207
  async def _insert_done(self) -> None:
208
+ await asyncio.gather(
209
+ *[
210
+ cast(StorageNameSpace, storage_inst).index_done_callback()
211
+ for storage_inst in [
212
+ self.text_chunks_db,
213
+ self.chunks_vdb,
214
+ self.entities_vdb,
215
+ self.relationships_vdb,
216
+ self.knowledge_graph_inst,
217
+ ]
218
  ]
219
+ )
220
 
221
 
222
  class ImageModalProcessor(BaseModalProcessor):
 
224
 
225
  def __init__(self, lightrag: LightRAG, modal_caption_func):
226
  """Initialize image processor
227
+
228
  Args:
229
  lightrag: LightRAG instance
230
  modal_caption_func: Function for generating descriptions (supporting image understanding)
 
235
  """Encode image to base64"""
236
  try:
237
  with open(image_path, "rb") as image_file:
238
+ encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
 
239
  return encoded_string
240
  except Exception as e:
241
  logger.error(f"Failed to encode image {image_path}: {e}")
 
300
  response = await self.modal_caption_func(
301
  vision_prompt,
302
  image_data=image_base64,
303
+ system_prompt="You are an expert image analyst. Provide detailed, accurate descriptions.",
 
304
  )
305
  else:
306
  # Analyze based on existing text information
307
  text_prompt = f"""Based on the following image information, provide analysis:
308
+
309
  Image Path: {image_path}
310
  Captions: {captions}
311
  Footnotes: {footnotes}
 
314
 
315
  response = await self.modal_caption_func(
316
  text_prompt,
317
+ system_prompt="You are an expert image analyst. Provide detailed analysis based on available information.",
 
318
  )
319
 
320
  # Parse response
321
+ enhanced_caption, entity_info = self._parse_response(response, entity_name)
 
322
 
323
  # Build complete image content
324
  modal_chunk = f"""
 
329
 
330
  Visual Analysis: {enhanced_caption}"""
331
 
332
+ return await self._create_entity_and_chunk(
333
+ modal_chunk, entity_info, file_path
334
+ )
335
 
336
  except Exception as e:
337
  logger.error(f"Error processing image content: {e}")
338
  # Fallback processing
339
  fallback_entity = {
340
+ "entity_name": entity_name
341
+ if entity_name
342
+ else f"image_{compute_mdhash_id(str(modal_content))}",
343
  "entity_type": "image",
344
+ "summary": f"Image content: {str(modal_content)[:100]}",
345
  }
346
  return str(modal_content), fallback_entity
347
 
348
+ def _parse_response(
349
+ self, response: str, entity_name: str = None
350
+ ) -> Tuple[str, Dict[str, Any]]:
351
  """Parse model response"""
352
  try:
353
  response_data = json.loads(
354
+ re.search(r"\{.*\}", response, re.DOTALL).group(0)
355
+ )
356
 
357
  description = response_data.get("detailed_description", "")
358
  entity_data = response_data.get("entity_info", {})
 
360
  if not description or not entity_data:
361
  raise ValueError("Missing required fields in response")
362
 
363
+ if not all(
364
+ key in entity_data for key in ["entity_name", "entity_type", "summary"]
365
+ ):
366
  raise ValueError("Missing required fields in entity_info")
367
 
368
+ entity_data["entity_name"] = (
369
+ entity_data["entity_name"] + f" ({entity_data['entity_type']})"
370
+ )
371
  if entity_name:
372
  entity_data["entity_name"] = entity_name
373
 
 
376
  except (json.JSONDecodeError, AttributeError, ValueError) as e:
377
  logger.error(f"Error parsing image analysis response: {e}")
378
  fallback_entity = {
379
+ "entity_name": entity_name
380
+ if entity_name
381
+ else f"image_{compute_mdhash_id(response)}",
382
+ "entity_type": "image",
383
+ "summary": response[:100] + "..." if len(response) > 100 else response,
 
 
384
  }
385
  return response, fallback_entity
386
 
 
439
 
440
  response = await self.modal_caption_func(
441
  table_prompt,
442
+ system_prompt="You are an expert data analyst. Provide detailed table analysis with specific insights.",
 
443
  )
444
 
445
  # Parse response
446
  enhanced_caption, entity_info = self._parse_table_response(
447
+ response, entity_name
448
+ )
449
+
450
+ # TODO: Add Retry Mechanism
451
 
452
  # Build complete table content
453
  modal_chunk = f"""Table Analysis:
 
458
 
459
  Analysis: {enhanced_caption}"""
460
 
461
+ return await self._create_entity_and_chunk(modal_chunk, entity_info, file_path)
 
462
 
463
  def _parse_table_response(
464
+ self, response: str, entity_name: str = None
465
+ ) -> Tuple[str, Dict[str, Any]]:
 
466
  """Parse table analysis response"""
467
  try:
468
  response_data = json.loads(
469
+ re.search(r"\{.*\}", response, re.DOTALL).group(0)
470
+ )
471
 
472
  description = response_data.get("detailed_description", "")
473
  entity_data = response_data.get("entity_info", {})
 
475
  if not description or not entity_data:
476
  raise ValueError("Missing required fields in response")
477
 
478
+ if not all(
479
+ key in entity_data for key in ["entity_name", "entity_type", "summary"]
480
+ ):
481
  raise ValueError("Missing required fields in entity_info")
482
 
483
+ entity_data["entity_name"] = (
484
+ entity_data["entity_name"] + f" ({entity_data['entity_type']})"
485
+ )
486
  if entity_name:
487
  entity_data["entity_name"] = entity_name
488
 
 
491
  except (json.JSONDecodeError, AttributeError, ValueError) as e:
492
  logger.error(f"Error parsing table analysis response: {e}")
493
  fallback_entity = {
494
+ "entity_name": entity_name
495
+ if entity_name
496
+ else f"table_{compute_mdhash_id(response)}",
497
+ "entity_type": "table",
498
+ "summary": response[:100] + "..." if len(response) > 100 else response,
 
 
499
  }
500
  return response, fallback_entity
501
 
 
551
 
552
  response = await self.modal_caption_func(
553
  equation_prompt,
554
+ system_prompt="You are an expert mathematician. Provide detailed mathematical analysis.",
 
555
  )
556
 
557
  # Parse response
558
  enhanced_caption, entity_info = self._parse_equation_response(
559
+ response, entity_name
560
+ )
561
 
562
  # Build complete equation content
563
  modal_chunk = f"""Mathematical Equation Analysis:
 
566
 
567
  Mathematical Analysis: {enhanced_caption}"""
568
 
569
+ return await self._create_entity_and_chunk(modal_chunk, entity_info, file_path)
 
570
 
571
  def _parse_equation_response(
572
+ self, response: str, entity_name: str = None
573
+ ) -> Tuple[str, Dict[str, Any]]:
 
574
  """Parse equation analysis response"""
575
  try:
576
  response_data = json.loads(
577
+ re.search(r"\{.*\}", response, re.DOTALL).group(0)
578
+ )
579
 
580
  description = response_data.get("detailed_description", "")
581
  entity_data = response_data.get("entity_info", {})
 
583
  if not description or not entity_data:
584
  raise ValueError("Missing required fields in response")
585
 
586
+ if not all(
587
+ key in entity_data for key in ["entity_name", "entity_type", "summary"]
588
+ ):
589
  raise ValueError("Missing required fields in entity_info")
590
 
591
+ entity_data["entity_name"] = (
592
+ entity_data["entity_name"] + f" ({entity_data['entity_type']})"
593
+ )
594
  if entity_name:
595
  entity_data["entity_name"] = entity_name
596
 
 
599
  except (json.JSONDecodeError, AttributeError, ValueError) as e:
600
  logger.error(f"Error parsing equation analysis response: {e}")
601
  fallback_entity = {
602
+ "entity_name": entity_name
603
+ if entity_name
604
+ else f"equation_{compute_mdhash_id(response)}",
605
+ "entity_type": "equation",
606
+ "summary": response[:100] + "..." if len(response) > 100 else response,
 
 
607
  }
608
  return response, fallback_entity
609
 
 
643
 
644
  response = await self.modal_caption_func(
645
  generic_prompt,
646
+ system_prompt=f"You are an expert content analyst specializing in {content_type} content.",
 
647
  )
648
 
649
  # Parse response
650
  enhanced_caption, entity_info = self._parse_generic_response(
651
+ response, entity_name, content_type
652
+ )
653
 
654
  # Build complete content
655
  modal_chunk = f"""{content_type.title()} Content Analysis:
 
657
 
658
  Analysis: {enhanced_caption}"""
659
 
660
+ return await self._create_entity_and_chunk(modal_chunk, entity_info, file_path)
 
661
 
662
  def _parse_generic_response(
663
+ self, response: str, entity_name: str = None, content_type: str = "content"
664
+ ) -> Tuple[str, Dict[str, Any]]:
 
 
665
  """Parse generic analysis response"""
666
  try:
667
  response_data = json.loads(
668
+ re.search(r"\{.*\}", response, re.DOTALL).group(0)
669
+ )
670
 
671
  description = response_data.get("detailed_description", "")
672
  entity_data = response_data.get("entity_info", {})
 
674
  if not description or not entity_data:
675
  raise ValueError("Missing required fields in response")
676
 
677
+ if not all(
678
+ key in entity_data for key in ["entity_name", "entity_type", "summary"]
679
+ ):
680
  raise ValueError("Missing required fields in entity_info")
681
 
682
+ entity_data["entity_name"] = (
683
+ entity_data["entity_name"] + f" ({entity_data['entity_type']})"
684
+ )
685
  if entity_name:
686
  entity_data["entity_name"] = entity_name
687
 
 
690
  except (json.JSONDecodeError, AttributeError, ValueError) as e:
691
  logger.error(f"Error parsing generic analysis response: {e}")
692
  fallback_entity = {
693
+ "entity_name": entity_name
694
+ if entity_name
695
+ else f"{content_type}_{compute_mdhash_id(response)}",
696
+ "entity_type": content_type,
697
+ "summary": response[:100] + "..." if len(response) > 100 else response,
 
 
698
  }
699
  return response, fallback_entity
lightrag/raganything.py CHANGED
@@ -26,15 +26,15 @@ from lightrag.mineru_parser import MineruParser
26
  # Import specialized processors
27
  from lightrag.modalprocessors import (
28
  ImageModalProcessor,
29
- TableModalProcessor,
30
  EquationModalProcessor,
31
- GenericModalProcessor
32
  )
33
 
34
 
35
  class RAGAnything:
36
  """Multimodal Document Processing Pipeline - Complete document parsing and insertion pipeline"""
37
-
38
  def __init__(
39
  self,
40
  lightrag: Optional[LightRAG] = None,
@@ -43,11 +43,11 @@ class RAGAnything:
43
  embedding_func: Optional[Callable] = None,
44
  working_dir: str = "./rag_storage",
45
  embedding_dim: int = 3072,
46
- max_token_size: int = 8192
47
  ):
48
  """
49
  Initialize Multimodal Document Processing Pipeline
50
-
51
  Args:
52
  lightrag: Optional pre-initialized LightRAG instance
53
  llm_model_func: LLM model function for text analysis
@@ -63,64 +63,67 @@ class RAGAnything:
63
  self.embedding_func = embedding_func
64
  self.embedding_dim = embedding_dim
65
  self.max_token_size = max_token_size
66
-
67
  # Set up logging
68
  setup_logger("RAGAnything")
69
  self.logger = logging.getLogger("RAGAnything")
70
-
71
  # Create working directory if needed
72
  if not os.path.exists(working_dir):
73
  os.makedirs(working_dir)
74
-
75
  # Use provided LightRAG or mark for later initialization
76
  self.lightrag = lightrag
77
  self.modal_processors = {}
78
-
79
  # If LightRAG is provided, initialize processors immediately
80
  if self.lightrag is not None:
81
  self._initialize_processors()
82
-
83
  def _initialize_processors(self):
84
  """Initialize multimodal processors with appropriate model functions"""
85
  if self.lightrag is None:
86
- raise ValueError("LightRAG instance must be initialized before creating processors")
87
-
 
 
88
  # Create different multimodal processors
89
  self.modal_processors = {
90
  "image": ImageModalProcessor(
91
  lightrag=self.lightrag,
92
- modal_caption_func=self.vision_model_func or self.llm_model_func
93
  ),
94
  "table": TableModalProcessor(
95
- lightrag=self.lightrag,
96
- modal_caption_func=self.llm_model_func
97
  ),
98
  "equation": EquationModalProcessor(
99
- lightrag=self.lightrag,
100
- modal_caption_func=self.llm_model_func
101
  ),
102
  "generic": GenericModalProcessor(
103
- lightrag=self.lightrag,
104
- modal_caption_func=self.llm_model_func
105
- )
106
  }
107
-
108
  self.logger.info("Multimodal processors initialized")
109
  self.logger.info(f"Available processors: {list(self.modal_processors.keys())}")
110
-
111
  async def _ensure_lightrag_initialized(self):
112
  """Ensure LightRAG instance is initialized, create if necessary"""
113
  if self.lightrag is not None:
114
  return
115
-
116
  # Validate required functions
117
  if self.llm_model_func is None:
118
- raise ValueError("llm_model_func must be provided when LightRAG is not pre-initialized")
 
 
119
  if self.embedding_func is None:
120
- raise ValueError("embedding_func must be provided when LightRAG is not pre-initialized")
121
-
 
 
122
  from lightrag.kg.shared_storage import initialize_pipeline_status
123
-
124
  # Create LightRAG instance with provided functions
125
  self.lightrag = LightRAG(
126
  working_dir=self.working_dir,
@@ -134,88 +137,86 @@ class RAGAnything:
134
 
135
  await self.lightrag.initialize_storages()
136
  await initialize_pipeline_status()
137
-
138
  # Initialize processors after LightRAG is ready
139
  self._initialize_processors()
140
-
141
  self.logger.info("LightRAG and multimodal processors initialized")
142
 
143
  def parse_document(
144
- self,
145
- file_path: str,
146
  output_dir: str = "./output",
147
  parse_method: str = "auto",
148
- display_stats: bool = True
149
  ) -> Tuple[List[Dict[str, Any]], str]:
150
  """
151
  Parse document using MinerU
152
-
153
  Args:
154
  file_path: Path to the file to parse
155
  output_dir: Output directory
156
  parse_method: Parse method ("auto", "ocr", "txt")
157
  display_stats: Whether to display content statistics
158
-
159
  Returns:
160
  (content_list, md_content): Content list and markdown text
161
  """
162
  self.logger.info(f"Starting document parsing: {file_path}")
163
-
164
  file_path = Path(file_path)
165
  if not file_path.exists():
166
  raise FileNotFoundError(f"File not found: {file_path}")
167
-
168
  # Choose appropriate parsing method based on file extension
169
  ext = file_path.suffix.lower()
170
-
171
  try:
172
  if ext in [".pdf"]:
173
- self.logger.info(f"Detected PDF file, using PDF parser (OCR={parse_method == 'ocr'})...")
 
 
174
  content_list, md_content = MineruParser.parse_pdf(
175
- file_path,
176
- output_dir,
177
- use_ocr=(parse_method == "ocr")
178
  )
179
  elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
180
  self.logger.info("Detected image file, using image parser...")
181
  content_list, md_content = MineruParser.parse_image(
182
- file_path,
183
- output_dir
184
  )
185
  elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
186
  self.logger.info("Detected Office document, using Office parser...")
187
  content_list, md_content = MineruParser.parse_office_doc(
188
- file_path,
189
- output_dir
190
  )
191
  else:
192
  # For other or unknown formats, use generic parser
193
- self.logger.info(f"Using generic parser for {ext} file (method={parse_method})...")
 
 
194
  content_list, md_content = MineruParser.parse_document(
195
- file_path,
196
- parse_method=parse_method,
197
- output_dir=output_dir
198
  )
199
-
200
  except Exception as e:
201
  self.logger.error(f"Error during parsing with specific parser: {str(e)}")
202
  self.logger.warning("Falling back to generic parser...")
203
  # If specific parser fails, fall back to generic parser
204
  content_list, md_content = MineruParser.parse_document(
205
- file_path,
206
- parse_method=parse_method,
207
- output_dir=output_dir
208
  )
209
-
210
- self.logger.info(f"Parsing complete! Extracted {len(content_list)} content blocks")
 
 
211
  self.logger.info(f"Markdown text length: {len(md_content)} characters")
212
-
213
  # Display content statistics if requested
214
  if display_stats:
215
  self.logger.info("\nContent Information:")
216
  self.logger.info(f"* Total blocks in content_list: {len(content_list)}")
217
  self.logger.info(f"* Markdown content length: {len(md_content)} characters")
218
-
219
  # Count elements by type
220
  block_types: Dict[str, int] = {}
221
  for block in content_list:
@@ -223,29 +224,31 @@ class RAGAnything:
223
  block_type = block.get("type", "unknown")
224
  if isinstance(block_type, str):
225
  block_types[block_type] = block_types.get(block_type, 0) + 1
226
-
227
  self.logger.info("* Content block types:")
228
  for block_type, count in block_types.items():
229
  self.logger.info(f" - {block_type}: {count}")
230
-
231
  return content_list, md_content
232
 
233
- def _separate_content(self, content_list: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]]]:
 
 
234
  """
235
  Separate text content and multimodal content
236
-
237
  Args:
238
  content_list: Content list from MinerU parsing
239
-
240
  Returns:
241
  (text_content, multimodal_items): Pure text content and multimodal items list
242
  """
243
  text_parts = []
244
  multimodal_items = []
245
-
246
  for item in content_list:
247
  content_type = item.get("type", "text")
248
-
249
  if content_type == "text":
250
  # Text content
251
  text = item.get("text", "")
@@ -254,27 +257,27 @@ class RAGAnything:
254
  else:
255
  # Multimodal content (image, table, equation, etc.)
256
  multimodal_items.append(item)
257
-
258
  # Merge all text content
259
  text_content = "\n\n".join(text_parts)
260
-
261
- self.logger.info(f"Content separation complete:")
262
  self.logger.info(f" - Text content length: {len(text_content)} characters")
263
  self.logger.info(f" - Multimodal items count: {len(multimodal_items)}")
264
-
265
  # Count multimodal types
266
  modal_types = {}
267
  for item in multimodal_items:
268
  modal_type = item.get("type", "unknown")
269
  modal_types[modal_type] = modal_types.get(modal_type, 0) + 1
270
-
271
  if modal_types:
272
  self.logger.info(f" - Multimodal type distribution: {modal_types}")
273
-
274
  return text_content, multimodal_items
275
 
276
  async def _insert_text_content(
277
- self,
278
  input: str | list[str],
279
  split_by_character: str | None = None,
280
  split_by_character_only: bool = False,
@@ -283,7 +286,7 @@ class RAGAnything:
283
  ):
284
  """
285
  Insert pure text content into LightRAG
286
-
287
  Args:
288
  input: Single document string or list of document strings
289
  split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
@@ -292,24 +295,26 @@ class RAGAnything:
292
  split_by_character is None, this parameter is ignored.
293
  ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
294
  file_paths: single string of the file path or list of file paths, used for citation
295
- """
296
  self.logger.info("Starting text content insertion into LightRAG...")
297
-
298
  # Use LightRAG's insert method with all parameters
299
  await self.lightrag.ainsert(
300
  input=input,
301
  file_paths=file_paths,
302
  split_by_character=split_by_character,
303
  split_by_character_only=split_by_character_only,
304
- ids=ids
305
  )
306
-
307
  self.logger.info("Text content insertion complete")
308
 
309
- async def _process_multimodal_content(self, multimodal_items: List[Dict[str, Any]], file_path: str):
 
 
310
  """
311
  Process multimodal content (using specialized processors)
312
-
313
  Args:
314
  multimodal_items: List of multimodal items
315
  file_path: File path (for reference)
@@ -317,43 +322,52 @@ class RAGAnything:
317
  if not multimodal_items:
318
  self.logger.debug("No multimodal content to process")
319
  return
320
-
321
  self.logger.info("Starting multimodal content processing...")
322
-
323
  file_name = os.path.basename(file_path)
324
-
325
  for i, item in enumerate(multimodal_items):
326
  try:
327
  content_type = item.get("type", "unknown")
328
- self.logger.info(f"Processing item {i+1}/{len(multimodal_items)}: {content_type} content")
329
-
 
 
330
  # Select appropriate processor
331
  processor = self._get_processor_for_type(content_type)
332
-
333
  if processor:
334
- enhanced_caption, entity_info = await processor.process_multimodal_content(
 
 
 
335
  modal_content=item,
336
  content_type=content_type,
337
- file_path=file_name
 
 
 
338
  )
339
- self.logger.info(f"{content_type} processing complete: {entity_info.get('entity_name', 'Unknown')}")
340
  else:
341
- self.logger.warning(f"No suitable processor found for {content_type} type content")
342
-
 
 
343
  except Exception as e:
344
  self.logger.error(f"Error processing multimodal content: {str(e)}")
345
  self.logger.debug("Exception details:", exc_info=True)
346
  continue
347
-
348
  self.logger.info("Multimodal content processing complete")
349
 
350
  def _get_processor_for_type(self, content_type: str):
351
  """
352
  Get appropriate processor based on content type
353
-
354
  Args:
355
  content_type: Content type
356
-
357
  Returns:
358
  Corresponding processor instance
359
  """
@@ -369,18 +383,18 @@ class RAGAnything:
369
  return self.modal_processors.get("generic")
370
 
371
  async def process_document_complete(
372
- self,
373
- file_path: str,
374
  output_dir: str = "./output",
375
  parse_method: str = "auto",
376
  display_stats: bool = True,
377
  split_by_character: str | None = None,
378
  split_by_character_only: bool = False,
379
- doc_id: str | None = None
380
  ):
381
  """
382
  Complete document processing workflow
383
-
384
  Args:
385
  file_path: Path to the file to process
386
  output_dir: MinerU output directory
@@ -392,35 +406,32 @@ class RAGAnything:
392
  """
393
  # Ensure LightRAG is initialized
394
  await self._ensure_lightrag_initialized()
395
-
396
  self.logger.info(f"Starting complete document processing: {file_path}")
397
-
398
  # Step 1: Parse document using MinerU
399
  content_list, md_content = self.parse_document(
400
- file_path,
401
- output_dir,
402
- parse_method,
403
- display_stats
404
  )
405
-
406
  # Step 2: Separate text and multimodal content
407
  text_content, multimodal_items = self._separate_content(content_list)
408
-
409
  # Step 3: Insert pure text content with all parameters
410
  if text_content.strip():
411
  file_name = os.path.basename(file_path)
412
  await self._insert_text_content(
413
- text_content,
414
  file_paths=file_name,
415
  split_by_character=split_by_character,
416
  split_by_character_only=split_by_character_only,
417
- ids=doc_id
418
  )
419
-
420
  # Step 4: Process multimodal content (using specialized processors)
421
  if multimodal_items:
422
  await self._process_multimodal_content(multimodal_items, file_path)
423
-
424
  self.logger.info(f"Document {file_path} processing complete!")
425
 
426
  async def process_folder_complete(
@@ -433,11 +444,11 @@ class RAGAnything:
433
  split_by_character_only: bool = False,
434
  file_extensions: Optional[List[str]] = None,
435
  recursive: bool = True,
436
- max_workers: int = 1
437
  ):
438
  """
439
  Process all files in a folder in batch
440
-
441
  Args:
442
  folder_path: Path to the folder to process
443
  output_dir: MinerU output directory
@@ -451,75 +462,98 @@ class RAGAnything:
451
  """
452
  # Ensure LightRAG is initialized
453
  await self._ensure_lightrag_initialized()
454
-
455
  folder_path = Path(folder_path)
456
  if not folder_path.exists() or not folder_path.is_dir():
457
- raise ValueError(f"Folder does not exist or is not a valid directory: {folder_path}")
458
-
 
 
459
  # Supported file formats
460
  supported_extensions = {
461
- ".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif",
462
- ".doc", ".docx", ".ppt", ".pptx", ".txt", ".md"
 
 
 
 
 
 
 
 
 
 
 
463
  }
464
-
465
  # Use specified extensions or all supported formats
466
  if file_extensions:
467
  target_extensions = set(ext.lower() for ext in file_extensions)
468
  # Validate if all are supported formats
469
  unsupported = target_extensions - supported_extensions
470
  if unsupported:
471
- self.logger.warning(f"The following file formats may not be fully supported: {unsupported}")
 
 
472
  else:
473
  target_extensions = supported_extensions
474
-
475
  # Collect all files to process
476
  files_to_process = []
477
-
478
  if recursive:
479
  # Recursively traverse all subfolders
480
  for file_path in folder_path.rglob("*"):
481
- if file_path.is_file() and file_path.suffix.lower() in target_extensions:
 
 
 
482
  files_to_process.append(file_path)
483
  else:
484
  # Process only current folder
485
  for file_path in folder_path.glob("*"):
486
- if file_path.is_file() and file_path.suffix.lower() in target_extensions:
 
 
 
487
  files_to_process.append(file_path)
488
-
489
  if not files_to_process:
490
  self.logger.info(f"No files to process found in {folder_path}")
491
  return
492
-
493
  self.logger.info(f"Found {len(files_to_process)} files to process")
494
- self.logger.info(f"File type distribution:")
495
-
496
  # Count file types
497
  file_type_count = {}
498
  for file_path in files_to_process:
499
  ext = file_path.suffix.lower()
500
  file_type_count[ext] = file_type_count.get(ext, 0) + 1
501
-
502
  for ext, count in sorted(file_type_count.items()):
503
  self.logger.info(f" {ext}: {count} files")
504
-
505
  # Create progress tracking
506
  processed_count = 0
507
  failed_files = []
508
-
509
  # Use semaphore to control concurrency
510
  semaphore = asyncio.Semaphore(max_workers)
511
-
512
  async def process_single_file(file_path: Path, index: int) -> None:
513
  """Process a single file"""
514
  async with semaphore:
515
  nonlocal processed_count
516
  try:
517
- self.logger.info(f"[{index}/{len(files_to_process)}] Processing: {file_path}")
518
-
 
 
519
  # Create separate output directory for each file
520
  file_output_dir = Path(output_dir) / file_path.stem
521
  file_output_dir.mkdir(parents=True, exist_ok=True)
522
-
523
  # Process file
524
  await self.process_document_complete(
525
  file_path=str(file_path),
@@ -527,56 +561,56 @@ class RAGAnything:
527
  parse_method=parse_method,
528
  display_stats=display_stats,
529
  split_by_character=split_by_character,
530
- split_by_character_only=split_by_character_only
531
  )
532
-
533
  processed_count += 1
534
- self.logger.info(f"[{index}/{len(files_to_process)}] Successfully processed: {file_path}")
535
-
 
 
536
  except Exception as e:
537
- self.logger.error(f"[{index}/{len(files_to_process)}] Failed to process: {file_path}")
 
 
538
  self.logger.error(f"Error: {str(e)}")
539
  failed_files.append((file_path, str(e)))
540
-
541
  # Create all processing tasks
542
  tasks = []
543
  for index, file_path in enumerate(files_to_process, 1):
544
  task = process_single_file(file_path, index)
545
  tasks.append(task)
546
-
547
  # Wait for all tasks to complete
548
  await asyncio.gather(*tasks, return_exceptions=True)
549
-
550
  # Output processing statistics
551
  self.logger.info("\n===== Batch Processing Complete =====")
552
  self.logger.info(f"Total files: {len(files_to_process)}")
553
  self.logger.info(f"Successfully processed: {processed_count}")
554
  self.logger.info(f"Failed: {len(failed_files)}")
555
-
556
  if failed_files:
557
  self.logger.info("\nFailed files:")
558
  for file_path, error in failed_files:
559
  self.logger.info(f" - {file_path}: {error}")
560
-
561
  return {
562
  "total": len(files_to_process),
563
  "success": processed_count,
564
  "failed": len(failed_files),
565
- "failed_files": failed_files
566
  }
567
 
568
- async def query_with_multimodal(
569
- self,
570
- query: str,
571
- mode: str = "hybrid"
572
- ) -> str:
573
  """
574
  Query with multimodal content support
575
-
576
  Args:
577
  query: Query content
578
  mode: Query mode
579
-
580
  Returns:
581
  Query result
582
  """
@@ -588,45 +622,65 @@ class RAGAnything:
588
  "2. Process documents first using process_document_complete() or process_folder_complete() "
589
  "to create and populate the LightRAG instance."
590
  )
591
-
592
- result = await self.lightrag.aquery(
593
- query,
594
- param=QueryParam(mode=mode)
595
- )
596
-
597
  return result
598
 
599
  def get_processor_info(self) -> Dict[str, Any]:
600
  """Get processor information"""
601
  if not self.modal_processors:
602
  return {"status": "Not initialized"}
603
-
604
  info = {
605
  "status": "Initialized",
606
  "processors": {},
607
  "models": {
608
- "llm_model": "External function" if self.llm_model_func else "Not provided",
609
- "vision_model": "External function" if self.vision_model_func else "Not provided",
610
- "embedding_model": "External function" if self.embedding_func else "Not provided"
611
- }
 
 
 
 
 
 
612
  }
613
-
614
  for proc_type, processor in self.modal_processors.items():
615
  info["processors"][proc_type] = {
616
  "class": processor.__class__.__name__,
617
- "supports": self._get_processor_supports(proc_type)
618
  }
619
-
620
  return info
621
 
622
  def _get_processor_supports(self, proc_type: str) -> List[str]:
623
  """Get processor supported features"""
624
  supports_map = {
625
- "image": ["Image content analysis", "Visual understanding", "Image description generation", "Image entity extraction"],
626
- "table": ["Table structure analysis", "Data statistics", "Trend identification", "Table entity extraction"],
627
- "equation": ["Mathematical formula parsing", "Variable identification", "Formula meaning explanation", "Formula entity extraction"],
628
- "generic": ["General content analysis", "Structured processing", "Entity extraction"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
  }
630
  return supports_map.get(proc_type, ["Basic processing"])
631
-
632
-
 
26
  # Import specialized processors
27
  from lightrag.modalprocessors import (
28
  ImageModalProcessor,
29
+ TableModalProcessor,
30
  EquationModalProcessor,
31
+ GenericModalProcessor,
32
  )
33
 
34
 
35
  class RAGAnything:
36
  """Multimodal Document Processing Pipeline - Complete document parsing and insertion pipeline"""
37
+
38
  def __init__(
39
  self,
40
  lightrag: Optional[LightRAG] = None,
 
43
  embedding_func: Optional[Callable] = None,
44
  working_dir: str = "./rag_storage",
45
  embedding_dim: int = 3072,
46
+ max_token_size: int = 8192,
47
  ):
48
  """
49
  Initialize Multimodal Document Processing Pipeline
50
+
51
  Args:
52
  lightrag: Optional pre-initialized LightRAG instance
53
  llm_model_func: LLM model function for text analysis
 
63
  self.embedding_func = embedding_func
64
  self.embedding_dim = embedding_dim
65
  self.max_token_size = max_token_size
66
+
67
  # Set up logging
68
  setup_logger("RAGAnything")
69
  self.logger = logging.getLogger("RAGAnything")
70
+
71
  # Create working directory if needed
72
  if not os.path.exists(working_dir):
73
  os.makedirs(working_dir)
74
+
75
  # Use provided LightRAG or mark for later initialization
76
  self.lightrag = lightrag
77
  self.modal_processors = {}
78
+
79
  # If LightRAG is provided, initialize processors immediately
80
  if self.lightrag is not None:
81
  self._initialize_processors()
82
+
83
  def _initialize_processors(self):
84
  """Initialize multimodal processors with appropriate model functions"""
85
  if self.lightrag is None:
86
+ raise ValueError(
87
+ "LightRAG instance must be initialized before creating processors"
88
+ )
89
+
90
  # Create different multimodal processors
91
  self.modal_processors = {
92
  "image": ImageModalProcessor(
93
  lightrag=self.lightrag,
94
+ modal_caption_func=self.vision_model_func or self.llm_model_func,
95
  ),
96
  "table": TableModalProcessor(
97
+ lightrag=self.lightrag, modal_caption_func=self.llm_model_func
 
98
  ),
99
  "equation": EquationModalProcessor(
100
+ lightrag=self.lightrag, modal_caption_func=self.llm_model_func
 
101
  ),
102
  "generic": GenericModalProcessor(
103
+ lightrag=self.lightrag, modal_caption_func=self.llm_model_func
104
+ ),
 
105
  }
106
+
107
  self.logger.info("Multimodal processors initialized")
108
  self.logger.info(f"Available processors: {list(self.modal_processors.keys())}")
109
+
110
  async def _ensure_lightrag_initialized(self):
111
  """Ensure LightRAG instance is initialized, create if necessary"""
112
  if self.lightrag is not None:
113
  return
114
+
115
  # Validate required functions
116
  if self.llm_model_func is None:
117
+ raise ValueError(
118
+ "llm_model_func must be provided when LightRAG is not pre-initialized"
119
+ )
120
  if self.embedding_func is None:
121
+ raise ValueError(
122
+ "embedding_func must be provided when LightRAG is not pre-initialized"
123
+ )
124
+
125
  from lightrag.kg.shared_storage import initialize_pipeline_status
126
+
127
  # Create LightRAG instance with provided functions
128
  self.lightrag = LightRAG(
129
  working_dir=self.working_dir,
 
137
 
138
  await self.lightrag.initialize_storages()
139
  await initialize_pipeline_status()
140
+
141
  # Initialize processors after LightRAG is ready
142
  self._initialize_processors()
143
+
144
  self.logger.info("LightRAG and multimodal processors initialized")
145
 
146
  def parse_document(
147
+ self,
148
+ file_path: str,
149
  output_dir: str = "./output",
150
  parse_method: str = "auto",
151
+ display_stats: bool = True,
152
  ) -> Tuple[List[Dict[str, Any]], str]:
153
  """
154
  Parse document using MinerU
155
+
156
  Args:
157
  file_path: Path to the file to parse
158
  output_dir: Output directory
159
  parse_method: Parse method ("auto", "ocr", "txt")
160
  display_stats: Whether to display content statistics
161
+
162
  Returns:
163
  (content_list, md_content): Content list and markdown text
164
  """
165
  self.logger.info(f"Starting document parsing: {file_path}")
166
+
167
  file_path = Path(file_path)
168
  if not file_path.exists():
169
  raise FileNotFoundError(f"File not found: {file_path}")
170
+
171
  # Choose appropriate parsing method based on file extension
172
  ext = file_path.suffix.lower()
173
+
174
  try:
175
  if ext in [".pdf"]:
176
+ self.logger.info(
177
+ f"Detected PDF file, using PDF parser (OCR={parse_method == 'ocr'})..."
178
+ )
179
  content_list, md_content = MineruParser.parse_pdf(
180
+ file_path, output_dir, use_ocr=(parse_method == "ocr")
 
 
181
  )
182
  elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
183
  self.logger.info("Detected image file, using image parser...")
184
  content_list, md_content = MineruParser.parse_image(
185
+ file_path, output_dir
 
186
  )
187
  elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
188
  self.logger.info("Detected Office document, using Office parser...")
189
  content_list, md_content = MineruParser.parse_office_doc(
190
+ file_path, output_dir
 
191
  )
192
  else:
193
  # For other or unknown formats, use generic parser
194
+ self.logger.info(
195
+ f"Using generic parser for {ext} file (method={parse_method})..."
196
+ )
197
  content_list, md_content = MineruParser.parse_document(
198
+ file_path, parse_method=parse_method, output_dir=output_dir
 
 
199
  )
200
+
201
  except Exception as e:
202
  self.logger.error(f"Error during parsing with specific parser: {str(e)}")
203
  self.logger.warning("Falling back to generic parser...")
204
  # If specific parser fails, fall back to generic parser
205
  content_list, md_content = MineruParser.parse_document(
206
+ file_path, parse_method=parse_method, output_dir=output_dir
 
 
207
  )
208
+
209
+ self.logger.info(
210
+ f"Parsing complete! Extracted {len(content_list)} content blocks"
211
+ )
212
  self.logger.info(f"Markdown text length: {len(md_content)} characters")
213
+
214
  # Display content statistics if requested
215
  if display_stats:
216
  self.logger.info("\nContent Information:")
217
  self.logger.info(f"* Total blocks in content_list: {len(content_list)}")
218
  self.logger.info(f"* Markdown content length: {len(md_content)} characters")
219
+
220
  # Count elements by type
221
  block_types: Dict[str, int] = {}
222
  for block in content_list:
 
224
  block_type = block.get("type", "unknown")
225
  if isinstance(block_type, str):
226
  block_types[block_type] = block_types.get(block_type, 0) + 1
227
+
228
  self.logger.info("* Content block types:")
229
  for block_type, count in block_types.items():
230
  self.logger.info(f" - {block_type}: {count}")
231
+
232
  return content_list, md_content
233
 
234
+ def _separate_content(
235
+ self, content_list: List[Dict[str, Any]]
236
+ ) -> Tuple[str, List[Dict[str, Any]]]:
237
  """
238
  Separate text content and multimodal content
239
+
240
  Args:
241
  content_list: Content list from MinerU parsing
242
+
243
  Returns:
244
  (text_content, multimodal_items): Pure text content and multimodal items list
245
  """
246
  text_parts = []
247
  multimodal_items = []
248
+
249
  for item in content_list:
250
  content_type = item.get("type", "text")
251
+
252
  if content_type == "text":
253
  # Text content
254
  text = item.get("text", "")
 
257
  else:
258
  # Multimodal content (image, table, equation, etc.)
259
  multimodal_items.append(item)
260
+
261
  # Merge all text content
262
  text_content = "\n\n".join(text_parts)
263
+
264
+ self.logger.info("Content separation complete:")
265
  self.logger.info(f" - Text content length: {len(text_content)} characters")
266
  self.logger.info(f" - Multimodal items count: {len(multimodal_items)}")
267
+
268
  # Count multimodal types
269
  modal_types = {}
270
  for item in multimodal_items:
271
  modal_type = item.get("type", "unknown")
272
  modal_types[modal_type] = modal_types.get(modal_type, 0) + 1
273
+
274
  if modal_types:
275
  self.logger.info(f" - Multimodal type distribution: {modal_types}")
276
+
277
  return text_content, multimodal_items
278
 
279
  async def _insert_text_content(
280
+ self,
281
  input: str | list[str],
282
  split_by_character: str | None = None,
283
  split_by_character_only: bool = False,
 
286
  ):
287
  """
288
  Insert pure text content into LightRAG
289
+
290
  Args:
291
  input: Single document string or list of document strings
292
  split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
 
295
  split_by_character is None, this parameter is ignored.
296
  ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
297
  file_paths: single string of the file path or list of file paths, used for citation
298
+ """
299
  self.logger.info("Starting text content insertion into LightRAG...")
300
+
301
  # Use LightRAG's insert method with all parameters
302
  await self.lightrag.ainsert(
303
  input=input,
304
  file_paths=file_paths,
305
  split_by_character=split_by_character,
306
  split_by_character_only=split_by_character_only,
307
+ ids=ids,
308
  )
309
+
310
  self.logger.info("Text content insertion complete")
311
 
312
+ async def _process_multimodal_content(
313
+ self, multimodal_items: List[Dict[str, Any]], file_path: str
314
+ ):
315
  """
316
  Process multimodal content (using specialized processors)
317
+
318
  Args:
319
  multimodal_items: List of multimodal items
320
  file_path: File path (for reference)
 
322
  if not multimodal_items:
323
  self.logger.debug("No multimodal content to process")
324
  return
325
+
326
  self.logger.info("Starting multimodal content processing...")
327
+
328
  file_name = os.path.basename(file_path)
329
+
330
  for i, item in enumerate(multimodal_items):
331
  try:
332
  content_type = item.get("type", "unknown")
333
+ self.logger.info(
334
+ f"Processing item {i+1}/{len(multimodal_items)}: {content_type} content"
335
+ )
336
+
337
  # Select appropriate processor
338
  processor = self._get_processor_for_type(content_type)
339
+
340
  if processor:
341
+ (
342
+ enhanced_caption,
343
+ entity_info,
344
+ ) = await processor.process_multimodal_content(
345
  modal_content=item,
346
  content_type=content_type,
347
+ file_path=file_name,
348
+ )
349
+ self.logger.info(
350
+ f"{content_type} processing complete: {entity_info.get('entity_name', 'Unknown')}"
351
  )
 
352
  else:
353
+ self.logger.warning(
354
+ f"No suitable processor found for {content_type} type content"
355
+ )
356
+
357
  except Exception as e:
358
  self.logger.error(f"Error processing multimodal content: {str(e)}")
359
  self.logger.debug("Exception details:", exc_info=True)
360
  continue
361
+
362
  self.logger.info("Multimodal content processing complete")
363
 
364
  def _get_processor_for_type(self, content_type: str):
365
  """
366
  Get appropriate processor based on content type
367
+
368
  Args:
369
  content_type: Content type
370
+
371
  Returns:
372
  Corresponding processor instance
373
  """
 
383
  return self.modal_processors.get("generic")
384
 
385
  async def process_document_complete(
386
+ self,
387
+ file_path: str,
388
  output_dir: str = "./output",
389
  parse_method: str = "auto",
390
  display_stats: bool = True,
391
  split_by_character: str | None = None,
392
  split_by_character_only: bool = False,
393
+ doc_id: str | None = None,
394
  ):
395
  """
396
  Complete document processing workflow
397
+
398
  Args:
399
  file_path: Path to the file to process
400
  output_dir: MinerU output directory
 
406
  """
407
  # Ensure LightRAG is initialized
408
  await self._ensure_lightrag_initialized()
409
+
410
  self.logger.info(f"Starting complete document processing: {file_path}")
411
+
412
  # Step 1: Parse document using MinerU
413
  content_list, md_content = self.parse_document(
414
+ file_path, output_dir, parse_method, display_stats
 
 
 
415
  )
416
+
417
  # Step 2: Separate text and multimodal content
418
  text_content, multimodal_items = self._separate_content(content_list)
419
+
420
  # Step 3: Insert pure text content with all parameters
421
  if text_content.strip():
422
  file_name = os.path.basename(file_path)
423
  await self._insert_text_content(
424
+ text_content,
425
  file_paths=file_name,
426
  split_by_character=split_by_character,
427
  split_by_character_only=split_by_character_only,
428
+ ids=doc_id,
429
  )
430
+
431
  # Step 4: Process multimodal content (using specialized processors)
432
  if multimodal_items:
433
  await self._process_multimodal_content(multimodal_items, file_path)
434
+
435
  self.logger.info(f"Document {file_path} processing complete!")
436
 
437
  async def process_folder_complete(
 
444
  split_by_character_only: bool = False,
445
  file_extensions: Optional[List[str]] = None,
446
  recursive: bool = True,
447
+ max_workers: int = 1,
448
  ):
449
  """
450
  Process all files in a folder in batch
451
+
452
  Args:
453
  folder_path: Path to the folder to process
454
  output_dir: MinerU output directory
 
462
  """
463
  # Ensure LightRAG is initialized
464
  await self._ensure_lightrag_initialized()
465
+
466
  folder_path = Path(folder_path)
467
  if not folder_path.exists() or not folder_path.is_dir():
468
+ raise ValueError(
469
+ f"Folder does not exist or is not a valid directory: {folder_path}"
470
+ )
471
+
472
  # Supported file formats
473
  supported_extensions = {
474
+ ".pdf",
475
+ ".jpg",
476
+ ".jpeg",
477
+ ".png",
478
+ ".bmp",
479
+ ".tiff",
480
+ ".tif",
481
+ ".doc",
482
+ ".docx",
483
+ ".ppt",
484
+ ".pptx",
485
+ ".txt",
486
+ ".md",
487
  }
488
+
489
  # Use specified extensions or all supported formats
490
  if file_extensions:
491
  target_extensions = set(ext.lower() for ext in file_extensions)
492
  # Validate if all are supported formats
493
  unsupported = target_extensions - supported_extensions
494
  if unsupported:
495
+ self.logger.warning(
496
+ f"The following file formats may not be fully supported: {unsupported}"
497
+ )
498
  else:
499
  target_extensions = supported_extensions
500
+
501
  # Collect all files to process
502
  files_to_process = []
503
+
504
  if recursive:
505
  # Recursively traverse all subfolders
506
  for file_path in folder_path.rglob("*"):
507
+ if (
508
+ file_path.is_file()
509
+ and file_path.suffix.lower() in target_extensions
510
+ ):
511
  files_to_process.append(file_path)
512
  else:
513
  # Process only current folder
514
  for file_path in folder_path.glob("*"):
515
+ if (
516
+ file_path.is_file()
517
+ and file_path.suffix.lower() in target_extensions
518
+ ):
519
  files_to_process.append(file_path)
520
+
521
  if not files_to_process:
522
  self.logger.info(f"No files to process found in {folder_path}")
523
  return
524
+
525
  self.logger.info(f"Found {len(files_to_process)} files to process")
526
+ self.logger.info("File type distribution:")
527
+
528
  # Count file types
529
  file_type_count = {}
530
  for file_path in files_to_process:
531
  ext = file_path.suffix.lower()
532
  file_type_count[ext] = file_type_count.get(ext, 0) + 1
533
+
534
  for ext, count in sorted(file_type_count.items()):
535
  self.logger.info(f" {ext}: {count} files")
536
+
537
  # Create progress tracking
538
  processed_count = 0
539
  failed_files = []
540
+
541
  # Use semaphore to control concurrency
542
  semaphore = asyncio.Semaphore(max_workers)
543
+
544
  async def process_single_file(file_path: Path, index: int) -> None:
545
  """Process a single file"""
546
  async with semaphore:
547
  nonlocal processed_count
548
  try:
549
+ self.logger.info(
550
+ f"[{index}/{len(files_to_process)}] Processing: {file_path}"
551
+ )
552
+
553
  # Create separate output directory for each file
554
  file_output_dir = Path(output_dir) / file_path.stem
555
  file_output_dir.mkdir(parents=True, exist_ok=True)
556
+
557
  # Process file
558
  await self.process_document_complete(
559
  file_path=str(file_path),
 
561
  parse_method=parse_method,
562
  display_stats=display_stats,
563
  split_by_character=split_by_character,
564
+ split_by_character_only=split_by_character_only,
565
  )
566
+
567
  processed_count += 1
568
+ self.logger.info(
569
+ f"[{index}/{len(files_to_process)}] Successfully processed: {file_path}"
570
+ )
571
+
572
  except Exception as e:
573
+ self.logger.error(
574
+ f"[{index}/{len(files_to_process)}] Failed to process: {file_path}"
575
+ )
576
  self.logger.error(f"Error: {str(e)}")
577
  failed_files.append((file_path, str(e)))
578
+
579
  # Create all processing tasks
580
  tasks = []
581
  for index, file_path in enumerate(files_to_process, 1):
582
  task = process_single_file(file_path, index)
583
  tasks.append(task)
584
+
585
  # Wait for all tasks to complete
586
  await asyncio.gather(*tasks, return_exceptions=True)
587
+
588
  # Output processing statistics
589
  self.logger.info("\n===== Batch Processing Complete =====")
590
  self.logger.info(f"Total files: {len(files_to_process)}")
591
  self.logger.info(f"Successfully processed: {processed_count}")
592
  self.logger.info(f"Failed: {len(failed_files)}")
593
+
594
  if failed_files:
595
  self.logger.info("\nFailed files:")
596
  for file_path, error in failed_files:
597
  self.logger.info(f" - {file_path}: {error}")
598
+
599
  return {
600
  "total": len(files_to_process),
601
  "success": processed_count,
602
  "failed": len(failed_files),
603
+ "failed_files": failed_files,
604
  }
605
 
606
+ async def query_with_multimodal(self, query: str, mode: str = "hybrid") -> str:
 
 
 
 
607
  """
608
  Query with multimodal content support
609
+
610
  Args:
611
  query: Query content
612
  mode: Query mode
613
+
614
  Returns:
615
  Query result
616
  """
 
622
  "2. Process documents first using process_document_complete() or process_folder_complete() "
623
  "to create and populate the LightRAG instance."
624
  )
625
+
626
+ result = await self.lightrag.aquery(query, param=QueryParam(mode=mode))
627
+
 
 
 
628
  return result
629
 
630
  def get_processor_info(self) -> Dict[str, Any]:
631
  """Get processor information"""
632
  if not self.modal_processors:
633
  return {"status": "Not initialized"}
634
+
635
  info = {
636
  "status": "Initialized",
637
  "processors": {},
638
  "models": {
639
+ "llm_model": "External function"
640
+ if self.llm_model_func
641
+ else "Not provided",
642
+ "vision_model": "External function"
643
+ if self.vision_model_func
644
+ else "Not provided",
645
+ "embedding_model": "External function"
646
+ if self.embedding_func
647
+ else "Not provided",
648
+ },
649
  }
650
+
651
  for proc_type, processor in self.modal_processors.items():
652
  info["processors"][proc_type] = {
653
  "class": processor.__class__.__name__,
654
+ "supports": self._get_processor_supports(proc_type),
655
  }
656
+
657
  return info
658
 
659
  def _get_processor_supports(self, proc_type: str) -> List[str]:
660
  """Get processor supported features"""
661
  supports_map = {
662
+ "image": [
663
+ "Image content analysis",
664
+ "Visual understanding",
665
+ "Image description generation",
666
+ "Image entity extraction",
667
+ ],
668
+ "table": [
669
+ "Table structure analysis",
670
+ "Data statistics",
671
+ "Trend identification",
672
+ "Table entity extraction",
673
+ ],
674
+ "equation": [
675
+ "Mathematical formula parsing",
676
+ "Variable identification",
677
+ "Formula meaning explanation",
678
+ "Formula entity extraction",
679
+ ],
680
+ "generic": [
681
+ "General content analysis",
682
+ "Structured processing",
683
+ "Entity extraction",
684
+ ],
685
  }
686
  return supports_map.get(proc_type, ["Basic processing"])