File size: 1,567 Bytes
9d21edd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import re

def extract_clauses(text_data):
    """

    Extracts clauses from text chunks with location data.

    Args:

        text_data: List[Dict] with 'text' and 'page' keys.

    Returns:

        List[Dict]: [{'id', 'text', 'page', 'line'}]

    """
    unique_clauses = []
    seen = set()
    clause_id = 0

    for chunk in text_data:
        raw_text = chunk.get("text", "")
        page_num = chunk.get("page", 1)
        
        # Split into lines first to track line numbers roughly
        # Or split by sentence and find position.
        
        # Simple approach: Split by sentence, then find approximate line number in chunk
        sentences = re.split(r'(?<=[.!?])\s+', raw_text)
        
        # Helper to find line number
        def get_line_number(substring, source_text):
            idx = source_text.find(substring)
            if idx == -1: return 1
            return source_text[:idx].count('\n') + 1

        for s in sentences:
            s_clean = s.strip()
            if len(s_clean) > 30 and s_clean not in seen:
                seen.add(s_clean)
                
                # Estimate line number within the page
                line_offset = get_line_number(s_clean, raw_text)
                
                unique_clauses.append({
                    "id": clause_id, 
                    "text": s_clean,
                    "page": page_num,
                    "line": line_offset
                })
                clause_id += 1
            
    return unique_clauses