File size: 5,225 Bytes
0786686
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import chromadb
from pathlib import Path
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.cohere import CohereEmbedding

# Load environment variables
load_dotenv()

# Configure the embedding model
cohere_api_key = os.getenv("COHERE_API_KEY")
if not cohere_api_key:
    raise ValueError("COHERE_API_KEY not found in environment variables")

# Initialize the embedding model
embed_model = CohereEmbedding(
    cohere_api_key=cohere_api_key,
    model_name="embed-english-v3.0",
    input_type="search_document"
)

# Set the global embedding model
Settings.embed_model = embed_model

def process_documents(department: str, base_dir: str = "./resources/data"):
    """

    Process and index documents for a specific department

    

    Args:

        department: The department name (e.g., 'hr', 'engineering')

        base_dir: Base directory containing department folders

    """
    print(f"Processing documents for {department} department...")
    
    # Define paths
    dept_path = Path(base_dir) / department
    general_path = Path(base_dir) / "general"
    persist_dir = f"./chroma_db/{department}"
    
    # Create directory if it doesn't exist
    os.makedirs(persist_dir, exist_ok=True)
    
    # Initialize Chroma client
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    
    # Clear existing collection if it exists
    try:
        chroma_client.delete_collection("documents")
    except:
        pass
    
    # Create a new collection
    chroma_collection = chroma_client.get_or_create_collection("documents")
    
    # Create vector store
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    # Load department-specific documents
    documents = []
    
    # Add department-specific files
    if dept_path.exists() and dept_path.is_dir():
        for file_path in dept_path.glob("*"):
            if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']:
                print(f"Processing {file_path.name}...")
                try:
                    # Read the file content
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                    
                    # Create a document with metadata
                    from llama_index.core import Document
                    doc = Document(
                        text=content,
                        metadata={
                            "source": str(file_path.name),
                            "department": department,
                            "type": "department_specific"
                        }
                    )
                    documents.append(doc)
                except Exception as e:
                    print(f"Error processing {file_path}: {str(e)}")
    
    # Add general documents
    if general_path.exists() and general_path.is_dir():
        for file_path in general_path.glob("*"):
            if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']:
                print(f"Processing general document: {file_path.name}...")
                try:
                    # Read the file content
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                    
                    # Create a document with metadata
                    from llama_index.core import Document
                    doc = Document(
                        text=content,
                        metadata={
                            "source": str(file_path.name),
                            "department": "general",
                            "type": "general"
                        }
                    )
                    documents.append(doc)
                except Exception as e:
                    print(f"Error processing general document {file_path}: {str(e)}")
    
    if not documents:
        print(f"No documents found for {department} department.")
        return
    
    print(f"Indexing {len(documents)} documents...")
    
    # Create index with the documents
    index = VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
        show_progress=True,
        embed_model=embed_model
    )
    
    print(f"✅ Successfully indexed {len(documents)} documents for {department} department")
    print(f"Index stored in: {persist_dir}")

def main():
    """Main function to process documents for all departments"""
    departments = ["hr", "engineering", "finance", "marketing"]
    
    for dept in departments:
        print(f"\n{'='*50}")
        print(f"Processing {dept.upper()} department")
        print(f"{'='*50}")
        process_documents(dept)
    
    print("\n✅ Document processing completed for all departments!")

if __name__ == "__main__":
    main()