Spaces:
Configuration error
Configuration error
File size: 5,225 Bytes
0786686 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | import os
import chromadb
from pathlib import Path
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.cohere import CohereEmbedding
# Load environment variables
load_dotenv()
# Configure the embedding model
cohere_api_key = os.getenv("COHERE_API_KEY")
if not cohere_api_key:
raise ValueError("COHERE_API_KEY not found in environment variables")
# Initialize the embedding model
embed_model = CohereEmbedding(
cohere_api_key=cohere_api_key,
model_name="embed-english-v3.0",
input_type="search_document"
)
# Set the global embedding model
Settings.embed_model = embed_model
def process_documents(department: str, base_dir: str = "./resources/data"):
"""
Process and index documents for a specific department
Args:
department: The department name (e.g., 'hr', 'engineering')
base_dir: Base directory containing department folders
"""
print(f"Processing documents for {department} department...")
# Define paths
dept_path = Path(base_dir) / department
general_path = Path(base_dir) / "general"
persist_dir = f"./chroma_db/{department}"
# Create directory if it doesn't exist
os.makedirs(persist_dir, exist_ok=True)
# Initialize Chroma client
chroma_client = chromadb.PersistentClient(path=persist_dir)
# Clear existing collection if it exists
try:
chroma_client.delete_collection("documents")
except:
pass
# Create a new collection
chroma_collection = chroma_client.get_or_create_collection("documents")
# Create vector store
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Load department-specific documents
documents = []
# Add department-specific files
if dept_path.exists() and dept_path.is_dir():
for file_path in dept_path.glob("*"):
if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']:
print(f"Processing {file_path.name}...")
try:
# Read the file content
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Create a document with metadata
from llama_index.core import Document
doc = Document(
text=content,
metadata={
"source": str(file_path.name),
"department": department,
"type": "department_specific"
}
)
documents.append(doc)
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
# Add general documents
if general_path.exists() and general_path.is_dir():
for file_path in general_path.glob("*"):
if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']:
print(f"Processing general document: {file_path.name}...")
try:
# Read the file content
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Create a document with metadata
from llama_index.core import Document
doc = Document(
text=content,
metadata={
"source": str(file_path.name),
"department": "general",
"type": "general"
}
)
documents.append(doc)
except Exception as e:
print(f"Error processing general document {file_path}: {str(e)}")
if not documents:
print(f"No documents found for {department} department.")
return
print(f"Indexing {len(documents)} documents...")
# Create index with the documents
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
show_progress=True,
embed_model=embed_model
)
print(f"✅ Successfully indexed {len(documents)} documents for {department} department")
print(f"Index stored in: {persist_dir}")
def main():
"""Main function to process documents for all departments"""
departments = ["hr", "engineering", "finance", "marketing"]
for dept in departments:
print(f"\n{'='*50}")
print(f"Processing {dept.upper()} department")
print(f"{'='*50}")
process_documents(dept)
print("\n✅ Document processing completed for all departments!")
if __name__ == "__main__":
main()
|