Spaces:

bharathmunakala
/

Role_Base_Access_Control

Configuration error

File size: 5,225 Bytes
import os
import chromadb
from pathlib import Path
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.cohere import CohereEmbedding

# Load environment variables
load_dotenv()

# Configure the embedding model
cohere_api_key = os.getenv("COHERE_API_KEY")
if not cohere_api_key:
    raise ValueError("COHERE_API_KEY not found in environment variables")

# Initialize the embedding model
embed_model = CohereEmbedding(
    cohere_api_key=cohere_api_key,
    model_name="embed-english-v3.0",
    input_type="search_document"
)

# Set the global embedding model
Settings.embed_model = embed_model

def process_documents(department: str, base_dir: str = "./resources/data"):
    """

    Process and index documents for a specific department

    

    Args:

        department: The department name (e.g., 'hr', 'engineering')

        base_dir: Base directory containing department folders

    """
    print(f"Processing documents for {department} department...")
    
    # Define paths
    dept_path = Path(base_dir) / department
    general_path = Path(base_dir) / "general"
    persist_dir = f"./chroma_db/{department}"
    
    # Create directory if it doesn't exist
    os.makedirs(persist_dir, exist_ok=True)
    
    # Initialize Chroma client
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    
    # Clear existing collection if it exists
    try:
        chroma_client.delete_collection("documents")
    except:
        pass
    
    # Create a new collection
    chroma_collection = chroma_client.get_or_create_collection("documents")
    
    # Create vector store
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    # Load department-specific documents
    documents = []
    
    # Add department-specific files
    if dept_path.exists() and dept_path.is_dir():
        for file_path in dept_path.glob("*"):
            if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']:
                print(f"Processing {file_path.name}...")
                try:
                    # Read the file content
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                    
                    # Create a document with metadata
                    from llama_index.core import Document
                    doc = Document(
                        text=content,
                        metadata={
                            "source": str(file_path.name),
                            "department": department,
                            "type": "department_specific"
                        }
                    )
                    documents.append(doc)
                except Exception as e:
                    print(f"Error processing {file_path}: {str(e)}")
    
    # Add general documents
    if general_path.exists() and general_path.is_dir():
        for file_path in general_path.glob("*"):
            if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']:
                print(f"Processing general document: {file_path.name}...")
                try:
                    # Read the file content
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                    
                    # Create a document with metadata
                    from llama_index.core import Document
                    doc = Document(
                        text=content,
                        metadata={
                            "source": str(file_path.name),
                            "department": "general",
                            "type": "general"
                        }
                    )
                    documents.append(doc)
                except Exception as e:
                    print(f"Error processing general document {file_path}: {str(e)}")
    
    if not documents:
        print(f"No documents found for {department} department.")
        return
    
    print(f"Indexing {len(documents)} documents...")
    
    # Create index with the documents
    index = VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
        show_progress=True,
        embed_model=embed_model
    )
    
    print(f"✅ Successfully indexed {len(documents)} documents for {department} department")
    print(f"Index stored in: {persist_dir}")

def main():
    """Main function to process documents for all departments"""
    departments = ["hr", "engineering", "finance", "marketing"]
    
    for dept in departments:
        print(f"\n{'='*50}")
        print(f"Processing {dept.upper()} department")
        print(f"{'='*50}")
        process_documents(dept)
    
    print("\n✅ Document processing completed for all departments!")

if __name__ == "__main__":
    main()