Spaces:
Configuration error
Configuration error
| import os | |
| import chromadb | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings | |
| from llama_index.vector_stores.chroma import ChromaVectorStore | |
| from llama_index.embeddings.cohere import CohereEmbedding | |
| # Load environment variables | |
| load_dotenv() | |
| # Configure the embedding model | |
| cohere_api_key = os.getenv("COHERE_API_KEY") | |
| if not cohere_api_key: | |
| raise ValueError("COHERE_API_KEY not found in environment variables") | |
| # Initialize the embedding model | |
| embed_model = CohereEmbedding( | |
| cohere_api_key=cohere_api_key, | |
| model_name="embed-english-v3.0", | |
| input_type="search_document" | |
| ) | |
| # Set the global embedding model | |
| Settings.embed_model = embed_model | |
| def process_documents(department: str, base_dir: str = "./resources/data"): | |
| """ | |
| Process and index documents for a specific department | |
| Args: | |
| department: The department name (e.g., 'hr', 'engineering') | |
| base_dir: Base directory containing department folders | |
| """ | |
| print(f"Processing documents for {department} department...") | |
| # Define paths | |
| dept_path = Path(base_dir) / department | |
| general_path = Path(base_dir) / "general" | |
| persist_dir = f"./chroma_db/{department}" | |
| # Create directory if it doesn't exist | |
| os.makedirs(persist_dir, exist_ok=True) | |
| # Initialize Chroma client | |
| chroma_client = chromadb.PersistentClient(path=persist_dir) | |
| # Clear existing collection if it exists | |
| try: | |
| chroma_client.delete_collection("documents") | |
| except: | |
| pass | |
| # Create a new collection | |
| chroma_collection = chroma_client.get_or_create_collection("documents") | |
| # Create vector store | |
| vector_store = ChromaVectorStore(chroma_collection=chroma_collection) | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| # Load department-specific documents | |
| documents = [] | |
| # Add department-specific files | |
| if dept_path.exists() and dept_path.is_dir(): | |
| for file_path in dept_path.glob("*"): | |
| if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']: | |
| print(f"Processing {file_path.name}...") | |
| try: | |
| # Read the file content | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Create a document with metadata | |
| from llama_index.core import Document | |
| doc = Document( | |
| text=content, | |
| metadata={ | |
| "source": str(file_path.name), | |
| "department": department, | |
| "type": "department_specific" | |
| } | |
| ) | |
| documents.append(doc) | |
| except Exception as e: | |
| print(f"Error processing {file_path}: {str(e)}") | |
| # Add general documents | |
| if general_path.exists() and general_path.is_dir(): | |
| for file_path in general_path.glob("*"): | |
| if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']: | |
| print(f"Processing general document: {file_path.name}...") | |
| try: | |
| # Read the file content | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Create a document with metadata | |
| from llama_index.core import Document | |
| doc = Document( | |
| text=content, | |
| metadata={ | |
| "source": str(file_path.name), | |
| "department": "general", | |
| "type": "general" | |
| } | |
| ) | |
| documents.append(doc) | |
| except Exception as e: | |
| print(f"Error processing general document {file_path}: {str(e)}") | |
| if not documents: | |
| print(f"No documents found for {department} department.") | |
| return | |
| print(f"Indexing {len(documents)} documents...") | |
| # Create index with the documents | |
| index = VectorStoreIndex.from_documents( | |
| documents, | |
| storage_context=storage_context, | |
| show_progress=True, | |
| embed_model=embed_model | |
| ) | |
| print(f"✅ Successfully indexed {len(documents)} documents for {department} department") | |
| print(f"Index stored in: {persist_dir}") | |
| def main(): | |
| """Main function to process documents for all departments""" | |
| departments = ["hr", "engineering", "finance", "marketing"] | |
| for dept in departments: | |
| print(f"\n{'='*50}") | |
| print(f"Processing {dept.upper()} department") | |
| print(f"{'='*50}") | |
| process_documents(dept) | |
| print("\n✅ Document processing completed for all departments!") | |
| if __name__ == "__main__": | |
| main() | |