import os from dotenv import load_dotenv load_dotenv() from langchain_community.document_loaders import UnstructuredHTMLLoader, UnstructuredMarkdownLoader, TextLoader, BSHTMLLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_pinecone import PineconeVectorStore from pathlib import Path import fast_tokenizer base_dir = Path("./docs") paths = list(base_dir.rglob("*")) def load_docs(paths): all_docs = [] for p in paths: if not p.is_file(): continue ext = p.suffix.lower() try: if ext == ".html": # Try UnstructuredHTMLLoader first, fallback to BSHTMLLoader if it fails try: loader = UnstructuredHTMLLoader(p) docs = loader.load() except (AttributeError, Exception) as e: # Fallback to BSHTMLLoader for problematic HTML files print(f"Warning: UnstructuredHTMLLoader failed for {p}, using BSHTMLLoader instead. Error: {type(e).__name__}") loader = BSHTMLLoader(p) docs = loader.load() elif ext == ".md": loader = UnstructuredMarkdownLoader(p) docs = loader.load() elif ext == ".txt": loader = TextLoader(p) docs = loader.load() else: print(f"Skipping {p} because it is not a supported file type") continue all_docs.extend(docs) except Exception as e: print(f"Error loading {p}: {type(e).__name__}: {e}") continue return all_docs # Temporary Python fallback for local Windows ingestion def custom_token_length(text): # Ensure text is clean UTF-8 clean_text = text.encode('utf-8', 'ignore').decode('utf-8') # A standard rule of thumb for English text is that 1 token is roughly 4 characters. # This avoids needing the C++ fast_tokenizer on Windows! return len(clean_text) // 4 def split_docs(docs): splitter = RecursiveCharacterTextSplitter( chunk_size=350, # Max 350 tokens per chunk chunk_overlap=50, # Overlap of 50 tokens length_function=custom_token_length # Tells LangChain to use your C++ tool ) return splitter.split_documents(docs) def store_docs(texts): print("Embedding documents and uploading to Pinecone... (This may take a minute)") model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") vectorstore = PineconeVectorStore.from_documents( documents=texts, embedding=model, index_name="rag-agent", pinecone_api_key=os.environ["PINECONE_API_KEY"], ) return vectorstore if __name__ == "__main__": docs = load_docs(paths) texts = split_docs(docs) vectorstore = store_docs(texts) print("="*50) print("✅ SUCCESS!") print(f"Documents Loaded: {len(docs)}") print(f"Total Chunks Uploaded to Pinecone: {len(texts)}") print("="*50)