Spaces:
Sleeping
Sleeping
File size: 3,139 Bytes
2eb3831 dcac338 2eb3831 dcac338 2eb3831 dcac338 2eb3831 dcac338 2eb3831 dcac338 2eb3831 dcac338 2eb3831 dcac338 2eb3831 dcac338 2eb3831 dcac338 2eb3831 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | import os
from dotenv import load_dotenv
load_dotenv()
from langchain_community.document_loaders import UnstructuredHTMLLoader, UnstructuredMarkdownLoader, TextLoader, BSHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pathlib import Path
import fast_tokenizer
base_dir = Path("./docs")
paths = list(base_dir.rglob("*"))
def load_docs(paths):
all_docs = []
for p in paths:
if not p.is_file():
continue
ext = p.suffix.lower()
try:
if ext == ".html":
# Try UnstructuredHTMLLoader first, fallback to BSHTMLLoader if it fails
try:
loader = UnstructuredHTMLLoader(p)
docs = loader.load()
except (AttributeError, Exception) as e:
# Fallback to BSHTMLLoader for problematic HTML files
print(f"Warning: UnstructuredHTMLLoader failed for {p}, using BSHTMLLoader instead. Error: {type(e).__name__}")
loader = BSHTMLLoader(p)
docs = loader.load()
elif ext == ".md":
loader = UnstructuredMarkdownLoader(p)
docs = loader.load()
elif ext == ".txt":
loader = TextLoader(p)
docs = loader.load()
else:
print(f"Skipping {p} because it is not a supported file type")
continue
all_docs.extend(docs)
except Exception as e:
print(f"Error loading {p}: {type(e).__name__}: {e}")
continue
return all_docs
# Temporary Python fallback for local Windows ingestion
def custom_token_length(text):
# Ensure text is clean UTF-8
clean_text = text.encode('utf-8', 'ignore').decode('utf-8')
# A standard rule of thumb for English text is that 1 token is roughly 4 characters.
# This avoids needing the C++ fast_tokenizer on Windows!
return len(clean_text) // 4
def split_docs(docs):
splitter = RecursiveCharacterTextSplitter(
chunk_size=350, # Max 350 tokens per chunk
chunk_overlap=50, # Overlap of 50 tokens
length_function=custom_token_length # Tells LangChain to use your C++ tool
)
return splitter.split_documents(docs)
def store_docs(texts):
print("Embedding documents and uploading to Pinecone... (This may take a minute)")
model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = PineconeVectorStore.from_documents(
documents=texts,
embedding=model,
index_name="rag-agent",
pinecone_api_key=os.environ["PINECONE_API_KEY"],
)
return vectorstore
if __name__ == "__main__":
docs = load_docs(paths)
texts = split_docs(docs)
vectorstore = store_docs(texts)
print("="*50)
print("✅ SUCCESS!")
print(f"Documents Loaded: {len(docs)}")
print(f"Total Chunks Uploaded to Pinecone: {len(texts)}")
print("="*50) |