| """Load html from files, clean up, split, ingest into Weaviate.""" |
| import os |
| from pathlib import Path |
|
|
| import weaviate |
| from bs4 import BeautifulSoup |
| from langchain.text_splitter import CharacterTextSplitter |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| with open('paper-dir/main.txt') as f: |
| paper_text = f.read() |
|
|
| docs = paper_text.split("§") |
| |
| metadatas = [doc.split(" ")[0] for doc in docs] |
|
|
| text_splitter = CharacterTextSplitter( |
| separator="\n", |
| chunk_size=1000, |
| chunk_overlap=200, |
| length_function=len, |
| ) |
|
|
| documents = text_splitter.create_documents(docs, metadatas=metadatas) |
|
|
|
|
| WEAVIATE_URL = os.environ["WEAVIATE_URL"] |
| client = weaviate.Client( |
| url=WEAVIATE_URL, |
| additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}, |
| ) |
|
|
| client.schema.delete_class("Paragraph") |
| client.schema.get() |
| schema = { |
| "classes": [ |
| { |
| "class": "Paragraph", |
| "description": "A written paragraph", |
| "vectorizer": "text2vec-openai", |
| "moduleConfig": { |
| "text2vec-openai": { |
| "model": "ada", |
| "modelVersion": "002", |
| "type": "text", |
| } |
| }, |
| "properties": [ |
| { |
| "dataType": ["text"], |
| "description": "The content of the paragraph", |
| "moduleConfig": { |
| "text2vec-openai": { |
| "skip": False, |
| "vectorizePropertyName": False, |
| } |
| }, |
| "name": "content", |
| }, |
| { |
| "dataType": ["text"], |
| "description": "The link", |
| "moduleConfig": { |
| "text2vec-openai": { |
| "skip": True, |
| "vectorizePropertyName": False, |
| } |
| }, |
| "name": "source", |
| }, |
| ], |
| }, |
| ] |
| } |
|
|
| client.schema.create(schema) |
|
|
| with client.batch as batch: |
| for text in documents: |
| batch.add_data_object( |
| { |
| "content": text.page_content, |
| "source": str(text.metadata["source"]) |
| }, |
| "Paragraph", |
| ) |
|
|