| from datetime import date, timedelta |
|
|
| import bs4 |
| from langchain.retrievers import ParentDocumentRetriever |
| from langchain.storage import LocalFileStore |
| from langchain.storage._lc_store import create_kv_docstore |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain.vectorstores.chroma import Chroma |
| from langchain_community.document_loaders import WebBaseLoader |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings |
| from selenium import webdriver |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.ui import WebDriverWait |
|
|
| import config |
|
|
| DATA_URL = "https://www.sikafinance.com/marches/actualites_bourse_brvm" |
|
|
| embeddings_model = GoogleGenerativeAIEmbeddings( |
| model=config.GOOGLE_EMBEDDING_MODEL |
| ) |
|
|
|
|
| options = webdriver.ChromeOptions() |
| options.add_argument("--headless") |
| options.add_argument("--no-sandbox") |
| options.add_argument("--disable-dev-shm-usage") |
| driver = webdriver.Chrome(options=options) |
|
|
|
|
| def scrap_articles( |
| url="https://www.sikafinance.com/marches/actualites_bourse_brvm", num_days_past=5 |
| ): |
|
|
| today = date.today() |
|
|
| driver.get(url) |
|
|
| all_articles = [] |
| for i in range(num_days_past + 1): |
| past_date = today - timedelta(days=i) |
| date_str = past_date.strftime("%Y-%m-%d") |
| WebDriverWait(driver, 10).until( |
| EC.presence_of_element_located((By.ID, "dateActu")) |
| ) |
| text_box = driver.find_element(By.ID, "dateActu") |
| text_box.send_keys(date_str) |
|
|
| submit_btn = WebDriverWait(driver, 10).until( |
| EC.element_to_be_clickable((By.ID, "btn")) |
| ) |
| submit_btn.click() |
|
|
| dates = driver.find_elements(By.CLASS_NAME, "sp1") |
| table = driver.find_element(By.ID, "tabQuotes") |
| titles = table.find_elements(By.TAG_NAME, "a") |
| articles = [] |
| for i in range(len(titles)): |
| art = { |
| "title": titles[i].text.strip(), |
| "date": dates[i].text, |
| "link": titles[i].get_attribute("href"), |
| } |
| articles.append(art) |
|
|
| all_articles += articles |
| |
|
|
| return all_articles |
|
|
|
|
| def set_metadata(documents, metadatas): |
| """ |
| #Edit a metadata of lanchain Documents object |
| """ |
| for doc in documents: |
| idx = documents.index(doc) |
| doc.metadata = metadatas[idx] |
| print("Metadata successfully changed") |
| print(documents[0].metadata) |
|
|
|
|
| def process_docs( |
| articles, persist_directory, embeddings_model, chunk_size=500, chunk_overlap=0 |
| ): |
| """ |
| #Scrap all articles urls content and save on a vector DB |
| """ |
| article_urls = [a["link"] for a in articles] |
|
|
| print("Starting to scrap ..") |
|
|
| loader = WebBaseLoader( |
| web_paths=article_urls, |
| bs_kwargs=dict( |
| parse_only=bs4.SoupStrainer( |
| class_=("inarticle txtbig", "dt_sign", "innerUp") |
| ) |
| ), |
| ) |
|
|
| print("After scraping Loading ..") |
| docs = loader.load() |
|
|
| |
| set_metadata(documents=docs, metadatas=articles) |
|
|
| |
|
|
| |
| child_splitter = RecursiveCharacterTextSplitter( |
| chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n"] |
| ) |
|
|
| |
| vectorstore = Chroma( |
| persist_directory=persist_directory + "vectorstore", |
| collection_name="full_documents", |
| embedding_function=embeddings_model, |
| ) |
|
|
| |
| fs = LocalFileStore(persist_directory + "docstore") |
| store = create_kv_docstore(fs) |
|
|
| retriever = ParentDocumentRetriever( |
| vectorstore=vectorstore, |
| docstore=store, |
| child_splitter=child_splitter, |
| ) |
|
|
| retriever.add_documents(docs, ids=None) |
| print(len(docs), " documents added") |
|
|
|
|
| if __name__ == "__main__": |
|
|
| data = scrap_articles(DATA_URL, num_days_past=config.NUM_DAYS_PAST) |
| process_docs(data, config.STORAGE_PATH, embeddings_model) |
|
|