| import os |
| import shutil |
| import git |
| from urllib.parse import urlparse |
|
|
| local_dir = os.getcwd() |
| branch = None |
|
|
| |
| def get_repo_name(url): |
| parsed_url = urlparse(url) |
| |
| repo_name = os.path.basename(parsed_url.path) |
| |
| repo_name = repo_name[:-4] |
| return repo_name |
|
|
| |
| def clone_repo(url): |
| try: |
| path = os.path.join(local_dir,"staging",get_repo_name(url)) |
| |
| if os.path.exists(path): |
| print(f"{get_repo_name(url)} already added in db") |
| return False |
| |
| repo = git.Repo.clone_from(url,path) |
| global branch |
| branch = repo.head.reference |
| print(f"{get_repo_name(url)} cloned succesfully") |
| return True |
| except Exception as e : |
| print(f"Error cloning the git repository: {e}") |
| return False |
|
|
| def delete_cloned_repo(url): |
| local_path = os.path.join(local_dir,"staging",get_repo_name(url)) |
| try: |
| |
| if os.path.exists(local_path): |
| |
| shutil.rmtree(local_path,ignore_errors=True) |
| print(f"Repository at {local_path} successfully deleted.") |
| else: |
| print(f"Repository at {local_path} does not exist.") |
| except Exception as e: |
| print(f"Error deleting repository: {e}") |
|
|
| from langchain_community.document_loaders import GitLoader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain_community.vectorstores import Qdrant |
| import qdrant_client |
|
|
| text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size = 1000, |
| chunk_overlap = 20, |
| ) |
|
|
| |
| |
|
|
| client = qdrant_client.QdrantClient( |
| os.getenv("QDRANT_HOST"), |
| api_key=os.getenv("QDRANT_API_KEY") |
| ) |
|
|
| from langchain_community.embeddings.fastembed import FastEmbedEmbeddings |
| embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5") |
| vectorstore = None |
|
|
| def load_repo(url): |
| collection_config = qdrant_client.http.models.VectorParams( |
| size=384, |
| distance=qdrant_client.http.models.Distance.COSINE |
| ) |
|
|
| client.recreate_collection( |
| collection_name=get_repo_name(url), |
| vectors_config=collection_config |
| ) |
| vectorstore = Qdrant( |
| client=client, |
| collection_name=get_repo_name(url), |
| embeddings=embeddings |
| ) |
| print("collection created") |
| try: |
| loader = GitLoader(repo_path=os.path.join(local_dir,"staging",get_repo_name(url)), branch=branch, file_filter=lambda file_path: not file_path.endswith("package-lock.json"),) |
| data = loader.load() |
| chunks = text_splitter.split_documents(data) |
| print("chunks created") |
| vectorstore.add_documents(chunks) |
| return True |
| except Exception as e: |
| print(f"Error loading and indexing repository: {e}") |
| return False |
| |
| def repository_loader(url): |
| result = False |
| if(clone_repo(url)): |
| result = load_repo(url) |
| if result : |
| delete_cloned_repo(url) |
|
|
|
|
|
|
| print('HELLO FROM CONTAINER') |
| |
|
|
| |