| import logging |
| import os |
| from dotenv import load_dotenv |
| from langchain_text_splitters import ( |
| Language, |
| RecursiveCharacterTextSplitter, |
| ) |
|
|
| from .utils.logger_utils import setup_logger |
| load_dotenv() |
|
|
|
|
| LOGGER_NAME = 'CODE_PARSER_LOGGER' |
| CODE_CHUNK_OVERLAP = int(os.getenv('CODE_CHUNK_OVERLAP', 0)) |
| CODE_CHUNK_SIZE = int(os.getenv('CODE_CHUNK_SIZE', 2000)) |
|
|
|
|
| class CodeParser: |
| def __init__(self): |
| setup_logger(LOGGER_NAME) |
| self.logger = logging.getLogger(LOGGER_NAME) |
|
|
| self.extension_mapping = { |
| 'c': Language.C, |
| 'h': Language.C, |
| 'cpp': Language.CPP, |
| 'cc': Language.CPP, |
| 'cxx': Language.CPP, |
| 'hpp': Language.CPP, |
| 'hh': Language.CPP, |
| 'hxx': Language.CPP, |
| 'go': Language.GO, |
| 'java': Language.JAVA, |
| 'py': Language.PYTHON, |
| 'pyw': Language.PYTHON, |
| 'js': Language.JS, |
| 'mjs': Language.JS, |
| 'cjs': Language.JS, |
| 'md': Language.MARKDOWN, |
| 'markdown': Language.MARKDOWN, |
| 'html': Language.HTML, |
| } |
|
|
| def parse(self, file_name:str, file_content:str) -> list: |
| file_extension = file_name.split('.')[-1] |
|
|
| try: |
| self.logger.debug(f'Parsing file: {file_name}') |
| if file_extension not in self.extension_mapping: |
| self.logger.debug(f'File extension not supported: {file_extension}') |
| text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size=CODE_CHUNK_SIZE, |
| chunk_overlap=CODE_CHUNK_OVERLAP, |
| length_function=len, |
| is_separator_regex=False, |
| ) |
| docs = text_splitter.create_documents([file_content]) |
|
|
| else: |
| self.logger.debug(f'File extension supported: {file_extension}') |
| code_splitter = RecursiveCharacterTextSplitter.from_language(language=self.extension_mapping[file_extension], chunk_size=CODE_CHUNK_SIZE, chunk_overlap=CODE_CHUNK_OVERLAP) |
| docs = code_splitter.create_documents([file_content]) |
| except Exception as e: |
| self.logger.error(f'Error when parsing code: {e}') |
| return [doc.page_content for doc in docs] |
|
|
|
|
|
|
|
|
|
|
|
|