| |
| |
|
|
| """ |
| Language Detector |
| |
| This module provides functionality for detecting programming languages in a repository. |
| """ |
|
|
| import os |
| import logging |
| from collections import Counter |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| EXTENSION_TO_LANGUAGE = { |
| '.py': 'Python', |
| '.js': 'JavaScript', |
| '.jsx': 'JavaScript', |
| '.ts': 'TypeScript', |
| '.tsx': 'TypeScript', |
| '.java': 'Java', |
| '.go': 'Go', |
| '.rs': 'Rust', |
| '.cpp': 'C++', |
| '.cc': 'C++', |
| '.cxx': 'C++', |
| '.c': 'C', |
| '.h': 'C', |
| '.hpp': 'C++', |
| '.cs': 'C#', |
| '.php': 'PHP', |
| '.rb': 'Ruby', |
| '.swift': 'Swift', |
| '.kt': 'Kotlin', |
| '.scala': 'Scala', |
| '.r': 'R', |
| '.sh': 'Shell', |
| '.bash': 'Shell', |
| '.zsh': 'Shell', |
| '.html': 'HTML', |
| '.htm': 'HTML', |
| '.css': 'CSS', |
| '.scss': 'SCSS', |
| '.sass': 'SCSS', |
| '.less': 'Less', |
| '.md': 'Markdown', |
| '.json': 'JSON', |
| '.xml': 'XML', |
| '.yaml': 'YAML', |
| '.yml': 'YAML', |
| '.sql': 'SQL', |
| '.graphql': 'GraphQL', |
| '.gql': 'GraphQL', |
| } |
|
|
| |
| SPECIAL_FILES_TO_LANGUAGE = { |
| 'Dockerfile': 'Docker', |
| 'docker-compose.yml': 'Docker', |
| 'docker-compose.yaml': 'Docker', |
| 'Makefile': 'Make', |
| 'CMakeLists.txt': 'CMake', |
| 'package.json': 'JavaScript', |
| 'tsconfig.json': 'TypeScript', |
| 'requirements.txt': 'Python', |
| 'setup.py': 'Python', |
| 'pom.xml': 'Java', |
| 'build.gradle': 'Java', |
| 'Cargo.toml': 'Rust', |
| 'go.mod': 'Go', |
| } |
|
|
|
|
| class LanguageDetector: |
| """ |
| Detects programming languages in a repository. |
| """ |
| |
| def __init__(self): |
| """ |
| Initialize the LanguageDetector. |
| """ |
| logger.info("Initialized LanguageDetector") |
| |
| def detect_languages(self, repo_path): |
| """ |
| Detect programming languages in a repository. |
| |
| Args: |
| repo_path (str): The path to the repository. |
| |
| Returns: |
| list: A list of detected programming languages, sorted by prevalence. |
| """ |
| logger.info(f"Detecting languages in repository: {repo_path}") |
| |
| language_counter = Counter() |
| |
| for root, dirs, files in os.walk(repo_path): |
| |
| dirs[:] = [d for d in dirs if not d.startswith('.') and |
| d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']] |
| |
| for file in files: |
| file_path = os.path.join(root, file) |
| |
| |
| if file in SPECIAL_FILES_TO_LANGUAGE: |
| language = SPECIAL_FILES_TO_LANGUAGE[file] |
| language_counter[language] += 1 |
| continue |
| |
| |
| _, ext = os.path.splitext(file) |
| if ext in EXTENSION_TO_LANGUAGE: |
| language = EXTENSION_TO_LANGUAGE[ext] |
| language_counter[language] += 1 |
| |
| |
| supported_languages = [ |
| "Python", "JavaScript", "TypeScript", "Java", |
| "Go", "Rust", "C++", "C#", "PHP", "Ruby", |
| "Swift", "Kotlin", "Scala", "R", "Shell" |
| ] |
| |
| detected_languages = [lang for lang, _ in language_counter.most_common() |
| if lang in supported_languages] |
| |
| logger.info(f"Detected languages: {detected_languages}") |
| return detected_languages |
| |
| def get_language_breakdown(self, repo_path): |
| """ |
| Get a breakdown of programming languages in a repository by lines of code. |
| |
| Args: |
| repo_path (str): The path to the repository. |
| |
| Returns: |
| dict: A dictionary mapping languages to lines of code. |
| """ |
| logger.info(f"Getting language breakdown for repository: {repo_path}") |
| |
| language_loc = {} |
| |
| for root, dirs, files in os.walk(repo_path): |
| |
| dirs[:] = [d for d in dirs if not d.startswith('.') and |
| d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']] |
| |
| for file in files: |
| file_path = os.path.join(root, file) |
| |
| |
| language = None |
| |
| |
| if file in SPECIAL_FILES_TO_LANGUAGE: |
| language = SPECIAL_FILES_TO_LANGUAGE[file] |
| else: |
| |
| _, ext = os.path.splitext(file) |
| if ext in EXTENSION_TO_LANGUAGE: |
| language = EXTENSION_TO_LANGUAGE[ext] |
| |
| if language: |
| |
| try: |
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: |
| line_count = sum(1 for _ in f) |
| |
| if language in language_loc: |
| language_loc[language] += line_count |
| else: |
| language_loc[language] = line_count |
| except Exception as e: |
| logger.warning(f"Error counting lines in {file_path}: {e}") |
| |
| logger.info(f"Language breakdown: {language_loc}") |
| return language_loc |