File size: 8,727 Bytes

0774a75

# DOME wrapper for docstring intent classification
This wrapper allows to 
* split docstrings into sentences
* convert to required DOME inputs
* predict class for each sentence in docstring

## Model architecture
Architecture is based on https://github.com/ICSE-DOME/DOME.

## Usage
```python
docstring = "sentences of docstring"
dome = DOME("dome_location")
sentences, predictions = dome.predict(docstring)
```

## Dependencies
```
spacy
torch
transformers
```

## Code of the model
````python
"""
Model is based on replication package for ICSE23 Paper Developer-Intent Driven Code Comment Generation.
Initial solution: https://github.com/ICSE-DOME/DOME
Pipeline consists of several parts:
* split docstring into sentences
* prepare input data for DOMEBertForClassification
* predict class

How to use:
```python
docstring = "sentences of docstring"
dome = DOME("dome_location")
sentences, predictions = dome.predict(docstring)
```
"""
import re
from typing import Tuple, List

import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, RobertaConfig, RobertaModel

MAX_LENGTH_BERT = 510


class DOME:
    """
    End-to-end pipeline for docstring classification
    * split sentences
    * prepare inputs
    * classify
    """
    def __init__(self, pretrained_model: str):
        """
        :param pretrained_model: location of pretrained model
        """
        self.model = DOMEBertForClassification.from_pretrained(pretrained_model)
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        self.docstring2sentences = Docstring2Sentences()

    def predict(self, docstring: str) -> Tuple[List[str], List[str]]:
        """
        Predict DOME classes for each sentence in docstring.
        :param docstring: docstring to process
        :return: tuple with list of sentences and list of predictions for each sentence.
        """
        sentences = self.docstring2sentences.docstring2sentences(docstring)
        predictions = [self.model.predict(*dome_preprocess(tokenizer=self.tokenizer, comment=sentence))
                       for sentence in sentences]
        return sentences, predictions


class DOMEBertForClassification(RobertaModel):
    """
    A custom classification model based on the RobertaModel for intent classification.

    This model extends the RobertaModel with additional linear layers to incorporate
    comment length as an additional feature for classification tasks.
    """

    DOME_CLASS_NAMES = ["what", "why", "how-to-use", "how-it-is-done", "property", "others"]

    def __init__(self, config: RobertaConfig):
        """
        Initialize the DOMEBertForClassification model.

        :param config: The configuration information for the RobertaModel.
        """
        super().__init__(config)

        # I omit possibility to configure number of classes and so on because we need to load pretrained model
        # DOME layers for intent classification:
        self.fc1 = nn.Linear(768 + 1, 768 // 3)
        self.fc2 = nn.Linear(768 // 3, 6)
        self.dropout = nn.Dropout(0.2)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, comment_len: torch.Tensor = None) \
            -> torch.Tensor:
        """
        Forward pass for the DOMEBertForClassification model.

        :param input_ids: Tensor of token ids to be fed to a model.
        :param attention_mask: Mask to avoid performing attention on padding token indices. Always equals 1.
        :param comment_len: Tensor representing the length of comments. Equal 1 if comment has less than 3 words,
                            0 otherwise.
        :return: The logits after passing through the model.
        """
        # Use the parent class's forward method to get the base outputs
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the pooled output (last hidden state of the [CLS] token)
        pooled_output = outputs.pooler_output
        # DOME custom layers:
        comment_len = comment_len.view(-1, 1).float()  # Ensure comment_len is correctly shaped
        # DOME use comment len as additional feature
        combined_input = torch.cat([pooled_output, comment_len], dim=-1)
        x = self.dropout(F.relu(self.fc1(self.dropout(combined_input))))
        logits = self.fc2(x)
        return logits

    def predict(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, comment_len: torch.Tensor = None) \
            -> str:
        """
        Predict class for tokenized docstring.

        :param input_ids: Tensor of token ids to be fed to a model.
        :param attention_mask: Mask to avoid performing attention on padding token indices. Always equals 1.
        :param comment_len: Tensor representing the length of comments. Equal 1 if comment has less than 3 words,
                            0 otherwise.
        :return: class
        """
        logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, comment_len=comment_len)
        return self.DOME_CLASS_NAMES[int(torch.argmax(logits, 1))]


def dome_preprocess(tokenizer, comment):
    """
    DOME preprocessor - returns all required values for "DOMEBertForClassification.forward".
    This function limits maximum number of tokens to fit into BERT.
    :param tokenizer: tokenizer to use.
    :param comment: text of sentence from docstring/comment that should be classified by DOMEBertForClassification.
    :return: tuple with (input_ids, attention_mask, comment_len).
    """
    input_ids = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokenizer.tokenize(comment) +
                                                [tokenizer.sep_token])[:MAX_LENGTH_BERT]
    attention_mask = [1] * len(input_ids)
    if len(comment.strip().split()) < 3:
        comment_len = 1
    else:
        comment_len = 0
    return (torch.tensor(input_ids).unsqueeze(0), torch.tensor(attention_mask).unsqueeze(0),
            torch.tensor(comment_len).unsqueeze(0))


class Docstring2Sentences:
    """Helper class to split docstrings into sentences"""
    def __init__(self):
        self.spacy_nlp = spacy.load("en_core_web_sm")

    @staticmethod
    def split_docstring(docstring: str, delimiters: List[Tuple[str, str]]):
        """
        Splits the docstring into separate parts of text and code blocks, preserving the original formatting.

        :param docstring: The docstring to split.
        :param delimiters: A list of tuples, each containing start and end delimiters for code blocks.
        :return: A list of strings, each either a text block or a code block.
        """

        # Escape delimiter parts for regex and create a combined pattern
        escaped_delimiters = [tuple(map(re.escape, d)) for d in delimiters]
        combined_pattern = '|'.join([f'({start}.*?{end})' for start, end in escaped_delimiters])

        # Split using the combined pattern, preserving the delimiters
        parts = re.split(combined_pattern, docstring, flags=re.DOTALL)

        # Filter out empty strings
        parts = [part for part in parts if part]

        return parts

    @staticmethod
    def is_only_spaces_and_newlines(string):
        """
        Check if the given string contains only spaces and newlines.

        :param string: The string to check.
        :return: True if the string contains only spaces and newlines, False otherwise.
        """
        return bool(re.match(r'^[\s\n]+$', string))

    def docstring2sentences(self, docstring):
        """
        Splits a docstring into individual sentences, preserving code blocks.

        This function uses `docstring2parts` to split the docstring into parts based on predefined code block delimiters.
        It then utilizes a SpaCy NLP model to split the non-code text parts into sentences.
        Code blocks are kept intact as single elements.

        :param docstring: The docstring to be processed, which may contain both regular text and code blocks.
        :return: A list containing individual sentences and intact code blocks.
        """
        delimiters = [("@code", "@endcode"), ("\code", "\endcode")]
        parts = self.split_docstring(docstring=docstring, delimiters=delimiters)
        sentences = []
        for part in parts:
            if part[1:5] == "code" and part[-7:] == "endcode":
                # code block
                sentences.append(part)
            else:
                sentences.extend(sentence.text for sentence in self.spacy_nlp(part).sents)

        return [sentence for sentence in sentences if not self.is_only_spaces_and_newlines(sentence)]

````