File size: 8,727 Bytes
0774a75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | # DOME wrapper for docstring intent classification
This wrapper allows to
* split docstrings into sentences
* convert to required DOME inputs
* predict class for each sentence in docstring
## Model architecture
Architecture is based on https://github.com/ICSE-DOME/DOME.
## Usage
```python
docstring = "sentences of docstring"
dome = DOME("dome_location")
sentences, predictions = dome.predict(docstring)
```
## Dependencies
```
spacy
torch
transformers
```
## Code of the model
````python
"""
Model is based on replication package for ICSE23 Paper Developer-Intent Driven Code Comment Generation.
Initial solution: https://github.com/ICSE-DOME/DOME
Pipeline consists of several parts:
* split docstring into sentences
* prepare input data for DOMEBertForClassification
* predict class
How to use:
```python
docstring = "sentences of docstring"
dome = DOME("dome_location")
sentences, predictions = dome.predict(docstring)
```
"""
import re
from typing import Tuple, List
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, RobertaConfig, RobertaModel
MAX_LENGTH_BERT = 510
class DOME:
"""
End-to-end pipeline for docstring classification
* split sentences
* prepare inputs
* classify
"""
def __init__(self, pretrained_model: str):
"""
:param pretrained_model: location of pretrained model
"""
self.model = DOMEBertForClassification.from_pretrained(pretrained_model)
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
self.docstring2sentences = Docstring2Sentences()
def predict(self, docstring: str) -> Tuple[List[str], List[str]]:
"""
Predict DOME classes for each sentence in docstring.
:param docstring: docstring to process
:return: tuple with list of sentences and list of predictions for each sentence.
"""
sentences = self.docstring2sentences.docstring2sentences(docstring)
predictions = [self.model.predict(*dome_preprocess(tokenizer=self.tokenizer, comment=sentence))
for sentence in sentences]
return sentences, predictions
class DOMEBertForClassification(RobertaModel):
"""
A custom classification model based on the RobertaModel for intent classification.
This model extends the RobertaModel with additional linear layers to incorporate
comment length as an additional feature for classification tasks.
"""
DOME_CLASS_NAMES = ["what", "why", "how-to-use", "how-it-is-done", "property", "others"]
def __init__(self, config: RobertaConfig):
"""
Initialize the DOMEBertForClassification model.
:param config: The configuration information for the RobertaModel.
"""
super().__init__(config)
# I omit possibility to configure number of classes and so on because we need to load pretrained model
# DOME layers for intent classification:
self.fc1 = nn.Linear(768 + 1, 768 // 3)
self.fc2 = nn.Linear(768 // 3, 6)
self.dropout = nn.Dropout(0.2)
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, comment_len: torch.Tensor = None) \
-> torch.Tensor:
"""
Forward pass for the DOMEBertForClassification model.
:param input_ids: Tensor of token ids to be fed to a model.
:param attention_mask: Mask to avoid performing attention on padding token indices. Always equals 1.
:param comment_len: Tensor representing the length of comments. Equal 1 if comment has less than 3 words,
0 otherwise.
:return: The logits after passing through the model.
"""
# Use the parent class's forward method to get the base outputs
outputs = super().forward(
input_ids=input_ids,
attention_mask=attention_mask
)
# Extract the pooled output (last hidden state of the [CLS] token)
pooled_output = outputs.pooler_output
# DOME custom layers:
comment_len = comment_len.view(-1, 1).float() # Ensure comment_len is correctly shaped
# DOME use comment len as additional feature
combined_input = torch.cat([pooled_output, comment_len], dim=-1)
x = self.dropout(F.relu(self.fc1(self.dropout(combined_input))))
logits = self.fc2(x)
return logits
def predict(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, comment_len: torch.Tensor = None) \
-> str:
"""
Predict class for tokenized docstring.
:param input_ids: Tensor of token ids to be fed to a model.
:param attention_mask: Mask to avoid performing attention on padding token indices. Always equals 1.
:param comment_len: Tensor representing the length of comments. Equal 1 if comment has less than 3 words,
0 otherwise.
:return: class
"""
logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, comment_len=comment_len)
return self.DOME_CLASS_NAMES[int(torch.argmax(logits, 1))]
def dome_preprocess(tokenizer, comment):
"""
DOME preprocessor - returns all required values for "DOMEBertForClassification.forward".
This function limits maximum number of tokens to fit into BERT.
:param tokenizer: tokenizer to use.
:param comment: text of sentence from docstring/comment that should be classified by DOMEBertForClassification.
:return: tuple with (input_ids, attention_mask, comment_len).
"""
input_ids = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokenizer.tokenize(comment) +
[tokenizer.sep_token])[:MAX_LENGTH_BERT]
attention_mask = [1] * len(input_ids)
if len(comment.strip().split()) < 3:
comment_len = 1
else:
comment_len = 0
return (torch.tensor(input_ids).unsqueeze(0), torch.tensor(attention_mask).unsqueeze(0),
torch.tensor(comment_len).unsqueeze(0))
class Docstring2Sentences:
"""Helper class to split docstrings into sentences"""
def __init__(self):
self.spacy_nlp = spacy.load("en_core_web_sm")
@staticmethod
def split_docstring(docstring: str, delimiters: List[Tuple[str, str]]):
"""
Splits the docstring into separate parts of text and code blocks, preserving the original formatting.
:param docstring: The docstring to split.
:param delimiters: A list of tuples, each containing start and end delimiters for code blocks.
:return: A list of strings, each either a text block or a code block.
"""
# Escape delimiter parts for regex and create a combined pattern
escaped_delimiters = [tuple(map(re.escape, d)) for d in delimiters]
combined_pattern = '|'.join([f'({start}.*?{end})' for start, end in escaped_delimiters])
# Split using the combined pattern, preserving the delimiters
parts = re.split(combined_pattern, docstring, flags=re.DOTALL)
# Filter out empty strings
parts = [part for part in parts if part]
return parts
@staticmethod
def is_only_spaces_and_newlines(string):
"""
Check if the given string contains only spaces and newlines.
:param string: The string to check.
:return: True if the string contains only spaces and newlines, False otherwise.
"""
return bool(re.match(r'^[\s\n]+$', string))
def docstring2sentences(self, docstring):
"""
Splits a docstring into individual sentences, preserving code blocks.
This function uses `docstring2parts` to split the docstring into parts based on predefined code block delimiters.
It then utilizes a SpaCy NLP model to split the non-code text parts into sentences.
Code blocks are kept intact as single elements.
:param docstring: The docstring to be processed, which may contain both regular text and code blocks.
:return: A list containing individual sentences and intact code blocks.
"""
delimiters = [("@code", "@endcode"), ("\code", "\endcode")]
parts = self.split_docstring(docstring=docstring, delimiters=delimiters)
sentences = []
for part in parts:
if part[1:5] == "code" and part[-7:] == "endcode":
# code block
sentences.append(part)
else:
sentences.extend(sentence.text for sentence in self.spacy_nlp(part).sents)
return [sentence for sentence in sentences if not self.is_only_spaces_and_newlines(sentence)]
````
|