Spaces:

pranavKHF
/

code_vulnerability_detection

Sleeping

App Files Files Community

Pranav Pc commited on Feb 17

Commit

4b82ab5

1 Parent(s): 2075aa2

Final Deploy

Browse files

Files changed (13) hide show

Dockerfile +6 -13
README.md +0 -19
app.py +160 -0
models/best_model_clean.pt +3 -0
requirements.txt +5 -3
runtime.txt +1 -0
save_model.py +16 -0
src/__pycache__/inference.cpython-312.pyc +0 -0
src/__pycache__/model.cpython-312.pyc +0 -0
src/data.py +101 -0
src/inference.py +156 -0
src/model.py +77 -0
src/train.py +251 -0

Dockerfile CHANGED Viewed

@@ -1,20 +1,13 @@
-FROM python:3.13.5-slim
 WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.10
 WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -1,19 +0,0 @@
----
-title: Code Vulnerability Detection
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: AI-powered code vulnerability detection.
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+Streamlit UI for Vulnerability Detection
+Interactive web interface
+"""
+import streamlit as st
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).parent))
+from src.inference import VulnerabilityDetector
+# Page config
+st.set_page_config(
+    page_title="Code Vulnerability Detector",
+    page_icon="🔒",
+    layout="wide"
+)
+# Initialize detector (cache it so it loads only once)
+@st.cache_resource
+def load_detector():
+    return VulnerabilityDetector()
+# Main app
+def main():
+    st.title("🔒 AI-Powered Code Vulnerability Detection")
+    st.markdown("### Detect security vulnerabilities in your code using fine-tuned CodeT5")
+    # Sidebar
+    with st.sidebar:
+        st.header("ℹ️ About")
+        st.markdown("""
+        This tool uses a fine-tuned CodeT5 model to detect security vulnerabilities in source code.
+        **Supported Languages:**
+        - C/C++
+        - Python
+        - JavaScript
+        **Detection Types:**
+        - Buffer Overflow
+        - SQL Injection
+        - Command Injection
+        - Format String Bugs
+        - And more...
+        """)
+        st.header("📊 Model Info")
+        try:
+            detector = load_detector()
+            st.success("Model loaded successfully!")
+        except Exception as e:
+            st.error(f"Error loading model: {e}")
+            st.stop()
+    # Main area
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.header("📝 Enter Code")
+        # Example selector
+        example = st.selectbox(
+            "Or try an example:",
+            ["Custom", "Buffer Overflow", "SQL Injection", "Safe Code"]
+        )
+        if example == "Buffer Overflow":
+            default_code = '''void copy(char *input) {
+    char buffer[8];
+    strcpy(buffer, input);
+}'''
+        elif example == "SQL Injection":
+            default_code = '''def get_user(user_id):
+    query = "SELECT * FROM users WHERE id=" + user_id
+    cursor.execute(query)
+    return cursor.fetchone()'''
+        elif example == "Safe Code":
+            default_code = '''def add_numbers(a, b):
+    return a + b'''
+        else:
+            default_code = ""
+        code_input = st.text_area(
+            "Paste your code here:",
+            value=default_code,
+            height=300,
+            placeholder="Enter source code to analyze..."
+        )
+        analyze_button = st.button("🔍 Analyze Code", type="primary", use_container_width=True)
+    with col2:
+        st.header("📊 Analysis Results")
+        if analyze_button and code_input.strip():
+            with st.spinner("Analyzing code..."):
+                try:
+                    result = detector.predict(code_input)
+                    # Display result
+                    if result['prediction'] == 1:
+                        st.error(f"⚠️ {result['label']}")
+                        st.progress(result['probabilities']['vulnerable'])
+                    else:
+                        st.success(f"✅ {result['label']}")
+                        st.progress(result['probabilities']['safe'])
+                    # Confidence metrics
+                    st.subheader("Confidence Breakdown")
+                    col_a, col_b = st.columns(2)
+                    with col_a:
+                        st.metric(
+                            "Safe Probability",
+                            f"{result['probabilities']['safe']:.1%}",
+                            delta=None
+                        )
+                    with col_b:
+                        st.metric(
+                            "Vulnerable Probability",
+                            f"{result['probabilities']['vulnerable']:.1%}",
+                            delta=None
+                        )
+                    # Recommendations
+                    if result['prediction'] == 1:
+                        st.subheader("🛡️ Recommendations")
+                        st.warning("""
+                        **This code appears to have security vulnerabilities.**
+                        Common fixes:
+                        - Use bounds-checked functions (strncpy instead of strcpy)
+                        - Use parameterized queries for SQL
+                        - Validate and sanitize all user inputs
+                        - Avoid eval() and system() with user input
+                        """)
+                    else:
+                        st.subheader("Good Practices")
+                        st.info("""
+                        This code appears to follow security best practices!
+                        Remember to:
+                        - Keep dependencies updated
+                        - Perform regular security audits
+                        - Use static analysis tools
+                        - Follow OWASP guidelines
+                        """)
+                except Exception as e:
+                    st.error(f"Error during analysis: {e}")
+        elif analyze_button:
+            st.warning("Please enter some code to analyze.")
+if __name__ == "__main__":
+    main()

models/best_model_clean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a1b98dd49c9eddf98d8e95f612f6467c10a9f98a2a4b76b0770c84ea88a674c
+size 894029464

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
-altair
-pandas
-streamlit

+streamlit==1.28.2
+torch==2.10.0
+transformers==4.57.1
+sentencepiece
+numpy==1.26.2

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.10

save_model.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+from src.model import VulnerabilityCodeT5
+# Load original big checkpoint
+checkpoint = torch.load("models/best_model.pt", map_location="cpu")
+# Initialize model
+model = VulnerabilityCodeT5(num_labels=2)
+# Load only model weights
+model.load_state_dict(checkpoint['model_state_dict'])
+# Save clean weights only
+torch.save(model.state_dict(), "models/best_model_clean.pt")
+print("Saved clean model.")

src/__pycache__/inference.cpython-312.pyc ADDED Viewed

Binary file (6 kB). View file

src/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (3.66 kB). View file

src/data.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from transformers import RobertaTokenizer
+from torch.utils.data import Dataset, DataLoader
+import torch
+import json
+from pathlib import Path
+class VulnerabilityDataset(Dataset):
+    """PyTorch dataset for vulnerability detection"""
+    def __init__(self, data_path, tokenizer, max_length=512):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.data = []
+        data_path = Path(data_path)
+        if not data_path.exists():
+            raise FileNotFoundError(f"Dataset file not found: {data_path}")
+        with open(data_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    self.data.append(json.loads(line))
+        print(f"{data_path.name}: {len(self.data)} samples")
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        sample = self.data[idx]
+        code = sample["func"]          # confirmed correct
+        label = sample["target"]       # confirmed correct (0/1)
+        encoding = self.tokenizer(
+            code,
+            truncation=True,
+            padding="max_length",
+            max_length=self.max_length,
+            return_tensors="pt"
+        )
+        return {
+            "input_ids": encoding["input_ids"].squeeze(0),
+            "attention_mask": encoding["attention_mask"].squeeze(0),
+            "labels": torch.tensor(label, dtype=torch.long)
+        }
+def load_tokenizer(model_name="Salesforce/codet5-base"):
+    print(f"Tokenizer: {model_name}")
+    return RobertaTokenizer.from_pretrained(model_name)
+def create_dataloader(
+    train_path,
+    valid_path,
+    test_path,
+    tokenizer,
+    batch_size=8,
+    max_length=512,
+    num_workers=2,
+):
+    train_dataset = VulnerabilityDataset(train_path, tokenizer, max_length)
+    valid_dataset = VulnerabilityDataset(valid_path, tokenizer, max_length)
+    test_dataset = VulnerabilityDataset(test_path, tokenizer, max_length)
+    if len(train_dataset) == 0:
+        raise RuntimeError(f"No samples found in {train_path}")
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers,
+        pin_memory=True,
+        persistent_workers=True
+    )
+    valid_loader = DataLoader(
+        valid_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=True,
+        persistent_workers=True
+    )
+    test_loader = DataLoader(
+        test_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=True,
+        persistent_workers=True
+    )
+    return train_loader, valid_loader, test_loader

src/inference.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""Inference module for vulnerability detection
+Load trained models and make predictions"""
+import torch
+from transformers import RobertaTokenizer
+from pathlib import Path
+import sys
+sys.path.append(str(Path(__file__).parent.parent.parent))
+from src.model import VulnerabilityCodeT5
+class VulnerabilityDetector:
+    def __init__(self, model_path="models/best_model.pt",
+                 model_name="Salesforce/codet5-base", max_length=256):
+        ### CHANGED FOR DEPLOYMENT
+        self.device = torch.device('cpu')
+        self.max_length = max_length
+        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
+        self.model = VulnerabilityCodeT5(model_name=model_name, num_labels=2)
+        state_dict = torch.load(model_path, map_location=self.device)
+        self.model.load_state_dict(state_dict)
+        self.model.to(self.device)
+        self.model.eval()
+        print("Model Loaded Successfully")
+        self.labels = {
+            0: "Safe Code",
+            1: "Vulnerable Code"
+        }
+    def predict(self, code_snippet):
+        """Predict Vulnerability of Code Snippet
+        Args :
+        code_snippet: String Containing source code
+        Returns:
+          dict with predictions, confidence and label
+        """
+        inputs = self.tokenizer(
+            code_snippet,
+            max_length=256,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        )
+        input_ids = inputs['input_ids'].to(self.device)
+        attention_mask = inputs['attention_mask'].to(self.device)
+        with torch.no_grad():
+            predictions, probs = self.model.predict(input_ids, attention_mask)
+        pred_label = predictions[0].item()
+        confidence = probs[0][pred_label].item()
+        return {
+            'prediction': pred_label,
+            'label': self.labels[pred_label],
+            'confidence': confidence,
+            'probabilities':{
+                'safe': probs[0][0].item(),
+                'vulnerable': probs[0][1].item()
+            }
+        }
+    def analyze_batch(self, code_snippets):
+        """Analyze multiple code snippets at once"""
+        return [self.predict(code) for code in code_snippets]
+def test_inference():
+    detector = VulnerabilityDetector()
+    test_cases = [
+        {
+            "name": "Safe Bounded Copy",
+            "code": """void copy_input(const char *input) {
+        char buffer[32];
+        strncpy(buffer, input, sizeof(buffer) - 1);
+        buffer[sizeof(buffer) - 1] = '\\0';
+    }"""
+        },
+        {
+            "name": "Safe fgets Input",
+            "code": """void read_input() {
+        char buffer[64];
+        if (fgets(buffer, sizeof(buffer), stdin) != NULL) {
+            printf("%s", buffer);
+        }
+    }"""
+        },
+        {
+            "name": "Safe malloc usage",
+            "code": """void allocate() {
+        char *buf = (char *)malloc(128);
+        if (buf == NULL) {
+            return;
+        }
+        strcpy(buf, "safe");
+        free(buf);
+    }"""
+        },
+        {
+            "name": "Stack Buffer Overflow",
+            "code": """void copy_input(char *input) {
+        char buffer[8];
+        strcpy(buffer, input);
+    }"""
+        },
+        {
+            "name": "Integer Overflow",
+            "code": """void allocate(int size) {
+        char *buf = (char *)malloc(size * sizeof(char));
+        if (buf == NULL) return;
+        memset(buf, 'A', size + 10);
+    }"""
+        },
+        {
+            "name": "Use After Free",
+            "code": """void uaf() {
+        char *buf = (char *)malloc(16);
+        free(buf);
+        strcpy(buf, "UAF");
+    }"""
+        }
+    ]
+    print("\n" + "="*60)
+    print("Testing Vulnerability Detection")
+    print("="*60)
+    for test in test_cases:
+        print(f"\nTest: {test['name']}")
+        print(f"Code: {test['code'][:60]}...")
+        result = detector.predict(test['code'])
+        print(f"Prediction: {result['label']}")
+        print(f"Confidence: {result['confidence']:.2%}")
+        print(f"   - Safe: {result['probabilities']['safe']:.2%}")
+        print(f"   - Vulnerable: {result['probabilities']['vulnerable']:.2%}")
+if __name__ == "__main__":
+    test_inference()

src/model.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""CodeT5 Vulnerability Detection model
+Binary Classication Safe(0) vs Vulnerable(1)"""
+import torch
+import torch.nn as nn
+from transformers import T5ForConditionalGeneration, RobertaTokenizer
+class VulnerabilityCodeT5(nn.Module):
+    """CodeT5 model for vulnerability detection"""
+    def __init__(self, model_name="Salesforce/codet5-base", num_labels=2):
+        super().__init__()
+        self.encoder_decoder = T5ForConditionalGeneration.from_pretrained(model_name)
+        #Get hidden size from config
+        hidden_size = self.encoder_decoder.config.d_model #768 for base
+        #Classification Head
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size, num_labels)
+        )
+        self.num_labels = num_labels
+    def forward(self, input_ids, attention_mask, labels=None):
+        """
+        Forward pass
+        Args:
+        input_ids : tokenized code [batch_size, seq_len]
+        attention_mask : attention mask [batch_size, seq_len]
+        labels: ground truth labels [batch_size]
+        """
+        #Get encoder outputs
+        encoder_outputs = self.encoder_decoder.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        #Pool encoder outputs (use first token [CLS])
+        hidden_state = encoder_outputs.last_hidden_state # [batch, seq_len, hidden]
+        pooled_output = hidden_state[:, 0, :] # [batch, hidden]
+        #Classification
+        logits = self.classifier(pooled_output) # [batch, num_labels]
+        #Calculate loss
+        loss = None
+        if labels is not None:
+            loss_fn = nn.CrossEntropyLoss()
+            loss = loss_fn(logits, labels)
+        return {
+            'loss': loss,
+            'logits': logits,
+            'hidden_states': hidden_state
+        }
+    def predict(self, input_ids, attention_mask):
+        """Make Predictions"""
+        self.eval()
+        with torch.no_grad():
+            outputs = self.forward(input_ids, attention_mask)
+            probs = torch.softmax(outputs["logits"], dim=1)
+            predictions = torch.argmax(probs, dim=1)
+        return predictions, probs
+def count_parameters(model):
+    """Count trainable parameters"""
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)

src/train.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import torch
+import torch.nn as nn
+from torch.optim import AdamW
+from torch.amp import autocast, GradScaler
+from transformers import get_linear_schedule_with_warmup
+from pathlib import Path
+from tqdm import tqdm
+import argparse
+import json
+import gc
+import sys
+sys.path.append(str(Path(__file__).parent.parent))
+from src.v2.data_processor import load_tokenizer, create_dataloader
+from src.v2.model import VulnerabilityCodeT5, count_parameters
+class Trainer:
+    def __init__(
+        self,
+        model,
+        train_loader,
+        valid_loader,
+        device,
+        learning_rate=2e-5,
+        num_epochs=5,
+        gradient_accumulation_steps=4,
+    ):
+        self.model = model.to(device)
+        self.train_loader = train_loader
+        self.valid_loader = valid_loader
+        self.device = device
+        self.num_epochs = num_epochs
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.use_amp = device.type == "cuda"
+        self.scaler = GradScaler(enabled=self.use_amp)
+        self.optimizer = AdamW(
+            self.model.parameters(), lr=learning_rate, weight_decay=0.01
+        )
+        total_steps = (
+            len(self.train_loader) * num_epochs
+        ) // gradient_accumulation_steps
+        self.scheduler = get_linear_schedule_with_warmup(
+            self.optimizer,
+            num_warmup_steps=max(1, total_steps // 10),
+            num_training_steps=total_steps,
+        )
+        self.best_val_acc = 0.0
+        self.history = {
+            "train_loss": [],
+            "train_acc": [],
+            "val_loss": [],
+            "val_acc": [],
+        }
+    def clear_memory(self):
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+    def train_epoch(self):
+        self.model.train()
+        total_loss = 0.0
+        correct = 0
+        total = 0
+        self.optimizer.zero_grad(set_to_none=True)
+        pbar = tqdm(self.train_loader, desc="Training")
+        for step, batch in enumerate(pbar):
+            input_ids = batch["input_ids"].to(self.device, non_blocking=True)
+            attention_mask = batch["attention_mask"].to(self.device, non_blocking=True)
+            labels = batch["labels"].to(self.device, non_blocking=True)
+            with autocast(device_type="cuda", enabled=self.use_amp):
+                outputs = self.model(input_ids, attention_mask, labels)
+                loss = outputs["loss"] / self.gradient_accumulation_steps
+            self.scaler.scale(loss).backward()
+            if (step + 1) % self.gradient_accumulation_steps == 0:
+                self.scaler.unscale_(self.optimizer)
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+                self.scheduler.step()
+                self.optimizer.zero_grad(set_to_none=True)
+            with torch.no_grad():
+                preds = torch.argmax(outputs["logits"], dim=1)
+                correct += (preds == labels).sum().item()
+                total += labels.size(0)
+            total_loss += loss.item() * self.gradient_accumulation_steps
+            gpu_mem = (
+                torch.cuda.memory_allocated() / 1024 ** 3
+                if torch.cuda.is_available()
+                else 0
+            )
+            pbar.set_postfix(
+                {
+                    "loss": f"{loss.item() * self.gradient_accumulation_steps:.4f}",
+                    "acc": f"{100 * correct / max(1, total):.2f}%",
+                    "gpu": f"{gpu_mem:.2f}GB",
+                }
+            )
+            del input_ids, attention_mask, labels, outputs, loss
+        self.clear_memory()
+        return total_loss / len(self.train_loader), 100 * correct / total
+    def validate(self):
+        self.model.eval()
+        total_loss = 0.0
+        correct = 0
+        total = 0
+        with torch.no_grad():
+            for batch in tqdm(self.valid_loader, desc="Validating"):
+                input_ids = batch["input_ids"].to(self.device)
+                attention_mask = batch["attention_mask"].to(self.device)
+                labels = batch["labels"].to(self.device)
+                with autocast(device_type="cuda", enabled=self.use_amp):
+                    outputs = self.model(input_ids, attention_mask, labels)
+                    loss = outputs["loss"]
+                preds = torch.argmax(outputs["logits"], dim=1)
+                correct += (preds == labels).sum().item()
+                total += labels.size(0)
+                total_loss += loss.item()
+        self.clear_memory()
+        return total_loss / len(self.valid_loader), 100 * correct / total
+    def train(self, save_dir="models/v2"):
+        print(f"Training samples: {len(self.train_loader.dataset)}")
+        print(f"Validation samples: {len(self.valid_loader.dataset)}")
+        if torch.cuda.is_available():
+            print(f"GPU: {torch.cuda.get_device_name(0)}")
+        save_dir = Path(save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        for epoch in range(self.num_epochs):
+            print(f"\n{'=' * 60}")
+            print(f"Epoch {epoch + 1}/{self.num_epochs}")
+            print(f"{'=' * 60}")
+            train_loss, train_acc = self.train_epoch()
+            val_loss, val_acc = self.validate()
+            print(
+                f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%"
+            )
+            print(f"Val   Loss: {val_loss:.4f} | Val   Acc: {val_acc:.2f}%")
+            self.history["train_loss"].append(train_loss)
+            self.history["train_acc"].append(train_acc)
+            self.history["val_loss"].append(val_loss)
+            self.history["val_acc"].append(val_acc)
+            if val_acc > self.best_val_acc:
+                self.best_val_acc = val_acc
+                torch.save(
+                    {
+                        "model_state_dict": self.model.state_dict(),
+                        "optimizer_state_dict": self.optimizer.state_dict(),
+                        "val_acc": val_acc,
+                    },
+                    save_dir / "best_model.pt",
+                )
+                print("Saved best model")
+        torch.save(
+            {
+                "model_state_dict": self.model.state_dict(),
+                "history": self.history,
+            },
+            save_dir / "final_model.pt",
+        )
+        with open(save_dir / "training_history.json", "w") as f:
+            json.dump(self.history, f, indent=2)
+        print(f"\nTraining complete. Best Val Acc: {self.best_val_acc:.2f}%")
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    data_dir = (
+        Path("data/processed/sample") if args.use_sample else Path("data/processed")
+    )
+    train_path = data_dir / "train.jsonl"
+    valid_path = data_dir / "valid.jsonl"
+    test_path = data_dir / "test.jsonl"
+    tokenizer = load_tokenizer(args.model_name)
+    train_loader, valid_loader, test_loader = create_dataloader(
+        train_path,
+        valid_path,
+        test_path,
+        tokenizer,
+        batch_size=args.batch_size,
+        max_length=args.max_length,
+        num_workers=2,
+    )
+    model = VulnerabilityCodeT5(model_name=args.model_name, num_labels=2)
+    print(f"Trainable parameters: {count_parameters(model):,}")
+    trainer = Trainer(
+        model,
+        train_loader,
+        valid_loader,
+        device,
+        learning_rate=args.learning_rate,
+        num_epochs=args.epochs,
+        gradient_accumulation_steps=args.gradient_accumulation,
+    )
+    trainer.train(args.output_dir)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", default="Salesforce/codet5-base")
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--max_length", type=int, default=256)
+    parser.add_argument("--learning_rate", type=float, default=2e-5)
+    parser.add_argument("--epochs", type=int, default=3)
+    parser.add_argument("--gradient_accumulation", type=int, default=4)
+    parser.add_argument("--output_dir", default="models/v2")
+    parser.add_argument("--use_sample", action="store_true")
+    main(parser.parse_args())