Spaces:

dad1909
/

CyberCode

Paused

App Files Files Community

CyberCode / app.py

dad1909

Update app.py

55fe000 verified over 1 year ago

raw

history blame contribute delete

4.28 kB

	import os
	import torch
	from unsloth import FastLanguageModel, is_bfloat16_supported
	from trl import SFTTrainer
	from transformers import TrainingArguments
	from datasets import load_dataset
	import gradio as gr
	import json
	from huggingface_hub import HfApi

	max_seq_length = 4096
	dtype = None
	load_in_4bit = True
	hf_token = os.getenv("HF_TOKEN")
	current_num = os.getenv("NUM")

	print(f"stage ${current_num}")

	api = HfApi(token=hf_token)
	# models = f"dad1909/cybersentinal-2.0-{current_num}"
	model_base = "unsloth/gemma-2-27b-bnb-4bit"

	print("Starting model and tokenizer loading...")

	# Load the model and tokenizer
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_base,
	max_seq_length=max_seq_length,
	dtype=dtype,
	load_in_4bit=load_in_4bit,
	token=hf_token
	)

	print("Model and tokenizer loaded successfully.")

	print("Configuring PEFT model...")
	model = FastLanguageModel.get_peft_model(
	model,
	r=16,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
	lora_alpha=16,
	lora_dropout=0,
	bias="none",
	use_gradient_checkpointing="unsloth",
	random_state=3407,
	use_rslora=False,
	loftq_config=None,
	)
	print("PEFT model configured.")

	# Updated alpaca_prompt for different types
	alpaca_prompt = {
	"learning_from": """Below is a CVE definition.
	### CVE definition:
	{}
	### detail CVE:
	{}""",
	"definition": """Below is a definition about software vulnerability. Explain it.
	### Definition:
	{}
	### Explanation:
	{}""",
	"code_vulnerability": """Below is a code snippet. Identify the line of code that is vulnerable and describe the type of software vulnerability.
	### Code Snippet:
	{}
	### Vulnerability solution:
	{}"""
	}

	EOS_TOKEN = tokenizer.eos_token

	def detect_prompt_type(instruction):
	if instruction.startswith("what is code vulnerable of this code:"):
	return "code_vulnerability"
	elif instruction.startswith("Learning from"):
	return "learning_from"
	elif instruction.startswith("what is"):
	return "definition"
	else:
	return "unknown"

	def formatting_prompts_func(examples):
	instructions = examples["instruction"]
	outputs = examples["output"]
	texts = []

	for instruction, output in zip(instructions, outputs):
	prompt_type = detect_prompt_type(instruction)
	if prompt_type in alpaca_prompt:
	prompt = alpaca_prompt[prompt_type].format(instruction, output)
	else:
	prompt = instruction + "\n\n" + output
	text = prompt + EOS_TOKEN
	texts.append(text)

	return {"text": texts}

	print("Loading dataset...")
	dataset = load_dataset("dad1909/DCSV", split="train")
	print("Dataset loaded successfully.")

	print("Applying formatting function to the dataset...")
	dataset = dataset.map(formatting_prompts_func, batched=True)
	print("Formatting function applied.")

	print("Initializing trainer...")
	trainer = SFTTrainer(
	model=model,
	tokenizer=tokenizer,
	train_dataset=dataset,
	dataset_text_field="text",
	max_seq_length=max_seq_length,
	dataset_num_proc=2,
	packing=False,
	args=TrainingArguments(
	per_device_train_batch_size=1,
	gradient_accumulation_steps=1,
	learning_rate=2e-4,
	fp16=not is_bfloat16_supported(),
	bf16=is_bfloat16_supported(),
	warmup_steps=5,
	logging_steps=10,
	max_steps=100,
	optim="adamw_8bit",
	weight_decay=0.01,
	lr_scheduler_type="linear",
	seed=3407,
	output_dir="outputs"
	),
	)
	print("Trainer initialized.")

	print("Starting training...")
	trainer_stats = trainer.train()
	print("Training completed.")

	num = int(current_num)
	num += 1

	uploads_models = f"cybersentinal-2.0-{str(num)}"

	up = "sentinal-3.1-70B"

	print("Saving the trained model...")
	model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
	print("Model saved successfully.")

	print("Pushing the model to the hub...")
	model.push_to_hub_merged(
	up,
	tokenizer,
	save_method="merged_16bit",
	token=hf_token
	)
	print("Model pushed to hub successfully.")

	api.delete_space_variable(repo_id="dad1909/CyberCode", key="NUM")
	api.add_space_variable(repo_id="dad1909/CyberCode", key="NUM", value=str(num))