omernet's picture
Upload app.py with huggingface_hub
1d80aa1 verified
import gradio as st
import torch
import json
import requests
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding
)
from datasets import Dataset, DatasetDict
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
# Sayfa ayarları
st.set_page_config(page_title="Code Security Trainer", page_icon="🎓", layout="wide")
st.title("🎓 Code Security Model Trainer")
st.markdown("Interaktif model eğitim arayüzü - Kontrol sende!")
# Session state
if 'model' not in st.session_state:
st.session_state.model = None
if 'tokenizer' not in st.session_state:
st.session_state.tokenizer = None
if 'dataset' not in st.session_state:
st.session_state.dataset = None
if 'training_logs' not in st.session_state:
st.session_state.training_logs = []
# Sidebar kontroller
with st.sidebar:
st.header("⚙️ Eğitim Ayarları")
epochs = st.slider("Epoch sayısı", 5, 50, 20)
learning_rate = st.select_slider("Learning rate", options=[1e-5, 2e-5, 5e-5, 1e-4], value=2e-5)
batch_size = st.selectbox("Batch size", [2, 4, 8], index=1)
st.markdown("---")
st.header("📊 Durum")
if st.session_state.model:
st.success("Model yüklendi")
else:
st.info("Model yüklenmedi")
# Ana bölüm
tab1, tab2, tab3, tab4 = st.tabs(["📥 Veri Seti", "🏋️ Eğitim", "🧪 Test", "💾 Kaydet"])
# Tab 1: Veri Seti
with tab1:
st.header("Veri Seti Yükle")
col1, col2 = st.columns(2)
with col1:
st.subheader("HF'den İndir")
if st.button("📥 HF Veri Setini İndir"):
with st.spinner("İndiriliyor..."):
try:
url = "https://huggingface.co/datasets/omernet/code-security-dataset/resolve/main/python_sql_20.jsonl"
response = requests.get(url)
data = []
for line in response.text.strip().split('\n'):
if line.strip():
data.append(json.loads(line))
st.session_state.raw_data = data
# Göster
st.success(f"{len(data)} örnek yüklendi!")
st.write(f"- Zafiyetli: {sum(1 for d in data if d['label'] == 1)}")
st.write(f"- Güvenli: {sum(1 for d in data if d['label'] == 0)}")
# Veri setini hazırla
train_data = data[:14]
val_data = data[14:17]
test_data = data[17:]
def create_dataset(examples):
return Dataset.from_dict({
'code': [e['code'] for e in examples],
'label': [e['label'] for e in examples]
})
st.session_state.dataset = DatasetDict({
'train': create_dataset(train_data),
'validation': create_dataset(val_data),
'test': create_dataset(test_data)
})
st.success("Veri seti hazır!")
except Exception as e:
st.error(f"Hata: {e}")
with col2:
st.subheader("Örnekleri Gör")
if st.session_state.get('raw_data'):
sample_type = st.radio("Tür", ["Zafiyetli", "Güvenli"])
label = 1 if sample_type == "Zafiyetli" else 0
samples = [d for d in st.session_state.raw_data if d['label'] == label]
if samples:
selected = st.selectbox("Örnek seç", range(len(samples)), format_func=lambda i: f"Örnek {i+1}")
st.code(samples[selected]['code'], language='python')
# Tab 2: Eğitim
with tab2:
st.header("Model Eğitimi")
col1, col2 = st.columns([1, 2])
with col1:
st.subheader("Başlat")
if st.button("🚀 Eğitimi Başlat", type="primary"):
if not st.session_state.get('dataset'):
st.error("Önce veri setini indir!")
else:
with st.spinner("Model yükleniyor..."):
# Model yükle
MODEL_NAME = "microsoft/codebert-base"
st.session_state.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
st.session_state.model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME, num_labels=2
)
# Tokenize
def tokenize_function(examples):
return st.session_state.tokenizer(
examples['code'],
padding='max_length',
truncation=True,
max_length=512
)
tokenized = st.session_state.dataset.map(tokenize_function, batched=True)
tokenized = tokenized.remove_columns(['code'])
tokenized = tokenized.rename_column('label', 'labels')
tokenized.set_format('torch')
st.session_state.tokenized_dataset = tokenized
# Eğitim
with st.spinner(f"Eğitim başlıyor ({epochs} epoch)..."):
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average='binary'
)
acc = accuracy_score(labels, predictions)
return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}
training_args = TrainingArguments(
output_dir="./results",
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=epochs,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1",
logging_dir='./logs',
logging_steps=1,
report_to="none"
)
trainer = Trainer(
model=st.session_state.model,
args=training_args,
train_dataset=tokenized['train'],
eval_dataset=tokenized['validation'],
tokenizer=st.session_state.tokenizer,
data_collator=DataCollatorWithPadding(st.session_state.tokenizer),
compute_metrics=compute_metrics,
)
# Eğit
trainer.train()
# Test
results = trainer.evaluate(tokenized['test'])
st.session_state.test_results = results
st.success("Eğitim tamamlandı!")
with col2:
st.subheader("Sonuçlar")
if st.session_state.get('test_results'):
results = st.session_state.test_results
col_m1, col_m2, col_m3, col_m4 = st.columns(4)
with col_m1:
st.metric("Accuracy", f"{results['eval_accuracy']:.2%}")
with col_m2:
st.metric("F1 Score", f"{results['eval_f1']:.2%}")
with col_m3:
st.metric("Precision", f"{results['eval_precision']:.2%}")
with col_m4:
st.metric("Recall", f"{results['eval_recall']:.2%}")
else:
st.info("Eğitim sonrası sonuçlar burada görünecek")
# Tab 3: Test
with tab3:
st.header("Model Testi")
if not st.session_state.get('model'):
st.warning("Önce modeli eğit!")
else:
test_code = st.text_area("Test kodu", height=150, value="def login(u, p):\n query = f\"SELECT * FROM users WHERE name='{u}'\"\n return db.execute(query)")
if st.button("🔍 Tahmin Et"):
with st.spinner("Tahmin yapılıyor..."):
inputs = st.session_state.tokenizer(
test_code,
return_tensors="pt",
truncation=True,
max_length=512
)
with torch.no_grad():
outputs = st.session_state.model(**inputs)
probabilities = torch.softmax(outputs.logits, dim=-1)
prediction = torch.argmax(probabilities, dim=-1).item()
confidence = probabilities[0][prediction].item()
if prediction == 1:
st.error(f"🔴 ZAFİYET TESPİT EDİLDİ (Güven: {confidence:.2%})")
else:
st.success(f"🟢 GÜVENLİ (Güven: {confidence:.2%})")
# Tab 4: Kaydet
with tab4:
st.header("Modeli Kaydet")
if not st.session_state.get('model'):
st.warning("Önce modeli eğit!")
else:
if st.button("💾 Local Kaydet"):
with st.spinner("Kaydediliyor..."):
st.session_state.model.save_pretrained("./code-security-model")
st.session_state.tokenizer.save_pretrained("./code-security-model")
st.success("Model kaydedildi!")
st.markdown("---")
hf_token = st.text_input("HF Token (opsiyonel)", type="password")
if st.button("☁️ Hugging Face'e Yükle"):
if hf_token:
with st.spinner("Yükleniyor..."):
from huggingface_hub import login, HfApi
login(token=hf_token)
api = HfApi()
api.create_repo(repo_id="omernet/code-security-trained", exist_ok=True)
api.upload_folder(
folder_path="./code-security-model",
repo_id="omernet/code-security-trained"
)
st.success("HF'e yüklendi!")
else:
st.error("HF token gerekli!")