import gradio as st import torch import json import requests from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding ) from datasets import Dataset, DatasetDict import numpy as np from sklearn.metrics import accuracy_score, precision_recall_fscore_support import os # Sayfa ayarları st.set_page_config(page_title="Code Security Trainer", page_icon="🎓", layout="wide") st.title("🎓 Code Security Model Trainer") st.markdown("Interaktif model eğitim arayüzü - Kontrol sende!") # Session state if 'model' not in st.session_state: st.session_state.model = None if 'tokenizer' not in st.session_state: st.session_state.tokenizer = None if 'dataset' not in st.session_state: st.session_state.dataset = None if 'training_logs' not in st.session_state: st.session_state.training_logs = [] # Sidebar kontroller with st.sidebar: st.header("⚙️ Eğitim Ayarları") epochs = st.slider("Epoch sayısı", 5, 50, 20) learning_rate = st.select_slider("Learning rate", options=[1e-5, 2e-5, 5e-5, 1e-4], value=2e-5) batch_size = st.selectbox("Batch size", [2, 4, 8], index=1) st.markdown("---") st.header("📊 Durum") if st.session_state.model: st.success("Model yüklendi") else: st.info("Model yüklenmedi") # Ana bölüm tab1, tab2, tab3, tab4 = st.tabs(["📥 Veri Seti", "🏋️ Eğitim", "🧪 Test", "💾 Kaydet"]) # Tab 1: Veri Seti with tab1: st.header("Veri Seti Yükle") col1, col2 = st.columns(2) with col1: st.subheader("HF'den İndir") if st.button("📥 HF Veri Setini İndir"): with st.spinner("İndiriliyor..."): try: url = "https://huggingface.co/datasets/omernet/code-security-dataset/resolve/main/python_sql_20.jsonl" response = requests.get(url) data = [] for line in response.text.strip().split('\n'): if line.strip(): data.append(json.loads(line)) st.session_state.raw_data = data # Göster st.success(f"{len(data)} örnek yüklendi!") st.write(f"- Zafiyetli: {sum(1 for d in data if d['label'] == 1)}") st.write(f"- Güvenli: {sum(1 for d in data if d['label'] == 0)}") # Veri setini hazırla train_data = data[:14] val_data = data[14:17] test_data = data[17:] def create_dataset(examples): return Dataset.from_dict({ 'code': [e['code'] for e in examples], 'label': [e['label'] for e in examples] }) st.session_state.dataset = DatasetDict({ 'train': create_dataset(train_data), 'validation': create_dataset(val_data), 'test': create_dataset(test_data) }) st.success("Veri seti hazır!") except Exception as e: st.error(f"Hata: {e}") with col2: st.subheader("Örnekleri Gör") if st.session_state.get('raw_data'): sample_type = st.radio("Tür", ["Zafiyetli", "Güvenli"]) label = 1 if sample_type == "Zafiyetli" else 0 samples = [d for d in st.session_state.raw_data if d['label'] == label] if samples: selected = st.selectbox("Örnek seç", range(len(samples)), format_func=lambda i: f"Örnek {i+1}") st.code(samples[selected]['code'], language='python') # Tab 2: Eğitim with tab2: st.header("Model Eğitimi") col1, col2 = st.columns([1, 2]) with col1: st.subheader("Başlat") if st.button("🚀 Eğitimi Başlat", type="primary"): if not st.session_state.get('dataset'): st.error("Önce veri setini indir!") else: with st.spinner("Model yükleniyor..."): # Model yükle MODEL_NAME = "microsoft/codebert-base" st.session_state.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) st.session_state.model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, num_labels=2 ) # Tokenize def tokenize_function(examples): return st.session_state.tokenizer( examples['code'], padding='max_length', truncation=True, max_length=512 ) tokenized = st.session_state.dataset.map(tokenize_function, batched=True) tokenized = tokenized.remove_columns(['code']) tokenized = tokenized.rename_column('label', 'labels') tokenized.set_format('torch') st.session_state.tokenized_dataset = tokenized # Eğitim with st.spinner(f"Eğitim başlıyor ({epochs} epoch)..."): def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) precision, recall, f1, _ = precision_recall_fscore_support( labels, predictions, average='binary' ) acc = accuracy_score(labels, predictions) return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall} training_args = TrainingArguments( output_dir="./results", learning_rate=learning_rate, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=epochs, weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="f1", logging_dir='./logs', logging_steps=1, report_to="none" ) trainer = Trainer( model=st.session_state.model, args=training_args, train_dataset=tokenized['train'], eval_dataset=tokenized['validation'], tokenizer=st.session_state.tokenizer, data_collator=DataCollatorWithPadding(st.session_state.tokenizer), compute_metrics=compute_metrics, ) # Eğit trainer.train() # Test results = trainer.evaluate(tokenized['test']) st.session_state.test_results = results st.success("Eğitim tamamlandı!") with col2: st.subheader("Sonuçlar") if st.session_state.get('test_results'): results = st.session_state.test_results col_m1, col_m2, col_m3, col_m4 = st.columns(4) with col_m1: st.metric("Accuracy", f"{results['eval_accuracy']:.2%}") with col_m2: st.metric("F1 Score", f"{results['eval_f1']:.2%}") with col_m3: st.metric("Precision", f"{results['eval_precision']:.2%}") with col_m4: st.metric("Recall", f"{results['eval_recall']:.2%}") else: st.info("Eğitim sonrası sonuçlar burada görünecek") # Tab 3: Test with tab3: st.header("Model Testi") if not st.session_state.get('model'): st.warning("Önce modeli eğit!") else: test_code = st.text_area("Test kodu", height=150, value="def login(u, p):\n query = f\"SELECT * FROM users WHERE name='{u}'\"\n return db.execute(query)") if st.button("🔍 Tahmin Et"): with st.spinner("Tahmin yapılıyor..."): inputs = st.session_state.tokenizer( test_code, return_tensors="pt", truncation=True, max_length=512 ) with torch.no_grad(): outputs = st.session_state.model(**inputs) probabilities = torch.softmax(outputs.logits, dim=-1) prediction = torch.argmax(probabilities, dim=-1).item() confidence = probabilities[0][prediction].item() if prediction == 1: st.error(f"🔴 ZAFİYET TESPİT EDİLDİ (Güven: {confidence:.2%})") else: st.success(f"🟢 GÜVENLİ (Güven: {confidence:.2%})") # Tab 4: Kaydet with tab4: st.header("Modeli Kaydet") if not st.session_state.get('model'): st.warning("Önce modeli eğit!") else: if st.button("💾 Local Kaydet"): with st.spinner("Kaydediliyor..."): st.session_state.model.save_pretrained("./code-security-model") st.session_state.tokenizer.save_pretrained("./code-security-model") st.success("Model kaydedildi!") st.markdown("---") hf_token = st.text_input("HF Token (opsiyonel)", type="password") if st.button("☁️ Hugging Face'e Yükle"): if hf_token: with st.spinner("Yükleniyor..."): from huggingface_hub import login, HfApi login(token=hf_token) api = HfApi() api.create_repo(repo_id="omernet/code-security-trained", exist_ok=True) api.upload_folder( folder_path="./code-security-model", repo_id="omernet/code-security-trained" ) st.success("HF'e yüklendi!") else: st.error("HF token gerekli!")