Spaces:
Build error
Build error
| import gradio as st | |
| import torch | |
| import json | |
| import requests | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| TrainingArguments, | |
| Trainer, | |
| DataCollatorWithPadding | |
| ) | |
| from datasets import Dataset, DatasetDict | |
| import numpy as np | |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
| import os | |
| # Sayfa ayarları | |
| st.set_page_config(page_title="Code Security Trainer", page_icon="🎓", layout="wide") | |
| st.title("🎓 Code Security Model Trainer") | |
| st.markdown("Interaktif model eğitim arayüzü - Kontrol sende!") | |
| # Session state | |
| if 'model' not in st.session_state: | |
| st.session_state.model = None | |
| if 'tokenizer' not in st.session_state: | |
| st.session_state.tokenizer = None | |
| if 'dataset' not in st.session_state: | |
| st.session_state.dataset = None | |
| if 'training_logs' not in st.session_state: | |
| st.session_state.training_logs = [] | |
| # Sidebar kontroller | |
| with st.sidebar: | |
| st.header("⚙️ Eğitim Ayarları") | |
| epochs = st.slider("Epoch sayısı", 5, 50, 20) | |
| learning_rate = st.select_slider("Learning rate", options=[1e-5, 2e-5, 5e-5, 1e-4], value=2e-5) | |
| batch_size = st.selectbox("Batch size", [2, 4, 8], index=1) | |
| st.markdown("---") | |
| st.header("📊 Durum") | |
| if st.session_state.model: | |
| st.success("Model yüklendi") | |
| else: | |
| st.info("Model yüklenmedi") | |
| # Ana bölüm | |
| tab1, tab2, tab3, tab4 = st.tabs(["📥 Veri Seti", "🏋️ Eğitim", "🧪 Test", "💾 Kaydet"]) | |
| # Tab 1: Veri Seti | |
| with tab1: | |
| st.header("Veri Seti Yükle") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("HF'den İndir") | |
| if st.button("📥 HF Veri Setini İndir"): | |
| with st.spinner("İndiriliyor..."): | |
| try: | |
| url = "https://huggingface.co/datasets/omernet/code-security-dataset/resolve/main/python_sql_20.jsonl" | |
| response = requests.get(url) | |
| data = [] | |
| for line in response.text.strip().split('\n'): | |
| if line.strip(): | |
| data.append(json.loads(line)) | |
| st.session_state.raw_data = data | |
| # Göster | |
| st.success(f"{len(data)} örnek yüklendi!") | |
| st.write(f"- Zafiyetli: {sum(1 for d in data if d['label'] == 1)}") | |
| st.write(f"- Güvenli: {sum(1 for d in data if d['label'] == 0)}") | |
| # Veri setini hazırla | |
| train_data = data[:14] | |
| val_data = data[14:17] | |
| test_data = data[17:] | |
| def create_dataset(examples): | |
| return Dataset.from_dict({ | |
| 'code': [e['code'] for e in examples], | |
| 'label': [e['label'] for e in examples] | |
| }) | |
| st.session_state.dataset = DatasetDict({ | |
| 'train': create_dataset(train_data), | |
| 'validation': create_dataset(val_data), | |
| 'test': create_dataset(test_data) | |
| }) | |
| st.success("Veri seti hazır!") | |
| except Exception as e: | |
| st.error(f"Hata: {e}") | |
| with col2: | |
| st.subheader("Örnekleri Gör") | |
| if st.session_state.get('raw_data'): | |
| sample_type = st.radio("Tür", ["Zafiyetli", "Güvenli"]) | |
| label = 1 if sample_type == "Zafiyetli" else 0 | |
| samples = [d for d in st.session_state.raw_data if d['label'] == label] | |
| if samples: | |
| selected = st.selectbox("Örnek seç", range(len(samples)), format_func=lambda i: f"Örnek {i+1}") | |
| st.code(samples[selected]['code'], language='python') | |
| # Tab 2: Eğitim | |
| with tab2: | |
| st.header("Model Eğitimi") | |
| col1, col2 = st.columns([1, 2]) | |
| with col1: | |
| st.subheader("Başlat") | |
| if st.button("🚀 Eğitimi Başlat", type="primary"): | |
| if not st.session_state.get('dataset'): | |
| st.error("Önce veri setini indir!") | |
| else: | |
| with st.spinner("Model yükleniyor..."): | |
| # Model yükle | |
| MODEL_NAME = "microsoft/codebert-base" | |
| st.session_state.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| st.session_state.model = AutoModelForSequenceClassification.from_pretrained( | |
| MODEL_NAME, num_labels=2 | |
| ) | |
| # Tokenize | |
| def tokenize_function(examples): | |
| return st.session_state.tokenizer( | |
| examples['code'], | |
| padding='max_length', | |
| truncation=True, | |
| max_length=512 | |
| ) | |
| tokenized = st.session_state.dataset.map(tokenize_function, batched=True) | |
| tokenized = tokenized.remove_columns(['code']) | |
| tokenized = tokenized.rename_column('label', 'labels') | |
| tokenized.set_format('torch') | |
| st.session_state.tokenized_dataset = tokenized | |
| # Eğitim | |
| with st.spinner(f"Eğitim başlıyor ({epochs} epoch)..."): | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| predictions = np.argmax(logits, axis=-1) | |
| precision, recall, f1, _ = precision_recall_fscore_support( | |
| labels, predictions, average='binary' | |
| ) | |
| acc = accuracy_score(labels, predictions) | |
| return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall} | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| learning_rate=learning_rate, | |
| per_device_train_batch_size=batch_size, | |
| per_device_eval_batch_size=batch_size, | |
| num_train_epochs=epochs, | |
| weight_decay=0.01, | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| metric_for_best_model="f1", | |
| logging_dir='./logs', | |
| logging_steps=1, | |
| report_to="none" | |
| ) | |
| trainer = Trainer( | |
| model=st.session_state.model, | |
| args=training_args, | |
| train_dataset=tokenized['train'], | |
| eval_dataset=tokenized['validation'], | |
| tokenizer=st.session_state.tokenizer, | |
| data_collator=DataCollatorWithPadding(st.session_state.tokenizer), | |
| compute_metrics=compute_metrics, | |
| ) | |
| # Eğit | |
| trainer.train() | |
| # Test | |
| results = trainer.evaluate(tokenized['test']) | |
| st.session_state.test_results = results | |
| st.success("Eğitim tamamlandı!") | |
| with col2: | |
| st.subheader("Sonuçlar") | |
| if st.session_state.get('test_results'): | |
| results = st.session_state.test_results | |
| col_m1, col_m2, col_m3, col_m4 = st.columns(4) | |
| with col_m1: | |
| st.metric("Accuracy", f"{results['eval_accuracy']:.2%}") | |
| with col_m2: | |
| st.metric("F1 Score", f"{results['eval_f1']:.2%}") | |
| with col_m3: | |
| st.metric("Precision", f"{results['eval_precision']:.2%}") | |
| with col_m4: | |
| st.metric("Recall", f"{results['eval_recall']:.2%}") | |
| else: | |
| st.info("Eğitim sonrası sonuçlar burada görünecek") | |
| # Tab 3: Test | |
| with tab3: | |
| st.header("Model Testi") | |
| if not st.session_state.get('model'): | |
| st.warning("Önce modeli eğit!") | |
| else: | |
| test_code = st.text_area("Test kodu", height=150, value="def login(u, p):\n query = f\"SELECT * FROM users WHERE name='{u}'\"\n return db.execute(query)") | |
| if st.button("🔍 Tahmin Et"): | |
| with st.spinner("Tahmin yapılıyor..."): | |
| inputs = st.session_state.tokenizer( | |
| test_code, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=512 | |
| ) | |
| with torch.no_grad(): | |
| outputs = st.session_state.model(**inputs) | |
| probabilities = torch.softmax(outputs.logits, dim=-1) | |
| prediction = torch.argmax(probabilities, dim=-1).item() | |
| confidence = probabilities[0][prediction].item() | |
| if prediction == 1: | |
| st.error(f"🔴 ZAFİYET TESPİT EDİLDİ (Güven: {confidence:.2%})") | |
| else: | |
| st.success(f"🟢 GÜVENLİ (Güven: {confidence:.2%})") | |
| # Tab 4: Kaydet | |
| with tab4: | |
| st.header("Modeli Kaydet") | |
| if not st.session_state.get('model'): | |
| st.warning("Önce modeli eğit!") | |
| else: | |
| if st.button("💾 Local Kaydet"): | |
| with st.spinner("Kaydediliyor..."): | |
| st.session_state.model.save_pretrained("./code-security-model") | |
| st.session_state.tokenizer.save_pretrained("./code-security-model") | |
| st.success("Model kaydedildi!") | |
| st.markdown("---") | |
| hf_token = st.text_input("HF Token (opsiyonel)", type="password") | |
| if st.button("☁️ Hugging Face'e Yükle"): | |
| if hf_token: | |
| with st.spinner("Yükleniyor..."): | |
| from huggingface_hub import login, HfApi | |
| login(token=hf_token) | |
| api = HfApi() | |
| api.create_repo(repo_id="omernet/code-security-trained", exist_ok=True) | |
| api.upload_folder( | |
| folder_path="./code-security-model", | |
| repo_id="omernet/code-security-trained" | |
| ) | |
| st.success("HF'e yüklendi!") | |
| else: | |
| st.error("HF token gerekli!") | |