| import os |
| import sys |
| import itertools |
| import numpy as np |
| import tensorflow as tf |
| from sklearn.model_selection import train_test_split |
|
|
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
| from src.model import MalConv |
| from src.utils import preprocess_dataset |
|
|
| def hyperparameter_search(csv_path, |
| param_grid=None, |
| max_length=2**20, |
| epochs=5, |
| validation_split=0.2): |
| """ |
| ๊ทธ๋ฆฌ๋ ์์น๋ฅผ ํตํ ํ์ดํผํ๋ผ๋ฏธํฐ ์ต์ ํ |
| |
| Args: |
| csv_path: ํ๋ จ ๋ฐ์ดํฐ CSV ๊ฒฝ๋ก |
| param_grid: ํ์ดํผํ๋ผ๋ฏธํฐ ๊ทธ๋ฆฌ๋ |
| max_length: ์ต๋ ์
๋ ฅ ๊ธธ์ด |
| epochs: ํ๋ จ ์ํฌํฌ ์ |
| validation_split: ๊ฒ์ฆ ๋ฐ์ดํฐ ๋น์จ |
| """ |
| |
| if param_grid is None: |
| param_grid = { |
| 'embedding_size': [8, 16], |
| 'num_filters': [64, 128], |
| 'fc_size': [64, 128], |
| 'learning_rate': [0.001, 0.0001] |
| } |
| |
| print("๋ฐ์ดํฐ ๋ก๋ฉ ์ค...") |
| X, y = preprocess_dataset(csv_path, max_length) |
| X_train, X_val, y_train, y_val = train_test_split( |
| X, y, test_size=validation_split, random_state=42, stratify=y |
| ) |
| |
| |
| param_names = list(param_grid.keys()) |
| param_values = list(param_grid.values()) |
| param_combinations = list(itertools.product(*param_values)) |
| |
| best_score = 0 |
| best_params = None |
| results = [] |
| |
| print(f"์ด {len(param_combinations)}๊ฐ์ ์กฐํฉ์ ํ
์คํธํฉ๋๋ค.") |
| |
| for i, params in enumerate(param_combinations): |
| param_dict = dict(zip(param_names, params)) |
| print(f"\n[{i+1}/{len(param_combinations)}] ํ
์คํธ ์ค: {param_dict}") |
| |
| try: |
| |
| model = MalConv( |
| max_input_length=max_length, |
| embedding_size=param_dict['embedding_size'], |
| num_filters=param_dict['num_filters'], |
| fc_size=param_dict['fc_size'] |
| ) |
| |
| |
| model.compile( |
| optimizer=tf.keras.optimizers.Adam( |
| learning_rate=param_dict['learning_rate'] |
| ), |
| loss='binary_crossentropy', |
| metrics=['accuracy'] |
| ) |
| |
| |
| dummy_input = np.zeros((1, max_length), dtype=np.uint8) |
| _ = model(dummy_input) |
| |
| |
| history = model.fit( |
| X_train, y_train, |
| batch_size=16, |
| epochs=epochs, |
| validation_data=(X_val, y_val), |
| verbose=0 |
| ) |
| |
| |
| val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0) |
| |
| result = { |
| 'params': param_dict, |
| 'val_accuracy': val_acc, |
| 'val_loss': val_loss |
| } |
| results.append(result) |
| |
| print(f"๊ฒ์ฆ ์ ํ๋: {val_acc:.4f}") |
| |
| |
| if val_acc > best_score: |
| best_score = val_acc |
| best_params = param_dict |
| print(f"์๋ก์ด ์ต๊ณ ์ฑ๋ฅ! ์ ํ๋: {best_score:.4f}") |
| |
| except Exception as e: |
| print(f"์๋ฌ ๋ฐ์: {e}") |
| continue |
| |
| print("\n" + "="*50) |
| print("ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋ ์๋ฃ") |
| print("="*50) |
| print(f"์ต๊ณ ์ฑ๋ฅ: {best_score:.4f}") |
| print(f"์ต์ ํ์ดํผํ๋ผ๋ฏธํฐ: {best_params}") |
| |
| |
| results.sort(key=lambda x: x['val_accuracy'], reverse=True) |
| |
| print("\n์์ 5๊ฐ ๊ฒฐ๊ณผ:") |
| for i, result in enumerate(results[:5]): |
| print(f"{i+1}. ์ ํ๋: {result['val_accuracy']:.4f}, " |
| f"ํ๋ผ๋ฏธํฐ: {result['params']}") |
| |
| return best_params, results |
|
|
| def main(): |
| csv_path = "Input/sample_data.csv" |
| |
| |
| param_grid = { |
| 'embedding_size': [8, 16], |
| 'num_filters': [64, 128], |
| 'fc_size': [64, 128], |
| 'learning_rate': [0.001, 0.0001] |
| } |
| |
| best_params, results = hyperparameter_search( |
| csv_path=csv_path, |
| param_grid=param_grid, |
| epochs=3 |
| ) |
| |
| print(f"\n์ต์ ํ์ดํผํ๋ผ๋ฏธํฐ๋ก ๋ชจ๋ธ์ ๋ค์ ํ๋ จํ์ธ์:") |
| print(f"python src/train.py {csv_path} --epochs 10") |
|
|
| if __name__ == "__main__": |
| main() |
|
|