| import os |
| from datasets import load_dataset |
| from sklearn.model_selection import train_test_split |
| import pandas as pd |
|
|
|
|
| def prepare_data(dataset_name: str = "David-Egea/phishing-texts"): |
| print(f"Loading dataset: {dataset_name}...") |
| |
| ds = load_dataset(dataset_name) |
|
|
| |
| df: pd.DataFrame = ds["train"].to_pandas() |
|
|
| print(f"Total samples: {len(df)}") |
| print(f"Class distribution:\n{df['phishing'].value_counts(normalize=True)}") |
|
|
| |
| train_df, temp_df = train_test_split( |
| df, test_size=0.2, random_state=42, stratify=df["phishing"] |
| ) |
|
|
| |
| val_df, test_df = train_test_split( |
| temp_df, test_size=0.5, random_state=42, stratify=temp_df["phishing"] |
| ) |
|
|
| print(f"Train samples: {len(train_df)}") |
| print(f"Val samples: {len(val_df)}") |
| print(f"Test samples: {len(test_df)}") |
|
|
| |
| os.makedirs("data", exist_ok=True) |
|
|
| |
| train_df.to_csv("data/train.csv", index=False) |
| val_df.to_csv("data/val.csv", index=False) |
| test_df.to_csv("data/test.csv", index=False) |
| print("Splits saved to data/ folder.") |
|
|
|
|
| if __name__ == "__main__": |
| prepare_data() |
|
|