| import pandas as pd |
| import numpy as np |
| from sklearn.impute import SimpleImputer |
| from sklearn.preprocessing import StandardScaler |
|
|
| class DataCleaner: |
| def __init__(self): |
| self.imputer = SimpleImputer(strategy='mean') |
| self.scaler = StandardScaler() |
|
|
| def clean(self, data): |
| |
| data = self.handle_missing_values(data) |
|
|
| |
| data = self.remove_outliers(data) |
|
|
| |
| data = self.normalize_data(data) |
|
|
| return data |
|
|
| def handle_missing_values(self, data): |
| numeric_columns = data.select_dtypes(include=[np.number]).columns |
| data[numeric_columns] = self.imputer.fit_transform(data[numeric_columns]) |
| return data |
|
|
| def remove_outliers(self, data): |
| numeric_columns = data.select_dtypes(include=[np.number]).columns |
| for column in numeric_columns: |
| Q1 = data[column].quantile(0.25) |
| Q3 = data[column].quantile(0.75) |
| IQR = Q3 - Q1 |
| lower_bound = Q1 - 1.5 * IQR |
| upper_bound = Q3 + 1.5 * IQR |
| data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)] |
| return data |
|
|
| def normalize_data(self, data): |
| numeric_columns = data.select_dtypes(include=[np.number]).columns |
| data[numeric_columns] = self.scaler.fit_transform(data[numeric_columns]) |
| return data |