| import pandas as pd |
| |
| |
|
|
| |
| |
|
|
| class GroupingRareValues(): |
| """ |
| Grouping the observations that show rare labels into a unique category ('rare') |
| |
| Parameters |
| ---------- |
| |
| """ |
|
|
| def __init__(self, mapping=None, cols=None, threshold=0.01): |
| self.cols = cols |
| self.mapping = mapping |
| self._dim = None |
| self.threshold = threshold |
|
|
|
|
| def fit(self, X, y=None, **kwargs): |
| """Fit encoder according to X and y. |
| Parameters |
| ---------- |
| X : array-like, shape = [n_samples, n_features] |
| Training vectors, where n_samples is the number of samples |
| and n_features is the number of features. |
| y : array-like, shape = [n_samples] |
| Target values. |
| Returns |
| ------- |
| self : encoder |
| Returns self. |
| """ |
|
|
| self._dim = X.shape[1] |
|
|
| _, categories = self.grouping( |
| X, |
| mapping=self.mapping, |
| cols=self.cols, |
| threshold=self.threshold |
| ) |
| self.mapping = categories |
| return self |
|
|
|
|
| def transform(self, X): |
| """Perform the transformation to new categorical data. |
| Will use the mapping (if available) and the column list to encode the |
| data. |
| Parameters |
| ---------- |
| X : array-like, shape = [n_samples, n_features] |
| Returns |
| ------- |
| X : Transformed values with encoding applied. |
| """ |
|
|
| if self._dim is None: |
| raise ValueError('Must train encoder before it can be used to transform data.') |
|
|
| |
| if X.shape[1] != self._dim: |
| raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) |
|
|
| X, _ = self.grouping( |
| X, |
| mapping=self.mapping, |
| cols=self.cols, |
| threshold=self.threshold |
| ) |
|
|
| return X |
|
|
|
|
| def grouping(self, X_in, threshold, mapping=None, cols=None): |
| """ |
| Grouping the observations that show rare labels into a unique category ('rare') |
| |
| """ |
|
|
| X = X_in.copy(deep=True) |
|
|
| |
| |
|
|
| if mapping is not None: |
| mapping_out = mapping |
| for i in mapping: |
| column = i.get('col') |
| X[column] = X[column].map(i['mapping']) |
|
|
| |
| |
| |
| |
| else: |
| mapping_out = [] |
| for col in cols: |
| |
| |
| |
| temp_df = pd.Series(X[col].value_counts()/len(X)) |
| mapping = { k: ('rare' if k not in temp_df[temp_df >= threshold].index else k) |
| for k in temp_df.index} |
|
|
| mapping = pd.Series(mapping) |
| mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, ) |
|
|
| return X, mapping_out |
|
|
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| class ModeImputation(): |
| """ |
| Replacing the rare label by most frequent label |
| |
| Parameters |
| ---------- |
| |
| """ |
|
|
| def __init__(self, mapping=None, cols=None, threshold=0.01): |
| self.cols = cols |
| self.mapping = mapping |
| self._dim = None |
| self.threshold = threshold |
|
|
|
|
| def fit(self, X, y=None, **kwargs): |
| """Fit encoder according to X and y. |
| Parameters |
| ---------- |
| X : array-like, shape = [n_samples, n_features] |
| Training vectors, where n_samples is the number of samples |
| and n_features is the number of features. |
| y : array-like, shape = [n_samples] |
| Target values. |
| Returns |
| ------- |
| self : encoder |
| Returns self. |
| """ |
|
|
| self._dim = X.shape[1] |
|
|
| _, categories = self.impute_with_mode( |
| X, |
| mapping=self.mapping, |
| cols=self.cols, |
| threshold=self.threshold |
| ) |
| self.mapping = categories |
| return self |
|
|
|
|
| def transform(self, X): |
| """Perform the transformation to new categorical data. |
| Will use the mapping (if available) and the column list to encode the |
| data. |
| Parameters |
| ---------- |
| X : array-like, shape = [n_samples, n_features] |
| Returns |
| ------- |
| X : Transformed values with encoding applied. |
| """ |
|
|
| if self._dim is None: |
| raise ValueError('Must train encoder before it can be used to transform data.') |
|
|
| |
| if X.shape[1] != self._dim: |
| raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) |
|
|
| X, _ = self.impute_with_mode( |
| X, |
| mapping=self.mapping, |
| cols=self.cols, |
| threshold=self.threshold |
| ) |
|
|
| return X |
|
|
|
|
| def impute_with_mode(self, X_in, threshold, mapping=None, cols=None): |
| """ |
| Grouping the observations that show rare labels into a unique category ('rare') |
| |
| """ |
|
|
| X = X_in.copy(deep=True) |
|
|
| |
| |
|
|
| if mapping is not None: |
| mapping_out = mapping |
| for i in mapping: |
| column = i.get('col') |
| X[column] = X[column].map(i['mapping']) |
|
|
| |
| |
| |
| |
| else: |
| mapping_out = [] |
| for col in cols: |
| |
| |
| |
| temp_df = pd.Series(X[col].value_counts()/len(X)) |
| median = X[col].mode()[0] |
| mapping = { k: (median if k not in temp_df[temp_df >= threshold].index else k) |
| for k in temp_df.index} |
|
|
| mapping = pd.Series(mapping) |
| mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, ) |
|
|
| return X, mapping_out |
|
|