| | import pandas as pd |
| |
|
| | |
| |
|
| | class MeanEncoding(): |
| | """ |
| | replacing the label by the mean of the target for that label. |
| | |
| | Parameters |
| | ---------- |
| | |
| | """ |
| |
|
| | def __init__(self, mapping=None, cols=None): |
| | self.cols = cols |
| | self.mapping = mapping |
| | self._dim = None |
| | |
| |
|
| |
|
| | def fit(self, X, y=None, **kwargs): |
| | """Fit encoder according to X and y. |
| | Parameters |
| | ---------- |
| | X : array-like, shape = [n_samples, n_features] |
| | Training vectors, where n_samples is the number of samples |
| | and n_features is the number of features. |
| | y : array-like, shape = [n_samples] |
| | Target values. |
| | Returns |
| | ------- |
| | self : encoder |
| | Returns self. |
| | """ |
| |
|
| | self._dim = X.shape[1] |
| |
|
| | _, categories = self.mean_encoding( |
| | X, |
| | y, |
| | mapping=self.mapping, |
| | cols=self.cols |
| | |
| | ) |
| | self.mapping = categories |
| | return self |
| |
|
| |
|
| | def transform(self, X): |
| | """Perform the transformation to new categorical data. |
| | Will use the mapping (if available) and the column list to encode the |
| | data. |
| | Parameters |
| | ---------- |
| | X : array-like, shape = [n_samples, n_features] |
| | Returns |
| | ------- |
| | X : Transformed values with encoding applied. |
| | """ |
| |
|
| | if self._dim is None: |
| | raise ValueError('Must train encoder before it can be used to transform data.') |
| |
|
| | |
| | if X.shape[1] != self._dim: |
| | raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) |
| |
|
| | X, _ = self.mean_encoding( |
| | X, |
| | mapping=self.mapping, |
| | cols=self.cols |
| | |
| | ) |
| |
|
| | return X |
| |
|
| |
|
| | def mean_encoding(self, X_in, y=None, mapping=None, cols=None): |
| | """ |
| | Grouping the observations that show rare labels into a unique category ('rare') |
| | |
| | """ |
| |
|
| | X = X_in.copy(deep=True) |
| |
|
| | |
| | |
| |
|
| | if mapping is not None: |
| | mapping_out = mapping |
| | for i in mapping: |
| | column = i.get('col') |
| | X[column] = X[column].map(i['mapping']) |
| |
|
| | |
| | |
| | |
| | |
| | else: |
| | mapping_out = [] |
| | for col in cols: |
| | |
| | |
| | |
| | mapping = X[y.name].groupby(X[col]).mean().to_dict() |
| | mapping = pd.Series(mapping) |
| | mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, ) |
| |
|
| | return X, mapping_out |