| | import pandas as pd |
| | from sklearn.tree import DecisionTreeClassifier |
| | from sklearn.model_selection import cross_val_score |
| | import numpy as np |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | class ChiMerge(): |
| | """ |
| | supervised discretization using the ChiMerge method. |
| | |
| | |
| | Parameters |
| | ---------- |
| | confidenceVal: number |
| | default=3.841, correspond to p=0.05 dof=1 |
| | num_of_bins: int |
| | number of bins after discretize |
| | col: str |
| | the column to be performed |
| | |
| | """ |
| | |
| | def __init__(self, col=None, bins=None, confidenceVal=3.841, num_of_bins=10): |
| | self.col = col |
| | self._dim = None |
| | self.confidenceVal = confidenceVal |
| | self.bins = bins |
| | self.num_of_bins = num_of_bins |
| |
|
| |
|
| | def fit(self, X, y, **kwargs): |
| | """Fit encoder according to X and y. |
| | Parameters |
| | ---------- |
| | X : array-like, shape = [n_samples, n_features] |
| | Training vectors, where n_samples is the number of samples |
| | and n_features is the number of features. |
| | y : array-like, shape = [n_samples] |
| | Target values. |
| | Returns |
| | ------- |
| | self : encoder |
| | Returns self. |
| | """ |
| |
|
| | self._dim = X.shape[1] |
| |
|
| | _, bins = self.chimerge( |
| | X_in=X, |
| | y=y, |
| | confidenceVal=self.confidenceVal, |
| | col=self.col, |
| | num_of_bins=self.num_of_bins |
| | ) |
| | self.bins = bins |
| | return self |
| | |
| | |
| | def transform(self, X): |
| | """Perform the transformation to new data. |
| | Will use the tree model and the column list to discretize the |
| | column. |
| | Parameters |
| | ---------- |
| | X : array-like, shape = [n_samples, n_features] |
| | Returns |
| | ------- |
| | X : new dataframe with discretized new column. |
| | """ |
| | |
| | if self._dim is None: |
| | raise ValueError('Must train encoder before it can be used to transform data.') |
| | |
| | |
| | if X.shape[1] != self._dim: |
| | raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) |
| | |
| | X, _ = self.chimerge( |
| | X_in=X, |
| | col=self.col, |
| | bins=self.bins |
| | ) |
| | |
| | return X |
| |
|
| | def chimerge(self, X_in, y=None, confidenceVal=None, num_of_bins=None, col=None, bins=None): |
| | """ |
| | discretize a variable using ChiMerge |
| | |
| | """ |
| |
|
| | X = X_in.copy(deep=True) |
| |
|
| | if bins is not None: |
| | try: |
| | X[col+'_chimerge'] = pd.cut(X[col],bins=bins,include_lowest=True) |
| | except Exception as e: |
| | print(e) |
| | |
| | else: |
| | try: |
| | |
| | total_num = X.groupby([col])[y].count() |
| | total_num = pd.DataFrame({'total_num': total_num}) |
| | positive_class = X.groupby([col])[y].sum() |
| | positive_class = pd.DataFrame({'positive_class': positive_class}) |
| | regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True,how='inner') |
| | regroup.reset_index(inplace=True) |
| | regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] |
| | regroup = regroup.drop('total_num', axis=1) |
| | np_regroup = np.array(regroup) |
| | |
| | i = 0 |
| | while (i <= np_regroup.shape[0] - 2): |
| | if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or ( np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)): |
| | np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1] |
| | np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2] |
| | np_regroup[i, 0] = np_regroup[i + 1, 0] |
| | np_regroup = np.delete(np_regroup, i + 1, 0) |
| | i = i - 1 |
| | i = i + 1 |
| | |
| | |
| | chi_table = np.array([]) |
| | for i in np.arange(np_regroup.shape[0] - 1): |
| | chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \ |
| | * (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \ |
| | ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * ( |
| | np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2])) |
| | chi_table = np.append(chi_table, chi) |
| | |
| | while (1): |
| | if (len(chi_table) <= (num_of_bins - 1) and min(chi_table) >= confidenceVal): |
| | break |
| | chi_min_index = np.argwhere(chi_table == min(chi_table))[0] |
| | np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1] |
| | np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2] |
| | np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0] |
| | np_regroup = np.delete(np_regroup, chi_min_index + 1, 0) |
| | |
| | if (chi_min_index == np_regroup.shape[0] - 1): |
| | chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \ |
| | * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \ |
| | ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2])) |
| | chi_table = np.delete(chi_table, chi_min_index, axis=0) |
| | |
| | else: |
| | chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \ |
| | * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \ |
| | ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2])) |
| | chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \ |
| | * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \ |
| | ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2])) |
| | chi_table = np.delete(chi_table, chi_min_index + 1, axis=0) |
| | result_data = pd.DataFrame() |
| | result_data['variable'] = [col] * np_regroup.shape[0] |
| | bins = [] |
| | tmp = [] |
| | for i in np.arange(np_regroup.shape[0]): |
| | if i == 0: |
| | y = '-inf' + ',' + str(np_regroup[i, 0]) |
| | |
| | |
| | elif i == np_regroup.shape[0] - 1: |
| | y = str(np_regroup[i - 1, 0]) + '+' |
| | |
| | |
| | else: |
| | y = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0]) |
| | |
| | |
| | bins.append(np_regroup[i - 1, 0]) |
| | tmp.append(y) |
| | |
| | |
| | bins.append(X[col].min()-0.1) |
| | |
| | result_data['interval'] = tmp |
| | result_data['flag_0'] = np_regroup[:, 2] |
| | result_data['flag_1'] = np_regroup[:, 1] |
| | bins.sort(reverse=False) |
| | print('Interval for variable %s' % col) |
| | print(result_data) |
| | |
| | except Exception as e: |
| | print(e) |
| | |
| | return X, bins |
| | |
| | |
| | |
| | |
| | |
| | class DiscretizeByDecisionTree(): |
| | """ |
| | Discretisation with Decision Trees consists of using a decision tree |
| | to identify the optimal splitting points that would determine the bins |
| | or contiguous intervals: |
| | |
| | 1.train a decision tree of limited depth (2, 3 or 4) using the variable |
| | we want to discretise to predict the target. |
| | 2.the original variable values are then replaced by the |
| | probability returned by the tree. |
| | |
| | Parameters |
| | ---------- |
| | col: str |
| | column to discretise |
| | max_depth: int or list of int |
| | max depth of the tree. Can be an int or a list of int we want the tree model to search |
| | for the optimal depth. |
| | |
| | """ |
| |
|
| | def __init__(self, col=None, max_depth=None, tree_model=None): |
| | self.col = col |
| | self._dim = None |
| | self.max_depth = max_depth |
| | self.tree_model = tree_model |
| |
|
| |
|
| | def fit(self, X, y, **kwargs): |
| | """Fit encoder according to X and y. |
| | Parameters |
| | ---------- |
| | X : array-like, shape = [n_samples, n_features] |
| | Training vectors, where n_samples is the number of samples |
| | and n_features is the number of features. |
| | y : array-like, shape = [n_samples] |
| | Target values. |
| | Returns |
| | ------- |
| | self : encoder |
| | Returns self. |
| | """ |
| |
|
| | self._dim = X.shape[1] |
| |
|
| | _, tree = self.discretize( |
| | X_in=X, |
| | y=y, |
| | max_depth=self.max_depth, |
| | col=self.col, |
| | tree_model=self.tree_model |
| | ) |
| | self.tree_model = tree |
| | return self |
| |
|
| | def transform(self, X): |
| | """Perform the transformation to new categorical data. |
| | Will use the tree model and the column list to discretize the |
| | column. |
| | Parameters |
| | ---------- |
| | X : array-like, shape = [n_samples, n_features] |
| | Returns |
| | ------- |
| | X : new dataframe with discretized new column. |
| | """ |
| |
|
| | if self._dim is None: |
| | raise ValueError('Must train encoder before it can be used to transform data.') |
| |
|
| | |
| | if X.shape[1] != self._dim: |
| | raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) |
| |
|
| | X, _ = self.discretize( |
| | X_in=X, |
| | col=self.col, |
| | tree_model=self.tree_model |
| | ) |
| |
|
| | return X |
| |
|
| |
|
| | def discretize(self, X_in, y=None, max_depth=None, tree_model=None, col=None): |
| | """ |
| | discretize a variable using DecisionTreeClassifier |
| | |
| | """ |
| |
|
| | X = X_in.copy(deep=True) |
| |
|
| | if tree_model is not None: |
| | X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1] |
| |
|
| | else: |
| | if isinstance(max_depth,int): |
| | tree_model = DecisionTreeClassifier(max_depth=max_depth) |
| | tree_model.fit(X[col].to_frame(), y) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | elif len(max_depth)>1: |
| | score_ls = [] |
| | score_std_ls = [] |
| | for tree_depth in max_depth: |
| | tree_model = DecisionTreeClassifier(max_depth=tree_depth) |
| | scores = cross_val_score(tree_model, X[col].to_frame(), y, cv=3, scoring='roc_auc') |
| | score_ls.append(np.mean(scores)) |
| | score_std_ls.append(np.std(scores)) |
| | temp = pd.concat([pd.Series(max_depth), pd.Series(score_ls), pd.Series(score_std_ls)], axis=1) |
| | temp.columns = ['depth', 'roc_auc_mean', 'roc_auc_std'] |
| | print('result ROC-AUC for each depth') |
| | print(temp) |
| | max_roc = temp.roc_auc_mean.max() |
| | optimal_depth=temp[temp.roc_auc_mean==max_roc]['depth'].values |
| | print('optimal_depth:',optimal_depth) |
| | tree_model = DecisionTreeClassifier(max_depth=optimal_depth) |
| | tree_model.fit(X[col].to_frame(), y) |
| | |
| | |
| | |
| | |
| | else: |
| | raise ValueError('max_depth of a tree must be an integer or a list') |
| |
|
| | return X, tree_model |
| |
|
| |
|
| |
|