| |
| """m1_exam_oral (4).ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/10X3y3B7p8yobOiRjxxNqGfV7bIykP31L |
| """ |
|
|
| |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| |
| df = pd.read_csv("./stack-overflow-developer-survey-2024/survey_results_public.csv", ) |
|
|
|
|
| def basic_info(column): |
| print(df[column].head()) |
| print('-'*40) |
| print(df[column].tail()) |
| print('-'*40) |
| print(df[column].isnull().sum()) |
| print('-'*40) |
| print(df[column].value_counts()) |
| print('-'*40) |
| print('*'*40) |
|
|
| |
| value_maps = {} |
| def map_and_fill(column, fill_value=0): |
| value_map = {v: i for i, v in enumerate(df[column].dropna().unique())} |
| df[column] = df[column].map(value_map) |
| df[column] = df[column].fillna(fill_value) |
| value_maps[column] = value_map |
| return value_map |
|
|
| def reverse_map_and_fill(column): |
| if column in value_maps: |
| _reverse_map = {v: k for k, v in value_maps[column].items()} |
| df[column] = df[column].map(_reverse_map) |
| else: |
| print(f"No mapping found for column: {column}") |
|
|
| def bin_distribution(column, bins=10,rotate=0): |
| plt.figure(figsize=(12,6)) |
| sns.histplot(df[column], bins=bins, kde=False) |
| plt.title(f'{column} Distribution') |
| plt.xlabel(column) |
| plt.ylabel('Count') |
| # rotate x labels |
| if rotate: |
| plt.xticks(rotation=rotate) |
| plt.show() |
|
|
| |
| def fast_explode(target_dataframe, target_column, fillna='', split=';', prefix='worked with', tmp_column_name='tmp_c'): |
| # Step 1: Create a temporary column with split and prefix |
| _exploded = ( |
| target_dataframe.assign( |
| **{tmp_column_name: target_dataframe[target_column] |
| .fillna(fillna) |
| .str.split(split) |
| .apply(lambda lst: [f"{prefix} {lang.strip()}" for lang in lst if lang]) |
| } |
| ) |
| .explode(tmp_column_name) |
| ) |
|
|
| # Step 2: One-hot encode |
| _one_hot = pd.crosstab(index=_exploded.index, columns=_exploded[tmp_column_name]).astype(bool) |
|
|
| # Step 3: Combine with original DataFrame |
| print(target_column, _one_hot.shape[0], target_dataframe.shape[0]) |
| result = pd.concat([target_dataframe.drop(columns=[target_column]), _one_hot], axis=1) |
| result[_one_hot.columns] = result[_one_hot.columns].fillna(False) |
|
|
| return result |
|
|
|
|
|
|
| |
| print(df.head()) |
| print(df.describe()) |
| print(df.info()) |
|
|
| for c in df.columns: |
| print(f"{c}") |
|
|
| """we can see null values throughout all columns, few columns have filled with full values. |
| |
| now we will look into the data and see what we can do to clean it up and make it ready for analysis. |
| """ |
|
|
| |
| basic_info("ResponseId") |
|
|
| |
| df.set_index('ResponseId', inplace=True) # setting index |
|
|
| |
| basic_info("MainBranch") |
|
|
| |
|
|
| print(map_and_fill("MainBranch")) |
|
|
| |
| print(df['MainBranch'].value_counts()) |
| |
|
|
| basic_info("Age") |
| |
|
|
| |
| print(map_and_fill("Age")) |
|
|
| """age is categorical data, we can keep it as it is. But I assume it represented register age/ working experience, not the actual age of the respondents""" |
|
|
| basic_info("RemoteWork") |
| |
| print(map_and_fill("RemoteWork")) |
|
|
| df['RemoteWork'] = df['RemoteWork'].fillna(0)# filling null values with 0 |
| print(df['RemoteWork'].value_counts()) |
|
|
| """remote_work is categorical data, we can keep it as it is. And fill the null values with 0, meaning no remote work/experience""" |
|
|
| basic_info("EdLevel") |
| |
| print(map_and_fill("EdLevel")) |
| df['EdLevel'] = df['EdLevel'].fillna(0) |
|
|
| basic_info("Check") |
| |
| df.drop(columns=['Check'], inplace=True) |
|
|
| |
| basic_info("CodingActivities") |
| df = fast_explode(df,target_column='CodingActivities', fillna='', split=';', prefix='coding_activities ') |
|
|
| basic_info("Employment") |
|
|
|
|
| |
| print(map_and_fill("Employment")) |
| df['Employment'] = df['Employment'].fillna(0) |
| |
|
|
| bin_distribution("Employment", bins=len(df['Employment'].unique())+1) |
|
|
| |
| basic_info('LearnCode') |
| basic_info('LearnCodeOnline') |
| basic_info('TechDoc') |
| for col in ['LearnCode', 'LearnCodeOnline', 'TechDoc']: |
| df = fast_explode(df,target_column=col, fillna='', split=';', prefix=f'{col} ') |
| |
| basic_info('YearsCode') |
| # 2 unique string values 'Less than 1 year' and 'More than 50 years' |
| # convert them to 0 and 51 |
| df['YearsCode'] = df['YearsCode'].replace({'Less than 1 year': 0, 'More than 50 years': 51}) |
| basic_info("YearsCode") |
| # need to handle the null values |
| df['YearsCode'] = df['YearsCode'].fillna(-1) |
| df['YearsCode'] = df['YearsCode'].astype(int) |
| bin_distribution('YearsCode', bins=52) |
| |
| basic_info('YearsCodePro') |
| df['YearsCodePro'] = df['YearsCodePro'].replace({'Less than 1 year': 0, 'More than 50 years': 51}) |
| basic_info("YearsCodePro") |
| # need to handle the null values |
| df['YearsCodePro'] = df['YearsCodePro'].fillna(-1) |
| |
| df['YearsCodePro'] = df['YearsCodePro'].astype(int) |
| bin_distribution('YearsCodePro', bins=52) |
| |
| basic_info("DevType") |
| bin_distribution('DevType', bins=df['DevType'].nunique()+1,rotate=90) |
| print(map_and_fill("DevType")) |
| df['DevType'] = df['DevType'].fillna(-1) |
| |
| basic_info("OrgSize") |
| bin_distribution('OrgSize', bins=df['OrgSize'].nunique()+1,rotate=90) |
| # categorical data, a lot null values, fill null with -1 meaning no response/don't have an organization? |
| print(map_and_fill("OrgSize")) |
| df['OrgSize'] = df['OrgSize'].fillna(-1) |
|
|
| basic_info("PurchaseInfluence") |
| |
| print(map_and_fill("PurchaseInfluence")) |
| df['PurchaseInfluence'] = df['PurchaseInfluence'].fillna(-1) |
|
|
| basic_info("BuyNewTool")# another need to expand categorial data |
| df = fast_explode(df,target_column='BuyNewTool', fillna='', split=';', prefix='buy_new_tool ') |
|
|
| basic_info('BuildvsBuy') |
| |
| |
| print(map_and_fill("BuildvsBuy")) |
| |
| |
| df['BuildvsBuy'] = df['BuildvsBuy'].fillna(-1) |
|
|
| basic_info("LanguageHaveWorkedWith") |
| drop_columns = [] |
| explode_columns = [] |
| for col in df.columns: |
| if 'HaveWorkedWith' in col: |
| explode_columns.append(col) |
|
|
| elif 'Admired' in col or 'WantTo' in col or 'SO' in col or 'AI' in col: |
| drop_columns.append(col) |
|
|
| print(drop_columns) |
| print(explode_columns) |
| |
| df.drop(columns=drop_columns, inplace=True) |
|
|
| |
| for col in explode_columns: |
| df = fast_explode(df,target_column=col, fillna='', split=';', prefix=f'{col} ') |
| print(df.isnull().sum()) |
| |
| # Splits each string in LanguageHaveWorkedWith into a list of languages,Creates a new column called Language in the DataFrame. Converts each list in Language into multiple rows. |
| # df = fast_explode(df,target_column='LanguageHaveWorkedWith', fillna='', split=';', prefix='worked_with ') |
| |
| basic_info('ICorPM') #Individual Contributor or People Manager |
| # categorical data, a lot null values, fill null with -1 |
| print(map_and_fill("ICorPM")) |
| df['ICorPM'] = df['ICorPM'].fillna(-1) |
| |
| basic_info('WorkExp') |
| # numerical categorical data, a lot null values, fill null with -1 |
| df['WorkExp'] = df['WorkExp'].fillna(-1) |
| |
| basic_info('Knowledge_1') |
| # from Knowledge_1 ~9 check the survey question, it is the long question about different things |
| for i in range(1,10): |
| basic_info(f'Knowledge_{i}') |
| print(map_and_fill(f'Knowledge_{i}')) |
| df[f'Knowledge_{i}'] = df[f'Knowledge_{i}'].fillna(-1) |
| |
| # Frequency_1 ~ 3 also |
| |
| for i in range(1,4): |
| basic_info(f'Frequency_{i}') |
| print(map_and_fill(f'Frequency_{i}')) |
| df[f'Frequency_{i}'] = df[f'Frequency_{i}'].fillna(-1) |
| |
| basic_info('TimeSearching') |
| print(map_and_fill("TimeSearching")) |
| df['TimeSearching'] = df['TimeSearching'].fillna(-1) |
| |
| basic_info('TimeAnswering') |
| print(map_and_fill("TimeAnswering")) |
| df['TimeAnswering'] = df['TimeAnswering'].fillna(-1) |
| |
| basic_info('Frustration') |
| # change it's name to Challenge_Frustration |
| df.rename(columns={'Frustration': 'Challenge_Frustration'}, inplace=True) |
| print(map_and_fill("Challenge_Frustration")) |
| df['Challenge_Frustration'] = df['Challenge_Frustration'].fillna(-1) |
|
|
|
|
| basic_info('ProfessionalTech') |
| |
| df.rename(columns={'ProfessionalTech': 'Company_ProfessionalTech'}, inplace=True) |
| print(map_and_fill("Company_ProfessionalTech")) |
| df['Company_ProfessionalTech'] = df['Company_ProfessionalTech'].fillna(-1) |
|
|
| basic_info('ProfessionalCloud') |
| print(map_and_fill("ProfessionalCloud")) |
| df['ProfessionalCloud'] = df['ProfessionalCloud'].fillna(-1) |
|
|
|
|
| basic_info('ProfessionalQuestion') |
| |
| df.rename(columns={'ProfessionalQuestion': 'FirstAnswerer_ProfessionalQuestion'}, inplace=True) |
| print(map_and_fill("FirstAnswerer_ProfessionalQuestion")) |
| df['FirstAnswerer_ProfessionalQuestion'] = df['FirstAnswerer_ProfessionalQuestion'].fillna(-1) |
|
|
|
|
| basic_info('Industry') |
| print(map_and_fill("Industry")) |
| df['Industry'] = df['Industry'].fillna(-1) |
|
|
| basic_info("TechEndorse") |
| basic_info("Country") # do keep this for later analysis |
| basic_info("Currency") # their salary is in this currency |
| basic_info("CompTotal") # salary! for a year?! |
| |
| |
| df['CompTotal'] = df['CompTotal'].fillna(0) |
|
|
| print(df['Currency'].unique()) |
| |
|
|
| df['Currency'] = df['Currency'].str[:3] |
| print(df['Currency'].value_counts()) |
| |
| usd_avg_rates_2024 = { |
| "AUD": 1.5158, # Australian dollar |
| "BRL": 5.3872, # Brazilian real |
| "CAD": 1.3699, # Canadian dollar |
| "CNY": 7.1957, # Chinese Yuan Renminbi |
| "DKK": 6.8955, # Danish krone |
| "EUR": 1.0820, # European Euro |
| "HKD": 7.8030, # Hong Kong dollar |
| "INR": 83.6566, # Indian rupee |
| "JPY": 151.4551, # Japanese yen |
| "MYR": 4.5747, # Malaysian ringgit |
| "MXN": 18.3062, # Mexican peso |
| "NZD": 1.6529, # New Zealand dollar |
| "NOK": 10.7574, # Norwegian krone |
| "SGD": 1.3363, # Singapore dollar |
| "ZAR": 18.3346, # South African rand |
| "KRW": 1363.4381, # South Korean won |
| "LKR": 301.6752, # Sri Lankan rupee |
| "SEK": 10.5744, # Swedish krona |
| "CHF": 0.8808, # Swiss franc |
| "TWD": 32.1064, # New Taiwan dollar |
| "THB": 35.2845, # Thai baht |
| "GBP": 1.2781, # Pound sterling |
| "VES": 38.3314 # Venezuelan bolivar |
| } |
| |
| print(df[['CompTotal']].describe()) |
| print(df['CompTotal'].max()) |
|
|
| |
| |
| |
|
|
| df = df.loc[(df['CompTotal'] < df['CompTotal'].quantile(0.99)) & (df['CompTotal'] > df['CompTotal'].quantile(0.01))] |
| |
|
|
|
|
|
|
| |
| salary_counts = df['CompTotal'].value_counts() |
| import plotly.express as px |
| |
| df_salary = salary_counts.reset_index() |
| df_salary.columns = ['Salary', 'Count'] |
| fig = px.histogram( |
| df_salary, |
| x='Salary', |
| y='Count', |
| nbins=100, # Adjust bins for granularity |
| title='Salary Distribution Histogram', |
| labels={'Salary': 'Salary Amount', 'Count': 'Number of People'}, |
| log_y=False # Log scale for better visibility |
| ) |
|
|
| fig.show() |
| |
| fig = px.histogram( |
| df_salary, |
| x='Salary', |
| y='Count', |
| nbins=100, # Adjust bins for granularity |
| title='Salary Distribution Histogram (log scale)', |
| labels={'Salary': 'Salary Amount', 'Count': 'Number of People'}, |
| log_y=True # Log scale for better visibility |
| ) |
|
|
| fig.show() |
|
|
| |
| df = df.loc[(df['CompTotal'] >= 1000) & (df['CompTotal'] <= 500000)] |
| print(df[['CompTotal']].describe()) |
|
|
| basic_info('JobSatPoints_1') |
| |
|
|
| |
| |
| |
| for i in range(1, 12): |
| if i in [2,3]: |
| continue |
| df[f'JobSatPoints_{i}'] = df[f'JobSatPoints_{i}'].fillna(-1) |
|
|
| total_score = df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].sum(axis=1) |
| print(total_score.value_counts()) |
| |
| print((total_score > 100).sum()) |
| |
| df.loc[total_score > 100, [f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]] = df.loc[total_score > 100, [f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].div(total_score[total_score > 100], axis=0).multiply(100) |
| df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]] = df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].round().astype(int) |
|
|
| total_score = df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].sum(axis=1) |
| print(total_score.value_counts()) |
| |
| print((total_score > 100).sum()) |
|
|
| |
| null_counts = df.isnull().sum() |
| print(null_counts[null_counts > 0]) |
|
|
| drop_columns = ['TechEndorse','SurveyLength','SurveyEase','ConvertedCompYearly'] |
| explode_columns = ['OpSysPersonal use','OpSysProfessional use'] |
| for col in explode_columns: |
| df = fast_explode(df,target_column=col, fillna='', split=';', prefix=f'{col} ') |
| df.drop(columns=drop_columns, inplace=True) |
| |
| df['TBranch'] = df['TBranch'].map({'Yes': 1, 'No': 0}) |
| df['TBranch'] = df['TBranch'].fillna(-1) |
| |
| print(map_and_fill('Country')) |
| print(map_and_fill('Currency')) |
| |
| null_counts = df.isnull().sum() |
| print(null_counts[null_counts > 0]) |
| df |
| |
| df.replace({True: 1, False: 0}, inplace=True) |
| # df |
| |
| # start model prediction using xgboost or lightgbm |
| from sklearn.model_selection import train_test_split, cross_val_score, KFold |
| from sklearn.compose import ColumnTransformer |
| from sklearn.pipeline import Pipeline |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler |
| from sklearn.impute import SimpleImputer |
| from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score |
| import numpy as np |
| # --- Candidate models --- |
| from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor |
| from xgboost import XGBRegressor |
| from lightgbm import LGBMRegressor |
| |
| # ahh.. standard scaler |
| # ========== 1. Data preparation ========== |
| # Filter and clean |
| # using country and currency as grouping then separate the data into train_df |
| |
| # train_df = df.loc[df["CompTotal"] >= 1000].copy() |
| # train_df.drop(columns=["JobSat"], errors="ignore", inplace=True) |
| # |
| # # Split features and target |
| # X = train_df.drop(columns=["CompTotal"]) |
| # y = train_df["CompTotal"] |
| # |
| # # Split into training / testing |
| # X_train, X_test, y_train, y_test = train_test_split( |
| # X, y, test_size=0.2, random_state=42 |
| # ) |
| # |
| # # ========== 2. Preprocessing pipeline ========== |
| # # Identify numeric and categorical columns |
| # num_cols = X_train.select_dtypes(include=["number"]).columns |
| # cat_cols = X_train.select_dtypes(exclude=["number"]).columns |
| # |
| # # Numeric pipeline |
| # numeric_pipeline = Pipeline(steps=[ |
| # |
| # ("scaler", StandardScaler()) |
| # ]) |
| # |
| # |
| # |
| # # Combine into column transformer |
| # preprocessor = ColumnTransformer( |
| # transformers=[ |
| # ("num", numeric_pipeline, num_cols), |
| # |
| # ] |
| # ) |
| # |
| # # ========== 3. Define models ========== |
| # models = { |
| # "GradientBoosting": GradientBoostingRegressor( |
| # n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42 |
| # ), |
| # "RandomForest": RandomForestRegressor( |
| # n_estimators=400, random_state=42, n_jobs=-1 |
| # ), |
| # "HistGradientBoosting": HistGradientBoostingRegressor( |
| # learning_rate=0.05, random_state=42 |
| # ) |
| # } |
| # |
| # |
| # models["XGBoost"] = XGBRegressor( |
| # n_estimators=600, learning_rate=0.05, max_depth=6, |
| # subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42 |
| # ) |
| # |
| # |
| # models["LightGBM"] = LGBMRegressor( |
| # n_estimators=600, learning_rate=0.05, num_leaves=31, |
| # subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42 |
| # ) |
| # |
| # # ========== 4. Evaluate each model with cross-validation ========== |
| # cv = KFold(n_splits=5, shuffle=True, random_state=42) |
| # results = [] |
| # |
| # for name, model in models.items(): |
| # pipe = Pipeline(steps=[("preprocessor", preprocessor), |
| # ("model", model)]) |
| # scores = cross_val_score( |
| # pipe, X_train, y_train, |
| # scoring="neg_root_mean_squared_error", |
| # cv=cv, n_jobs=-1,error_score='raise' |
| # ) |
| # rmse = -scores.mean() |
| # print(f"{name:>20s} | CV RMSE: {rmse:.2f}") |
| # results.append((name, rmse, scores.std())) |
| # |
| # # Sort and select best |
| # results_df = pd.DataFrame(results, columns=["Model", "CV_RMSE", "Std"]).sort_values("CV_RMSE") |
| # best_model_name = results_df.iloc[0]["Model"] |
| # print("\nBest model:", best_model_name) |
| # |
| # # ========== 5. Fit best model pipeline and evaluate on test set ========== |
| # best_model = models[best_model_name] |
| # best_pipeline = Pipeline(steps=[ |
| # ("preprocessor", preprocessor), |
| # ("model", best_model) |
| # ]) |
| # |
| # best_pipeline.fit(X_train, y_train) |
| # y_pred = best_pipeline.predict(X_test) |
| # |
| # rmse = np.sqrt(mean_squared_error(y_test, y_pred)) |
| # mae = mean_absolute_error(y_test, y_pred) |
| # r2 = r2_score(y_test, y_pred) |
| # |
| # print(f"\n--- Test Performance ({best_model_name}) ---") |
| # print(f"RMSE: {rmse:.2f}") |
| # print(f"MAE : {mae:.2f}") |
| # print(f"R² : {r2:.4f}") |
| # |
| # # Optional: show model comparison table |
| # print("\nModel comparison (lower RMSE better):") |
| # print(results_df.to_string(index=False)) |
| |
| # lightbgm seems to be the best model here, check feature importance and plot the top 20 features |
| |
| # feature_names_num = num_cols.tolist() |
| # |
| # feature_names = feature_names_num |
| # |
| # importances = best_model.feature_importances_ |
| # feature_importance_df = pd.DataFrame({ |
| # 'Feature': feature_names, |
| # 'Importance': importances |
| # }).sort_values(by='Importance', ascending=False) |
| # |
| # print("\nTop 20 Feature Importances:") |
| # print(feature_importance_df.head(20)) |
| # |
| # # Plot top 20 features |
| # plt.figure(figsize=(10, 6)) |
| # sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20)) |
| # plt.title('Top 20 Feature Importances') |
| # plt.tight_layout() |
| # plt.show() |
| |
| """ |
| # first modelling is terrible, we need a deeper analysis of the data. |
| # we can see that the country and currency are important features, we can try to group the data by country and currency and see what we can find out. |
| """ |
| |
| for train_df in df.groupby(['Country', 'Currency']): |
| print(train_df[0], train_df[1].shape) |
| |
| """# we can see that some country and currency have very few data, we can filter out these data and only keep the country and currency with more than 1000 data points. |
| |
| """ |
| |
| # finding the country and currency with more than 1000 data points |
| for train_df in df.groupby(['Country', 'Currency']): |
| if train_df[1].shape[0]>1000: |
| print(train_df[0], train_df[1].shape) |
| for x in value_maps['Country'].keys() : |
| if value_maps['Country'][x]==train_df[0][0]: |
| print(x) |
| |
| for x in value_maps['Currency'].keys() : |
| if value_maps['Currency'][x]==train_df[0][1]: |
| print(x) |
| # and it's USA and USD, France and EUR, UK and GBP, Canada and CAD, Germany and EUR |
|
|
| |
| df_cleaned = df.loc[df["CompTotal"] >= 1000].copy() |
| df_cleaned.drop(columns=["JobSat"], errors="ignore", inplace=True) |
| df_cleaned = df_cleaned[((df_cleaned['Country']==value_maps['Country']['United States of America']) & (df_cleaned['Currency']==value_maps['Currency']['USD'])) |
| ] |
|
|
| from sklearn.decomposition import PCA |
|
|
| results = [] |
| for group in df_cleaned.groupby(['Country', 'Currency']): |
| train_df = group[1] |
| if train_df.shape[0]<1000: |
| continue |
| |
| |
| X = train_df.select_dtypes(include=['number']) |
|
|
| # --- 2️⃣ Standardize the features --- |
| scaler = StandardScaler() |
| X_scaled = scaler.fit_transform(X) |
|
|
| # --- 3️⃣ Apply PCA --- |
| # Let's start with 2 principal components for visualization |
| for n in range(1, 100): |
| pca = PCA(n_components=n) |
| pca.fit(X_scaled) |
| results.append({'Components': n, 'ExplainedVariance': pca.explained_variance_ratio_.sum()}) |
| |
| pca_summary = pd.DataFrame(results) |
| print(pca_summary) |
| plt.plot(pca_summary['Components'], pca_summary['ExplainedVariance'], marker='o') |
| plt.xlabel('Number of Components') |
| plt.ylabel('Total Explained Variance') |
| plt.title('Explained Variance by Number of PCA Components') |
| plt.grid(True) |
| plt.show() |
| |
| from sklearn.model_selection import train_test_split, cross_val_score, KFold |
| from sklearn.compose import ColumnTransformer |
| from sklearn.pipeline import Pipeline |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler |
| from sklearn.impute import SimpleImputer |
| from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score |
| from sklearn.base import clone |
| from sklearn.inspection import permutation_importance |
| import numpy as np |
| # --- baseline models --- |
| from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor |
| # kaggle top models |
| from xgboost import XGBRegressor |
| from lightgbm import LGBMRegressor |
| |
| # ========== 1. Data preparation ========== |
| |
| df_cleaned = df.loc[df["CompTotal"] >= 1000].copy() |
| df_cleaned = df_cleaned.loc[df["CompTotal"] <= 300000].copy() |
| |
| df_cleaned.drop(columns=["JobSat"], errors="ignore", inplace=True) |
| |
| for group in df_cleaned.groupby(['Country', 'Currency']): |
| train_df = group[1] |
| if train_df.shape[0]<1000: |
| continue |
| # Split features and target |
| X = train_df.drop(columns=["CompTotal"]) |
| y = train_df["CompTotal"] |
| |
| # Split into training / testing |
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=0.2, random_state=42 |
| ) |
| |
| # ========== 2. Preprocessing pipeline ========== |
| # Identify numeric and categorical columns |
| num_cols = X_train.select_dtypes(include=["number"]).columns |
| cat_cols = X_train.select_dtypes(exclude=["number"]).columns |
| |
| # Numeric pipeline |
| numeric_pipeline = Pipeline(steps=[ |
| |
| ("scaler", StandardScaler()) |
| ]) |
| |
| |
| |
| # Combine into column transformer |
| preprocessor = ColumnTransformer( |
| transformers=[ |
| ("num", numeric_pipeline, num_cols), |
| |
| ] |
| ) |
| |
| # ========== 3. Define models ========== |
| models = { |
| "GradientBoosting": GradientBoostingRegressor( |
| n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42 |
| ), |
| "RandomForest": RandomForestRegressor( |
| n_estimators=400, random_state=42, n_jobs=-1 |
| ), |
| "HistGradientBoosting": HistGradientBoostingRegressor( |
| learning_rate=0.05, random_state=42 |
| ) |
| } |
| |
| |
| models["XGBoost"] = XGBRegressor( |
| n_estimators=600, learning_rate=0.05, max_depth=6, |
| subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42 |
| ) |
| |
| |
| models["LightGBM"] = LGBMRegressor( |
| n_estimators=600, learning_rate=0.05, num_leaves=31, |
| subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42 |
| ) |
| |
| # ========== 4. Evaluate each model with cross-validation ========== |
| cv = KFold(n_splits=5, shuffle=True, random_state=42) |
| results = [] |
| |
| for name, model in models.items(): |
| pipe = Pipeline(steps=[("preprocessor", preprocessor), |
| ("model", model)]) |
| scores = cross_val_score( |
| pipe, X_train, y_train, |
| scoring="neg_root_mean_squared_error", |
| cv=cv, n_jobs=-1,error_score='raise' |
| ) |
| rmse = -scores.mean() |
| print(f"{name:>20s} | CV RMSE: {rmse:.2f}") |
| results.append((name, rmse, scores.std())) |
| |
| # Sort and select best |
| results_df = pd.DataFrame(results, columns=["Model", "CV_RMSE", "Std"]).sort_values("CV_RMSE") |
| best_model_name = results_df.iloc[0]["Model"] |
| print("\nBest model:", best_model_name) |
| |
| # ========== 5. Fit best model pipeline and evaluate on test set ========== |
| best_model = models[best_model_name] |
| best_pipeline = Pipeline(steps=[ |
| ("preprocessor", preprocessor), |
| ("model", best_model) |
| ]) |
| |
| best_pipeline.fit(X_train, y_train) |
| y_pred = best_pipeline.predict(X_test) |
| |
| rmse = np.sqrt(mean_squared_error(y_test, y_pred)) |
| mae = mean_absolute_error(y_test, y_pred) |
| r2 = r2_score(y_test, y_pred) |
| |
| print(f"\n--- Test Performance ({best_model_name}) ---") |
| print(f"RMSE: {rmse:.2f}") |
| print(f"MAE : {mae:.2f}") |
| print(f"R² : {r2:.4f}") |
| |
| # Optional: show model comparison table |
| print("\nModel comparison (lower RMSE better):") |
| print(results_df.to_string(index=False)) |
| _country = '' |
| for x in value_maps['Country'].keys() : |
| if value_maps['Country'][x]==group[0][0]: |
| _country = x |
| |
| for x in value_maps['Currency'].keys() : |
| if value_maps['Currency'][x]==group[0][1]: |
| print(x) |
| print('*'*40) |
| print('*'*40) |
| feature_names_num = num_cols.tolist() |
| |
| feature_names = feature_names_num |
| |
| importances = best_model.feature_importances_ |
| feature_importance_df = pd.DataFrame({ |
| 'Feature': feature_names, |
| 'Importance': importances |
| }).sort_values(by='Importance', ascending=False) |
| |
| print("\nTop 20 Feature Importances:") |
| print(feature_importance_df.head(20)) |
| # ---------- existing code up to the first evaluation ---------- |
| best_pipeline.fit(X_train, y_train) |
| y_pred = best_pipeline.predict(X_test) |
| |
| rmse = np.sqrt(mean_squared_error(y_test, y_pred)) |
| mae = mean_absolute_error(y_test, y_pred) |
| r2 = r2_score(y_test, y_pred) |
| |
| print(f"\n--- Test Performance ({best_model_name}) ---") |
| print(f"RMSE: {rmse:.2f}") |
| print(f"MAE : {mae:.2f}") |
| print(f"R² : {r2:.4f}") |
| |
| print("\nModel comparison (lower RMSE better):") |
| print(results_df.to_string(index=False)) |
| |
| # --------- Nice labels for Country/Currency (unchanged) --------- |
| _country = '' |
| for x in value_maps['Country'].keys(): |
| if value_maps['Country'][x] == group[0][0]: |
| _country = x |
| |
| _currency = '' |
| for x in value_maps['Currency'].keys(): |
| if value_maps['Currency'][x] == group[0][1]: |
| _currency = x |
| print(_currency) |
| print('*'*40) |
| print('*'*40) |
| |
| # ===================== TOP-50 FEATURE REFIT ===================== |
| # 1) Get feature importances (fallback to permutation importance if needed) |
| feature_names = num_cols.tolist() # only numeric used in your preprocessor |
| if hasattr(best_model, "feature_importances_"): |
| importances = best_model.feature_importances_ |
| else: |
| # Fallback: permutation importance on training set |
| # (uses the full pipeline so it handles scaling correctly) |
| perm = permutation_importance( |
| best_pipeline, X_train, y_train, |
| n_repeats=5, |
| scoring="neg_root_mean_squared_error", |
| random_state=42, |
| n_jobs=-1 |
| ) |
| # With your current preprocessor (only numeric), importances align to feature_names |
| importances = perm.importances_mean |
| |
| feature_importance_df = pd.DataFrame({ |
| 'Feature': feature_names, |
| 'Importance': importances |
| }).sort_values(by='Importance', ascending=False) |
| |
| print("\nTop 20 Feature Importances (all features):") |
| print(feature_importance_df.head(20)) |
| |
| # 2) Pick top-K features |
| TOP_K = min(50, len(feature_names)) |
| top_features = feature_importance_df.head(TOP_K)['Feature'].tolist() |
| |
| # 3) Build a new preprocessor that only uses the top-K features |
| preprocessor_top = ColumnTransformer( |
| transformers=[ |
| ("num", numeric_pipeline, top_features), |
| ] |
| ) |
| |
| # 4) Clone the best model (clean, unfitted) and retrain on top-K |
| best_model_top = clone(best_model) |
| best_pipeline_top = Pipeline(steps=[ |
| ("preprocessor", preprocessor_top), |
| ("model", best_model_top) |
| ]) |
| best_pipeline_top.fit(X_train, y_train) |
| y_pred_top = best_pipeline_top.predict(X_test) |
| |
| rmse_top = np.sqrt(mean_squared_error(y_test, y_pred_top)) |
| mae_top = mean_absolute_error(y_test, y_pred_top) |
| r2_top = r2_score(y_test, y_pred_top) |
| |
| # 5) Report before/after |
| print(f"\n=== Top-{TOP_K} Feature Refit ({best_model_name}) in {_country} [{_currency}] ===") |
| print(f"BEFORE -> RMSE: {rmse:.2f} | MAE: {mae:.2f} | R²: {r2:.4f} (all {len(feature_names)} numeric features)") |
| print(f"AFTER -> RMSE: {rmse_top:.2f} | MAE: {mae_top:.2f} | R²: {r2_top:.4f} (top {TOP_K} features)") |
| delta = rmse_top - rmse |
| print(f"ΔRMSE (top-{TOP_K} - all): {delta:+.2f}") |
| |
| # Optional: plot top 20 features among the selected set (from original ranking) |
| plt.figure(figsize=(10, 6)) |
| sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20)) |
| plt.title(f'Top 20 Feature Importances in {_country}') |
| plt.tight_layout() |
| plt.show() |
| # # Plot top 20 features |
| # plt.figure(figsize=(10, 6)) |
| # sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20)) |
| # plt.title(f'Top 20 Feature Importances in {_country}') |
| # plt.tight_layout() |
| # plt.show() |
| # |
| |
| # using top 30 feature the train again |