Ha / Unconfirmed 240271.crdownload
Hamna97's picture
Upload Unconfirmed 240271.crdownload
592ff70 verified
# -*- coding: utf-8 -*-
"""m1_exam_oral (4).ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/10X3y3B7p8yobOiRjxxNqGfV7bIykP31L
"""
# load pandas and read the csv file from the url
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# I don't know why it takes so long to load the data
df = pd.read_csv("./stack-overflow-developer-survey-2024/survey_results_public.csv", )
def basic_info(column):
print(df[column].head())
print('-'*40)
print(df[column].tail())
print('-'*40)
print(df[column].isnull().sum())
print('-'*40)
print(df[column].value_counts())
print('-'*40)
print('*'*40)
# create a value mapping for all categorical data
value_maps = {}
def map_and_fill(column, fill_value=0):
value_map = {v: i for i, v in enumerate(df[column].dropna().unique())}
df[column] = df[column].map(value_map)
df[column] = df[column].fillna(fill_value)
value_maps[column] = value_map
return value_map
def reverse_map_and_fill(column):
if column in value_maps:
_reverse_map = {v: k for k, v in value_maps[column].items()}
df[column] = df[column].map(_reverse_map)
else:
print(f"No mapping found for column: {column}")
def bin_distribution(column, bins=10,rotate=0):
plt.figure(figsize=(12,6))
sns.histplot(df[column], bins=bins, kde=False)
plt.title(f'{column} Distribution')
plt.xlabel(column)
plt.ylabel('Count')
# rotate x labels
if rotate:
plt.xticks(rotation=rotate)
plt.show()
# don't overthink this function, it is just a fast way to explode a column with multiple values separated by a delimiter and one-hot encode the result, it is for machine learning data cleaning purpose, not for data analysis
def fast_explode(target_dataframe, target_column, fillna='', split=';', prefix='worked with', tmp_column_name='tmp_c'):
# Step 1: Create a temporary column with split and prefix
_exploded = (
target_dataframe.assign(
**{tmp_column_name: target_dataframe[target_column]
.fillna(fillna)
.str.split(split)
.apply(lambda lst: [f"{prefix} {lang.strip()}" for lang in lst if lang])
}
)
.explode(tmp_column_name)
)
# Step 2: One-hot encode
_one_hot = pd.crosstab(index=_exploded.index, columns=_exploded[tmp_column_name]).astype(bool)
# Step 3: Combine with original DataFrame
print(target_column, _one_hot.shape[0], target_dataframe.shape[0])
result = pd.concat([target_dataframe.drop(columns=[target_column]), _one_hot], axis=1)
result[_one_hot.columns] = result[_one_hot.columns].fillna(False)
return result
# let's take a look at the data
print(df.head())
print(df.describe())
print(df.info())
for c in df.columns:
print(f"{c}")
"""we can see null values throughout all columns, few columns have filled with full values.
now we will look into the data and see what we can do to clean it up and make it ready for analysis.
"""
# start checking..
basic_info("ResponseId")
# so response_id can be used as index
df.set_index('ResponseId', inplace=True) # setting index
# start checking..
basic_info("MainBranch")
# main_branch seems like a categorical data, asssign the unique values to the value_maps map to interger
print(map_and_fill("MainBranch"))
# it seems like a categorical data, let's see the unique values
print(df['MainBranch'].value_counts())
# ok, its categorical data
basic_info("Age")
# 0 null values
# mapping the age column
print(map_and_fill("Age"))
"""age is categorical data, we can keep it as it is. But I assume it represented register age/ working experience, not the actual age of the respondents"""
basic_info("RemoteWork")
# mapping the remote_work column
print(map_and_fill("RemoteWork"))
df['RemoteWork'] = df['RemoteWork'].fillna(0)# filling null values with 0
print(df['RemoteWork'].value_counts())
"""remote_work is categorical data, we can keep it as it is. And fill the null values with 0, meaning no remote work/experience"""
basic_info("EdLevel")
# don't know what it is yet, but it seems categorical data, filling null with 0
print(map_and_fill("EdLevel"))
df['EdLevel'] = df['EdLevel'].fillna(0)
basic_info("Check")
# only 1 value drop this column
df.drop(columns=['Check'], inplace=True)
# CodingActivities
basic_info("CodingActivities")
df = fast_explode(df,target_column='CodingActivities', fillna='', split=';', prefix='coding_activities ')
basic_info("Employment")
# mapping the Employment column
print(map_and_fill("Employment"))
df['Employment'] = df['Employment'].fillna(0)
# plot distribution of Employment column
bin_distribution("Employment", bins=len(df['Employment'].unique())+1)
# categorical data, a lot null values
basic_info('LearnCode')
basic_info('LearnCodeOnline')
basic_info('TechDoc')
for col in ['LearnCode', 'LearnCodeOnline', 'TechDoc']:
df = fast_explode(df,target_column=col, fillna='', split=';', prefix=f'{col} ')
basic_info('YearsCode')
# 2 unique string values 'Less than 1 year' and 'More than 50 years'
# convert them to 0 and 51
df['YearsCode'] = df['YearsCode'].replace({'Less than 1 year': 0, 'More than 50 years': 51})
basic_info("YearsCode")
# need to handle the null values
df['YearsCode'] = df['YearsCode'].fillna(-1)
df['YearsCode'] = df['YearsCode'].astype(int)
bin_distribution('YearsCode', bins=52)
basic_info('YearsCodePro')
df['YearsCodePro'] = df['YearsCodePro'].replace({'Less than 1 year': 0, 'More than 50 years': 51})
basic_info("YearsCodePro")
# need to handle the null values
df['YearsCodePro'] = df['YearsCodePro'].fillna(-1)
df['YearsCodePro'] = df['YearsCodePro'].astype(int)
bin_distribution('YearsCodePro', bins=52)
basic_info("DevType")
bin_distribution('DevType', bins=df['DevType'].nunique()+1,rotate=90)
print(map_and_fill("DevType"))
df['DevType'] = df['DevType'].fillna(-1)
basic_info("OrgSize")
bin_distribution('OrgSize', bins=df['OrgSize'].nunique()+1,rotate=90)
# categorical data, a lot null values, fill null with -1 meaning no response/don't have an organization?
print(map_and_fill("OrgSize"))
df['OrgSize'] = df['OrgSize'].fillna(-1)
basic_info("PurchaseInfluence")
# categorical data, a lot null values, fill null with -1 meaning no response/don't have an influence
print(map_and_fill("PurchaseInfluence"))
df['PurchaseInfluence'] = df['PurchaseInfluence'].fillna(-1)
basic_info("BuyNewTool")# another need to expand categorial data
df = fast_explode(df,target_column='BuyNewTool', fillna='', split=';', prefix='buy_new_tool ')
basic_info('BuildvsBuy')
# don't plot this for the name is so long
# mapping the BuildvsBuy column
print(map_and_fill("BuildvsBuy"))
# filling null values with -1, meaning?
#TODO meaning?
df['BuildvsBuy'] = df['BuildvsBuy'].fillna(-1)
basic_info("LanguageHaveWorkedWith")
drop_columns = []
explode_columns = []
for col in df.columns:
if 'HaveWorkedWith' in col:
explode_columns.append(col)
elif 'Admired' in col or 'WantTo' in col or 'SO' in col or 'AI' in col:
drop_columns.append(col)
print(drop_columns)
print(explode_columns)
# drop first
df.drop(columns=drop_columns, inplace=True)
# expand
for col in explode_columns:
df = fast_explode(df,target_column=col, fillna='', split=';', prefix=f'{col} ')
print(df.isnull().sum())
# Splits each string in LanguageHaveWorkedWith into a list of languages,Creates a new column called Language in the DataFrame. Converts each list in Language into multiple rows.
# df = fast_explode(df,target_column='LanguageHaveWorkedWith', fillna='', split=';', prefix='worked_with ')
basic_info('ICorPM') #Individual Contributor or People Manager
# categorical data, a lot null values, fill null with -1
print(map_and_fill("ICorPM"))
df['ICorPM'] = df['ICorPM'].fillna(-1)
basic_info('WorkExp')
# numerical categorical data, a lot null values, fill null with -1
df['WorkExp'] = df['WorkExp'].fillna(-1)
basic_info('Knowledge_1')
# from Knowledge_1 ~9 check the survey question, it is the long question about different things
for i in range(1,10):
basic_info(f'Knowledge_{i}')
print(map_and_fill(f'Knowledge_{i}'))
df[f'Knowledge_{i}'] = df[f'Knowledge_{i}'].fillna(-1)
# Frequency_1 ~ 3 also
for i in range(1,4):
basic_info(f'Frequency_{i}')
print(map_and_fill(f'Frequency_{i}'))
df[f'Frequency_{i}'] = df[f'Frequency_{i}'].fillna(-1)
basic_info('TimeSearching')
print(map_and_fill("TimeSearching"))
df['TimeSearching'] = df['TimeSearching'].fillna(-1)
basic_info('TimeAnswering')
print(map_and_fill("TimeAnswering"))
df['TimeAnswering'] = df['TimeAnswering'].fillna(-1)
basic_info('Frustration')
# change it's name to Challenge_Frustration
df.rename(columns={'Frustration': 'Challenge_Frustration'}, inplace=True)
print(map_and_fill("Challenge_Frustration"))
df['Challenge_Frustration'] = df['Challenge_Frustration'].fillna(-1)
basic_info('ProfessionalTech')
# change name to Company_ProfessionalTech
df.rename(columns={'ProfessionalTech': 'Company_ProfessionalTech'}, inplace=True)
print(map_and_fill("Company_ProfessionalTech"))
df['Company_ProfessionalTech'] = df['Company_ProfessionalTech'].fillna(-1)
basic_info('ProfessionalCloud')
print(map_and_fill("ProfessionalCloud"))
df['ProfessionalCloud'] = df['ProfessionalCloud'].fillna(-1)
basic_info('ProfessionalQuestion')
# change name to FirstAnswer(er)_ProfessionalQuestion
df.rename(columns={'ProfessionalQuestion': 'FirstAnswerer_ProfessionalQuestion'}, inplace=True)
print(map_and_fill("FirstAnswerer_ProfessionalQuestion"))
df['FirstAnswerer_ProfessionalQuestion'] = df['FirstAnswerer_ProfessionalQuestion'].fillna(-1)
basic_info('Industry')
print(map_and_fill("Industry"))
df['Industry'] = df['Industry'].fillna(-1)
basic_info("TechEndorse")
basic_info("Country") # do keep this for later analysis
basic_info("Currency") # their salary is in this currency
basic_info("CompTotal") # salary! for a year?!
# CompTotal means total compensation, including bonus, etc, not just base salary
# so it is a numerical data
df['CompTotal'] = df['CompTotal'].fillna(0)
print(df['Currency'].unique())
# keep first 3 letters of the currency that is not null
df['Currency'] = df['Currency'].str[:3]
print(df['Currency'].value_counts())
# using gpt to get the currency to usd exchange rate
usd_avg_rates_2024 = {
"AUD": 1.5158, # Australian dollar
"BRL": 5.3872, # Brazilian real
"CAD": 1.3699, # Canadian dollar
"CNY": 7.1957, # Chinese Yuan Renminbi
"DKK": 6.8955, # Danish krone
"EUR": 1.0820, # European Euro
"HKD": 7.8030, # Hong Kong dollar
"INR": 83.6566, # Indian rupee
"JPY": 151.4551, # Japanese yen
"MYR": 4.5747, # Malaysian ringgit
"MXN": 18.3062, # Mexican peso
"NZD": 1.6529, # New Zealand dollar
"NOK": 10.7574, # Norwegian krone
"SGD": 1.3363, # Singapore dollar
"ZAR": 18.3346, # South African rand
"KRW": 1363.4381, # South Korean won
"LKR": 301.6752, # Sri Lankan rupee
"SEK": 10.5744, # Swedish krona
"CHF": 0.8808, # Swiss franc
"TWD": 32.1064, # New Taiwan dollar
"THB": 35.2845, # Thai baht
"GBP": 1.2781, # Pound sterling
"VES": 38.3314 # Venezuelan bolivar
}
# convert all salary to usd
print(df[['CompTotal']].describe())
print(df['CompTotal'].max())
# there are some outliers, let's see how many are above 300000
# remove these outliers using quantile
#TODO apply currency conversion
df = df.loc[(df['CompTotal'] < df['CompTotal'].quantile(0.99)) & (df['CompTotal'] > df['CompTotal'].quantile(0.01))]
#remove salaries below 1000 usd per year ,remove larger than 300000 usd per year
# Example: df['CompTotal'].value_counts()
salary_counts = df['CompTotal'].value_counts()
import plotly.express as px
# Convert to DataFrame
df_salary = salary_counts.reset_index()
df_salary.columns = ['Salary', 'Count']
fig = px.histogram(
df_salary,
x='Salary',
y='Count',
nbins=100, # Adjust bins for granularity
title='Salary Distribution Histogram',
labels={'Salary': 'Salary Amount', 'Count': 'Number of People'},
log_y=False # Log scale for better visibility
)
fig.show()
# Plot histogram (weighted by Count)
fig = px.histogram(
df_salary,
x='Salary',
y='Count',
nbins=100, # Adjust bins for granularity
title='Salary Distribution Histogram (log scale)',
labels={'Salary': 'Salary Amount', 'Count': 'Number of People'},
log_y=True # Log scale for better visibility
)
fig.show()
# limit salary to 1000 to 1.5M
df = df.loc[(df['CompTotal'] >= 1000) & (df['CompTotal'] <= 500000)]
print(df[['CompTotal']].describe())
basic_info('JobSatPoints_1')
# JobSatPoints_1~11 need to fillna
# it is an over all job satisfaction score from 1 to 100. total score should be 100 lets check
# !! there is no jsp_2 and jsp_3
# fillna with -1 first
for i in range(1, 12):
if i in [2,3]:
continue
df[f'JobSatPoints_{i}'] = df[f'JobSatPoints_{i}'].fillna(-1)
total_score = df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].sum(axis=1)
print(total_score.value_counts())
# ok there are some value larger than 100, let's check total number
print((total_score > 100).sum())
# let try to normalize these values to 100, reverse the mistake, round to integer incase accuracy problem
df.loc[total_score > 100, [f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]] = df.loc[total_score > 100, [f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].div(total_score[total_score > 100], axis=0).multiply(100)
df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]] = df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].round().astype(int)
total_score = df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].sum(axis=1)
print(total_score.value_counts())
# ok there are some value larger than 100, let's check total number
print((total_score > 100).sum())
# check null values again
null_counts = df.isnull().sum()
print(null_counts[null_counts > 0])
drop_columns = ['TechEndorse','SurveyLength','SurveyEase','ConvertedCompYearly']
explode_columns = ['OpSysPersonal use','OpSysProfessional use']
for col in explode_columns:
df = fast_explode(df,target_column=col, fillna='', split=';', prefix=f'{col} ')
df.drop(columns=drop_columns, inplace=True)
df['TBranch'] = df['TBranch'].map({'Yes': 1, 'No': 0})
df['TBranch'] = df['TBranch'].fillna(-1)
print(map_and_fill('Country'))
print(map_and_fill('Currency'))
null_counts = df.isnull().sum()
print(null_counts[null_counts > 0])
df
df.replace({True: 1, False: 0}, inplace=True)
# df
# start model prediction using xgboost or lightgbm
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
# --- Candidate models ---
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# ahh.. standard scaler
# ========== 1. Data preparation ==========
# Filter and clean
# using country and currency as grouping then separate the data into train_df
# train_df = df.loc[df["CompTotal"] >= 1000].copy()
# train_df.drop(columns=["JobSat"], errors="ignore", inplace=True)
#
# # Split features and target
# X = train_df.drop(columns=["CompTotal"])
# y = train_df["CompTotal"]
#
# # Split into training / testing
# X_train, X_test, y_train, y_test = train_test_split(
# X, y, test_size=0.2, random_state=42
# )
#
# # ========== 2. Preprocessing pipeline ==========
# # Identify numeric and categorical columns
# num_cols = X_train.select_dtypes(include=["number"]).columns
# cat_cols = X_train.select_dtypes(exclude=["number"]).columns
#
# # Numeric pipeline
# numeric_pipeline = Pipeline(steps=[
#
# ("scaler", StandardScaler())
# ])
#
#
#
# # Combine into column transformer
# preprocessor = ColumnTransformer(
# transformers=[
# ("num", numeric_pipeline, num_cols),
#
# ]
# )
#
# # ========== 3. Define models ==========
# models = {
# "GradientBoosting": GradientBoostingRegressor(
# n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42
# ),
# "RandomForest": RandomForestRegressor(
# n_estimators=400, random_state=42, n_jobs=-1
# ),
# "HistGradientBoosting": HistGradientBoostingRegressor(
# learning_rate=0.05, random_state=42
# )
# }
#
#
# models["XGBoost"] = XGBRegressor(
# n_estimators=600, learning_rate=0.05, max_depth=6,
# subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42
# )
#
#
# models["LightGBM"] = LGBMRegressor(
# n_estimators=600, learning_rate=0.05, num_leaves=31,
# subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42
# )
#
# # ========== 4. Evaluate each model with cross-validation ==========
# cv = KFold(n_splits=5, shuffle=True, random_state=42)
# results = []
#
# for name, model in models.items():
# pipe = Pipeline(steps=[("preprocessor", preprocessor),
# ("model", model)])
# scores = cross_val_score(
# pipe, X_train, y_train,
# scoring="neg_root_mean_squared_error",
# cv=cv, n_jobs=-1,error_score='raise'
# )
# rmse = -scores.mean()
# print(f"{name:>20s} | CV RMSE: {rmse:.2f}")
# results.append((name, rmse, scores.std()))
#
# # Sort and select best
# results_df = pd.DataFrame(results, columns=["Model", "CV_RMSE", "Std"]).sort_values("CV_RMSE")
# best_model_name = results_df.iloc[0]["Model"]
# print("\nBest model:", best_model_name)
#
# # ========== 5. Fit best model pipeline and evaluate on test set ==========
# best_model = models[best_model_name]
# best_pipeline = Pipeline(steps=[
# ("preprocessor", preprocessor),
# ("model", best_model)
# ])
#
# best_pipeline.fit(X_train, y_train)
# y_pred = best_pipeline.predict(X_test)
#
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)
#
# print(f"\n--- Test Performance ({best_model_name}) ---")
# print(f"RMSE: {rmse:.2f}")
# print(f"MAE : {mae:.2f}")
# print(f"R² : {r2:.4f}")
#
# # Optional: show model comparison table
# print("\nModel comparison (lower RMSE better):")
# print(results_df.to_string(index=False))
# lightbgm seems to be the best model here, check feature importance and plot the top 20 features
# feature_names_num = num_cols.tolist()
#
# feature_names = feature_names_num
#
# importances = best_model.feature_importances_
# feature_importance_df = pd.DataFrame({
# 'Feature': feature_names,
# 'Importance': importances
# }).sort_values(by='Importance', ascending=False)
#
# print("\nTop 20 Feature Importances:")
# print(feature_importance_df.head(20))
#
# # Plot top 20 features
# plt.figure(figsize=(10, 6))
# sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
# plt.title('Top 20 Feature Importances')
# plt.tight_layout()
# plt.show()
"""![image.png](attachment:2f8352b5-06b3-4ccd-870c-c3ffc3318a1f.png)
# first modelling is terrible, we need a deeper analysis of the data.
# we can see that the country and currency are important features, we can try to group the data by country and currency and see what we can find out.
"""
for train_df in df.groupby(['Country', 'Currency']):
print(train_df[0], train_df[1].shape)
"""# we can see that some country and currency have very few data, we can filter out these data and only keep the country and currency with more than 1000 data points.
"""
# finding the country and currency with more than 1000 data points
for train_df in df.groupby(['Country', 'Currency']):
if train_df[1].shape[0]>1000:
print(train_df[0], train_df[1].shape)
for x in value_maps['Country'].keys() :
if value_maps['Country'][x]==train_df[0][0]:
print(x)
for x in value_maps['Currency'].keys() :
if value_maps['Currency'][x]==train_df[0][1]:
print(x)
# and it's USA and USD, France and EUR, UK and GBP, Canada and CAD, Germany and EUR
# select these country and currency ,3 and 1
df_cleaned = df.loc[df["CompTotal"] >= 1000].copy()
df_cleaned.drop(columns=["JobSat"], errors="ignore", inplace=True)
df_cleaned = df_cleaned[((df_cleaned['Country']==value_maps['Country']['United States of America']) & (df_cleaned['Currency']==value_maps['Currency']['USD']))
]
from sklearn.decomposition import PCA
results = []
for group in df_cleaned.groupby(['Country', 'Currency']):
train_df = group[1]
if train_df.shape[0]<1000:
continue
# --- 1️⃣ Prepare data ---
# Assuming df_cleaned is your cleaned DataFrame with only numeric columns
X = train_df.select_dtypes(include=['number'])
# --- 2️⃣ Standardize the features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# --- 3️⃣ Apply PCA ---
# Let's start with 2 principal components for visualization
for n in range(1, 100):
pca = PCA(n_components=n)
pca.fit(X_scaled)
results.append({'Components': n, 'ExplainedVariance': pca.explained_variance_ratio_.sum()})
pca_summary = pd.DataFrame(results)
print(pca_summary)
plt.plot(pca_summary['Components'], pca_summary['ExplainedVariance'], marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Total Explained Variance')
plt.title('Explained Variance by Number of PCA Components')
plt.grid(True)
plt.show()
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import clone
from sklearn.inspection import permutation_importance
import numpy as np
# --- baseline models ---
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
# kaggle top models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# ========== 1. Data preparation ==========
df_cleaned = df.loc[df["CompTotal"] >= 1000].copy()
df_cleaned = df_cleaned.loc[df["CompTotal"] <= 300000].copy()
df_cleaned.drop(columns=["JobSat"], errors="ignore", inplace=True)
for group in df_cleaned.groupby(['Country', 'Currency']):
train_df = group[1]
if train_df.shape[0]<1000:
continue
# Split features and target
X = train_df.drop(columns=["CompTotal"])
y = train_df["CompTotal"]
# Split into training / testing
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# ========== 2. Preprocessing pipeline ==========
# Identify numeric and categorical columns
num_cols = X_train.select_dtypes(include=["number"]).columns
cat_cols = X_train.select_dtypes(exclude=["number"]).columns
# Numeric pipeline
numeric_pipeline = Pipeline(steps=[
("scaler", StandardScaler())
])
# Combine into column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_pipeline, num_cols),
]
)
# ========== 3. Define models ==========
models = {
"GradientBoosting": GradientBoostingRegressor(
n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42
),
"RandomForest": RandomForestRegressor(
n_estimators=400, random_state=42, n_jobs=-1
),
"HistGradientBoosting": HistGradientBoostingRegressor(
learning_rate=0.05, random_state=42
)
}
models["XGBoost"] = XGBRegressor(
n_estimators=600, learning_rate=0.05, max_depth=6,
subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42
)
models["LightGBM"] = LGBMRegressor(
n_estimators=600, learning_rate=0.05, num_leaves=31,
subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42
)
# ========== 4. Evaluate each model with cross-validation ==========
cv = KFold(n_splits=5, shuffle=True, random_state=42)
results = []
for name, model in models.items():
pipe = Pipeline(steps=[("preprocessor", preprocessor),
("model", model)])
scores = cross_val_score(
pipe, X_train, y_train,
scoring="neg_root_mean_squared_error",
cv=cv, n_jobs=-1,error_score='raise'
)
rmse = -scores.mean()
print(f"{name:>20s} | CV RMSE: {rmse:.2f}")
results.append((name, rmse, scores.std()))
# Sort and select best
results_df = pd.DataFrame(results, columns=["Model", "CV_RMSE", "Std"]).sort_values("CV_RMSE")
best_model_name = results_df.iloc[0]["Model"]
print("\nBest model:", best_model_name)
# ========== 5. Fit best model pipeline and evaluate on test set ==========
best_model = models[best_model_name]
best_pipeline = Pipeline(steps=[
("preprocessor", preprocessor),
("model", best_model)
])
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"\n--- Test Performance ({best_model_name}) ---")
print(f"RMSE: {rmse:.2f}")
print(f"MAE : {mae:.2f}")
print(f"R² : {r2:.4f}")
# Optional: show model comparison table
print("\nModel comparison (lower RMSE better):")
print(results_df.to_string(index=False))
_country = ''
for x in value_maps['Country'].keys() :
if value_maps['Country'][x]==group[0][0]:
_country = x
for x in value_maps['Currency'].keys() :
if value_maps['Currency'][x]==group[0][1]:
print(x)
print('*'*40)
print('*'*40)
feature_names_num = num_cols.tolist()
feature_names = feature_names_num
importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({
'Feature': feature_names,
'Importance': importances
}).sort_values(by='Importance', ascending=False)
print("\nTop 20 Feature Importances:")
print(feature_importance_df.head(20))
# ---------- existing code up to the first evaluation ----------
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"\n--- Test Performance ({best_model_name}) ---")
print(f"RMSE: {rmse:.2f}")
print(f"MAE : {mae:.2f}")
print(f"R² : {r2:.4f}")
print("\nModel comparison (lower RMSE better):")
print(results_df.to_string(index=False))
# --------- Nice labels for Country/Currency (unchanged) ---------
_country = ''
for x in value_maps['Country'].keys():
if value_maps['Country'][x] == group[0][0]:
_country = x
_currency = ''
for x in value_maps['Currency'].keys():
if value_maps['Currency'][x] == group[0][1]:
_currency = x
print(_currency)
print('*'*40)
print('*'*40)
# ===================== TOP-50 FEATURE REFIT =====================
# 1) Get feature importances (fallback to permutation importance if needed)
feature_names = num_cols.tolist() # only numeric used in your preprocessor
if hasattr(best_model, "feature_importances_"):
importances = best_model.feature_importances_
else:
# Fallback: permutation importance on training set
# (uses the full pipeline so it handles scaling correctly)
perm = permutation_importance(
best_pipeline, X_train, y_train,
n_repeats=5,
scoring="neg_root_mean_squared_error",
random_state=42,
n_jobs=-1
)
# With your current preprocessor (only numeric), importances align to feature_names
importances = perm.importances_mean
feature_importance_df = pd.DataFrame({
'Feature': feature_names,
'Importance': importances
}).sort_values(by='Importance', ascending=False)
print("\nTop 20 Feature Importances (all features):")
print(feature_importance_df.head(20))
# 2) Pick top-K features
TOP_K = min(50, len(feature_names))
top_features = feature_importance_df.head(TOP_K)['Feature'].tolist()
# 3) Build a new preprocessor that only uses the top-K features
preprocessor_top = ColumnTransformer(
transformers=[
("num", numeric_pipeline, top_features),
]
)
# 4) Clone the best model (clean, unfitted) and retrain on top-K
best_model_top = clone(best_model)
best_pipeline_top = Pipeline(steps=[
("preprocessor", preprocessor_top),
("model", best_model_top)
])
best_pipeline_top.fit(X_train, y_train)
y_pred_top = best_pipeline_top.predict(X_test)
rmse_top = np.sqrt(mean_squared_error(y_test, y_pred_top))
mae_top = mean_absolute_error(y_test, y_pred_top)
r2_top = r2_score(y_test, y_pred_top)
# 5) Report before/after
print(f"\n=== Top-{TOP_K} Feature Refit ({best_model_name}) in {_country} [{_currency}] ===")
print(f"BEFORE -> RMSE: {rmse:.2f} | MAE: {mae:.2f} | R²: {r2:.4f} (all {len(feature_names)} numeric features)")
print(f"AFTER -> RMSE: {rmse_top:.2f} | MAE: {mae_top:.2f} | R²: {r2_top:.4f} (top {TOP_K} features)")
delta = rmse_top - rmse
print(f"ΔRMSE (top-{TOP_K} - all): {delta:+.2f}")
# Optional: plot top 20 features among the selected set (from original ranking)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
plt.title(f'Top 20 Feature Importances in {_country}')
plt.tight_layout()
plt.show()
# # Plot top 20 features
# plt.figure(figsize=(10, 6))
# sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
# plt.title(f'Top 20 Feature Importances in {_country}')
# plt.tight_layout()
# plt.show()
#
# using top 30 feature the train again