Spaces:

Hamna97
/

Ha

Sleeping

App Files Files Community

Ha / Unconfirmed 240271.crdownload

Hamna97

Upload Unconfirmed 240271.crdownload

592ff70 verified 7 months ago

raw

history blame contribute delete

30.7 kB

	# -- coding: utf-8 --
	"""m1_exam_oral (4).ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/10X3y3B7p8yobOiRjxxNqGfV7bIykP31L
	"""

	# load pandas and read the csv file from the url
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	# I don't know why it takes so long to load the data
	df = pd.read_csv("./stack-overflow-developer-survey-2024/survey_results_public.csv", )


	def basic_info(column):
	print(df[column].head())
	print('-'*40)
	print(df[column].tail())
	print('-'*40)
	print(df[column].isnull().sum())
	print('-'*40)
	print(df[column].value_counts())
	print('-'*40)
	print(''40)

	# create a value mapping for all categorical data
	value_maps = {}
	def map_and_fill(column, fill_value=0):
	value_map = {v: i for i, v in enumerate(df[column].dropna().unique())}
	df[column] = df[column].map(value_map)
	df[column] = df[column].fillna(fill_value)
	value_maps[column] = value_map
	return value_map

	def reverse_map_and_fill(column):
	if column in value_maps:
	_reverse_map = {v: k for k, v in value_maps[column].items()}
	df[column] = df[column].map(_reverse_map)
	else:
	print(f"No mapping found for column: {column}")

	def bin_distribution(column, bins=10,rotate=0):
	plt.figure(figsize=(12,6))
	sns.histplot(df[column], bins=bins, kde=False)
	plt.title(f'{column} Distribution')
	plt.xlabel(column)
	plt.ylabel('Count')
	# rotate x labels
	if rotate:
	plt.xticks(rotation=rotate)
	plt.show()

	# don't overthink this function, it is just a fast way to explode a column with multiple values separated by a delimiter and one-hot encode the result, it is for machine learning data cleaning purpose, not for data analysis
	def fast_explode(target_dataframe, target_column, fillna='', split=';', prefix='worked with', tmp_column_name='tmp_c'):
	# Step 1: Create a temporary column with split and prefix
	_exploded = (
	target_dataframe.assign(
	**{tmp_column_name: target_dataframe[target_column]
	.fillna(fillna)
	.str.split(split)
	.apply(lambda lst: [f"{prefix} {lang.strip()}" for lang in lst if lang])
	}
	)
	.explode(tmp_column_name)
	)

	# Step 2: One-hot encode
	_one_hot = pd.crosstab(index=_exploded.index, columns=_exploded[tmp_column_name]).astype(bool)

	# Step 3: Combine with original DataFrame
	print(target_column, _one_hot.shape[0], target_dataframe.shape[0])
	result = pd.concat([target_dataframe.drop(columns=[target_column]), _one_hot], axis=1)
	result[_one_hot.columns] = result[_one_hot.columns].fillna(False)

	return result



	# let's take a look at the data
	print(df.head())
	print(df.describe())
	print(df.info())

	for c in df.columns:
	print(f"{c}")

	"""we can see null values throughout all columns, few columns have filled with full values.

	now we will look into the data and see what we can do to clean it up and make it ready for analysis.
	"""

	# start checking..
	basic_info("ResponseId")

	# so response_id can be used as index
	df.set_index('ResponseId', inplace=True) # setting index

	# start checking..
	basic_info("MainBranch")

	# main_branch seems like a categorical data, asssign the unique values to the value_maps map to interger

	print(map_and_fill("MainBranch"))

	# it seems like a categorical data, let's see the unique values
	print(df['MainBranch'].value_counts())
	# ok, its categorical data

	basic_info("Age")
	# 0 null values

	# mapping the age column
	print(map_and_fill("Age"))

	"""age is categorical data, we can keep it as it is. But I assume it represented register age/ working experience, not the actual age of the respondents"""

	basic_info("RemoteWork")
	# mapping the remote_work column
	print(map_and_fill("RemoteWork"))

	df['RemoteWork'] = df['RemoteWork'].fillna(0)# filling null values with 0
	print(df['RemoteWork'].value_counts())

	"""remote_work is categorical data, we can keep it as it is. And fill the null values with 0, meaning no remote work/experience"""

	basic_info("EdLevel")
	# don't know what it is yet, but it seems categorical data, filling null with 0
	print(map_and_fill("EdLevel"))
	df['EdLevel'] = df['EdLevel'].fillna(0)

	basic_info("Check")
	# only 1 value drop this column
	df.drop(columns=['Check'], inplace=True)

	# CodingActivities
	basic_info("CodingActivities")
	df = fast_explode(df,target_column='CodingActivities', fillna='', split=';', prefix='coding_activities ')

	basic_info("Employment")


	# mapping the Employment column
	print(map_and_fill("Employment"))
	df['Employment'] = df['Employment'].fillna(0)
	# plot distribution of Employment column

	bin_distribution("Employment", bins=len(df['Employment'].unique())+1)

	# categorical data, a lot null values
	basic_info('LearnCode')
	basic_info('LearnCodeOnline')
	basic_info('TechDoc')
	for col in ['LearnCode', 'LearnCodeOnline', 'TechDoc']:
	df = fast_explode(df,target_column=col, fillna='', split=';', prefix=f'{col} ')

	basic_info('YearsCode')
	# 2 unique string values 'Less than 1 year' and 'More than 50 years'
	# convert them to 0 and 51
	df['YearsCode'] = df['YearsCode'].replace({'Less than 1 year': 0, 'More than 50 years': 51})
	basic_info("YearsCode")
	# need to handle the null values
	df['YearsCode'] = df['YearsCode'].fillna(-1)
	df['YearsCode'] = df['YearsCode'].astype(int)
	bin_distribution('YearsCode', bins=52)

	basic_info('YearsCodePro')
	df['YearsCodePro'] = df['YearsCodePro'].replace({'Less than 1 year': 0, 'More than 50 years': 51})
	basic_info("YearsCodePro")
	# need to handle the null values
	df['YearsCodePro'] = df['YearsCodePro'].fillna(-1)

	df['YearsCodePro'] = df['YearsCodePro'].astype(int)
	bin_distribution('YearsCodePro', bins=52)

	basic_info("DevType")
	bin_distribution('DevType', bins=df['DevType'].nunique()+1,rotate=90)
	print(map_and_fill("DevType"))
	df['DevType'] = df['DevType'].fillna(-1)

	basic_info("OrgSize")
	bin_distribution('OrgSize', bins=df['OrgSize'].nunique()+1,rotate=90)
	# categorical data, a lot null values, fill null with -1 meaning no response/don't have an organization?
	print(map_and_fill("OrgSize"))
	df['OrgSize'] = df['OrgSize'].fillna(-1)

	basic_info("PurchaseInfluence")
	# categorical data, a lot null values, fill null with -1 meaning no response/don't have an influence
	print(map_and_fill("PurchaseInfluence"))
	df['PurchaseInfluence'] = df['PurchaseInfluence'].fillna(-1)

	basic_info("BuyNewTool")# another need to expand categorial data
	df = fast_explode(df,target_column='BuyNewTool', fillna='', split=';', prefix='buy_new_tool ')

	basic_info('BuildvsBuy')
	# don't plot this for the name is so long
	# mapping the BuildvsBuy column
	print(map_and_fill("BuildvsBuy"))
	# filling null values with -1, meaning?
	#TODO meaning?
	df['BuildvsBuy'] = df['BuildvsBuy'].fillna(-1)

	basic_info("LanguageHaveWorkedWith")
	drop_columns = []
	explode_columns = []
	for col in df.columns:
	if 'HaveWorkedWith' in col:
	explode_columns.append(col)

	elif 'Admired' in col or 'WantTo' in col or 'SO' in col or 'AI' in col:
	drop_columns.append(col)

	print(drop_columns)
	print(explode_columns)
	# drop first
	df.drop(columns=drop_columns, inplace=True)

	# expand
	for col in explode_columns:
	df = fast_explode(df,target_column=col, fillna='', split=';', prefix=f'{col} ')
	print(df.isnull().sum())

	# Splits each string in LanguageHaveWorkedWith into a list of languages，Creates a new column called Language in the DataFrame. Converts each list in Language into multiple rows.
	# df = fast_explode(df,target_column='LanguageHaveWorkedWith', fillna='', split=';', prefix='worked_with ')

	basic_info('ICorPM') #Individual Contributor or People Manager
	# categorical data, a lot null values, fill null with -1
	print(map_and_fill("ICorPM"))
	df['ICorPM'] = df['ICorPM'].fillna(-1)

	basic_info('WorkExp')
	# numerical categorical data, a lot null values, fill null with -1
	df['WorkExp'] = df['WorkExp'].fillna(-1)

	basic_info('Knowledge_1')
	# from Knowledge_1 ~9 check the survey question, it is the long question about different things
	for i in range(1,10):
	basic_info(f'Knowledge_{i}')
	print(map_and_fill(f'Knowledge_{i}'))
	df[f'Knowledge_{i}'] = df[f'Knowledge_{i}'].fillna(-1)

	# Frequency_1 ~ 3 also

	for i in range(1,4):
	basic_info(f'Frequency_{i}')
	print(map_and_fill(f'Frequency_{i}'))
	df[f'Frequency_{i}'] = df[f'Frequency_{i}'].fillna(-1)

	basic_info('TimeSearching')
	print(map_and_fill("TimeSearching"))
	df['TimeSearching'] = df['TimeSearching'].fillna(-1)

	basic_info('TimeAnswering')
	print(map_and_fill("TimeAnswering"))
	df['TimeAnswering'] = df['TimeAnswering'].fillna(-1)

	basic_info('Frustration')
	# change it's name to Challenge_Frustration
	df.rename(columns={'Frustration': 'Challenge_Frustration'}, inplace=True)
	print(map_and_fill("Challenge_Frustration"))
	df['Challenge_Frustration'] = df['Challenge_Frustration'].fillna(-1)


	basic_info('ProfessionalTech')
	# change name to Company_ProfessionalTech
	df.rename(columns={'ProfessionalTech': 'Company_ProfessionalTech'}, inplace=True)
	print(map_and_fill("Company_ProfessionalTech"))
	df['Company_ProfessionalTech'] = df['Company_ProfessionalTech'].fillna(-1)

	basic_info('ProfessionalCloud')
	print(map_and_fill("ProfessionalCloud"))
	df['ProfessionalCloud'] = df['ProfessionalCloud'].fillna(-1)


	basic_info('ProfessionalQuestion')
	# change name to FirstAnswer(er)_ProfessionalQuestion
	df.rename(columns={'ProfessionalQuestion': 'FirstAnswerer_ProfessionalQuestion'}, inplace=True)
	print(map_and_fill("FirstAnswerer_ProfessionalQuestion"))
	df['FirstAnswerer_ProfessionalQuestion'] = df['FirstAnswerer_ProfessionalQuestion'].fillna(-1)


	basic_info('Industry')
	print(map_and_fill("Industry"))
	df['Industry'] = df['Industry'].fillna(-1)

	basic_info("TechEndorse")
	basic_info("Country") # do keep this for later analysis
	basic_info("Currency") # their salary is in this currency
	basic_info("CompTotal") # salary! for a year?!
	# CompTotal means total compensation, including bonus, etc, not just base salary
	# so it is a numerical data
	df['CompTotal'] = df['CompTotal'].fillna(0)

	print(df['Currency'].unique())
	# keep first 3 letters of the currency that is not null

	df['Currency'] = df['Currency'].str[:3]
	print(df['Currency'].value_counts())
	# using gpt to get the currency to usd exchange rate
	usd_avg_rates_2024 = {
	"AUD": 1.5158, # Australian dollar
	"BRL": 5.3872, # Brazilian real
	"CAD": 1.3699, # Canadian dollar
	"CNY": 7.1957, # Chinese Yuan Renminbi
	"DKK": 6.8955, # Danish krone
	"EUR": 1.0820, # European Euro
	"HKD": 7.8030, # Hong Kong dollar
	"INR": 83.6566, # Indian rupee
	"JPY": 151.4551, # Japanese yen
	"MYR": 4.5747, # Malaysian ringgit
	"MXN": 18.3062, # Mexican peso
	"NZD": 1.6529, # New Zealand dollar
	"NOK": 10.7574, # Norwegian krone
	"SGD": 1.3363, # Singapore dollar
	"ZAR": 18.3346, # South African rand
	"KRW": 1363.4381, # South Korean won
	"LKR": 301.6752, # Sri Lankan rupee
	"SEK": 10.5744, # Swedish krona
	"CHF": 0.8808, # Swiss franc
	"TWD": 32.1064, # New Taiwan dollar
	"THB": 35.2845, # Thai baht
	"GBP": 1.2781, # Pound sterling
	"VES": 38.3314 # Venezuelan bolivar
	}
	# convert all salary to usd
	print(df[['CompTotal']].describe())
	print(df['CompTotal'].max())

	# there are some outliers, let's see how many are above 300000
	# remove these outliers using quantile
	#TODO apply currency conversion

	df = df.loc[(df['CompTotal'] < df['CompTotal'].quantile(0.99)) & (df['CompTotal'] > df['CompTotal'].quantile(0.01))]
	#remove salaries below 1000 usd per year ,remove larger than 300000 usd per year



	# Example: df['CompTotal'].value_counts()
	salary_counts = df['CompTotal'].value_counts()
	import plotly.express as px
	# Convert to DataFrame
	df_salary = salary_counts.reset_index()
	df_salary.columns = ['Salary', 'Count']
	fig = px.histogram(
	df_salary,
	x='Salary',
	y='Count',
	nbins=100, # Adjust bins for granularity
	title='Salary Distribution Histogram',
	labels={'Salary': 'Salary Amount', 'Count': 'Number of People'},
	log_y=False # Log scale for better visibility
	)

	fig.show()
	# Plot histogram (weighted by Count)
	fig = px.histogram(
	df_salary,
	x='Salary',
	y='Count',
	nbins=100, # Adjust bins for granularity
	title='Salary Distribution Histogram (log scale)',
	labels={'Salary': 'Salary Amount', 'Count': 'Number of People'},
	log_y=True # Log scale for better visibility
	)

	fig.show()

	# limit salary to 1000 to 1.5M
	df = df.loc[(df['CompTotal'] >= 1000) & (df['CompTotal'] <= 500000)]
	print(df[['CompTotal']].describe())

	basic_info('JobSatPoints_1')
	# JobSatPoints_1~11 need to fillna

	# it is an over all job satisfaction score from 1 to 100. total score should be 100 lets check
	# !! there is no jsp_2 and jsp_3
	# fillna with -1 first
	for i in range(1, 12):
	if i in [2,3]:
	continue
	df[f'JobSatPoints_{i}'] = df[f'JobSatPoints_{i}'].fillna(-1)

	total_score = df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].sum(axis=1)
	print(total_score.value_counts())
	# ok there are some value larger than 100, let's check total number
	print((total_score > 100).sum())
	# let try to normalize these values to 100, reverse the mistake, round to integer incase accuracy problem
	df.loc[total_score > 100, [f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]] = df.loc[total_score > 100, [f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].div(total_score[total_score > 100], axis=0).multiply(100)
	df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]] = df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].round().astype(int)

	total_score = df[[f'JobSatPoints_{i}' for i in range(1, 12) if i not in [2,3]]].sum(axis=1)
	print(total_score.value_counts())
	# ok there are some value larger than 100, let's check total number
	print((total_score > 100).sum())

	# check null values again
	null_counts = df.isnull().sum()
	print(null_counts[null_counts > 0])

	drop_columns = ['TechEndorse','SurveyLength','SurveyEase','ConvertedCompYearly']
	explode_columns = ['OpSysPersonal use','OpSysProfessional use']
	for col in explode_columns:
	df = fast_explode(df,target_column=col, fillna='', split=';', prefix=f'{col} ')
	df.drop(columns=drop_columns, inplace=True)

	df['TBranch'] = df['TBranch'].map({'Yes': 1, 'No': 0})
	df['TBranch'] = df['TBranch'].fillna(-1)

	print(map_and_fill('Country'))
	print(map_and_fill('Currency'))

	null_counts = df.isnull().sum()
	print(null_counts[null_counts > 0])
	df

	df.replace({True: 1, False: 0}, inplace=True)
	# df

	# start model prediction using xgboost or lightgbm
	from sklearn.model_selection import train_test_split, cross_val_score, KFold
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OneHotEncoder, StandardScaler
	from sklearn.impute import SimpleImputer
	from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
	import numpy as np
	# --- Candidate models ---
	from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
	from xgboost import XGBRegressor
	from lightgbm import LGBMRegressor

	# ahh.. standard scaler
	# ========== 1. Data preparation ==========
	# Filter and clean
	# using country and currency as grouping then separate the data into train_df

	# train_df = df.loc[df["CompTotal"] >= 1000].copy()
	# train_df.drop(columns=["JobSat"], errors="ignore", inplace=True)
	#
	# # Split features and target
	# X = train_df.drop(columns=["CompTotal"])
	# y = train_df["CompTotal"]
	#
	# # Split into training / testing
	# X_train, X_test, y_train, y_test = train_test_split(
	# X, y, test_size=0.2, random_state=42
	# )
	#
	# # ========== 2. Preprocessing pipeline ==========
	# # Identify numeric and categorical columns
	# num_cols = X_train.select_dtypes(include=["number"]).columns
	# cat_cols = X_train.select_dtypes(exclude=["number"]).columns
	#
	# # Numeric pipeline
	# numeric_pipeline = Pipeline(steps=[
	#
	# ("scaler", StandardScaler())
	# ])
	#
	#
	#
	# # Combine into column transformer
	# preprocessor = ColumnTransformer(
	# transformers=[
	# ("num", numeric_pipeline, num_cols),
	#
	# ]
	# )
	#
	# # ========== 3. Define models ==========
	# models = {
	# "GradientBoosting": GradientBoostingRegressor(
	# n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42
	# ),
	# "RandomForest": RandomForestRegressor(
	# n_estimators=400, random_state=42, n_jobs=-1
	# ),
	# "HistGradientBoosting": HistGradientBoostingRegressor(
	# learning_rate=0.05, random_state=42
	# )
	# }
	#
	#
	# models["XGBoost"] = XGBRegressor(
	# n_estimators=600, learning_rate=0.05, max_depth=6,
	# subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42
	# )
	#
	#
	# models["LightGBM"] = LGBMRegressor(
	# n_estimators=600, learning_rate=0.05, num_leaves=31,
	# subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42
	# )
	#
	# # ========== 4. Evaluate each model with cross-validation ==========
	# cv = KFold(n_splits=5, shuffle=True, random_state=42)
	# results = []
	#
	# for name, model in models.items():
	# pipe = Pipeline(steps=[("preprocessor", preprocessor),
	# ("model", model)])
	# scores = cross_val_score(
	# pipe, X_train, y_train,
	# scoring="neg_root_mean_squared_error",
	# cv=cv, n_jobs=-1,error_score='raise'
	# )
	# rmse = -scores.mean()
	# print(f"{name:>20s} \| CV RMSE: {rmse:.2f}")
	# results.append((name, rmse, scores.std()))
	#
	# # Sort and select best
	# results_df = pd.DataFrame(results, columns=["Model", "CV_RMSE", "Std"]).sort_values("CV_RMSE")
	# best_model_name = results_df.iloc[0]["Model"]
	# print("\nBest model:", best_model_name)
	#
	# # ========== 5. Fit best model pipeline and evaluate on test set ==========
	# best_model = models[best_model_name]
	# best_pipeline = Pipeline(steps=[
	# ("preprocessor", preprocessor),
	# ("model", best_model)
	# ])
	#
	# best_pipeline.fit(X_train, y_train)
	# y_pred = best_pipeline.predict(X_test)
	#
	# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
	# mae = mean_absolute_error(y_test, y_pred)
	# r2 = r2_score(y_test, y_pred)
	#
	# print(f"\n--- Test Performance ({best_model_name}) ---")
	# print(f"RMSE: {rmse:.2f}")
	# print(f"MAE : {mae:.2f}")
	# print(f"R² : {r2:.4f}")
	#
	# # Optional: show model comparison table
	# print("\nModel comparison (lower RMSE better):")
	# print(results_df.to_string(index=False))

	# lightbgm seems to be the best model here, check feature importance and plot the top 20 features

	# feature_names_num = num_cols.tolist()
	#
	# feature_names = feature_names_num
	#
	# importances = best_model.feature_importances_
	# feature_importance_df = pd.DataFrame({
	# 'Feature': feature_names,
	# 'Importance': importances
	# }).sort_values(by='Importance', ascending=False)
	#
	# print("\nTop 20 Feature Importances:")
	# print(feature_importance_df.head(20))
	#
	# # Plot top 20 features
	# plt.figure(figsize=(10, 6))
	# sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
	# plt.title('Top 20 Feature Importances')
	# plt.tight_layout()
	# plt.show()

	"""![image.png](attachment:2f8352b5-06b3-4ccd-870c-c3ffc3318a1f.png)
	# first modelling is terrible, we need a deeper analysis of the data.
	# we can see that the country and currency are important features, we can try to group the data by country and currency and see what we can find out.
	"""

	for train_df in df.groupby(['Country', 'Currency']):
	print(train_df[0], train_df[1].shape)

	"""# we can see that some country and currency have very few data, we can filter out these data and only keep the country and currency with more than 1000 data points.

	"""

	# finding the country and currency with more than 1000 data points
	for train_df in df.groupby(['Country', 'Currency']):
	if train_df[1].shape[0]>1000:
	print(train_df[0], train_df[1].shape)
	for x in value_maps['Country'].keys() :
	if value_maps['Country'][x]==train_df[0][0]:
	print(x)

	for x in value_maps['Currency'].keys() :
	if value_maps['Currency'][x]==train_df[0][1]:
	print(x)
	# and it's USA and USD, France and EUR, UK and GBP, Canada and CAD, Germany and EUR

	# select these country and currency ,3 and 1
	df_cleaned = df.loc[df["CompTotal"] >= 1000].copy()
	df_cleaned.drop(columns=["JobSat"], errors="ignore", inplace=True)
	df_cleaned = df_cleaned[((df_cleaned['Country']==value_maps['Country']['United States of America']) & (df_cleaned['Currency']==value_maps['Currency']['USD']))
	]

	from sklearn.decomposition import PCA

	results = []
	for group in df_cleaned.groupby(['Country', 'Currency']):
	train_df = group[1]
	if train_df.shape[0]<1000:
	continue
	# --- 1️⃣ Prepare data ---
	# Assuming df_cleaned is your cleaned DataFrame with only numeric columns
	X = train_df.select_dtypes(include=['number'])

	# --- 2️⃣ Standardize the features ---
	scaler = StandardScaler()
	X_scaled = scaler.fit_transform(X)

	# --- 3️⃣ Apply PCA ---
	# Let's start with 2 principal components for visualization
	for n in range(1, 100):
	pca = PCA(n_components=n)
	pca.fit(X_scaled)
	results.append({'Components': n, 'ExplainedVariance': pca.explained_variance_ratio_.sum()})

	pca_summary = pd.DataFrame(results)
	print(pca_summary)
	plt.plot(pca_summary['Components'], pca_summary['ExplainedVariance'], marker='o')
	plt.xlabel('Number of Components')
	plt.ylabel('Total Explained Variance')
	plt.title('Explained Variance by Number of PCA Components')
	plt.grid(True)
	plt.show()

	from sklearn.model_selection import train_test_split, cross_val_score, KFold
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OneHotEncoder, StandardScaler
	from sklearn.impute import SimpleImputer
	from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
	from sklearn.base import clone
	from sklearn.inspection import permutation_importance
	import numpy as np
	# --- baseline models ---
	from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
	# kaggle top models
	from xgboost import XGBRegressor
	from lightgbm import LGBMRegressor

	# ========== 1. Data preparation ==========

	df_cleaned = df.loc[df["CompTotal"] >= 1000].copy()
	df_cleaned = df_cleaned.loc[df["CompTotal"] <= 300000].copy()

	df_cleaned.drop(columns=["JobSat"], errors="ignore", inplace=True)

	for group in df_cleaned.groupby(['Country', 'Currency']):
	train_df = group[1]
	if train_df.shape[0]<1000:
	continue
	# Split features and target
	X = train_df.drop(columns=["CompTotal"])
	y = train_df["CompTotal"]

	# Split into training / testing
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42
	)

	# ========== 2. Preprocessing pipeline ==========
	# Identify numeric and categorical columns
	num_cols = X_train.select_dtypes(include=["number"]).columns
	cat_cols = X_train.select_dtypes(exclude=["number"]).columns

	# Numeric pipeline
	numeric_pipeline = Pipeline(steps=[

	("scaler", StandardScaler())
	])



	# Combine into column transformer
	preprocessor = ColumnTransformer(
	transformers=[
	("num", numeric_pipeline, num_cols),

	]
	)

	# ========== 3. Define models ==========
	models = {
	"GradientBoosting": GradientBoostingRegressor(
	n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42
	),
	"RandomForest": RandomForestRegressor(
	n_estimators=400, random_state=42, n_jobs=-1
	),
	"HistGradientBoosting": HistGradientBoostingRegressor(
	learning_rate=0.05, random_state=42
	)
	}


	models["XGBoost"] = XGBRegressor(
	n_estimators=600, learning_rate=0.05, max_depth=6,
	subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42
	)


	models["LightGBM"] = LGBMRegressor(
	n_estimators=600, learning_rate=0.05, num_leaves=31,
	subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42
	)

	# ========== 4. Evaluate each model with cross-validation ==========
	cv = KFold(n_splits=5, shuffle=True, random_state=42)
	results = []

	for name, model in models.items():
	pipe = Pipeline(steps=[("preprocessor", preprocessor),
	("model", model)])
	scores = cross_val_score(
	pipe, X_train, y_train,
	scoring="neg_root_mean_squared_error",
	cv=cv, n_jobs=-1,error_score='raise'
	)
	rmse = -scores.mean()
	print(f"{name:>20s} \| CV RMSE: {rmse:.2f}")
	results.append((name, rmse, scores.std()))

	# Sort and select best
	results_df = pd.DataFrame(results, columns=["Model", "CV_RMSE", "Std"]).sort_values("CV_RMSE")
	best_model_name = results_df.iloc[0]["Model"]
	print("\nBest model:", best_model_name)

	# ========== 5. Fit best model pipeline and evaluate on test set ==========
	best_model = models[best_model_name]
	best_pipeline = Pipeline(steps=[
	("preprocessor", preprocessor),
	("model", best_model)
	])

	best_pipeline.fit(X_train, y_train)
	y_pred = best_pipeline.predict(X_test)

	rmse = np.sqrt(mean_squared_error(y_test, y_pred))
	mae = mean_absolute_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)

	print(f"\n--- Test Performance ({best_model_name}) ---")
	print(f"RMSE: {rmse:.2f}")
	print(f"MAE : {mae:.2f}")
	print(f"R² : {r2:.4f}")

	# Optional: show model comparison table
	print("\nModel comparison (lower RMSE better):")
	print(results_df.to_string(index=False))
	_country = ''
	for x in value_maps['Country'].keys() :
	if value_maps['Country'][x]==group[0][0]:
	_country = x

	for x in value_maps['Currency'].keys() :
	if value_maps['Currency'][x]==group[0][1]:
	print(x)
	print(''40)
	print(''40)
	feature_names_num = num_cols.tolist()

	feature_names = feature_names_num

	importances = best_model.feature_importances_
	feature_importance_df = pd.DataFrame({
	'Feature': feature_names,
	'Importance': importances
	}).sort_values(by='Importance', ascending=False)

	print("\nTop 20 Feature Importances:")
	print(feature_importance_df.head(20))
	# ---------- existing code up to the first evaluation ----------
	best_pipeline.fit(X_train, y_train)
	y_pred = best_pipeline.predict(X_test)

	rmse = np.sqrt(mean_squared_error(y_test, y_pred))
	mae = mean_absolute_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)

	print(f"\n--- Test Performance ({best_model_name}) ---")
	print(f"RMSE: {rmse:.2f}")
	print(f"MAE : {mae:.2f}")
	print(f"R² : {r2:.4f}")

	print("\nModel comparison (lower RMSE better):")
	print(results_df.to_string(index=False))

	# --------- Nice labels for Country/Currency (unchanged) ---------
	_country = ''
	for x in value_maps['Country'].keys():
	if value_maps['Country'][x] == group[0][0]:
	_country = x

	_currency = ''
	for x in value_maps['Currency'].keys():
	if value_maps['Currency'][x] == group[0][1]:
	_currency = x
	print(_currency)
	print(''40)
	print(''40)

	# ===================== TOP-50 FEATURE REFIT =====================
	# 1) Get feature importances (fallback to permutation importance if needed)
	feature_names = num_cols.tolist() # only numeric used in your preprocessor
	if hasattr(best_model, "feature_importances_"):
	importances = best_model.feature_importances_
	else:
	# Fallback: permutation importance on training set
	# (uses the full pipeline so it handles scaling correctly)
	perm = permutation_importance(
	best_pipeline, X_train, y_train,
	n_repeats=5,
	scoring="neg_root_mean_squared_error",
	random_state=42,
	n_jobs=-1
	)
	# With your current preprocessor (only numeric), importances align to feature_names
	importances = perm.importances_mean

	feature_importance_df = pd.DataFrame({
	'Feature': feature_names,
	'Importance': importances
	}).sort_values(by='Importance', ascending=False)

	print("\nTop 20 Feature Importances (all features):")
	print(feature_importance_df.head(20))

	# 2) Pick top-K features
	TOP_K = min(50, len(feature_names))
	top_features = feature_importance_df.head(TOP_K)['Feature'].tolist()

	# 3) Build a new preprocessor that only uses the top-K features
	preprocessor_top = ColumnTransformer(
	transformers=[
	("num", numeric_pipeline, top_features),
	]
	)

	# 4) Clone the best model (clean, unfitted) and retrain on top-K
	best_model_top = clone(best_model)
	best_pipeline_top = Pipeline(steps=[
	("preprocessor", preprocessor_top),
	("model", best_model_top)
	])
	best_pipeline_top.fit(X_train, y_train)
	y_pred_top = best_pipeline_top.predict(X_test)

	rmse_top = np.sqrt(mean_squared_error(y_test, y_pred_top))
	mae_top = mean_absolute_error(y_test, y_pred_top)
	r2_top = r2_score(y_test, y_pred_top)

	# 5) Report before/after
	print(f"\n=== Top-{TOP_K} Feature Refit ({best_model_name}) in {_country} [{_currency}] ===")
	print(f"BEFORE -> RMSE: {rmse:.2f} \| MAE: {mae:.2f} \| R²: {r2:.4f} (all {len(feature_names)} numeric features)")
	print(f"AFTER -> RMSE: {rmse_top:.2f} \| MAE: {mae_top:.2f} \| R²: {r2_top:.4f} (top {TOP_K} features)")
	delta = rmse_top - rmse
	print(f"ΔRMSE (top-{TOP_K} - all): {delta:+.2f}")

	# Optional: plot top 20 features among the selected set (from original ranking)
	plt.figure(figsize=(10, 6))
	sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
	plt.title(f'Top 20 Feature Importances in {_country}')
	plt.tight_layout()
	plt.show()
	# # Plot top 20 features
	# plt.figure(figsize=(10, 6))
	# sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
	# plt.title(f'Top 20 Feature Importances in {_country}')
	# plt.tight_layout()
	# plt.show()
	#

	# using top 30 feature the train again