'How to properly use SMOTE for data balancing
I wanted to know if it is required to use SMOTE only after splitting test and train dataset. I used smote after train_test_split
for Churn prediction, but haven't got any significant improvement pre or post SMOTE. Below is my entire code using smote. Not sure where the issue is. I wanted to know if I used SMOTE properly.
Below is the code
import pandas as pd
import numpy as np
from datetime import timedelta,datetime,date
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from numpy import percentile
tel_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
tel_data.info()
tel_data.isnull().sum()
num = {"No":0,"Yes":1}
tel_data = tel_data.replace({"Churn":num})
# also total charges seem to be object. coverting to integer
tel_data['TotalCharges'] = pd.to_numeric(tel_data['TotalCharges'])
tel_data.head(2)
tel_data['Churn'].value_counts()
plt.figure(figsize=(6,5))
sns.countplot(tel_data['Churn'])
plt.show()
# using pd.to_numeric to convert the TotalCharges column to numeric will help us see the null values
tel_data.TotalCharges = pd.to_numeric(tel_data.TotalCharges, errors="coerce")
tel_data.isnull().sum()
# deleting the rows with null values
tel_data = tel_data.dropna(axis=0)
# encoding all categorical variables using one hot encoding
tel_data = pd.get_dummies(tel_data,drop_first=True,columns=['gender','Partner','Dependents',
'PhoneService','MultipleLines','InternetService',
'OnlineSecurity','OnlineBackup','DeviceProtection',
'TechSupport','StreamingTV','StreamingMovies',
'Contract','PaperlessBilling','PaymentMethod'])
# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['customerID','Churn'],axis=1)
y = tel_data['Churn']
# performing feature selection using chi2 test
from sklearn.feature_selection import chi2
chi_scores = chi2(X,y)
print('chi_values:',chi_scores[0],'\n')
print('p_values:',chi_scores[1])
p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)
plt.figure(figsize=(12,8))
p_values.plot.bar()
plt.show()
tel_data.drop(['PhoneService_Yes','gender_Male','MultipleLines_No phone service','MultipleLines_Yes','customerID'],axis=1,inplace=True)
tel_data.head(2)
# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['Churn'],axis=1)
y = tel_data['Churn']
# import sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score
# splitting into train and test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)
model_xgb_1 = xgb.XGBClassifier(n_estimators=100,
learning_rate=0.3,
max_depth=5,
random_state=42 )
xgbmod = model_xgb_1.fit(X_train,y_train)
# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgbmod.score(X_train, y_train)))
y_xgb_pred = trn_xgbmod.predict(X_test)
print(classification_report(y_test, y_xgb_pred))
from imblearn.over_sampling import SMOTE
smote_preprocess = SMOTE(random_state=42)
X_train_resampled,y_train_resampled = smote_preprocess.fit_resample(X_train,y_train)
model_xgb_smote = xgb.XGBClassifier(n_estimators=100,
learning_rate=0.3,
max_depth=5,
random_state=42 )
xgbmod_smote = model_xgb_smote.fit(X_train_resampled,y_train_resampled)
# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgbmod_smote.score(X_train_resampled,y_train_resampled)))
y_xgb_pred_smote = xgbmod_smote.predict(X_test)
print(classification_report(y_test, y_xgb_pred_smote))
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|