The objective of this project is predict behavior to retain customers. Analysing all relevant customer data can help to develop focused customer retention programs. The dataset was obtained from https://www.kaggle.com/mnassrib/telecom-churn-datasets. It contains information about the State, Area, Account, Type of plan of the customers, how much they spend on charges, the schedule they do more calls and others. Here these variables are explored in order to get insights about customer behavior, and Machine Learning Algorithms are used to predict if a customer will continue with its plan or not.
#!pip install sweetviz
#!pip install missingno
import pandas as pd
import numpy as np
from numpy import loadtxt
import warnings
import sweetviz as sv
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as missing
import random
import re
import sys
import joblib
from sklearn import feature_selection
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
from sklearn.metrics import accuracy_score
import matplotlib.ticker as mtick
from IPython.display import display
%matplotlib inline
color = sns.color_palette()
warnings.simplefilter(action = 'ignore', category = FutureWarning)
df = pd.read_csv('projeto4_telecom_treino.csv')
df.dtypes
Unnamed: 0 int64 state object account_length int64 area_code object international_plan object voice_mail_plan object number_vmail_messages int64 total_day_minutes float64 total_day_calls int64 total_day_charge float64 total_eve_minutes float64 total_eve_calls int64 total_eve_charge float64 total_night_minutes float64 total_night_calls int64 total_night_charge float64 total_intl_minutes float64 total_intl_calls int64 total_intl_charge float64 number_customer_service_calls int64 churn object dtype: object
df.head()
| Unnamed: 0 | state | account_length | area_code | international_plan | voice_mail_plan | number_vmail_messages | total_day_minutes | total_day_calls | total_day_charge | ... | total_eve_calls | total_eve_charge | total_night_minutes | total_night_calls | total_night_charge | total_intl_minutes | total_intl_calls | total_intl_charge | number_customer_service_calls | churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | KS | 128 | area_code_415 | no | yes | 25 | 265.1 | 110 | 45.07 | ... | 99 | 16.78 | 244.7 | 91 | 11.01 | 10.0 | 3 | 2.70 | 1 | no |
| 1 | 2 | OH | 107 | area_code_415 | no | yes | 26 | 161.6 | 123 | 27.47 | ... | 103 | 16.62 | 254.4 | 103 | 11.45 | 13.7 | 3 | 3.70 | 1 | no |
| 2 | 3 | NJ | 137 | area_code_415 | no | no | 0 | 243.4 | 114 | 41.38 | ... | 110 | 10.30 | 162.6 | 104 | 7.32 | 12.2 | 5 | 3.29 | 0 | no |
| 3 | 4 | OH | 84 | area_code_408 | yes | no | 0 | 299.4 | 71 | 50.90 | ... | 88 | 5.26 | 196.9 | 89 | 8.86 | 6.6 | 7 | 1.78 | 2 | no |
| 4 | 5 | OK | 75 | area_code_415 | yes | no | 0 | 166.7 | 113 | 28.34 | ... | 122 | 12.61 | 186.9 | 121 | 8.41 | 10.1 | 3 | 2.73 | 3 | no |
5 rows × 21 columns
df['international_plan'].value_counts() # How many customers have an international plan
no 3010 yes 323 Name: international_plan, dtype: int64
df['voice_mail_plan'].value_counts()
no 2411 yes 922 Name: voice_mail_plan, dtype: int64
eda = sv.analyze(source = df,
target_feat = 'churn')
eda.show_notebook()
Churn does not change with the area. It is higher in people that have international plans. It is higher in (22%) e TX (25%) and the mean rate is 14%; Higher in people that does not use voice-mail (17%). People that use it, the rate is 9%. The churn increases from 250 min of calls; from 40 total day charge; from 250 total eve min; from 20 total eve charge. Until 50 total day calls the churn is high. It decrease with the total night min, suggesting that is advantageous to people that prefers to call at night.
pred_vars = ['number_vmail_messages', 'total_day_minutes', 'total_day_calls', 'total_day_charge',
'total_eve_calls', 'total_eve_charge', 'total_night_minutes', 'total_night_calls',
'total_night_charge', 'total_intl_minutes',
'total_intl_charge', 'number_customer_service_calls']
fig, ax = plt.subplots(2, 6, figsize=(22, 6));
df[df.churn == 'no'][pred_vars].hist( bins=30, color="blue", alpha=0.5, ax=ax);
df[df.churn == 'yes'][pred_vars].hist( bins=30, color="red", alpha=0.5, ax=ax);
People that spend more minutes during the day and have higher total day charge tends to churn more. Maybe they need a day offer. People that do not use Voice Messages has high churn.
# Color palette
myred='#E74C3C'
myblue='#2E86C1'
mygreen='#28B463'
cols= [myblue, myred, mygreen]
# percentage of churn
df['churn'].replace(['yes', 'no'],[1,0]).sum()/df['churn'].count()
0.14491449144914492
The churn rate is 14.49%
sns.countplot(data=df,x='churn',palette=cols);
df_num=df[['total_day_minutes', 'total_day_calls', 'total_day_charge', 'total_eve_calls',
'total_eve_charge', 'total_night_minutes', 'total_night_calls', 'total_night_charge',
'total_intl_minutes', 'total_intl_charge']]
#df_num
plt.figure(figsize=(10, 6))
sns.heatmap(df_num.corr(),annot = True,
cmap='Blues');
Colinearity
total_night_charge and total_night_minutes
total_day_charge and total_day_minutes
total_intl_charge and total_intl_minutes
It is expected because if the person uses more minutes, they will need to charge more.
df['churn1']=df.churn.replace(['yes', 'no'],[1,0])
df_plot = df.groupby('number_customer_service_calls').churn1.mean().reset_index()
df.drop('churn1', axis=1, inplace = True)
df_plot
| number_customer_service_calls | churn1 | |
|---|---|---|
| 0 | 0 | 0.131994 |
| 1 | 1 | 0.103302 |
| 2 | 2 | 0.114625 |
| 3 | 3 | 0.102564 |
| 4 | 4 | 0.457831 |
| 5 | 5 | 0.606061 |
| 6 | 6 | 0.636364 |
| 7 | 7 | 0.555556 |
| 8 | 8 | 0.500000 |
| 9 | 9 | 1.000000 |
#Number of Service calls vs churn
x =df_plot['number_customer_service_calls']
y=df_plot['churn1']
plt.bar(x,y);
plt.title('Churn Rate VS Number of Customer Service Calls');
plt.xlabel('Num of Customer Service Calls');
plt.ylabel('Churn Rate');
plt.xticks(x, (0,1,2,3,4,5,6,7,8,9));
Churn rate increases with the number of customer service calls.
cols= [myblue, myred, mygreen]
sns.boxplot(x="churn", y="total_intl_charge", data=df, palette=cols);
plt.title('Total Charge VS Churn');
plt.xlabel('Churn');
plt.ylabel('Total Charge');
df.groupby(['churn'])['total_intl_charge'].mean()
churn no 2.743404 yes 2.889545 Name: total_intl_charge, dtype: float64
No significant differences.
df["state"].nunique()
51
# Drop not important vars
df.drop(['Unnamed: 0', 'state'], axis = 1, inplace = True)
#df.head()
df.isnull().sum()
df.isna().any()
account_length False area_code False international_plan False voice_mail_plan False number_vmail_messages False total_day_minutes False total_day_calls False total_day_charge False total_eve_minutes False total_eve_calls False total_eve_charge False total_night_minutes False total_night_calls False total_night_charge False total_intl_minutes False total_intl_calls False total_intl_charge False number_customer_service_calls False churn False dtype: bool
missing.matrix(df, figsize = (8,4), color = (0, 0.1, 0.25), sparkline = False);
It does not present null values
df['churn'].value_counts()
no 2850 yes 483 Name: churn, dtype: int64
##Substituindo a variável sexo para 0 e 1
le = LabelEncoder()
df['international_plan'] = le.fit_transform(df['international_plan'])
df['voice_mail_plan'] = le.fit_transform(df['voice_mail_plan'])
df['churn'] = le.fit_transform(df['churn'])
df.head(10)
| account_length | area_code | international_plan | voice_mail_plan | number_vmail_messages | total_day_minutes | total_day_calls | total_day_charge | total_eve_minutes | total_eve_calls | total_eve_charge | total_night_minutes | total_night_calls | total_night_charge | total_intl_minutes | total_intl_calls | total_intl_charge | number_customer_service_calls | churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 128 | area_code_415 | 0 | 1 | 25 | 265.1 | 110 | 45.07 | 197.4 | 99 | 16.78 | 244.7 | 91 | 11.01 | 10.0 | 3 | 2.70 | 1 | 0 |
| 1 | 107 | area_code_415 | 0 | 1 | 26 | 161.6 | 123 | 27.47 | 195.5 | 103 | 16.62 | 254.4 | 103 | 11.45 | 13.7 | 3 | 3.70 | 1 | 0 |
| 2 | 137 | area_code_415 | 0 | 0 | 0 | 243.4 | 114 | 41.38 | 121.2 | 110 | 10.30 | 162.6 | 104 | 7.32 | 12.2 | 5 | 3.29 | 0 | 0 |
| 3 | 84 | area_code_408 | 1 | 0 | 0 | 299.4 | 71 | 50.90 | 61.9 | 88 | 5.26 | 196.9 | 89 | 8.86 | 6.6 | 7 | 1.78 | 2 | 0 |
| 4 | 75 | area_code_415 | 1 | 0 | 0 | 166.7 | 113 | 28.34 | 148.3 | 122 | 12.61 | 186.9 | 121 | 8.41 | 10.1 | 3 | 2.73 | 3 | 0 |
| 5 | 118 | area_code_510 | 1 | 0 | 0 | 223.4 | 98 | 37.98 | 220.6 | 101 | 18.75 | 203.9 | 118 | 9.18 | 6.3 | 6 | 1.70 | 0 | 0 |
| 6 | 121 | area_code_510 | 0 | 1 | 24 | 218.2 | 88 | 37.09 | 348.5 | 108 | 29.62 | 212.6 | 118 | 9.57 | 7.5 | 7 | 2.03 | 3 | 0 |
| 7 | 147 | area_code_415 | 1 | 0 | 0 | 157.0 | 79 | 26.69 | 103.1 | 94 | 8.76 | 211.8 | 96 | 9.53 | 7.1 | 6 | 1.92 | 0 | 0 |
| 8 | 117 | area_code_408 | 0 | 0 | 0 | 184.5 | 97 | 31.37 | 351.6 | 80 | 29.89 | 215.8 | 90 | 9.71 | 8.7 | 4 | 2.35 | 1 | 0 |
| 9 | 141 | area_code_415 | 1 | 1 | 37 | 258.6 | 84 | 43.96 | 222.0 | 111 | 18.87 | 326.4 | 97 | 14.69 | 11.2 | 5 | 3.02 | 0 | 0 |
scaler = StandardScaler()
df_final = df
num_vars = [ 'total_day_calls', 'total_day_charge',
'total_eve_calls', 'total_eve_charge', 'total_night_calls',
'total_night_charge', 'total_intl_charge']
df_final[num_vars] = scaler.fit_transform(df_final[num_vars])
df_final[num_vars]
def groupcat(x):
if x == 0:
return '0'
elif x >= 1 | x <= 3:
return '1'
elif x >= 4 | x <= 6:
return '2'
else:
return 'Other'
df_final['number_customer_service_calls'] = df_final['number_customer_service_calls'].apply(groupcat)
df_final['number_customer_service_calls'].value_counts().plot(kind = 'pie');
This step is useful to improve just for the Decision Tree Classifier Model.
#df['number_vmail_messages'].value_counts()
df_final[df_final['number_vmail_messages'] > 0]['number_vmail_messages'].describe()
count 922.000000 mean 29.277657 std 7.559027 min 4.000000 25% 24.000000 50% 29.000000 75% 34.000000 max 51.000000 Name: number_vmail_messages, dtype: float64
# Categorizing vars with well defined concentration of numbers
df_final['number_vmail_messages'].plot(kind = 'hist', edgecolor = 'black');
def categorizing(x):
if x == 0:
return 0
elif x < 24:
return 1
elif x < 29:
return 2
else:
return 3
df_final['number_vmail_messages'] = df_final['number_vmail_messages'].apply(categorizing)
df_final.head()
| account_length | area_code | international_plan | voice_mail_plan | number_vmail_messages | total_day_minutes | total_day_calls | total_day_charge | total_eve_minutes | total_eve_calls | total_eve_charge | total_night_minutes | total_night_calls | total_night_charge | total_intl_minutes | total_intl_calls | total_intl_charge | number_customer_service_calls | churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 128 | area_code_415 | 0 | 1 | 2 | 265.1 | 0.476643 | 1.567036 | 197.4 | -0.055940 | -0.070427 | 244.7 | -0.465494 | 0.866029 | 10.0 | 3 | -0.085690 | 1 | 0 |
| 1 | 107 | area_code_415 | 0 | 1 | 2 | 161.6 | 1.124503 | -0.334013 | 195.5 | 0.144867 | -0.107549 | 254.4 | 0.147825 | 1.059390 | 13.7 | 3 | 1.241169 | 1 | 0 |
| 2 | 137 | area_code_415 | 0 | 0 | 0 | 243.4 | 0.675985 | 1.168464 | 121.2 | 0.496279 | -1.573900 | 162.6 | 0.198935 | -0.755571 | 12.2 | 5 | 0.697156 | 0 | 0 |
| 3 | 84 | area_code_408 | 1 | 0 | 0 | 299.4 | -1.466936 | 2.196759 | 61.9 | -0.608159 | -2.743268 | 196.9 | -0.567714 | -0.078806 | 6.6 | 7 | -1.306401 | Other | 0 |
| 4 | 75 | area_code_415 | 1 | 0 | 0 | 166.7 | 0.626149 | -0.240041 | 148.3 | 1.098699 | -1.037939 | 186.9 | 1.067803 | -0.276562 | 10.1 | 3 | -0.045885 | 1 | 0 |
vars_cat = ['number_vmail_messages', 'number_customer_service_calls', 'area_code']
df_final = pd.get_dummies(df_final, columns = vars_cat, drop_first = True)
# Split train and test
X = df_final.loc[:, df_final.columns != 'churn']
y = df_final['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)
models = []
models.append(('Logistic Regression', LogisticRegression(solver='liblinear',
class_weight='balanced')))
models.append(('SVC', SVC(kernel = 'linear')))
models.append(('Kernel SVM', SVC(kernel = 'rbf')))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)))
models.append(('Gaussian NB', GaussianNB()))
models.append(('Decision Tree Classifier',
DecisionTreeClassifier(criterion = 'entropy', random_state = 0)))
models.append(('Random Forest', RandomForestClassifier(
n_estimators=100, criterion = 'entropy')))
#Evaluating Model Results:
acc_results = []
auc_results = []
names = []
# set table to table to populate with performance results
col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD',
'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns=col)
i = 0
# Evaluate each model using k-fold cross-validation:
for name, model in models:
kfold = model_selection.KFold(
n_splits=10, random_state=0)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(
model, X_train, y_train, cv=kfold, scoring='accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(
model, X_train, y_train, cv=kfold, scoring='roc_auc')
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
model_results.sort_values(by=['ROC AUC Mean'], ascending=False)
| Algorithm | ROC AUC Mean | ROC AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 6 | Random Forest | 90.48 | 4.42 | 94.45 | 2.31 |
| 0 | Logistic Regression | 85.96 | 3.38 | 84.55 | 2.36 |
| 1 | SVC | 85.79 | 3.37 | 85.48 | 2.38 |
| 4 | Gaussian NB | 84.60 | 3.11 | 87.40 | 1.73 |
| 5 | Decision Tree Classifier | 83.58 | 4.83 | 91.33 | 2.13 |
| 2 | Kernel SVM | 73.58 | 4.49 | 85.56 | 2.66 |
| 3 | KNN | 65.36 | 5.35 | 86.46 | 2.21 |
fig = plt.figure(figsize=(15, 7))
ax = fig.add_subplot(111)
plt.boxplot(acc_results)
ax.set_xticklabels(names)
plt.title('Accuracy Score Comparison \n',
fontsize = "22", fontfamily = "sans-serif")
plt.xticks(rotation=0, horizontalalignment="center")
plt.yticks(rotation=0, horizontalalignment="right")
plt.show()
Random Forest presented a better performance
fig = plt.figure(figsize=(15, 7))
ax = fig.add_subplot(111)
plt.boxplot(auc_results)
ax.set_xticklabels(names)
plt.title('ROC AUC Comparison \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22",
fontfamily = "sans-serif")
plt.xticks(rotation=0, horizontalalignment="center")
plt.yticks(rotation=0, horizontalalignment="right")
plt.show()
# create model using DecisionTree Classifier and fit training data
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
DecisionTreeClassifier()
# create prediction
dt_pred = dt_model.predict(X_test)
dt_pred[0:5]
array([0, 0, 0, 0, 0])
# Evaluating the prediction model
metrics.accuracy_score(y_test, dt_pred)
0.9010494752623688
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
KNeighborsClassifier()
result_knn = knn.predict(X_test)
print (pd.crosstab(y_test,result_knn, rownames=['Real'], colnames=['Predicted'], margins=True))
Predicted 0 1 All Real 0 546 23 569 1 64 34 98 All 610 57 667
error = []
# Calculating error for K values between 1 and 40
for i in range(1, 40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
# Plot of Mean Error vs Number of Neighbors
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error rate of K value')
plt.xlabel('K value')
plt.ylabel('Mean Error')
Text(0, 0.5, 'Mean Error')
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=15)
result_knn = knn.predict(X_test)
print (pd.crosstab(y_test,result_knn, rownames=['Real'], colnames=['Predicted'], margins=True))
Predicted 0 1 All Real 0 566 3 569 1 75 23 98 All 641 26 667
print(classification_report(y_test, result_knn))
precision recall f1-score support
0 0.88 0.99 0.94 569
1 0.88 0.23 0.37 98
accuracy 0.88 667
macro avg 0.88 0.61 0.65 667
weighted avg 0.88 0.88 0.85 667
score_array = []
for each in range(1,100):
rf_loop = RandomForestClassifier(
n_estimators = each, random_state = 1)
rf_loop.fit(X_train,y_train)
score_array.append(rf_loop.score(X_test,y_test))
fig = plt.figure(figsize=(15, 7))
plt.plot(range(1,100),score_array, color = '#ec838a')
plt.ylabel('Range\n',horizontalalignment="center",
fontstyle = "normal", fontsize = "large",
fontfamily = "sans-serif")
plt.xlabel('Score\n',horizontalalignment="center",
fontstyle = "normal", fontsize = "large",
fontfamily = "sans-serif")
plt.title('Optimal Number of Trees for Random Forest Model \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
plt.xticks(rotation=0, horizontalalignment="center")
plt.yticks(rotation=0, horizontalalignment="right")
plt.show()
# Model with 81 decision trees
rf = RandomForestClassifier (n_estimators = 81, random_state = 42)
# Training model
rf.fit (X_train, y_train);
# Predictions with test data
y_rf = rf.predict(X_test)
print (pd.crosstab(y_test,y_rf, rownames=['Real'], colnames=['Predicted'], margins=True))
Predicted 0 1 All Real 0 565 4 569 1 31 67 98 All 596 71 667
print(classification_report(y_test, y_rf))
precision recall f1-score support
0 0.95 0.99 0.97 569
1 0.94 0.68 0.79 98
accuracy 0.95 667
macro avg 0.95 0.84 0.88 667
weighted avg 0.95 0.95 0.94 667
# Feature Importance
rf.feature_importances_
feature_importances = pd.DataFrame(rf.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)
feature_importances
| importance | |
|---|---|
| total_day_charge | 0.140958 |
| total_day_minutes | 0.126932 |
| number_customer_service_calls_2 | 0.091497 |
| international_plan | 0.076248 |
| total_eve_minutes | 0.068470 |
| total_eve_charge | 0.064455 |
| total_intl_charge | 0.048407 |
| total_intl_calls | 0.048220 |
| total_intl_minutes | 0.046710 |
| total_night_charge | 0.042940 |
| total_night_minutes | 0.040504 |
| total_day_calls | 0.038503 |
| total_eve_calls | 0.032753 |
| total_night_calls | 0.032265 |
| account_length | 0.031946 |
| voice_mail_plan | 0.028626 |
| number_customer_service_calls_1 | 0.009869 |
| number_vmail_messages_3 | 0.007155 |
| area_code_area_code_415 | 0.005570 |
| number_customer_service_calls_Other | 0.005253 |
| area_code_area_code_510 | 0.005033 |
| number_vmail_messages_2 | 0.004082 |
| number_vmail_messages_1 | 0.003601 |
# Adjusting model with training data
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
/home/natalia/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[19:03:58] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
# Predictions with test set
predict_xgb = xgb.predict(X_test)
print (pd.crosstab(y_test,predict_xgb, rownames=['Real'], colnames=['Predicted'], margins=True))
Predicted 0 1 All Real 0 564 5 569 1 26 72 98 All 590 77 667
print(classification_report(y_test, predict_xgb))
precision recall f1-score support
0 0.96 0.99 0.97 569
1 0.94 0.73 0.82 98
accuracy 0.95 667
macro avg 0.95 0.86 0.90 667
weighted avg 0.95 0.95 0.95 667
# Voting Classifier with soft voting
vot = VotingClassifier(estimators=[('rf', rf),('xgb',xgb)], voting='soft')
vot = vot.fit(X_train,y_train)
[19:04:12] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
/home/natalia/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
y_predicted = vot.predict(X_test)
print (pd.crosstab(y_test,y_predicted, rownames=['Real'], colnames=['Predicted'], margins=True))
Predicted 0 1 All Real 0 564 5 569 1 25 73 98 All 589 78 667
print(classification_report(y_test, y_predicted))
precision recall f1-score support
0 0.96 0.99 0.97 569
1 0.94 0.74 0.83 98
accuracy 0.96 667
macro avg 0.95 0.87 0.90 667
weighted avg 0.95 0.96 0.95 667
# Churn Probability
df['proba'] = vot.predict_proba(df_final[X_train.columns])[:,1]
df[['proba']]
# Grid of test
paramGrid = dict (
missing = [np.nan],
booster = ['gbtree'],#, 'gblinear', 'dart'],
max_depth = [4, 5],
n_estimators = [300, 350],
learning_rate = [0.025, 0.03],
nthread = [4],
subsample = [0.95, 1],
colsample_bytree = [0.95, 1],
seed = [100]
)
model = XGBClassifier()
# Creating Grid Search
grid = GridSearchCV(estimator = model, param_grid = paramGrid, cv = 10, verbose = True, n_jobs = -1)
# Searching best params
grid.fit(X_train, y_train)
# Print best parametes
print("\n" + "Best Model Parameters:" + "\n\n", grid.best_estimator_)
Fitting 10 folds for each of 32 candidates, totalling 320 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 7.4min [Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 45.4min [Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed: 78.8min finished /home/natalia/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:24:24] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Best Model Parameters:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.95, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.03, max_delta_step=0, max_depth=4,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=350, n_jobs=4, nthread=4, num_parallel_tree=1,
random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
seed=100, subsample=1, tree_method='exact', validate_parameters=1,
verbosity=None)
Best Model Parameters:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.95, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.03, max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=350, n_jobs=4, nthread=4, num_parallel_tree=1, random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=100, subsample=1, tree_method='exact', validate_parameters=1, verbosity=None)
classifierXGB = XGBClassifier(
booster='gbtree',
colsample_bylevel=1,
colsample_bynode=1,
colsample_bytree=0.95,
importance_type='gain',
learning_rate=0.03,
max_depth=4,
min_child_weight=1,
n_estimators=350,
n_jobs=4, nthread=4,
num_parallel_tree=1,
random_state=100,
seed=100,
subsample=1)
classifierXGB.fit(X = X_train, y = y_train)
[20:26:04] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.95, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.03, max_delta_step=0, max_depth=4,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=350, n_jobs=4, nthread=4, num_parallel_tree=1,
random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
seed=100, subsample=1, tree_method='exact', validate_parameters=1,
verbosity=None)
# Predictions with test set
predict_xgb = classifierXGB.predict(X_test)
print (pd.crosstab(y_test,predict_xgb, rownames=['Real'], colnames=['Predicted'], margins=True))
Predicted 0 1 All Real 0 563 6 569 1 23 75 98 All 586 81 667
print(classification_report(y_test, predict_xgb))
precision recall f1-score support
0 0.96 0.99 0.97 569
1 0.93 0.77 0.84 98
accuracy 0.96 667
macro avg 0.94 0.88 0.91 667
weighted avg 0.96 0.96 0.95 667