#!pip install sweetviz
#!pip install missingno

import pandas as pd
import numpy as np
from numpy import loadtxt

import warnings
import sweetviz as sv
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as missing
import random
import re
import sys
import joblib

from sklearn import feature_selection

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
from sklearn.metrics import accuracy_score

import matplotlib.ticker as mtick
from IPython.display import display

%matplotlib inline
color = sns.color_palette()
warnings.simplefilter(action = 'ignore', category = FutureWarning)


df = pd.read_csv('projeto4_telecom_treino.csv')


df.dtypes

Unnamed: 0                         int64
state                             object
account_length                     int64
area_code                         object
international_plan                object
voice_mail_plan                   object
number_vmail_messages              int64
total_day_minutes                float64
total_day_calls                    int64
total_day_charge                 float64
total_eve_minutes                float64
total_eve_calls                    int64
total_eve_charge                 float64
total_night_minutes              float64
total_night_calls                  int64
total_night_charge               float64
total_intl_minutes               float64
total_intl_calls                   int64
total_intl_charge                float64
number_customer_service_calls      int64
churn                             object
dtype: object


df.head()


df['international_plan'].value_counts() # How many customers have an international plan

no     3010
yes     323
Name: international_plan, dtype: int64


df['voice_mail_plan'].value_counts()

no     2411
yes     922
Name: voice_mail_plan, dtype: int64


eda = sv.analyze(source = df,
                target_feat = 'churn')


eda.show_notebook()


pred_vars = ['number_vmail_messages', 'total_day_minutes', 'total_day_calls', 'total_day_charge', 
             'total_eve_calls', 'total_eve_charge', 'total_night_minutes', 'total_night_calls',
             'total_night_charge', 'total_intl_minutes',
             'total_intl_charge', 'number_customer_service_calls']

fig, ax = plt.subplots(2, 6, figsize=(22, 6));
df[df.churn == 'no'][pred_vars].hist( bins=30, color="blue", alpha=0.5, ax=ax);
df[df.churn == 'yes'][pred_vars].hist( bins=30, color="red", alpha=0.5, ax=ax);


# Color palette
myred='#E74C3C'
myblue='#2E86C1'
mygreen='#28B463'
cols= [myblue, myred, mygreen]


# percentage of churn
df['churn'].replace(['yes', 'no'],[1,0]).sum()/df['churn'].count()

0.14491449144914492


sns.countplot(data=df,x='churn',palette=cols);


df_num=df[['total_day_minutes', 'total_day_calls', 'total_day_charge', 'total_eve_calls',
          'total_eve_charge', 'total_night_minutes', 'total_night_calls', 'total_night_charge',
          'total_intl_minutes', 'total_intl_charge']]
#df_num


plt.figure(figsize=(10, 6))
sns.heatmap(df_num.corr(),annot = True,
            cmap='Blues');


df['churn1']=df.churn.replace(['yes', 'no'],[1,0])
df_plot = df.groupby('number_customer_service_calls').churn1.mean().reset_index() 
df.drop('churn1', axis=1, inplace = True)
df_plot


#Number of Service calls vs churn
x =df_plot['number_customer_service_calls']
y=df_plot['churn1']
plt.bar(x,y);
plt.title('Churn Rate VS Number of Customer Service Calls');
plt.xlabel('Num of Customer Service Calls');
plt.ylabel('Churn Rate');
plt.xticks(x, (0,1,2,3,4,5,6,7,8,9));


cols= [myblue, myred, mygreen]
sns.boxplot(x="churn", y="total_intl_charge", data=df, palette=cols);
plt.title('Total Charge VS Churn');
plt.xlabel('Churn');
plt.ylabel('Total Charge');
df.groupby(['churn'])['total_intl_charge'].mean()

churn
no     2.743404
yes    2.889545
Name: total_intl_charge, dtype: float64


df["state"].nunique()

51


# Drop not important vars
df.drop(['Unnamed: 0', 'state'], axis = 1, inplace = True)


#df.head()


df.isnull().sum()
df.isna().any()

account_length                   False
area_code                        False
international_plan               False
voice_mail_plan                  False
number_vmail_messages            False
total_day_minutes                False
total_day_calls                  False
total_day_charge                 False
total_eve_minutes                False
total_eve_calls                  False
total_eve_charge                 False
total_night_minutes              False
total_night_calls                False
total_night_charge               False
total_intl_minutes               False
total_intl_calls                 False
total_intl_charge                False
number_customer_service_calls    False
churn                            False
dtype: bool


missing.matrix(df, figsize = (8,4), color = (0, 0.1, 0.25), sparkline = False);


df['churn'].value_counts()

no     2850
yes     483
Name: churn, dtype: int64


##Substituindo a variável sexo para 0 e 1

le = LabelEncoder()

df['international_plan'] = le.fit_transform(df['international_plan'])
df['voice_mail_plan'] = le.fit_transform(df['voice_mail_plan'])
df['churn'] = le.fit_transform(df['churn'])


df.head(10)


scaler = StandardScaler()
df_final = df

num_vars = [ 'total_day_calls', 'total_day_charge', 
             'total_eve_calls', 'total_eve_charge', 'total_night_calls',
             'total_night_charge', 'total_intl_charge']
df_final[num_vars] = scaler.fit_transform(df_final[num_vars])
df_final[num_vars]


def groupcat(x):
    if x == 0:
        return '0'
    elif x >= 1 | x <= 3:
        return '1'
    elif x >= 4 | x <= 6:
        return '2'
    else:
        return 'Other'
    
df_final['number_customer_service_calls'] = df_final['number_customer_service_calls'].apply(groupcat)    
df_final['number_customer_service_calls'].value_counts().plot(kind = 'pie');


#df['number_vmail_messages'].value_counts()


df_final[df_final['number_vmail_messages'] > 0]['number_vmail_messages'].describe()

count    922.000000
mean      29.277657
std        7.559027
min        4.000000
25%       24.000000
50%       29.000000
75%       34.000000
max       51.000000
Name: number_vmail_messages, dtype: float64


# Categorizing vars with well defined concentration of numbers
df_final['number_vmail_messages'].plot(kind = 'hist', edgecolor = 'black');


def categorizing(x):
    if x == 0:
        return 0
    elif x < 24:
        return 1
    elif x < 29:
        return 2
    else:
        return 3

df_final['number_vmail_messages'] = df_final['number_vmail_messages'].apply(categorizing)
df_final.head()


vars_cat = ['number_vmail_messages', 'number_customer_service_calls', 'area_code']
df_final = pd.get_dummies(df_final, columns = vars_cat, drop_first = True)


# Split train and test
X = df_final.loc[:, df_final.columns != 'churn']
y = df_final['churn']
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = 0.20, random_state = 1)


models = []
models.append(('Logistic Regression', LogisticRegression(solver='liblinear',
                                                         class_weight='balanced')))
models.append(('SVC', SVC(kernel = 'linear')))
models.append(('Kernel SVM', SVC(kernel = 'rbf')))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)))
models.append(('Gaussian NB', GaussianNB()))
models.append(('Decision Tree Classifier',
               DecisionTreeClassifier(criterion = 'entropy', random_state = 0)))
models.append(('Random Forest', RandomForestClassifier(
    n_estimators=100, criterion = 'entropy')))
#Evaluating Model Results:
acc_results = []
auc_results = []
names = []
# set table to table to populate with performance results
col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD', 
       'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns=col)
i = 0
# Evaluate each model using k-fold cross-validation:
for name, model in models:
    kfold = model_selection.KFold(
        n_splits=10, random_state=0)

    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(  
    model, X_train, y_train, cv=kfold, scoring='accuracy')
    
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(  
    model, X_train, y_train, cv=kfold, scoring='roc_auc')
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                            round(cv_auc_results.mean()*100, 2),
                            round(cv_auc_results.std()*100, 2),
                            round(cv_acc_results.mean()*100, 2),
                            round(cv_acc_results.std()*100, 2)
                            ]
    i += 1
    
model_results.sort_values(by=['ROC AUC Mean'], ascending=False)


fig = plt.figure(figsize=(15, 7))
ax = fig.add_subplot(111)
plt.boxplot(acc_results)
ax.set_xticklabels(names)
plt.title('Accuracy Score Comparison \n',
fontsize = "22", fontfamily = "sans-serif")
plt.xticks(rotation=0, horizontalalignment="center")
plt.yticks(rotation=0, horizontalalignment="right")
plt.show()


fig = plt.figure(figsize=(15, 7))
ax = fig.add_subplot(111)
plt.boxplot(auc_results)
ax.set_xticklabels(names)
plt.title('ROC AUC Comparison \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", 
fontfamily = "sans-serif")
plt.xticks(rotation=0, horizontalalignment="center")
plt.yticks(rotation=0, horizontalalignment="right")
plt.show()


# create model using DecisionTree Classifier and fit training data
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

DecisionTreeClassifier()


# create prediction
dt_pred = dt_model.predict(X_test)
dt_pred[0:5]

array([0, 0, 0, 0, 0])


# Evaluating the prediction model
metrics.accuracy_score(y_test, dt_pred)

0.9010494752623688


knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()


result_knn = knn.predict(X_test)
print (pd.crosstab(y_test,result_knn, rownames=['Real'], colnames=['Predicted'], margins=True))

Predicted    0   1  All
Real                   
0          546  23  569
1           64  34   98
All        610  57  667


error = []

# Calculating error for K values between 1 and 40
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))


# Plot of Mean Error vs Number of Neighbors
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error rate of K value')
plt.xlabel('K value')
plt.ylabel('Mean Error')

Text(0, 0.5, 'Mean Error')


knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=15)


result_knn = knn.predict(X_test)
print (pd.crosstab(y_test,result_knn, rownames=['Real'], colnames=['Predicted'], margins=True))

Predicted    0   1  All
Real                   
0          566   3  569
1           75  23   98
All        641  26  667


print(classification_report(y_test, result_knn))

              precision    recall  f1-score   support

           0       0.88      0.99      0.94       569
           1       0.88      0.23      0.37        98

    accuracy                           0.88       667
   macro avg       0.88      0.61      0.65       667
weighted avg       0.88      0.88      0.85       667


score_array = []
for each in range(1,100):
    rf_loop = RandomForestClassifier(
n_estimators = each, random_state = 1) 
    rf_loop.fit(X_train,y_train)
    score_array.append(rf_loop.score(X_test,y_test))
 
fig = plt.figure(figsize=(15, 7))
plt.plot(range(1,100),score_array, color = '#ec838a')
plt.ylabel('Range\n',horizontalalignment="center",
fontstyle = "normal", fontsize = "large", 
fontfamily = "sans-serif")
plt.xlabel('Score\n',horizontalalignment="center",
fontstyle = "normal", fontsize = "large", 
fontfamily = "sans-serif")
plt.title('Optimal Number of Trees for Random Forest Model \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
plt.xticks(rotation=0, horizontalalignment="center")
plt.yticks(rotation=0, horizontalalignment="right")
plt.show()


# Model with 81 decision trees
rf = RandomForestClassifier (n_estimators = 81, random_state = 42) 
# Training model
rf.fit (X_train, y_train);


# Predictions with test data
y_rf = rf.predict(X_test)
print (pd.crosstab(y_test,y_rf, rownames=['Real'], colnames=['Predicted'], margins=True))

Predicted    0   1  All
Real                   
0          565   4  569
1           31  67   98
All        596  71  667


print(classification_report(y_test, y_rf))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       569
           1       0.94      0.68      0.79        98

    accuracy                           0.95       667
   macro avg       0.95      0.84      0.88       667
weighted avg       0.95      0.95      0.94       667


# Feature Importance
rf.feature_importances_

feature_importances = pd.DataFrame(rf.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)
feature_importances


# Adjusting model with training data
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

/home/natalia/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[19:03:58] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


# Predictions with test set
predict_xgb = xgb.predict(X_test)
print (pd.crosstab(y_test,predict_xgb, rownames=['Real'], colnames=['Predicted'], margins=True))

Predicted    0   1  All
Real                   
0          564   5  569
1           26  72   98
All        590  77  667


print(classification_report(y_test, predict_xgb))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       569
           1       0.94      0.73      0.82        98

    accuracy                           0.95       667
   macro avg       0.95      0.86      0.90       667
weighted avg       0.95      0.95      0.95       667


# Voting Classifier with soft voting 
vot = VotingClassifier(estimators=[('rf', rf),('xgb',xgb)], voting='soft')
vot = vot.fit(X_train,y_train)

[19:04:12] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

/home/natalia/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


y_predicted = vot.predict(X_test)
print (pd.crosstab(y_test,y_predicted, rownames=['Real'], colnames=['Predicted'], margins=True))

Predicted    0   1  All
Real                   
0          564   5  569
1           25  73   98
All        589  78  667


print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       569
           1       0.94      0.74      0.83        98

    accuracy                           0.96       667
   macro avg       0.95      0.87      0.90       667
weighted avg       0.95      0.96      0.95       667


# Churn Probability
df['proba'] = vot.predict_proba(df_final[X_train.columns])[:,1]
df[['proba']]


# Grid of test
paramGrid = dict (
    missing          = [np.nan],
    booster          = ['gbtree'],#, 'gblinear', 'dart'],
    max_depth        = [4, 5], 
    n_estimators     = [300, 350], 
    learning_rate    = [0.025, 0.03], 
    nthread          = [4], 
    subsample        = [0.95, 1], 
    colsample_bytree = [0.95, 1], 
    seed             = [100]
)

model = XGBClassifier()

# Creating Grid Search
grid = GridSearchCV(estimator = model, param_grid = paramGrid, cv = 10, verbose = True, n_jobs = -1)

# Searching best params
grid.fit(X_train, y_train)

# Print best parametes
print("\n" + "Best Model Parameters:" + "\n\n", grid.best_estimator_)

Fitting 10 folds for each of 32 candidates, totalling 320 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 45.4min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed: 78.8min finished
/home/natalia/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[20:24:24] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

Best Model Parameters:

 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.95, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=350, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=100, subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)


classifierXGB = XGBClassifier(
    booster='gbtree',
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=0.95,
    importance_type='gain',
    learning_rate=0.03,
    max_depth=4,
    min_child_weight=1, 
    n_estimators=350,
    n_jobs=4, nthread=4,
    num_parallel_tree=1,
    random_state=100,
    seed=100,
    subsample=1)

classifierXGB.fit(X = X_train, y = y_train)

[20:26:04] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.95, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=350, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=100, subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)


# Predictions with test set
predict_xgb = classifierXGB.predict(X_test)
print (pd.crosstab(y_test,predict_xgb, rownames=['Real'], colnames=['Predicted'], margins=True))

Predicted    0   1  All
Real                   
0          563   6  569
1           23  75   98
All        586  81  667


print(classification_report(y_test, predict_xgb))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       569
           1       0.93      0.77      0.84        98

    accuracy                           0.96       667
   macro avg       0.94      0.88      0.91       667
weighted avg       0.96      0.96      0.95       667

	Unnamed: 0	state	account_length	area_code	international_plan	voice_mail_plan	number_vmail_messages	total_day_minutes	total_day_calls	total_day_charge	...	total_eve_calls	total_eve_charge	total_night_minutes	total_night_calls	total_night_charge	total_intl_minutes	total_intl_calls	total_intl_charge	number_customer_service_calls	churn
0	1	KS	128	area_code_415	no	yes	25	265.1	110	45.07	...	99	16.78	244.7	91	11.01	10.0	3	2.70	1	no
1	2	OH	107	area_code_415	no	yes	26	161.6	123	27.47	...	103	16.62	254.4	103	11.45	13.7	3	3.70	1	no
2	3	NJ	137	area_code_415	no	no	0	243.4	114	41.38	...	110	10.30	162.6	104	7.32	12.2	5	3.29	0	no
3	4	OH	84	area_code_408	yes	no	0	299.4	71	50.90	...	88	5.26	196.9	89	8.86	6.6	7	1.78	2	no
4	5	OK	75	area_code_415	yes	no	0	166.7	113	28.34	...	122	12.61	186.9	121	8.41	10.1	3	2.73	3	no

	number_customer_service_calls	churn1
0	0	0.131994
1	1	0.103302
2	2	0.114625
3	3	0.102564
4	4	0.457831
5	5	0.606061
6	6	0.636364
7	7	0.555556
8	8	0.500000
9	9	1.000000

	account_length	area_code	international_plan	voice_mail_plan	number_vmail_messages	total_day_minutes	total_day_calls	total_day_charge	total_eve_minutes	total_eve_calls	total_eve_charge	total_night_minutes	total_night_calls	total_night_charge	total_intl_minutes	total_intl_calls	total_intl_charge	number_customer_service_calls
0	128	area_code_415	0	1	25	265.1	110	45.07	197.4	99	16.78	244.7	91	11.01	10.0	3	2.70	1
1	107	area_code_415	0	1	26	161.6	123	27.47	195.5	103	16.62	254.4	103	11.45	13.7	3	3.70	1
2	137	area_code_415	0	0	0	243.4	114	41.38	121.2	110	10.30	162.6	104	7.32	12.2	5	3.29	0
3	84	area_code_408	1	0	0	299.4	71	50.90	61.9	88	5.26	196.9	89	8.86	6.6	7	1.78	2
4	75	area_code_415	1	0	0	166.7	113	28.34	148.3	122	12.61	186.9	121	8.41	10.1	3	2.73	3
5	118	area_code_510	1	0	0	223.4	98	37.98	220.6	101	18.75	203.9	118	9.18	6.3	6	1.70	0
6	121	area_code_510	0	1	24	218.2	88	37.09	348.5	108	29.62	212.6	118	9.57	7.5	7	2.03	3
7	147	area_code_415	1	0	0	157.0	79	26.69	103.1	94	8.76	211.8	96	9.53	7.1	6	1.92	0
8	117	area_code_408	0	0	0	184.5	97	31.37	351.6	80	29.89	215.8	90	9.71	8.7	4	2.35	1
9	141	area_code_415	1	1	37	258.6	84	43.96	222.0	111	18.87	326.4	97	14.69	11.2	5	3.02	0

	account_length	area_code	international_plan	voice_mail_plan	number_vmail_messages	total_day_minutes	total_day_calls	total_day_charge	total_eve_minutes	total_eve_calls	total_eve_charge	total_night_minutes	total_night_calls	total_night_charge	total_intl_minutes	total_intl_calls	total_intl_charge	number_customer_service_calls
0	128	area_code_415	0	1	2	265.1	0.476643	1.567036	197.4	-0.055940	-0.070427	244.7	-0.465494	0.866029	10.0	3	-0.085690	1
1	107	area_code_415	0	1	2	161.6	1.124503	-0.334013	195.5	0.144867	-0.107549	254.4	0.147825	1.059390	13.7	3	1.241169	1
2	137	area_code_415	0	0	0	243.4	0.675985	1.168464	121.2	0.496279	-1.573900	162.6	0.198935	-0.755571	12.2	5	0.697156	0
3	84	area_code_408	1	0	0	299.4	-1.466936	2.196759	61.9	-0.608159	-2.743268	196.9	-0.567714	-0.078806	6.6	7	-1.306401	Other
4	75	area_code_415	1	0	0	166.7	0.626149	-0.240041	148.3	1.098699	-1.037939	186.9	1.067803	-0.276562	10.1	3	-0.045885	1

	Algorithm	ROC AUC Mean	ROC AUC STD	Accuracy Mean	Accuracy STD
6	Random Forest	90.48	4.42	94.45	2.31
0	Logistic Regression	85.96	3.38	84.55	2.36
1	SVC	85.79	3.37	85.48	2.38
4	Gaussian NB	84.60	3.11	87.40	1.73
5	Decision Tree Classifier	83.58	4.83	91.33	2.13
2	Kernel SVM	73.58	4.49	85.56	2.66
3	KNN	65.36	5.35	86.46	2.21

Telecom Churn Prediction¶

Natália Faraj Murad¶

Importing libraries¶

Reading dataset¶

Data exploration¶

Distribution of Numeric Variables & Churn¶

People that cancel profile¶

Churn Rate¶

Numeric Variables Colinearity¶

Churn vs number of service calls¶

Total intl charge vs Churn¶

Data preprocessing¶

Droppping not important variables¶

Grouping Categories¶

Changing yes and no to 0 and 1¶

Controlling the scale of continue variables - padronization¶

Categorize continue variables¶

Dummy Variables¶

Split train & test datasets¶

Training Several Baseline Models¶

Decision Tree Classifier¶

Decision Tree Accuracy: 90%¶

KNN¶

KNN Model Accuracy: 88%¶

Random Forest¶

Optimal Number of Trees¶

Random Forest Accuracy: 95%¶

XGBoost¶

XGBoost Accuracy: 95%¶

Voting Classifier¶

Voting Accuracy 96%¶

Searching Best Parameters¶

Conclusion¶

The best models were Random Forest and XGBoost with 95% of Accuracy and Voting with 96% of Accuracy.¶

	importance
total_day_charge	0.140958
total_day_minutes	0.126932
number_customer_service_calls_2	0.091497
international_plan	0.076248
total_eve_minutes	0.068470
total_eve_charge	0.064455
total_intl_charge	0.048407
total_intl_calls	0.048220
total_intl_minutes	0.046710
total_night_charge	0.042940
total_night_minutes	0.040504
total_day_calls	0.038503
total_eve_calls	0.032753
total_night_calls	0.032265
account_length	0.031946
voice_mail_plan	0.028626
number_customer_service_calls_1	0.009869
number_vmail_messages_3	0.007155
area_code_area_code_415	0.005570
number_customer_service_calls_Other	0.005253
area_code_area_code_510	0.005033
number_vmail_messages_2	0.004082
number_vmail_messages_1	0.003601