#Remove variables not needed and recode if needed
Data.Full <- Data %>% dplyr::select(-c(Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2,
CLIENTNUM))
#recode gender variable
Data.Full$Gender[Data.Full$Gender == 'M'] = "Male"
Data.Full$Gender[Data.Full$Gender == 'F'] = "Female"
#recode outcome variable
Data.Full$Attrition_Flag[Data.Full$Attrition_Flag == 'Existing Customer'] = "not_churned"
Data.Full$Attrition_Flag[Data.Full$Attrition_Flag == 'Attrited Customer'] = "churned"
#set factors
Data.Full$Attrition_Flag = as.factor(Data.Full$Attrition_Flag)
Data.Full$Gender = as.factor(Data.Full$Gender)
Data.Full$Education_Level= as.factor(Data.Full$Education_Level)
Data.Full$Marital_Status = as.factor(Data.Full$Marital_Status)
Data.Full$Income_Category = as.factor(Data.Full$Income_Category)
Data.Full$Card_Category = as.factor(Data.Full$Card_Category)
str(Data.Full)
## 'data.frame': 10127 obs. of 20 variables:
## $ Attrition_Flag : Factor w/ 2 levels "churned","not_churned": 2 2 2 2 2 2 2 2 2 2 ...
## $ Customer_Age : int 45 49 51 40 40 44 51 32 37 48 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 2 2 2 2 2 ...
## $ Dependent_count : int 3 5 3 4 3 2 4 0 3 2 ...
## $ Education_Level : Factor w/ 7 levels "College","Doctorate",..: 4 3 3 4 6 3 7 4 6 3 ...
## $ Marital_Status : Factor w/ 4 levels "Divorced","Married",..: 2 3 2 4 2 2 2 4 3 3 ...
## $ Income_Category : Factor w/ 6 levels "$120K +","$40K - $60K",..: 3 5 4 5 3 2 1 3 3 4 ...
## $ Card_Category : Factor w/ 4 levels "Blue","Gold",..: 1 1 1 1 1 1 2 4 1 1 ...
## $ Months_on_book : int 39 44 36 34 21 36 46 27 36 36 ...
## $ Total_Relationship_Count: int 5 6 4 3 5 3 6 2 5 6 ...
## $ Months_Inactive_12_mon : int 1 1 1 4 1 1 1 2 2 3 ...
## $ Contacts_Count_12_mon : int 3 2 0 1 0 2 3 2 0 3 ...
## $ Credit_Limit : num 12691 8256 3418 3313 4716 ...
## $ Total_Revolving_Bal : int 777 864 0 2517 0 1247 2264 1396 2517 1677 ...
## $ Avg_Open_To_Buy : num 11914 7392 3418 796 4716 ...
## $ Total_Amt_Chng_Q4_Q1 : num 1.33 1.54 2.59 1.4 2.17 ...
## $ Total_Trans_Amt : int 1144 1291 1887 1171 816 1088 1330 1538 1350 1441 ...
## $ Total_Trans_Ct : int 42 33 20 20 28 24 31 36 24 32 ...
## $ Total_Ct_Chng_Q4_Q1 : num 1.62 3.71 2.33 2.33 2.5 ...
## $ Avg_Utilization_Ratio : num 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...
#check for missing values
sum(is.na(Data.Full))
## [1] 0
#plot of missing values
plot_missing(Data.Full,
group = c("Excellent" = 0.0, "Good" = .01, "Ok" = .05, "Bad" = .1),
missing_only = F,
title= "Missing on Churn Data",
ggtheme = theme_bw(),
theme_config = list(legend.position = c("below")) )
Plot of outcome variable
ggplot(Data.Full, aes(x = Attrition_Flag)) +
geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
geom_text(stat='count', aes(label=..count..), vjust= 1, colour = "black") +
labs(title= "Distribution of Attrition", y = "Count", x = "Attrition Flag") +
scale_fill_discrete(name = "Attrition Flag") +
scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+
theme(axis.title.y=element_blank(),
axis.title.x=element_blank(),
axis.ticks = element_blank(),
legend.position = "none",
panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
Histograms of all numeric variables
Data.Full %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram(aes(y=..density..), colour="black", fill="white")+
geom_density(alpha=.2, fill="#3399CC")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Boxplots of all numeric variables
par(mar=c(2,2,2,2))
par(mfrow=c(5,3))
boxplot(Data.Full$Customer_Age, col="#FF9999" , main="Customer's Age in Years" )
boxplot(Data.Full$Dependent_count,col="#FF9999" , main="Number of dependents")
boxplot(Data.Full$Months_on_book, col="#FF9999" , main="Period of relationship with bank")
boxplot(Data.Full$Total_Relationship_Count,col="#FF9999" , main="Total no. of products held by the customer")
boxplot(Data.Full$Months_Inactive_12_mon,col="#FF9999" , main="No. of months inactive in the last 12 months")
boxplot(Data.Full$Contacts_Count_12_mon,col="#FF9999" , main="No. of Contacts in the last 12 months")
boxplot(Data.Full$Credit_Limit,col="#FF9999" , main="Credit Limit on the Credit Card")
boxplot(Data.Full$Total_Revolving_Bal,col="#FF9999" , main="Total Revolving Balance on the Credit Card")
boxplot(Data.Full$Avg_Open_To_Buy,col="#FF9999" , main="Credit Line Average of last 12 months")
boxplot(Data.Full$Total_Amt_Chng_Q4_Q1,col="#FF9999" , main="Change in Transaction Amount (Q4 over Q1)")
boxplot(Data.Full$Total_Trans_Amt,col="#FF9999" , main="Total Transaction Amount (Last 12 months)")
boxplot(Data.Full$Total_Trans_Ct, col="#FF9999" , main="Total Transaction Count (Last 12 months)" )
boxplot(Data.Full$Total_Ct_Chng_Q4_Q1, col="#FF9999" , main="Change in Transaction Count (Q4 over Q1)" )
boxplot(Data.Full$Avg_Utilization_Ratio, col="#FF9999" , main="Average Card Utilization Ratio" )
Plots of all factor variables to the outcome variables
GENDER
ggplot(Data.Full, aes(x = Gender)) +
geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
labs(title= "Distribution of Attrition by Gender", y = "Count", x = "Attrition Flag") +
scale_fill_discrete(name = "Attrition Flag") +
scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+
theme(axis.title.y=element_blank(),
axis.title.x=element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
EDUCATION LEVEL
ggplot(Data.Full, aes(x = Education_Level)) +
geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
labs(title= "Distribution of Attrition by Education Level", y = "Count", x = "Attrition Flag") +
scale_fill_discrete(name = "Attrition Flag") +
scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+
theme(axis.title.y=element_blank(),
axis.title.x=element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
MARITAL STATUS
ggplot(Data.Full, aes(x = Marital_Status)) +
geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
labs(title= "Distribution of Attrition by Marital Status", y = "Count", x = "Attrition Flag") +
scale_fill_discrete(name = "Attrition Flag") +
scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+
theme(axis.title.y=element_blank(),
axis.title.x=element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
INCOME CATEGORY
ggplot(Data.Full, aes(x = Income_Category)) +
geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
labs(title= "Distribution of Attrition by Income Category", y = "Count", x = "Attrition Flag") +
scale_fill_discrete(name = "Attrition Flag") +
scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+
theme(axis.title.y=element_blank(),
axis.title.x=element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
CARD CATEGORY
ggplot(Data.Full, aes(x = Card_Category)) +
geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
labs(title= "Distribution of Attrition by Card Category", y = "Count", x = "Attrition Flag") +
scale_fill_discrete(name = "Attrition Flag") +
scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+
theme(axis.title.y=element_blank(),
axis.title.x=element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
Correlation
#Correlation of numeric variables
NumVar1 <- Data.Full[, sapply(Data.Full, class) %in% c('integer', 'numeric')]
cormatrix1 <- cor(NumVar1)
corrplot(cormatrix1, method = "number", tl.col = "black", tl.cex = .80, number.cex = .80)
#get name of the suggested variable to be remove due to being highly correlated with other variables
NumVar1.High.Cor <- findCorrelation(cormatrix1, .75, names = T)
NumVar1.High.Cor
## [1] "Avg_Open_To_Buy" "Total_Trans_Amt" "Customer_Age"
#Remove highly correlated variable "Average open to buy " first
NumVar2 <- select(NumVar1, -Avg_Open_To_Buy)
cormatrix2 <- cor(NumVar2)
corrplot(cormatrix2, method = "number", tl.col = "black", tl.cex = .70, number.cex = .80)
#get name of the suggested variable to be remove due to being highly correlated with other variables
NumVar2.High.Cor <- findCorrelation(cormatrix2, .75, names = T)
NumVar2.High.Cor
## [1] "Total_Trans_Amt" "Customer_Age"
#Remove highly correlated variable "Total Trans Amount" because it is correlated with "Total Trans Amt" at 0.81
NumVar3 <- select(NumVar2, -Total_Trans_Amt)
cormatrix3 <- cor(NumVar3)
corrplot(cormatrix3, method = "number", tl.col = "black", tl.cex = .70, number.cex = .80)
#get name of the suggested variable to be remove due to being highly correlated with other variables
NumVar3.High.Cor <- findCorrelation(cormatrix3, .75, names = T)
NumVar3.High.Cor
## [1] "Customer_Age"
#Remove highly correlated variable "Customer Age"
NumVar4 <- select(NumVar3, -Customer_Age)
cormatrix4 <- cor(NumVar4)
corrplot(cormatrix4, method = "number", tl.col = "black", tl.cex = .90, number.cex = .90)
#get name of the suggested variable to be remove due to being highly correlated with other variables
NumVar4.High.Cor <- findCorrelation(cormatrix4, .75, names = T)
NumVar4.High.Cor
## character(0)
Remove highly correlated variables and create REDUCED dataset
#Produce reduced dataset without the highly correlated data, the data should have 17 variables
Data.Reduced <- dplyr::select(Data.Full, -c(Avg_Open_To_Buy, Total_Trans_Amt, Customer_Age))
str(Data.Reduced)
## 'data.frame': 10127 obs. of 17 variables:
## $ Attrition_Flag : Factor w/ 2 levels "churned","not_churned": 2 2 2 2 2 2 2 2 2 2 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 2 2 2 2 2 ...
## $ Dependent_count : int 3 5 3 4 3 2 4 0 3 2 ...
## $ Education_Level : Factor w/ 7 levels "College","Doctorate",..: 4 3 3 4 6 3 7 4 6 3 ...
## $ Marital_Status : Factor w/ 4 levels "Divorced","Married",..: 2 3 2 4 2 2 2 4 3 3 ...
## $ Income_Category : Factor w/ 6 levels "$120K +","$40K - $60K",..: 3 5 4 5 3 2 1 3 3 4 ...
## $ Card_Category : Factor w/ 4 levels "Blue","Gold",..: 1 1 1 1 1 1 2 4 1 1 ...
## $ Months_on_book : int 39 44 36 34 21 36 46 27 36 36 ...
## $ Total_Relationship_Count: int 5 6 4 3 5 3 6 2 5 6 ...
## $ Months_Inactive_12_mon : int 1 1 1 4 1 1 1 2 2 3 ...
## $ Contacts_Count_12_mon : int 3 2 0 1 0 2 3 2 0 3 ...
## $ Credit_Limit : num 12691 8256 3418 3313 4716 ...
## $ Total_Revolving_Bal : int 777 864 0 2517 0 1247 2264 1396 2517 1677 ...
## $ Total_Amt_Chng_Q4_Q1 : num 1.33 1.54 2.59 1.4 2.17 ...
## $ Total_Trans_Ct : int 42 33 20 20 28 24 31 36 24 32 ...
## $ Total_Ct_Chng_Q4_Q1 : num 1.62 3.71 2.33 2.33 2.5 ...
## $ Avg_Utilization_Ratio : num 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...
Create test and train set from the full dataset
set.seed(1)
full.inTrain <- createDataPartition(Data.Full$Attrition_Flag, p = .8)[[1]]
full.train <- Data.Full[full.inTrain,]
full.test <- Data.Full[-full.inTrain,]
Create test and train set from the reduced dataset
set.seed(1)
reduced.inTrain <- createDataPartition(Data.Reduced$Attrition_Flag, p = .8)[[1]]
reduced.train <- Data.Reduced[reduced.inTrain,]
reduced.test <- Data.Reduced[-reduced.inTrain,]
#Remove unneeded objects from the environment
rm(cormatrix1, cormatrix2, cormatrix3, cormatrix4, NumVar1, NumVar2, NumVar3, NumVar4, NumVar1.High.Cor, NumVar2.High.Cor, NumVar3.High.Cor, NumVar4.High.Cor)
Set control
set.seed(1)
ctrl <- trainControl(method = "repeatedcv",
number =10,
repeats=3,
summaryFunction=twoClassSummary,
classProbs = TRUE)
Fit the full logistic model **Removed Avg_Open_To_Buy because it is perfectly correlated with credit limit and VIF will not run if there are variables that are perfectly correlated
set.seed(1)
glm.full <- train(Attrition_Flag~ . -Avg_Open_To_Buy , data = full.train, method = "glm", metric = "ROC", trControl = ctrl)
summary(glm.full)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5268 0.0688 0.1736 0.3644 2.8020
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.404e+00 5.294e-01 -12.098 < 2e-16 ***
## Customer_Age 6.893e-03 8.650e-03 0.797 0.425508
## GenderMale 8.497e-01 1.609e-01 5.279 1.30e-07 ***
## Dependent_count -1.321e-01 3.351e-02 -3.940 8.14e-05 ***
## Education_LevelDoctorate -4.023e-01 2.279e-01 -1.765 0.077486 .
## Education_LevelGraduate 7.764e-03 1.558e-01 0.050 0.960255
## `Education_LevelHigh School` -8.308e-02 1.654e-01 -0.502 0.615529
## `Education_LevelPost-Graduate` -2.086e-01 2.325e-01 -0.897 0.369529
## Education_LevelUneducated -6.068e-02 1.754e-01 -0.346 0.729331
## Education_LevelUnknown -1.608e-01 1.726e-01 -0.932 0.351313
## Marital_StatusMarried 5.246e-01 1.737e-01 3.021 0.002520 **
## Marital_StatusSingle -3.970e-02 1.739e-01 -0.228 0.819399
## Marital_StatusUnknown 9.720e-03 2.220e-01 0.044 0.965083
## `Income_Category$40K - $60K` 8.409e-01 2.275e-01 3.696 0.000219 ***
## `Income_Category$60K - $80K` 5.339e-01 2.022e-01 2.640 0.008284 **
## `Income_Category$80K - $120K` 2.595e-01 1.882e-01 1.379 0.167930
## `Income_CategoryLess than $40K` 6.359e-01 2.458e-01 2.587 0.009668 **
## Income_CategoryUnknown 6.689e-01 2.573e-01 2.599 0.009336 **
## Card_CategoryGold -5.885e-01 4.322e-01 -1.362 0.173297
## Card_CategoryPlatinum -5.265e-01 8.126e-01 -0.648 0.517067
## Card_CategorySilver -3.139e-01 2.212e-01 -1.419 0.155977
## Months_on_book 3.871e-03 8.579e-03 0.451 0.651803
## Total_Relationship_Count 4.467e-01 3.073e-02 14.537 < 2e-16 ***
## Months_Inactive_12_mon -5.118e-01 4.217e-02 -12.137 < 2e-16 ***
## Contacts_Count_12_mon -5.356e-01 4.061e-02 -13.189 < 2e-16 ***
## Credit_Limit 1.919e-05 7.711e-06 2.488 0.012845 *
## Total_Revolving_Bal 9.075e-04 8.048e-05 11.276 < 2e-16 ***
## Total_Amt_Chng_Q4_Q1 3.603e-01 2.081e-01 1.731 0.083425 .
## Total_Trans_Amt -4.755e-04 2.531e-05 -18.788 < 2e-16 ***
## Total_Trans_Ct 1.164e-01 4.115e-03 28.292 < 2e-16 ***
## Total_Ct_Chng_Q4_Q1 2.787e+00 2.116e-01 13.169 < 2e-16 ***
## Avg_Utilization_Ratio 2.390e-01 2.794e-01 0.855 0.392348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7143.2 on 8101 degrees of freedom
## Residual deviance: 3795.7 on 8070 degrees of freedom
## AIC: 3859.7
##
## Number of Fisher Scoring iterations: 6
Check VIF of the full logistic regression
car::vif(glm.full$finalModel)
## Customer_Age GenderMale
## 2.880910 3.655510
## Dependent_count Education_LevelDoctorate
## 1.057423 1.475645
## Education_LevelGraduate `Education_LevelHigh School`
## 2.911585 2.468186
## `Education_LevelPost-Graduate` Education_LevelUneducated
## 1.449436 2.137589
## Education_LevelUnknown Marital_StatusMarried
## 2.220887 4.247873
## Marital_StatusSingle Marital_StatusUnknown
## 4.101392 1.961592
## `Income_Category$40K - $60K` `Income_Category$60K - $80K`
## 3.973161 2.627034
## `Income_Category$80K - $120K` `Income_CategoryLess than $40K`
## 2.650383 7.863067
## Income_CategoryUnknown Card_CategoryGold
## 3.922451 1.124715
## Card_CategoryPlatinum Card_CategorySilver
## 1.037942 1.397398
## Months_on_book Total_Relationship_Count
## 2.877328 1.191303
## Months_Inactive_12_mon Contacts_Count_12_mon
## 1.059515 1.048592
## Credit_Limit Total_Revolving_Bal
## 2.689645 2.552419
## Total_Amt_Chng_Q4_Q1 Total_Trans_Amt
## 1.161880 4.343041
## Total_Trans_Ct Total_Ct_Chng_Q4_Q1
## 4.486725 1.186426
## Avg_Utilization_Ratio
## 2.976799
Confusion matrix of the full logistic model
set.seed(1)
glm.probs.full <- predict(glm.full, newdata = full.test, type = "raw")
glm.cm.full<- caret::confusionMatrix(as.factor(glm.probs.full),as.factor(full.test$Attrition_Flag), positive = "churned")
glm.cm.full
## Confusion Matrix and Statistics
##
## Reference
## Prediction churned not_churned
## churned 195 54
## not_churned 130 1646
##
## Accuracy : 0.9091
## 95% CI : (0.8958, 0.9213)
## No Information Rate : 0.8395
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6276
##
## Mcnemar's Test P-Value : 3.219e-08
##
## Sensitivity : 0.6000
## Specificity : 0.9682
## Pos Pred Value : 0.7831
## Neg Pred Value : 0.9268
## Prevalence : 0.1605
## Detection Rate : 0.0963
## Detection Prevalence : 0.1230
## Balanced Accuracy : 0.7841
##
## 'Positive' Class : churned
##
Plot of the full logistic model confusion matrix
draw_confusion_matrix <- function(glm.cm.full) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('Full Logistic Regression Confusion Matrix', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#72a376')
text(195, 435, 'churned', cex=1.2)
rect(250, 430, 340, 370, col='#c0504d')
text(295, 435, 'not churned', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#c0504d')
rect(250, 305, 340, 365, col='#72a376')
text(140, 400, 'churned', cex=1.2, srt=90)
text(140, 335, 'not churned', cex=1.2, srt=90)
# add in the glm.cm results
res <- as.numeric(glm.cm.full$table)
text(195, 400, res[1], cex=2, font=2, col='white')
text(195, 335, res[2], cex=2, font=2, col='white')
text(295, 400, res[3], cex=2, font=2, col='white')
text(295, 335, res[4], cex=2, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(glm.cm.full$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(glm.cm.full$byClass[1]), 3), cex=1.2)
text(30, 85, names(glm.cm.full$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(glm.cm.full$byClass[2]), 3), cex=1.2)
text(50, 85, names(glm.cm.full$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(glm.cm.full$byClass[5]), 3), cex=1.2)
text(70, 85, names(glm.cm.full$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(glm.cm.full$byClass[6]), 3), cex=1.2)
text(90, 85, names(glm.cm.full$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(glm.cm.full$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(glm.cm.full$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(glm.cm.full$overall[1]), 3), cex=1.4)
text(70, 35, names(glm.cm.full$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(glm.cm.full$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix(glm.cm.full)
Variable importance of the full logistic model
varImp(glm.full, scale = FALSE)
## glm variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Total_Trans_Ct 28.292
## Total_Trans_Amt 18.788
## Total_Relationship_Count 14.537
## Contacts_Count_12_mon 13.189
## Total_Ct_Chng_Q4_Q1 13.169
## Months_Inactive_12_mon 12.137
## Total_Revolving_Bal 11.276
## GenderMale 5.279
## Dependent_count 3.940
## `Income_Category$40K - $60K` 3.696
## Marital_StatusMarried 3.021
## `Income_Category$60K - $80K` 2.640
## Income_CategoryUnknown 2.599
## `Income_CategoryLess than $40K` 2.587
## Credit_Limit 2.488
## Education_LevelDoctorate 1.765
## Total_Amt_Chng_Q4_Q1 1.731
## Card_CategorySilver 1.419
## `Income_Category$80K - $120K` 1.379
## Card_CategoryGold 1.362
plot(varImp(glm.full, scale = FALSE))
Fit the reduced logistic model
set.seed(1)
glm.reduced <- train(Attrition_Flag~., data = reduced.train, method = "glm", metric = "ROC", trControl = ctrl)
summary(glm.reduced)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7187 0.0890 0.2066 0.4129 2.5978
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.992e+00 4.649e-01 -10.738 < 2e-16 ***
## GenderMale 6.849e-01 1.549e-01 4.422 9.76e-06 ***
## Dependent_count -1.149e-01 3.209e-02 -3.582 0.000342 ***
## Education_LevelDoctorate -4.373e-01 2.164e-01 -2.021 0.043284 *
## Education_LevelGraduate -8.449e-02 1.491e-01 -0.567 0.570934
## `Education_LevelHigh School` -1.835e-01 1.586e-01 -1.157 0.247228
## `Education_LevelPost-Graduate` -3.051e-01 2.221e-01 -1.374 0.169568
## Education_LevelUneducated -1.435e-01 1.682e-01 -0.853 0.393423
## Education_LevelUnknown -2.486e-01 1.655e-01 -1.502 0.133106
## Marital_StatusMarried 3.951e-01 1.653e-01 2.389 0.016873 *
## Marital_StatusSingle -7.506e-02 1.657e-01 -0.453 0.650481
## Marital_StatusUnknown -1.304e-01 2.122e-01 -0.615 0.538808
## `Income_Category$40K - $60K` 6.552e-01 2.202e-01 2.976 0.002923 **
## `Income_Category$60K - $80K` 4.442e-01 1.970e-01 2.254 0.024167 *
## `Income_Category$80K - $120K` 2.006e-01 1.824e-01 1.100 0.271384
## `Income_CategoryLess than $40K` 4.372e-01 2.382e-01 1.836 0.066386 .
## Income_CategoryUnknown 5.739e-01 2.482e-01 2.312 0.020777 *
## Card_CategoryGold -7.216e-01 4.313e-01 -1.673 0.094289 .
## Card_CategoryPlatinum -8.979e-01 8.418e-01 -1.067 0.286127
## Card_CategorySilver -5.461e-01 2.143e-01 -2.549 0.010816 *
## Months_on_book 9.645e-03 5.021e-03 1.921 0.054707 .
## Total_Relationship_Count 5.273e-01 2.958e-02 17.825 < 2e-16 ***
## Months_Inactive_12_mon -4.791e-01 4.019e-02 -11.921 < 2e-16 ***
## Contacts_Count_12_mon -4.949e-01 3.848e-02 -12.860 < 2e-16 ***
## Credit_Limit 1.484e-05 7.490e-06 1.981 0.047610 *
## Total_Revolving_Bal 8.028e-04 7.807e-05 10.284 < 2e-16 ***
## Total_Amt_Chng_Q4_Q1 6.491e-03 1.930e-01 0.034 0.973175
## Total_Trans_Ct 6.526e-02 2.415e-03 27.025 < 2e-16 ***
## Total_Ct_Chng_Q4_Q1 2.682e+00 2.056e-01 13.046 < 2e-16 ***
## Avg_Utilization_Ratio 5.259e-01 2.673e-01 1.968 0.049116 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7143.2 on 8101 degrees of freedom
## Residual deviance: 4134.5 on 8072 degrees of freedom
## AIC: 4194.5
##
## Number of Fisher Scoring iterations: 6
Check VIF of the reduced logistic regression
car::vif(glm.reduced$finalModel)
## GenderMale Dependent_count
## 3.676797 1.055243
## Education_LevelDoctorate Education_LevelGraduate
## 1.484708 2.915320
## `Education_LevelHigh School` `Education_LevelPost-Graduate`
## 2.464830 1.450609
## Education_LevelUneducated Education_LevelUnknown
## 2.130946 2.215503
## Marital_StatusMarried Marital_StatusSingle
## 4.193971 4.068851
## Marital_StatusUnknown `Income_Category$40K - $60K`
## 1.938676 4.114081
## `Income_Category$60K - $80K` `Income_Category$80K - $120K`
## 2.637744 2.682099
## `Income_CategoryLess than $40K` Income_CategoryUnknown
## 8.096989 4.023133
## Card_CategoryGold Card_CategoryPlatinum
## 1.121417 1.035684
## Card_CategorySilver Months_on_book
## 1.412281 1.048936
## Total_Relationship_Count Months_Inactive_12_mon
## 1.119741 1.047173
## Contacts_Count_12_mon Credit_Limit
## 1.035410 2.756137
## Total_Revolving_Bal Total_Amt_Chng_Q4_Q1
## 2.566631 1.150400
## Total_Trans_Ct Total_Ct_Chng_Q4_Q1
## 1.295491 1.188908
## Avg_Utilization_Ratio
## 2.975631
Confusion matrix of the reduced logistic model
set.seed(1)
glm.probs.reduced <- predict(glm.reduced, newdata = reduced.test, type = "raw")
glm.cm.reduced<- caret::confusionMatrix(as.factor(glm.probs.reduced),as.factor(reduced.test$Attrition_Flag), positive = "churned")
glm.cm.reduced
## Confusion Matrix and Statistics
##
## Reference
## Prediction churned not_churned
## churned 175 55
## not_churned 150 1645
##
## Accuracy : 0.8988
## 95% CI : (0.8848, 0.9116)
## No Information Rate : 0.8395
## P-Value [Acc > NIR] : 9.185e-15
##
## Kappa : 0.574
##
## Mcnemar's Test P-Value : 5.195e-11
##
## Sensitivity : 0.53846
## Specificity : 0.96765
## Pos Pred Value : 0.76087
## Neg Pred Value : 0.91643
## Prevalence : 0.16049
## Detection Rate : 0.08642
## Detection Prevalence : 0.11358
## Balanced Accuracy : 0.75305
##
## 'Positive' Class : churned
##
Plot of the reduced logistic model confusion matrix
draw_confusion_matrix <- function(glm.cm.reduced) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('Reduced Logistic Regression Confusion Matrix', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#72a376')
text(195, 435, 'churned', cex=1.2)
rect(250, 430, 340, 370, col='#c0504d')
text(295, 435, 'not churned', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#c0504d')
rect(250, 305, 340, 365, col='#72a376')
text(140, 400, 'churned', cex=1.2, srt=90)
text(140, 335, 'not churned', cex=1.2, srt=90)
# add in the glm.cm.reduced results
res <- as.numeric(glm.cm.reduced$table)
text(195, 400, res[1], cex=2, font=2, col='white')
text(195, 335, res[2], cex=2, font=2, col='white')
text(295, 400, res[3], cex=2, font=2, col='white')
text(295, 335, res[4], cex=2, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(glm.cm.reduced$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(glm.cm.reduced$byClass[1]), 3), cex=1.2)
text(30, 85, names(glm.cm.reduced$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(glm.cm.reduced$byClass[2]), 3), cex=1.2)
text(50, 85, names(glm.cm.reduced$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(glm.cm.reduced$byClass[5]), 3), cex=1.2)
text(70, 85, names(glm.cm.reduced$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(glm.cm.reduced$byClass[6]), 3), cex=1.2)
text(90, 85, names(glm.cm.reduced$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(glm.cm.reduced$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(glm.cm.reduced$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(glm.cm.reduced$overall[1]), 3), cex=1.4)
text(70, 35, names(glm.cm.reduced$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(glm.cm.reduced$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix(glm.cm.reduced)
Variable importance of the reduced logistic model
varImp(glm.reduced, scale = FALSE)
## glm variable importance
##
## only 20 most important variables shown (out of 29)
##
## Overall
## Total_Trans_Ct 27.025
## Total_Relationship_Count 17.825
## Total_Ct_Chng_Q4_Q1 13.046
## Contacts_Count_12_mon 12.860
## Months_Inactive_12_mon 11.921
## Total_Revolving_Bal 10.284
## GenderMale 4.422
## Dependent_count 3.582
## `Income_Category$40K - $60K` 2.976
## Card_CategorySilver 2.549
## Marital_StatusMarried 2.389
## Income_CategoryUnknown 2.312
## `Income_Category$60K - $80K` 2.254
## Education_LevelDoctorate 2.021
## Credit_Limit 1.981
## Avg_Utilization_Ratio 1.968
## Months_on_book 1.921
## `Income_CategoryLess than $40K` 1.836
## Card_CategoryGold 1.673
## Education_LevelUnknown 1.502
plot(varImp(glm.reduced, scale = FALSE))
Fit the full random forest model
#fit full random forest
# set.seed(1)
# rf.full <- train(Attrition_Flag~., data = full.train, method = "rf", metric= "Accuracy", trControl = ctrl , tuneGrid = NULL)
# print(rf.full)
#save the rf model to an object to avoid running it again for results
# saveRDS(rf.full, "rf.full.rds")
rf.full <- readRDS("rf.full.rds")
print(rf.full)
## Random Forest
##
## 8102 samples
## 19 predictor
## 2 classes: 'churned', 'not_churned'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 7292, 7292, 7292, 7291, 7292, 7292, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9193600 0.6370257
## 17 0.9638362 0.8620252
## 32 0.9610387 0.8522121
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 17.
Confusion matrix of the full random forest model
set.seed(1)
rf.probs.full <- predict(rf.full, newdata = full.test, type = "raw")
rf.cm.full<- caret::confusionMatrix(as.factor(rf.probs.full),as.factor(full.test$Attrition_Flag), positive = "churned")
rf.cm.full
## Confusion Matrix and Statistics
##
## Reference
## Prediction churned not_churned
## churned 286 19
## not_churned 39 1681
##
## Accuracy : 0.9714
## 95% CI : (0.9631, 0.9782)
## No Information Rate : 0.8395
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.891
##
## Mcnemar's Test P-Value : 0.0126
##
## Sensitivity : 0.8800
## Specificity : 0.9888
## Pos Pred Value : 0.9377
## Neg Pred Value : 0.9773
## Prevalence : 0.1605
## Detection Rate : 0.1412
## Detection Prevalence : 0.1506
## Balanced Accuracy : 0.9344
##
## 'Positive' Class : churned
##
Plot of the full random forest model confusion matrix
draw_confusion_matrix <- function(rf.cm.full) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('Full Random Forest Confusion Matrix', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#72a376')
text(195, 435, 'churned', cex=1.2)
rect(250, 430, 340, 370, col='#c0504d')
text(295, 435, 'not churned', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#c0504d')
rect(250, 305, 340, 365, col='#72a376')
text(140, 400, 'churned', cex=1.2, srt=90)
text(140, 335, 'not churned', cex=1.2, srt=90)
# add in the glm.cm results
res <- as.numeric(rf.cm.full$table)
text(195, 400, res[1], cex=2, font=2, col='white')
text(195, 335, res[2], cex=2, font=2, col='white')
text(295, 400, res[3], cex=2, font=2, col='white')
text(295, 335, res[4], cex=2, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(rf.cm.full$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(rf.cm.full$byClass[1]), 3), cex=1.2)
text(30, 85, names(rf.cm.full$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(rf.cm.full$byClass[2]), 3), cex=1.2)
text(50, 85, names(rf.cm.full$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(rf.cm.full$byClass[5]), 3), cex=1.2)
text(70, 85, names(rf.cm.full$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(rf.cm.full$byClass[6]), 3), cex=1.2)
text(90, 85, names(rf.cm.full$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(rf.cm.full$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(rf.cm.full$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(rf.cm.full$overall[1]), 3), cex=1.4)
text(70, 35, names(rf.cm.full$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(rf.cm.full$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix(rf.cm.full)
Fit the reduced random forest model
#fit reduced random forest
# set.seed(1)
# rf.reduced <- train(Attrition_Flag~., data = reduced.train, method = "rf", metric= "Accuracy", trControl = ctrl , tuneGrid = NULL)
# print(rf.reduced)
#save the rf model to an object to avoid running it again for results
# saveRDS(rf.reduced, "rf.reduced.rds")
rf.reduced <- readRDS("rf.reduced.rds")
print(rf.reduced)
## Random Forest
##
## 8102 samples
## 16 predictor
## 2 classes: 'churned', 'not_churned'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 7292, 7292, 7292, 7291, 7292, 7292, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9036037 0.5432447
## 15 0.9308395 0.7238281
## 29 0.9293996 0.7205106
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 15.
Confusion matrix of the reduced random forest model
set.seed(1)
rf.probs.reduced <- predict(rf.reduced, newdata = reduced.test, type = "raw")
rf.cm.reduced<- caret::confusionMatrix(as.factor(rf.probs.reduced),as.factor(reduced.test$Attrition_Flag), positive = "churned")
rf.cm.reduced
## Confusion Matrix and Statistics
##
## Reference
## Prediction churned not_churned
## churned 231 40
## not_churned 94 1660
##
## Accuracy : 0.9338
## 95% CI : (0.9221, 0.9443)
## No Information Rate : 0.8395
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7367
##
## Mcnemar's Test P-Value : 4.683e-06
##
## Sensitivity : 0.7108
## Specificity : 0.9765
## Pos Pred Value : 0.8524
## Neg Pred Value : 0.9464
## Prevalence : 0.1605
## Detection Rate : 0.1141
## Detection Prevalence : 0.1338
## Balanced Accuracy : 0.8436
##
## 'Positive' Class : churned
##
Plot of the reduced random forest model confusion matrix
draw_confusion_matrix <- function(rf.cm.reduced) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('Reduced Random Forest Confusion Matrix', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#72a376')
text(195, 435, 'churned', cex=1.2)
rect(250, 430, 340, 370, col='#c0504d')
text(295, 435, 'not churned', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#c0504d')
rect(250, 305, 340, 365, col='#72a376')
text(140, 400, 'churned', cex=1.2, srt=90)
text(140, 335, 'not churned', cex=1.2, srt=90)
# add in the glm.cm results
res <- as.numeric(rf.cm.reduced$table)
text(195, 400, res[1], cex=2, font=2, col='white')
text(195, 335, res[2], cex=2, font=2, col='white')
text(295, 400, res[3], cex=2, font=2, col='white')
text(295, 335, res[4], cex=2, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(rf.cm.reduced$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(rf.cm.reduced$byClass[1]), 3), cex=1.2)
text(30, 85, names(rf.cm.reduced$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(rf.cm.reduced$byClass[2]), 3), cex=1.2)
text(50, 85, names(rf.cm.reduced$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(rf.cm.reduced$byClass[5]), 3), cex=1.2)
text(70, 85, names(rf.cm.reduced$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(rf.cm.reduced$byClass[6]), 3), cex=1.2)
text(90, 85, names(rf.cm.reduced$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(rf.cm.reduced$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(rf.cm.reduced$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(rf.cm.reduced$overall[1]), 3), cex=1.4)
text(70, 35, names(rf.cm.reduced$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(rf.cm.reduced$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix(rf.cm.reduced)
Fit the full support vector machine model
#fit full support vector machine model
# set.seed(1)
# svm.full <- train(Attrition_Flag~., data = full.train, method = "svmRadial", tuneLength = 5, preProc = c("center", "scale"), metric = "ROC", trControl = ctrl)
# print(svm.full)
#save the rf model to an object to avoid running it again for results
# saveRDS(svm.full, "svm.full.rds")
svm.full <- readRDS("svm.full.rds")
print(svm.full)
## Support Vector Machines with Radial Basis Function Kernel
##
## 8102 samples
## 19 predictor
## 2 classes: 'churned', 'not_churned'
##
## Pre-processing: centered (32), scaled (32)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 7292, 7292, 7292, 7291, 7292, 7292, ...
## Resampling results across tuning parameters:
##
## C ROC Sens Spec
## 0.25 0.9281944 0.6041632 0.9682353
## 0.50 0.9396694 0.6241358 0.9700490
## 1.00 0.9480545 0.6538188 0.9710294
## 2.00 0.9522997 0.6640517 0.9715686
## 4.00 0.9545427 0.6773693 0.9711275
##
## Tuning parameter 'sigma' was held constant at a value of 0.01978265
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01978265 and C = 4.
Confusion matrix of the full support vector machine model
set.seed(1)
svm.probs.full <- predict(svm.full, newdata = full.test, type = "raw")
svm.cm.full<- caret::confusionMatrix(as.factor(svm.probs.full),as.factor(full.test$Attrition_Flag), positive = "churned")
svm.cm.full
## Confusion Matrix and Statistics
##
## Reference
## Prediction churned not_churned
## churned 230 30
## not_churned 95 1670
##
## Accuracy : 0.9383
## 95% CI : (0.9269, 0.9484)
## No Information Rate : 0.8395
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7508
##
## Mcnemar's Test P-Value : 1.038e-08
##
## Sensitivity : 0.7077
## Specificity : 0.9824
## Pos Pred Value : 0.8846
## Neg Pred Value : 0.9462
## Prevalence : 0.1605
## Detection Rate : 0.1136
## Detection Prevalence : 0.1284
## Balanced Accuracy : 0.8450
##
## 'Positive' Class : churned
##
Plot of the full support vector machine confusion matrix
draw_confusion_matrix <- function(svm.cm.full) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('Full Support Vector Machine Matrix', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#72a376')
text(195, 435, 'churned', cex=1.2)
rect(250, 430, 340, 370, col='#c0504d')
text(295, 435, 'not churned', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#c0504d')
rect(250, 305, 340, 365, col='#72a376')
text(140, 400, 'churned', cex=1.2, srt=90)
text(140, 335, 'not churned', cex=1.2, srt=90)
# add in the glm.cm results
res <- as.numeric(svm.cm.full$table)
text(195, 400, res[1], cex=2, font=2, col='white')
text(195, 335, res[2], cex=2, font=2, col='white')
text(295, 400, res[3], cex=2, font=2, col='white')
text(295, 335, res[4], cex=2, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(svm.cm.full$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(svm.cm.full$byClass[1]), 3), cex=1.2)
text(30, 85, names(svm.cm.full$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(svm.cm.full$byClass[2]), 3), cex=1.2)
text(50, 85, names(svm.cm.full$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(svm.cm.full$byClass[5]), 3), cex=1.2)
text(70, 85, names(svm.cm.full$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(svm.cm.full$byClass[6]), 3), cex=1.2)
text(90, 85, names(svm.cm.full$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(svm.cm.full$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(svm.cm.full$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(svm.cm.full$overall[1]), 3), cex=1.4)
text(70, 35, names(svm.cm.full$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(svm.cm.full$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix(svm.cm.full)
Fit the reduced support vector machine model
#fit reduced support vector machine
# set.seed(1)
# svm.reduced <- train(Attrition_Flag~., data = reduced.train, method = "svmRadial", tuneLength = 5, preProc = c("center", "scale"), metric = "ROC", trControl = ctrl)
# print(svm.reduced)
#save the rf model to an object to avoid running it again for results
# saveRDS(svm.reduced, "svm.reduced.rds")
svm.reduced <- readRDS("svm.reduced.rds")
print(svm.reduced)
## Support Vector Machines with Radial Basis Function Kernel
##
## 8102 samples
## 16 predictor
## 2 classes: 'churned', 'not_churned'
##
## Pre-processing: centered (29), scaled (29)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 7292, 7292, 7292, 7291, 7292, 7292, ...
## Resampling results across tuning parameters:
##
## C ROC Sens Spec
## 0.25 0.9146633 0.5798532 0.9692157
## 0.50 0.9211653 0.5900783 0.9705882
## 1.00 0.9270046 0.5913701 0.9718137
## 2.00 0.9315260 0.6013525 0.9718627
## 4.00 0.9342141 0.6016109 0.9716176
##
## Tuning parameter 'sigma' was held constant at a value of 0.02197041
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.02197041 and C = 4.
Confusion matrix of the reduced support vector machine model
set.seed(1)
svm.probs.reduced <- predict(svm.reduced, newdata = reduced.test, type = "raw")
svm.cm.reduced<- caret::confusionMatrix(as.factor(svm.probs.reduced),as.factor(reduced.test$Attrition_Flag), positive = "churned")
svm.cm.reduced
## Confusion Matrix and Statistics
##
## Reference
## Prediction churned not_churned
## churned 205 36
## not_churned 120 1664
##
## Accuracy : 0.923
## 95% CI : (0.9105, 0.9342)
## No Information Rate : 0.8395
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6807
##
## Mcnemar's Test P-Value : 3.026e-11
##
## Sensitivity : 0.6308
## Specificity : 0.9788
## Pos Pred Value : 0.8506
## Neg Pred Value : 0.9327
## Prevalence : 0.1605
## Detection Rate : 0.1012
## Detection Prevalence : 0.1190
## Balanced Accuracy : 0.8048
##
## 'Positive' Class : churned
##
Plot of the reduced support vector machine confusion matrix
draw_confusion_matrix <- function(svm.cm.reduced) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('Reduced Support Vector Machine Confusion Matrix', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#72a376')
text(195, 435, 'churned', cex=1.2)
rect(250, 430, 340, 370, col='#c0504d')
text(295, 435, 'not churned', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#c0504d')
rect(250, 305, 340, 365, col='#72a376')
text(140, 400, 'churned', cex=1.2, srt=90)
text(140, 335, 'not churned', cex=1.2, srt=90)
# add in the glm.cm results
res <- as.numeric(svm.cm.reduced$table)
text(195, 400, res[1], cex=2, font=2, col='white')
text(195, 335, res[2], cex=2, font=2, col='white')
text(295, 400, res[3], cex=2, font=2, col='white')
text(295, 335, res[4], cex=2, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(svm.cm.reduced$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(svm.cm.reduced$byClass[1]), 3), cex=1.2)
text(30, 85, names(svm.cm.reduced$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(svm.cm.reduced$byClass[2]), 3), cex=1.2)
text(50, 85, names(svm.cm.reduced$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(svm.cm.reduced$byClass[5]), 3), cex=1.2)
text(70, 85, names(svm.cm.reduced$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(svm.cm.reduced$byClass[6]), 3), cex=1.2)
text(90, 85, names(svm.cm.reduced$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(svm.cm.reduced$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(svm.cm.reduced$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(svm.cm.reduced$overall[1]), 3), cex=1.4)
text(70, 35, names(svm.cm.reduced$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(svm.cm.reduced$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix(svm.cm.reduced)