Bank_Data <- read.csv(file.choose()) # Choose the claimants Data set
EDU_Data <- read.csv(file.choose())
# sum(is.na(claimants))
# claimants <- na.omit(claimants) # Omitting NA values from the Data
# na.omit => will omit the rows which has atleast 1 NA value
attach(Bank_Data)
bank_data1 <- cbind(age,default,balance,housing,loan,duration,campaign,pdays,
previous,poutfailure,poutother,poutsuccess,poutunknown,con_cellular,con_telephone
,con_unknown,divorced,married,single,joadmin.,joblue.collar,joentrepreneur,
johousemaid,jomanagement,joretired,joself.employed,joservices,jostudent,jotechnician
,jounemployed,HasFD =y,EDU_Unknown =EDU_Data$Edu_Unknown,
EDU_Primary=EDU_Data$Edu_Primary,
EDU_Secondary=EDU_Data$Edu_Secondary,EDU_Tertiary=EDU_Data$Edu_Tertiary)
colnames(bank_data1)
## [1] "age" "default" "balance"
## [4] "housing" "loan" "duration"
## [7] "campaign" "pdays" "previous"
## [10] "poutfailure" "poutother" "poutsuccess"
## [13] "poutunknown" "con_cellular" "con_telephone"
## [16] "con_unknown" "divorced" "married"
## [19] "single" "joadmin." "joblue.collar"
## [22] "joentrepreneur" "johousemaid" "jomanagement"
## [25] "joretired" "joself.employed" "joservices"
## [28] "jostudent" "jotechnician" "jounemployed"
## [31] "HasFD" "EDU_Unknown" "EDU_Primary"
## [34] "EDU_Secondary" "EDU_Tertiary"
bank_data1 <- as.data.frame(bank_data1)
class(bank_data1)
## [1] "data.frame"
# Preparing a linear regression
mod_lm <- lm(HasFD~.,data=bank_data1)
pred1 <- predict(mod_lm,bank_data1)
## Warning in predict.lm(mod_lm, bank_data1): prediction from a rank-deficient
## fit may be misleading
# pred1
plot(age,pred1)

# We can no way use the linear regression technique to classify the data
plot(pred1)

# We can also include NA values but where ever it finds NA value
# probability values obtained using the glm will also be NA
# So they can be either filled using imputation technique or
# exlclude those values
# GLM function use sigmoid curve to produce desirable results
# The output of sigmoid function lies in between 0-1
model <- glm(HasFD~.,data=bank_data1,family = "binomial")
# To calculate the odds ratio manually we going r going to take exp of coef(model)
exp(coef(model))
## (Intercept) age default balance
## 0.02369092 1.00155122 0.84714306 1.00001762
## housing loan duration campaign
## 0.46350089 0.56933386 1.00406236 0.89625882
## pdays previous poutfailure poutother
## 1.00019439 1.01070294 1.27441904 1.63469841
## poutsuccess poutunknown con_cellular con_telephone
## 12.69020856 NA 3.17606995 2.92283064
## con_unknown divorced married single
## NA 0.85116181 0.72629828 NA
## joadmin. joblue.collar joentrepreneur johousemaid
## 1.40165761 0.94648506 0.86261661 0.82790055
## jomanagement joretired joself.employed joservices
## 1.11510436 1.97125931 0.95626510 1.02612093
## jostudent jotechnician jounemployed EDU_Unknown
## 2.32116818 1.06672276 1.12545759 0.87578233
## EDU_Primary EDU_Secondary EDU_Tertiary
## 0.66783084 0.78871718 NA
# Confusion matrix table
prob <- predict(model,bank_data1,type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
summary(model)
##
## Call:
## glm(formula = HasFD ~ ., family = "binomial", data = bank_data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.7169 -0.4054 -0.2718 -0.1616 3.5115
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.743e+00 2.488e-01 -15.044 < 2e-16 ***
## age 1.550e-03 2.148e-03 0.722 0.470430
## default -1.659e-01 1.614e-01 -1.028 0.304021
## balance 1.762e-05 4.859e-06 3.626 0.000287 ***
## housing -7.689e-01 3.956e-02 -19.435 < 2e-16 ***
## loan -5.633e-01 5.821e-02 -9.678 < 2e-16 ***
## duration 4.054e-03 6.274e-05 64.618 < 2e-16 ***
## campaign -1.095e-01 9.909e-03 -11.053 < 2e-16 ***
## pdays 1.944e-04 3.000e-04 0.648 0.517053
## previous 1.065e-02 6.550e-03 1.625 0.104068
## poutfailure 2.425e-01 9.037e-02 2.683 0.007290 **
## poutother 4.915e-01 1.029e-01 4.777 1.78e-06 ***
## poutsuccess 2.541e+00 8.328e-02 30.510 < 2e-16 ***
## poutunknown NA NA NA NA
## con_cellular 1.156e+00 5.778e-02 20.001 < 2e-16 ***
## con_telephone 1.073e+00 8.834e-02 12.141 < 2e-16 ***
## con_unknown NA NA NA NA
## divorced -1.612e-01 6.541e-02 -2.464 0.013751 *
## married -3.198e-01 4.461e-02 -7.168 7.59e-13 ***
## single NA NA NA NA
## joadmin. 3.377e-01 2.294e-01 1.472 0.141054
## joblue.collar -5.500e-02 2.287e-01 -0.240 0.809983
## joentrepreneur -1.478e-01 2.490e-01 -0.593 0.552861
## johousemaid -1.889e-01 2.528e-01 -0.747 0.454970
## jomanagement 1.089e-01 2.278e-01 0.478 0.632431
## joretired 6.787e-01 2.332e-01 2.910 0.003616 **
## joself.employed -4.472e-02 2.426e-01 -0.184 0.853734
## joservices 2.579e-02 2.332e-01 0.111 0.911958
## jostudent 8.421e-01 2.405e-01 3.501 0.000464 ***
## jotechnician 6.459e-02 2.277e-01 0.284 0.776688
## jounemployed 1.182e-01 2.426e-01 0.487 0.626115
## EDU_Unknown -1.326e-01 9.330e-02 -1.422 0.155130
## EDU_Primary -4.037e-01 7.348e-02 -5.494 3.93e-08 ***
## EDU_Secondary -2.373e-01 5.031e-02 -4.718 2.38e-06 ***
## EDU_Tertiary NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 32631 on 45210 degrees of freedom
## Residual deviance: 22604 on 45180 degrees of freedom
## AIC: 22666
##
## Number of Fisher Scoring iterations: 6
# We are going to use NULL and Residual Deviance to compare the between different models
# Confusion matrix and considering the threshold value as 0.5
confusion<-table(prob>0.5,bank_data1$HasFD)
confusion
##
## 0 1
## FALSE 39004 3584
## TRUE 918 1705
# Model Accuracy
Accuracy<-sum(diag(confusion)/sum(confusion))
Accuracy # 90.04
## [1] 0.9004225
# Creating empty vectors to store predicted classes based on threshold value
pred_values <- NULL
yes_no <- NULL
pred_values <- ifelse(prob>=0.5,1,0)
yes_no <- ifelse(prob>=0.5,"yes","no")
# Creating new column to store the above values
bank_data1[,"prob"] <- prob
bank_data1[,"pred_values"] <- pred_values
bank_data1[,"yes_no"] <- yes_no
# View(bank_data1[,c(1,31,36:38)])
table(bank_data1$HasFD,bank_data1$pred_values)
##
## 0 1
## 0 39004 918
## 1 3584 1705
# Calculate the below metrics
# precision | recall | True Positive Rate | False Positive Rate | Specificity | Sensitivity
# from the above table - 59
# ROC Curve => used to evaluate the betterness of the logistic model
# more area under ROC curve better is the model
# We will use ROC curve for any classification technique not only for logistic
# install.packages("ROCR")
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
rocrpred<-prediction(prob,bank_data1$HasFD)
rocrperf<-performance(rocrpred,'tpr','fpr')
str(rocrperf)
## Formal class 'performance' [package "ROCR"] with 6 slots
## ..@ x.name : chr "False positive rate"
## ..@ y.name : chr "True positive rate"
## ..@ alpha.name : chr "Cutoff"
## ..@ x.values :List of 1
## .. ..$ : num [1:45211] 0.00 2.50e-05 5.01e-05 5.01e-05 7.51e-05 ...
## ..@ y.values :List of 1
## .. ..$ : num [1:45211] 0 0 0 0.000189 0.000189 ...
## ..@ alpha.values:List of 1
## .. ..$ : num [1:45211] Inf 1 1 1 1 ...
plot(rocrperf,colorize=T,text.adj=c(-0.2,1.7))

# More area under the ROC Curve better is the logistic regression model obtained
## Getting cutt off or threshold value along with true positive and false positive rates in a data frame
str(rocrperf)
## Formal class 'performance' [package "ROCR"] with 6 slots
## ..@ x.name : chr "False positive rate"
## ..@ y.name : chr "True positive rate"
## ..@ alpha.name : chr "Cutoff"
## ..@ x.values :List of 1
## .. ..$ : num [1:45211] 0.00 2.50e-05 5.01e-05 5.01e-05 7.51e-05 ...
## ..@ y.values :List of 1
## .. ..$ : num [1:45211] 0 0 0 0.000189 0.000189 ...
## ..@ alpha.values:List of 1
## .. ..$ : num [1:45211] Inf 1 1 1 1 ...
rocr_cutoff <- data.frame(cut_off = rocrperf@alpha.values[[1]],fpr=rocrperf@x.values,tpr=rocrperf@y.values)
colnames(rocr_cutoff) <- c("cut_off","FPR","TPR")
View(rocr_cutoff)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
rocr_cutoff$cut_off <- round(rocr_cutoff$cut_off,6)
# Sorting data frame with respect to tpr in decreasing order
rocr_cutoff <- arrange(rocr_cutoff,desc(TPR))
View(rocr_cutoff)