Bank_Data <- read.csv(file.choose()) # Choose the claimants Data set
EDU_Data <- read.csv(file.choose())
# sum(is.na(claimants))
# claimants <- na.omit(claimants) # Omitting NA values from the Data 
# na.omit => will omit the rows which has atleast 1 NA value
attach(Bank_Data)

bank_data1 <- cbind(age,default,balance,housing,loan,duration,campaign,pdays,
                    previous,poutfailure,poutother,poutsuccess,poutunknown,con_cellular,con_telephone
                    ,con_unknown,divorced,married,single,joadmin.,joblue.collar,joentrepreneur,
                    johousemaid,jomanagement,joretired,joself.employed,joservices,jostudent,jotechnician
                    ,jounemployed,HasFD =y,EDU_Unknown =EDU_Data$Edu_Unknown,
                    EDU_Primary=EDU_Data$Edu_Primary,
                    EDU_Secondary=EDU_Data$Edu_Secondary,EDU_Tertiary=EDU_Data$Edu_Tertiary)

colnames(bank_data1)

##  [1] "age"             "default"         "balance"        
##  [4] "housing"         "loan"            "duration"       
##  [7] "campaign"        "pdays"           "previous"       
## [10] "poutfailure"     "poutother"       "poutsuccess"    
## [13] "poutunknown"     "con_cellular"    "con_telephone"  
## [16] "con_unknown"     "divorced"        "married"        
## [19] "single"          "joadmin."        "joblue.collar"  
## [22] "joentrepreneur"  "johousemaid"     "jomanagement"   
## [25] "joretired"       "joself.employed" "joservices"     
## [28] "jostudent"       "jotechnician"    "jounemployed"   
## [31] "HasFD"           "EDU_Unknown"     "EDU_Primary"    
## [34] "EDU_Secondary"   "EDU_Tertiary"

bank_data1 <- as.data.frame(bank_data1)
class(bank_data1)

## [1] "data.frame"

# Preparing a linear regression 
mod_lm <- lm(HasFD~.,data=bank_data1)
pred1 <- predict(mod_lm,bank_data1)

## Warning in predict.lm(mod_lm, bank_data1): prediction from a rank-deficient
## fit may be misleading

# pred1
plot(age,pred1)

# We can no way use the linear regression technique to classify the data
plot(pred1)

# We can also include NA values but where ever it finds NA value
# probability values obtained using the glm will also be NA 
# So they can be either filled using imputation technique or
# exlclude those values 


# GLM function use sigmoid curve to produce desirable results 
# The output of sigmoid function lies in between 0-1
model <- glm(HasFD~.,data=bank_data1,family = "binomial")

# To calculate the odds ratio manually we going r going to take exp of coef(model)
exp(coef(model))

##     (Intercept)             age         default         balance 
##      0.02369092      1.00155122      0.84714306      1.00001762 
##         housing            loan        duration        campaign 
##      0.46350089      0.56933386      1.00406236      0.89625882 
##           pdays        previous     poutfailure       poutother 
##      1.00019439      1.01070294      1.27441904      1.63469841 
##     poutsuccess     poutunknown    con_cellular   con_telephone 
##     12.69020856              NA      3.17606995      2.92283064 
##     con_unknown        divorced         married          single 
##              NA      0.85116181      0.72629828              NA 
##        joadmin.   joblue.collar  joentrepreneur     johousemaid 
##      1.40165761      0.94648506      0.86261661      0.82790055 
##    jomanagement       joretired joself.employed      joservices 
##      1.11510436      1.97125931      0.95626510      1.02612093 
##       jostudent    jotechnician    jounemployed     EDU_Unknown 
##      2.32116818      1.06672276      1.12545759      0.87578233 
##     EDU_Primary   EDU_Secondary    EDU_Tertiary 
##      0.66783084      0.78871718              NA

# Confusion matrix table 
prob <- predict(model,bank_data1,type="response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

summary(model)

## 
## Call:
## glm(formula = HasFD ~ ., family = "binomial", data = bank_data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.7169  -0.4054  -0.2718  -0.1616   3.5115  
## 
## Coefficients: (4 not defined because of singularities)
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.743e+00  2.488e-01 -15.044  < 2e-16 ***
## age              1.550e-03  2.148e-03   0.722 0.470430    
## default         -1.659e-01  1.614e-01  -1.028 0.304021    
## balance          1.762e-05  4.859e-06   3.626 0.000287 ***
## housing         -7.689e-01  3.956e-02 -19.435  < 2e-16 ***
## loan            -5.633e-01  5.821e-02  -9.678  < 2e-16 ***
## duration         4.054e-03  6.274e-05  64.618  < 2e-16 ***
## campaign        -1.095e-01  9.909e-03 -11.053  < 2e-16 ***
## pdays            1.944e-04  3.000e-04   0.648 0.517053    
## previous         1.065e-02  6.550e-03   1.625 0.104068    
## poutfailure      2.425e-01  9.037e-02   2.683 0.007290 ** 
## poutother        4.915e-01  1.029e-01   4.777 1.78e-06 ***
## poutsuccess      2.541e+00  8.328e-02  30.510  < 2e-16 ***
## poutunknown             NA         NA      NA       NA    
## con_cellular     1.156e+00  5.778e-02  20.001  < 2e-16 ***
## con_telephone    1.073e+00  8.834e-02  12.141  < 2e-16 ***
## con_unknown             NA         NA      NA       NA    
## divorced        -1.612e-01  6.541e-02  -2.464 0.013751 *  
## married         -3.198e-01  4.461e-02  -7.168 7.59e-13 ***
## single                  NA         NA      NA       NA    
## joadmin.         3.377e-01  2.294e-01   1.472 0.141054    
## joblue.collar   -5.500e-02  2.287e-01  -0.240 0.809983    
## joentrepreneur  -1.478e-01  2.490e-01  -0.593 0.552861    
## johousemaid     -1.889e-01  2.528e-01  -0.747 0.454970    
## jomanagement     1.089e-01  2.278e-01   0.478 0.632431    
## joretired        6.787e-01  2.332e-01   2.910 0.003616 ** 
## joself.employed -4.472e-02  2.426e-01  -0.184 0.853734    
## joservices       2.579e-02  2.332e-01   0.111 0.911958    
## jostudent        8.421e-01  2.405e-01   3.501 0.000464 ***
## jotechnician     6.459e-02  2.277e-01   0.284 0.776688    
## jounemployed     1.182e-01  2.426e-01   0.487 0.626115    
## EDU_Unknown     -1.326e-01  9.330e-02  -1.422 0.155130    
## EDU_Primary     -4.037e-01  7.348e-02  -5.494 3.93e-08 ***
## EDU_Secondary   -2.373e-01  5.031e-02  -4.718 2.38e-06 ***
## EDU_Tertiary            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32631  on 45210  degrees of freedom
## Residual deviance: 22604  on 45180  degrees of freedom
## AIC: 22666
## 
## Number of Fisher Scoring iterations: 6

# We are going to use NULL and Residual Deviance to compare the between different models

# Confusion matrix and considering the threshold value as 0.5 
confusion<-table(prob>0.5,bank_data1$HasFD)
confusion

##        
##             0     1
##   FALSE 39004  3584
##   TRUE    918  1705

# Model Accuracy 
Accuracy<-sum(diag(confusion)/sum(confusion))
Accuracy # 90.04

## [1] 0.9004225

# Creating empty vectors to store predicted classes based on threshold value
pred_values <- NULL
yes_no <- NULL

pred_values <- ifelse(prob>=0.5,1,0)
yes_no <- ifelse(prob>=0.5,"yes","no")

 # Creating new column to store the above values
bank_data1[,"prob"] <- prob
bank_data1[,"pred_values"] <- pred_values
bank_data1[,"yes_no"] <- yes_no

# View(bank_data1[,c(1,31,36:38)])

table(bank_data1$HasFD,bank_data1$pred_values)

##    
##         0     1
##   0 39004   918
##   1  3584  1705

# Calculate the below metrics
# precision | recall | True Positive Rate | False Positive Rate | Specificity | Sensitivity
# from the above table - 59


# ROC Curve => used to evaluate the betterness of the logistic model
# more area under ROC curve better is the model 
# We will use ROC curve for any classification technique not only for logistic
# install.packages("ROCR")
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

rocrpred<-prediction(prob,bank_data1$HasFD)
rocrperf<-performance(rocrpred,'tpr','fpr')

str(rocrperf)

## Formal class 'performance' [package "ROCR"] with 6 slots
##   ..@ x.name      : chr "False positive rate"
##   ..@ y.name      : chr "True positive rate"
##   ..@ alpha.name  : chr "Cutoff"
##   ..@ x.values    :List of 1
##   .. ..$ : num [1:45211] 0.00 2.50e-05 5.01e-05 5.01e-05 7.51e-05 ...
##   ..@ y.values    :List of 1
##   .. ..$ : num [1:45211] 0 0 0 0.000189 0.000189 ...
##   ..@ alpha.values:List of 1
##   .. ..$ : num [1:45211] Inf 1 1 1 1 ...

plot(rocrperf,colorize=T,text.adj=c(-0.2,1.7))

# More area under the ROC Curve better is the logistic regression model obtained

## Getting cutt off or threshold value along with true positive and false positive rates in a data frame 
str(rocrperf)

## Formal class 'performance' [package "ROCR"] with 6 slots
##   ..@ x.name      : chr "False positive rate"
##   ..@ y.name      : chr "True positive rate"
##   ..@ alpha.name  : chr "Cutoff"
##   ..@ x.values    :List of 1
##   .. ..$ : num [1:45211] 0.00 2.50e-05 5.01e-05 5.01e-05 7.51e-05 ...
##   ..@ y.values    :List of 1
##   .. ..$ : num [1:45211] 0 0 0 0.000189 0.000189 ...
##   ..@ alpha.values:List of 1
##   .. ..$ : num [1:45211] Inf 1 1 1 1 ...

rocr_cutoff <- data.frame(cut_off = rocrperf@alpha.values[[1]],fpr=rocrperf@x.values,tpr=rocrperf@y.values)
colnames(rocr_cutoff) <- c("cut_off","FPR","TPR")
View(rocr_cutoff)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

rocr_cutoff$cut_off <- round(rocr_cutoff$cut_off,6)
# Sorting data frame with respect to tpr in decreasing order 
rocr_cutoff <- arrange(rocr_cutoff,desc(TPR))
View(rocr_cutoff)