Final_Project521.utf8.md

#loading bank dataset file and storing in bank.df variable
bank_data <- tempfile()
download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip",bank_data, mode="wb")
unzip(bank_data, "bank-full.csv")
unlink(bank_data)
bank.df <- read.table("bank-full.csv", sep=";", header=T)
str(bank.df)

## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

#The data is related with direct marketing campaigns of a Portuguese banking institution.
#The marketing campaigns were based on phone calls. Often, more than one contact to the 
#same client was required, in order to access if the product (bank term deposit) would 
#be ('yes') or not ('no') subscribed.

#2. Clean your data set and perform exploratory data analysis
any(is.na(bank.df))

## [1] FALSE

#For exploratory data analysis loading library ggplot2
library(ggplot2)
#Boxplots of decision by age taking marital status
#Mean is higher in case of no with higher age and married,yes is higher
#in case of divorced and higher age
ggplot(bank.df,aes(y,age))+geom_boxplot()+
  geom_point(aes(color=marital),position=position_dodge(width=.5))

#Boxplots of decision by balance taking job profile
#Higher balance in case if management saying no to the term deposit whereas
#management, technician and seld-employed are saying yes to term 
#deposit.
ggplot(bank.df,aes(y,balance))+geom_boxplot()+
  geom_point(aes(color=job),position=position_dodge(width=0.5))

#Boxplots of decision by call duration taking contact communication type.
#Mean is higher in yes the more call duration more saying yes. Contact
#through telephone saying yes more.
ggplot(bank.df,aes(y,duration))+geom_boxplot()+
  geom_point(aes(color=contact),position=position_dodge(width=0.5))

#Boxplots of decision by campaign, number of contacts performed during this 
#campaign and for this client 
#In this case we can see that more no the more client being contact than in
#comparison to yes.
ggplot(bank.df,aes(y,campaign))+geom_boxplot()+
  geom_point(aes(color=campaign),position=position_dodge(width=0.5))

#Boxplots of decision by  number of contacts performed before this campaign and 
#for this client
#We can see for this campaign little improvement of saying yes when
#it was failure in previous campaign.
ggplot(bank.df,aes(y,previous))+geom_boxplot()+
  geom_point(aes(color=poutcome),position=position_dodge(width=0.5))

#3.Create research question(s).  
#Problem Statement: The classification goal is to predict if the client will subscribe 
#(yes/no)a term deposit based on age,balance, duration, campaigns.

#4. Apply techniques
#splitting dataset into training(80%) and testing(20%)
library('caret')

## Loading required package: lattice

validation_index <- createDataPartition(bank.df$y,p=.80,list=FALSE)
#select 20%of data for validation
validation <- bank.df[-validation_index,]
#use remaining 80% of data to training and testing the models
bank.dataset <- bank.df[validation_index,]
#list the type of each attribute
sapply(bank.dataset,class)

##       age       job   marital education   default   balance   housing 
## "integer"  "factor"  "factor"  "factor"  "factor" "integer"  "factor" 
##      loan   contact       day     month  duration  campaign     pdays 
##  "factor"  "factor" "integer"  "factor" "integer" "integer" "integer" 
##  previous  poutcome         y 
## "integer"  "factor"  "factor"

#list the level for the class
levels(bank.dataset$y)

## [1] "no"  "yes"

#summerize the class distribution
percentage <- prop.table(table(bank.dataset$y)) * 100
cbind(freq=table(bank.dataset$y),percentage=percentage)

##      freq percentage
## no  31938    88.2997
## yes  4232    11.7003

#summarize attribute distribution
summary(bank.dataset)

##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:7807   divorced: 4195   primary  : 5530  
##  1st Qu.:33.00   management :7626   married :21710   secondary:18504  
##  Median :39.00   technician :6022   single  :10265   tertiary :10647  
##  Mean   :40.93   admin.     :4106                    unknown  : 1489  
##  3rd Qu.:48.00   services   :3348                                     
##  Max.   :95.00   retired    :1822                                     
##                  (Other)    :5439                                     
##  default        balance        housing      loan            contact     
##  no :35519   Min.   :-8019.0   no :16064   no :30379   cellular :23484  
##  yes:  651   1st Qu.:   74.0   yes:20106   yes: 5791   telephone: 2340  
##              Median :  451.5                           unknown  :10346  
##              Mean   : 1358.2                                            
##              3rd Qu.: 1431.0                                            
##              Max.   :81204.0                                            
##                                                                         
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :11014   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 5509   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 4990   Median : 181.0   Median : 2.000  
##  Mean   :15.81   jun    : 4253   Mean   : 258.5   Mean   : 2.763  
##  3rd Qu.:21.00   nov    : 3243   3rd Qu.: 320.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2374   Max.   :3881.0   Max.   :63.000  
##                  (Other): 4787                                    
##      pdays           previous          poutcome       y        
##  Min.   : -1.00   Min.   : 0.0000   failure: 3939   no :31938  
##  1st Qu.: -1.00   1st Qu.: 0.0000   other  : 1464   yes: 4232  
##  Median : -1.00   Median : 0.0000   success: 1235              
##  Mean   : 40.58   Mean   : 0.5711   unknown:29532              
##  3rd Qu.: -1.00   3rd Qu.: 0.0000                              
##  Max.   :871.00   Max.   :58.0000                              
##

#run algorithm using 10-fold cross validation
control <- trainControl(method = 'cv',number = 10)
metric <- 'Accuracy'
#Build three models
#Linear Algorithm
set.seed(7)
fit.lda <- train(y~age+balance+duration+campaign+previous,data=bank.dataset,
                 method='lda',metric=metric, trControl=control)

#Advanced algorithm
#Random Forest
set.seed(7)
fit.rf <- train(y~age+balance+duration+campaign+previous,data=bank.dataset,
                 method='rf',metric=metric, trControl=control)

#nonlinear algorithm
#cart
set.seed(7)
fit.part <- train(y~age+balance+duration+campaign+previous,data=bank.dataset,
                  method='rpart',metric=metric, trControl=control)

#summerize accuracy of the models
results <- resamples(list(lda=fit.lda,rf=fit.rf,part=fit.part))
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: lda, rf, part 
## Number of resamples: 10 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lda  0.8819790 0.8831214 0.8858012 0.8865912 0.8882438 0.8974288    0
## rf   0.8786622 0.8845729 0.8855248 0.8863701 0.8892576 0.8935582    0
## part 0.8844666 0.8866386 0.8888582 0.8892177 0.8909542 0.8968759    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lda  0.2326377 0.2764205 0.2881094 0.2890214 0.3048011 0.3349281    0
## rf   0.2544991 0.2717317 0.2983122 0.2933651 0.3109348 0.3336254    0
## part 0.2142689 0.2463039 0.2603344 0.2582621 0.2797856 0.2956773    0

#campare accuracy of models
dotplot(results)

#summarize best model 
print(fit.lda)

## Linear Discriminant Analysis 
## 
## 36170 samples
##     5 predictor
##     2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 32553, 32552, 32553, 32553, 32553, 32552, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8865912  0.2890214

print(fit.rf)

## Random Forest 
## 
## 36170 samples
##     5 predictor
##     2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 32553, 32552, 32553, 32553, 32553, 32552, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.8863701  0.2933651
##   3     0.8835500  0.3011095
##   5     0.8821123  0.2991449
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

print(fit.part)

## CART 
## 
## 36170 samples
##     5 predictor
##     2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 32553, 32552, 32553, 32553, 32553, 32552, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.003898866  0.8892177  0.2582621
##   0.004017013  0.8891071  0.2579565
##   0.028473535  0.8847388  0.1148863
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.003898866.

#estimate skill of LDA on the validation dataset
prediction1 <- predict(fit.lda,validation)
confusionMatrix(prediction1,validation$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7727  787
##        yes  257  270
##                                          
##                Accuracy : 0.8845         
##                  95% CI : (0.8778, 0.891)
##     No Information Rate : 0.8831         
##     P-Value [Acc > NIR] : 0.3425         
##                                          
##                   Kappa : 0.2853         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9678         
##             Specificity : 0.2554         
##          Pos Pred Value : 0.9076         
##          Neg Pred Value : 0.5123         
##              Prevalence : 0.8831         
##          Detection Rate : 0.8547         
##    Detection Prevalence : 0.9417         
##       Balanced Accuracy : 0.6116         
##                                          
##        'Positive' Class : no             
##

#estimate skill of random forest on the validation dataset
prediction2 <- predict(fit.rf,validation)
confusionMatrix(prediction2,validation$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7757  806
##        yes  227  251
##                                          
##                Accuracy : 0.8857         
##                  95% CI : (0.879, 0.8922)
##     No Information Rate : 0.8831         
##     P-Value [Acc > NIR] : 0.2214         
##                                          
##                   Kappa : 0.2742         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9716         
##             Specificity : 0.2375         
##          Pos Pred Value : 0.9059         
##          Neg Pred Value : 0.5251         
##              Prevalence : 0.8831         
##          Detection Rate : 0.8580         
##    Detection Prevalence : 0.9471         
##       Balanced Accuracy : 0.6045         
##                                          
##        'Positive' Class : no             
##

#estimate skill of rpart on the validation dataset
prediction3 <- predict(fit.part,validation)
confusionMatrix(prediction3,validation$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7805  820
##        yes  179  237
##                                           
##                Accuracy : 0.8895          
##                  95% CI : (0.8829, 0.8959)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 0.02918         
##                                           
##                   Kappa : 0.2738          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9776          
##             Specificity : 0.2242          
##          Pos Pred Value : 0.9049          
##          Neg Pred Value : 0.5697          
##              Prevalence : 0.8831          
##          Detection Rate : 0.8633          
##    Detection Prevalence : 0.9540          
##       Balanced Accuracy : 0.6009          
##                                           
##        'Positive' Class : no              
##

#5.Analyze and interpret results
#After running the three techniques in the research question that was
#The classification goal is to predict if the client will subscribe 
#(yes/no)a term deposit based on age,balance, duration, campaigns.
#So for this supervised learning technique I have used linear regression
#Random Forest and Rpart (Recursive Partitioning And Regression Trees)
# from the accuracy table I can see that rpart(89%) gives highest accuracy,
#than linear regression(88%) and random forest(88%), though not a much difference

#Linear Regression predicts No term deposit(7782) and yes to term deposit(263)
#total 996 misclassification which is less in comparision to the data we are 
#feeding for the analysis.

#Random Forest predicts No term deposit(7756) and yes to term deposit(263)
#total 1022 misclassification which is less in comparision to the data we are 
#feeding for the analysis but higher than LR model misclassification.

#RPART predicts No term deposit(7834) and yes to term deposit(215)
#total 992 misclassification which is less in comparision to the data we are 
#feeding for the analysis also less than the other two models misclassification.

Final_Project521.R

arnabchakraboty

2020-05-02