#loading bank dataset file and storing in bank.df variable
bank_data <- tempfile()
download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip",bank_data, mode="wb")
unzip(bank_data, "bank-full.csv")
unlink(bank_data)
bank.df <- read.table("bank-full.csv", sep=";", header=T)
str(bank.df)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
## $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
#The data is related with direct marketing campaigns of a Portuguese banking institution.
#The marketing campaigns were based on phone calls. Often, more than one contact to the
#same client was required, in order to access if the product (bank term deposit) would
#be ('yes') or not ('no') subscribed.
#2. Clean your data set and perform exploratory data analysis
any(is.na(bank.df))
## [1] FALSE
#For exploratory data analysis loading library ggplot2
library(ggplot2)
#Boxplots of decision by age taking marital status
#Mean is higher in case of no with higher age and married,yes is higher
#in case of divorced and higher age
ggplot(bank.df,aes(y,age))+geom_boxplot()+
geom_point(aes(color=marital),position=position_dodge(width=.5))

#Boxplots of decision by balance taking job profile
#Higher balance in case if management saying no to the term deposit whereas
#management, technician and seld-employed are saying yes to term
#deposit.
ggplot(bank.df,aes(y,balance))+geom_boxplot()+
geom_point(aes(color=job),position=position_dodge(width=0.5))

#Boxplots of decision by call duration taking contact communication type.
#Mean is higher in yes the more call duration more saying yes. Contact
#through telephone saying yes more.
ggplot(bank.df,aes(y,duration))+geom_boxplot()+
geom_point(aes(color=contact),position=position_dodge(width=0.5))

#Boxplots of decision by campaign, number of contacts performed during this
#campaign and for this client
#In this case we can see that more no the more client being contact than in
#comparison to yes.
ggplot(bank.df,aes(y,campaign))+geom_boxplot()+
geom_point(aes(color=campaign),position=position_dodge(width=0.5))

#Boxplots of decision by number of contacts performed before this campaign and
#for this client
#We can see for this campaign little improvement of saying yes when
#it was failure in previous campaign.
ggplot(bank.df,aes(y,previous))+geom_boxplot()+
geom_point(aes(color=poutcome),position=position_dodge(width=0.5))

#3.Create research question(s).
#Problem Statement: The classification goal is to predict if the client will subscribe
#(yes/no)a term deposit based on age,balance, duration, campaigns.
#4. Apply techniques
#splitting dataset into training(80%) and testing(20%)
library('caret')
## Loading required package: lattice
validation_index <- createDataPartition(bank.df$y,p=.80,list=FALSE)
#select 20%of data for validation
validation <- bank.df[-validation_index,]
#use remaining 80% of data to training and testing the models
bank.dataset <- bank.df[validation_index,]
#list the type of each attribute
sapply(bank.dataset,class)
## age job marital education default balance housing
## "integer" "factor" "factor" "factor" "factor" "integer" "factor"
## loan contact day month duration campaign pdays
## "factor" "factor" "integer" "factor" "integer" "integer" "integer"
## previous poutcome y
## "integer" "factor" "factor"
#list the level for the class
levels(bank.dataset$y)
## [1] "no" "yes"
#summerize the class distribution
percentage <- prop.table(table(bank.dataset$y)) * 100
cbind(freq=table(bank.dataset$y),percentage=percentage)
## freq percentage
## no 31938 88.2997
## yes 4232 11.7003
#summarize attribute distribution
summary(bank.dataset)
## age job marital education
## Min. :18.00 blue-collar:7807 divorced: 4195 primary : 5530
## 1st Qu.:33.00 management :7626 married :21710 secondary:18504
## Median :39.00 technician :6022 single :10265 tertiary :10647
## Mean :40.93 admin. :4106 unknown : 1489
## 3rd Qu.:48.00 services :3348
## Max. :95.00 retired :1822
## (Other) :5439
## default balance housing loan contact
## no :35519 Min. :-8019.0 no :16064 no :30379 cellular :23484
## yes: 651 1st Qu.: 74.0 yes:20106 yes: 5791 telephone: 2340
## Median : 451.5 unknown :10346
## Mean : 1358.2
## 3rd Qu.: 1431.0
## Max. :81204.0
##
## day month duration campaign
## Min. : 1.00 may :11014 Min. : 0.0 Min. : 1.000
## 1st Qu.: 8.00 jul : 5509 1st Qu.: 103.0 1st Qu.: 1.000
## Median :16.00 aug : 4990 Median : 181.0 Median : 2.000
## Mean :15.81 jun : 4253 Mean : 258.5 Mean : 2.763
## 3rd Qu.:21.00 nov : 3243 3rd Qu.: 320.0 3rd Qu.: 3.000
## Max. :31.00 apr : 2374 Max. :3881.0 Max. :63.000
## (Other): 4787
## pdays previous poutcome y
## Min. : -1.00 Min. : 0.0000 failure: 3939 no :31938
## 1st Qu.: -1.00 1st Qu.: 0.0000 other : 1464 yes: 4232
## Median : -1.00 Median : 0.0000 success: 1235
## Mean : 40.58 Mean : 0.5711 unknown:29532
## 3rd Qu.: -1.00 3rd Qu.: 0.0000
## Max. :871.00 Max. :58.0000
##
#run algorithm using 10-fold cross validation
control <- trainControl(method = 'cv',number = 10)
metric <- 'Accuracy'
#Build three models
#Linear Algorithm
set.seed(7)
fit.lda <- train(y~age+balance+duration+campaign+previous,data=bank.dataset,
method='lda',metric=metric, trControl=control)
#Advanced algorithm
#Random Forest
set.seed(7)
fit.rf <- train(y~age+balance+duration+campaign+previous,data=bank.dataset,
method='rf',metric=metric, trControl=control)
#nonlinear algorithm
#cart
set.seed(7)
fit.part <- train(y~age+balance+duration+campaign+previous,data=bank.dataset,
method='rpart',metric=metric, trControl=control)
#summerize accuracy of the models
results <- resamples(list(lda=fit.lda,rf=fit.rf,part=fit.part))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, rf, part
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.8819790 0.8831214 0.8858012 0.8865912 0.8882438 0.8974288 0
## rf 0.8786622 0.8845729 0.8855248 0.8863701 0.8892576 0.8935582 0
## part 0.8844666 0.8866386 0.8888582 0.8892177 0.8909542 0.8968759 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.2326377 0.2764205 0.2881094 0.2890214 0.3048011 0.3349281 0
## rf 0.2544991 0.2717317 0.2983122 0.2933651 0.3109348 0.3336254 0
## part 0.2142689 0.2463039 0.2603344 0.2582621 0.2797856 0.2956773 0
#campare accuracy of models
dotplot(results)

#summarize best model
print(fit.lda)
## Linear Discriminant Analysis
##
## 36170 samples
## 5 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 32553, 32552, 32553, 32553, 32553, 32552, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8865912 0.2890214
print(fit.rf)
## Random Forest
##
## 36170 samples
## 5 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 32553, 32552, 32553, 32553, 32553, 32552, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8863701 0.2933651
## 3 0.8835500 0.3011095
## 5 0.8821123 0.2991449
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
print(fit.part)
## CART
##
## 36170 samples
## 5 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 32553, 32552, 32553, 32553, 32553, 32552, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.003898866 0.8892177 0.2582621
## 0.004017013 0.8891071 0.2579565
## 0.028473535 0.8847388 0.1148863
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.003898866.
#estimate skill of LDA on the validation dataset
prediction1 <- predict(fit.lda,validation)
confusionMatrix(prediction1,validation$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7727 787
## yes 257 270
##
## Accuracy : 0.8845
## 95% CI : (0.8778, 0.891)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 0.3425
##
## Kappa : 0.2853
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9678
## Specificity : 0.2554
## Pos Pred Value : 0.9076
## Neg Pred Value : 0.5123
## Prevalence : 0.8831
## Detection Rate : 0.8547
## Detection Prevalence : 0.9417
## Balanced Accuracy : 0.6116
##
## 'Positive' Class : no
##
#estimate skill of random forest on the validation dataset
prediction2 <- predict(fit.rf,validation)
confusionMatrix(prediction2,validation$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7757 806
## yes 227 251
##
## Accuracy : 0.8857
## 95% CI : (0.879, 0.8922)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 0.2214
##
## Kappa : 0.2742
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9716
## Specificity : 0.2375
## Pos Pred Value : 0.9059
## Neg Pred Value : 0.5251
## Prevalence : 0.8831
## Detection Rate : 0.8580
## Detection Prevalence : 0.9471
## Balanced Accuracy : 0.6045
##
## 'Positive' Class : no
##
#estimate skill of rpart on the validation dataset
prediction3 <- predict(fit.part,validation)
confusionMatrix(prediction3,validation$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7805 820
## yes 179 237
##
## Accuracy : 0.8895
## 95% CI : (0.8829, 0.8959)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 0.02918
##
## Kappa : 0.2738
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9776
## Specificity : 0.2242
## Pos Pred Value : 0.9049
## Neg Pred Value : 0.5697
## Prevalence : 0.8831
## Detection Rate : 0.8633
## Detection Prevalence : 0.9540
## Balanced Accuracy : 0.6009
##
## 'Positive' Class : no
##
#5.Analyze and interpret results
#After running the three techniques in the research question that was
#The classification goal is to predict if the client will subscribe
#(yes/no)a term deposit based on age,balance, duration, campaigns.
#So for this supervised learning technique I have used linear regression
#Random Forest and Rpart (Recursive Partitioning And Regression Trees)
# from the accuracy table I can see that rpart(89%) gives highest accuracy,
#than linear regression(88%) and random forest(88%), though not a much difference
#Linear Regression predicts No term deposit(7782) and yes to term deposit(263)
#total 996 misclassification which is less in comparision to the data we are
#feeding for the analysis.
#Random Forest predicts No term deposit(7756) and yes to term deposit(263)
#total 1022 misclassification which is less in comparision to the data we are
#feeding for the analysis but higher than LR model misclassification.
#RPART predicts No term deposit(7834) and yes to term deposit(215)
#total 992 misclassification which is less in comparision to the data we are
#feeding for the analysis also less than the other two models misclassification.