title: “Machine Learning” author: “Anand” date: “March 25, 2018” output: html_document: default
Importing the libraries
library(car)
library(caret)
library(class)
library(devtools)
library(e1071)
library(ggplot2)
library(Hmisc)
library(MASS)
library(klaR)
library(nnet)
library(plyr)
library(pROC)
library(psych)
library(scatterplot3d)
library(SDMTools)
library(dplyr)
library(ElemStatLearn)
library(rpart)
library(rpart.plot)
library(randomForest)
library(neuralnet)
library(psych)
library(VIM)
library(corrplot)
library(SDMTools)
library(pROC)
library(Hmisc)
library(MASS)
library(ISLR)
library(DMwR)
library(DiscriMiner)
library(lmtest)
library(pscl)
library(Deducer)
library(glmnet)
Reading the file
Loan<-read.csv(file.choose(), header=T)
Define some dummies
Loan$car<-ifelse(Loan$Transport=="Car",1,0)
Loan$Gender<-ifelse(Loan$Gender=="Female",0,1)
Loan <- Loan[,-9]
Loan <- na.omit(Loan)
univariate analysis
summary(Loan)
## Age Gender Engineer MBA
## Min. :18.00 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:25.00 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :27.00 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :27.75 Mean :0.7133 Mean :0.7562 Mean :0.2528
## 3rd Qu.:30.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :43.00 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Work.Exp Salary Distance license
## Min. : 0.0 Min. : 6.50 Min. : 3.20 Min. :0.0000
## 1st Qu.: 3.0 1st Qu.: 9.80 1st Qu.: 8.80 1st Qu.:0.0000
## Median : 5.0 Median :13.60 Median :11.00 Median :0.0000
## Mean : 6.3 Mean :16.24 Mean :11.33 Mean :0.2348
## 3rd Qu.: 8.0 3rd Qu.:15.75 3rd Qu.:13.45 3rd Qu.:0.0000
## Max. :24.0 Max. :57.00 Max. :23.40 Max. :1.0000
## car
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1377
## 3rd Qu.:0.0000
## Max. :1.0000
str(Loan)
## 'data.frame': 443 obs. of 9 variables:
## $ Age : int 28 23 29 28 27 26 28 26 22 27 ...
## $ Gender : num 1 0 1 0 1 1 1 0 1 1 ...
## $ Engineer: int 0 1 1 1 1 1 1 1 1 1 ...
## $ MBA : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Work.Exp: int 4 4 7 5 4 4 5 3 1 4 ...
## $ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
## $ Distance: num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
## $ license : int 0 0 0 0 0 1 0 0 0 0 ...
## $ car : num 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "na.action")=Class 'omit' Named int 145
## .. ..- attr(*, "names")= chr "145"
describe(Loan)
## vars n mean sd median trimmed mad min max range skew
## Age 1 443 27.75 4.42 27.0 27.29 4.45 18.0 43.0 25.0 0.95
## Gender 2 443 0.71 0.45 1.0 0.77 0.00 0.0 1.0 1.0 -0.94
## Engineer 3 443 0.76 0.43 1.0 0.82 0.00 0.0 1.0 1.0 -1.19
## MBA 4 443 0.25 0.44 0.0 0.19 0.00 0.0 1.0 1.0 1.13
## Work.Exp 5 443 6.30 5.12 5.0 5.52 4.45 0.0 24.0 24.0 1.34
## Salary 6 443 16.24 10.46 13.6 13.93 4.74 6.5 57.0 50.5 2.03
## Distance 7 443 11.33 3.61 11.0 11.13 3.56 3.2 23.4 20.2 0.53
## license 8 443 0.23 0.42 0.0 0.17 0.00 0.0 1.0 1.0 1.25
## car 9 443 0.14 0.34 0.0 0.05 0.00 0.0 1.0 1.0 2.10
## kurtosis se
## Age 0.89 0.21
## Gender -1.12 0.02
## Engineer -0.59 0.02
## MBA -0.72 0.02
## Work.Exp 1.42 0.24
## Salary 3.38 0.50
## Distance 0.16 0.17
## license -0.45 0.02
## car 2.40 0.02
Missing value Imputation
aggr(Loan,prop = F,cex.axis = 0.7,numbers=T)
Univariate analysis
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(Loan$Age,main ="Age")
boxplot(Loan$Gender,main ="Gender")
boxplot(Loan$Engineer,main ="Engineer")
boxplot(Loan$MBA,main ="MBA")
boxplot(Loan$Work.Exp,main ="Work Exp")
boxplot(Loan$license,main ="License")
boxplot(Loan$Salary,main ="Salary")
boxplot(Loan$Distance,main ="Distance")
hist(Loan$Age,main ="Age", xlab=NA, ylab=NA)
hist(Loan$Gender,main ="Gender", xlab=NA, ylab=NA)
hist(Loan$Engineer,main ="Engineer", xlab=NA, ylab=NA)
hist(Loan$MBA,main ="MBA", xlab=NA, ylab=NA)
hist(Loan$Work.Exp,main ="Work Exp", xlab=NA, ylab=NA)
hist(Loan$license,main ="License", xlab=NA, ylab=NA)
hist(Loan$Salary,main ="Salary", xlab=NA, ylab=NA)
hist(Loan$Distance,main ="Distance", xlab=NA, ylab=NA)
Bivariate analysis
corx <- cor(Loan)
corrplot(corx)
Response rate
resp <- sum(Loan$car/nrow(Loan))
resp
## [1] 0.1376975
Creating factor variables
Loan$car <- as.factor(Loan$car)
Loan$Gender <- as.factor(Loan$Gender)
Loan$Engineer <- as.factor(Loan$Engineer)
Loan$MBA <- as.factor(Loan$MBA)
Loan$license <- as.factor(Loan$license)
Running K FOLD VALIDATIONS to identify which model to use
Loantrim<-Loan
set.seed(123)
folds<-createFolds(Loantrim$car,k=10)
str(folds)
## List of 10
## $ Fold01: int [1:44] 15 17 26 29 37 52 60 63 72 84 ...
## $ Fold02: int [1:44] 8 20 21 41 43 55 57 58 71 86 ...
## $ Fold03: int [1:45] 6 11 12 14 27 28 40 67 80 83 ...
## $ Fold04: int [1:44] 19 31 35 61 64 78 85 97 107 112 ...
## $ Fold05: int [1:44] 24 25 79 94 100 119 120 131 136 140 ...
## $ Fold06: int [1:45] 16 36 42 51 53 59 95 98 103 117 ...
## $ Fold07: int [1:44] 1 2 23 34 49 56 70 75 81 96 ...
## $ Fold08: int [1:45] 3 4 7 44 46 48 50 54 65 66 ...
## $ Fold09: int [1:44] 13 18 22 30 33 45 62 68 73 74 ...
## $ Fold10: int [1:44] 5 9 10 32 38 39 47 69 88 89 ...
Eq.2<- car~ Age + Gender + Engineer+MBA++Work.Exp + Salary+Distance+license
Running Logistic regression
cv_logit<-lapply(folds,function(x){
test<-Loantrim[x,]
train<-Loantrim[-x,]
Logit.1<-glm(Eq.2, train, family = binomial)
pred.1 <- predict.glm(Logit.1, newdata=test, type="response")
actual<-test$car
tab.logit.1<-confusion.matrix(actual,pred.1,threshold = 0.5)
sum(diag(tab.logit.1))/sum(tab.logit.1)
})
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(cv_logit)
## List of 10
## $ Fold01: num 0.977
## $ Fold02: num 0.977
## $ Fold03: num 0.978
## $ Fold04: num 0.909
## $ Fold05: num 0.909
## $ Fold06: num 0.978
## $ Fold07: num 0.886
## $ Fold08: num 0.956
## $ Fold09: num 0.955
## $ Fold10: num 1
fit.logit<-mean(unlist(cv_logit))
fit.logit
## [1] 0.9524747
The accuracy is 95.2%
Running NB
cv_NB<-lapply(folds,function(x){
train.NB.kval<-Loantrim[x,]
test.NB.kval<-Loantrim[x,]
NB.kval<-naiveBayes(x=train.NB.kval[-1], y=train.NB.kval$car)
y_pred.NB.kval<-predict( NB.kval,newdata=test.NB.kval[-1])
cm.NB.kval=table(test.NB.kval[,1],y_pred.NB.kval)
sum(diag(cm.NB.kval))/sum(cm.NB.kval)
})
str(cv_NB)
## List of 10
## $ Fold01: num 0.0227
## $ Fold02: num 0.0227
## $ Fold03: num 0.0222
## $ Fold04: num 0.0227
## $ Fold05: num 0.0227
## $ Fold06: num 0.0444
## $ Fold07: num 0.0227
## $ Fold08: num 0.0222
## $ Fold09: num 0.0227
## $ Fold10: num 0.0455
fit.NB<-mean(unlist(cv_NB))
fit.NB
## [1] 0.02707071
The accuracy is 2.7%
Running Linear Discriminannt analysis
cv_LDA<-lapply(folds,function(x){
train<-Loantrim[-x,]
test<-Loantrim[x,]
lda_1<-lda(Eq.2 , train)
lda1.pred<-predict(lda_1, newdata=test)
ldapredclass<-lda1.pred$class
tab.LDA<-table(ldapredclass,test$car)
sum(diag(tab.LDA))/sum(tab.LDA)
})
str(cv_LDA)
## List of 10
## $ Fold01: num 1
## $ Fold02: num 0.909
## $ Fold03: num 0.978
## $ Fold04: num 0.909
## $ Fold05: num 0.932
## $ Fold06: num 0.956
## $ Fold07: num 0.886
## $ Fold08: num 0.978
## $ Fold09: num 0.932
## $ Fold10: num 0.977
fit.LDA<-mean(unlist(cv_LDA))
fit.LDA
## [1] 0.9456566
The accuracy is 94.5%
Running Decision Tree
cv_DT<-lapply(folds,function(x){
train<-Loantrim[-x,]
test<-Loantrim[x,]
DT<-rpart(Eq.2, method="class",train)
pred = predict(DT, type="class",newdata=test)
tabDT<-table( pred,test$car)
sum(diag(tabDT))/sum(tabDT)
})
str(cv_DT)
## List of 10
## $ Fold01: num 0.977
## $ Fold02: num 0.955
## $ Fold03: num 0.978
## $ Fold04: num 0.955
## $ Fold05: num 0.955
## $ Fold06: num 1
## $ Fold07: num 0.955
## $ Fold08: num 0.978
## $ Fold09: num 0.909
## $ Fold10: num 1
fit.DT<-mean(unlist(cv_DT))
fit.DT
## [1] 0.9660101
The accuracy is 96.6%
By running the algorithms we found that decision tree's algorithm accuracy is the highest with 96.6 % but with decision tree we cannot explain variable importance as our problem statement is to build a model that explains best the employee's decision to use car as means of transport.
Hence going for the second highest accuracy of 95.5% from logistic algorithm.
Splitting the data into 70 and 30 %
index1 <- createDataPartition(Loan$car, p=0.70, list=FALSE)
train1 <- Loan[ index1,]
test1 <- Loan[-index1,]
Logit.final <- glm(Eq.2 , train1, family = binomial)
summary(Logit.final)
##
## Call:
## glm(formula = Eq.2, family = binomial, data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.81084 -0.04406 -0.00947 -0.00109 2.06335
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -67.25779 17.48674 -3.846 0.000120 ***
## Age 2.18565 0.62155 3.516 0.000437 ***
## Gender1 -1.96151 1.09489 -1.792 0.073213 .
## Engineer1 0.29510 0.99855 0.296 0.767593
## MBA1 -1.29675 0.94434 -1.373 0.169694
## Work.Exp -1.20176 0.47987 -2.504 0.012268 *
## Salary 0.16250 0.08998 1.806 0.070925 .
## Distance 0.43426 0.15118 2.873 0.004071 **
## license1 3.21791 1.10470 2.913 0.003581 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 249.919 on 310 degrees of freedom
## Residual deviance: 47.418 on 302 degrees of freedom
## AIC: 65.418
##
## Number of Fisher Scoring iterations: 9
vif(Logit.final)
## Age Gender Engineer MBA Work.Exp Salary Distance
## 15.457249 1.705767 1.144465 1.378596 23.483694 5.074497 1.399514
## license
## 2.200031
varImp(Logit.final)
## Overall
## Age 3.5164654
## Gender1 1.7915017
## Engineer1 0.2955243
## MBA1 1.3731875
## Work.Exp 2.5043291
## Salary 1.8059558
## Distance 2.8725700
## license1 2.9129268
pred.logit.final <- predict.glm(Logit.final, newdata=test1, type="response")
#Classification
tab.logit<-table(test1$car,pred.logit.final>0.5)
tab.logit
##
## FALSE TRUE
## 0 114 0
## 1 4 14
#Logit
accuracy.logit<-sum(diag(tab.logit))/sum(tab.logit)
accuracy.logit
## [1] 0.969697
The VIF shows that there is multicollinearity occurs, ass the variables Age and Work.Exp shows high value Need to keep one variable alone as one variable influence the other, as Age increases Work.exp will increases, henceforth removing work.exp variable.
Creating a model by removing Work.Exp variable
Eq.2<- car~ Age + Gender + Engineer+MBA+Salary+Distance+license
Logit.final <- glm(Eq.2 , train1, family = binomial)
summary(Logit.final)
##
## Call:
## glm(formula = Eq.2, family = binomial, data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.00573 -0.08627 -0.03033 -0.00734 2.44071
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -33.85506 6.87762 -4.922 8.54e-07 ***
## Age 0.88463 0.19602 4.513 6.39e-06 ***
## Gender1 -1.39271 0.98564 -1.413 0.15766
## Engineer1 0.50772 0.91259 0.556 0.57797
## MBA1 -1.42950 0.89793 -1.592 0.11139
## Salary -0.03100 0.04805 -0.645 0.51883
## Distance 0.40259 0.13308 3.025 0.00249 **
## license1 2.89334 0.96426 3.001 0.00269 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 249.919 on 310 degrees of freedom
## Residual deviance: 55.444 on 303 degrees of freedom
## AIC: 71.444
##
## Number of Fisher Scoring iterations: 8
vif(Logit.final)
## Age Gender Engineer MBA Salary Distance license
## 1.869234 1.531864 1.118493 1.307975 1.635326 1.331920 1.877613
varImp(Logit.final)
## Overall
## Age 4.5130470
## Gender1 1.4129930
## Engineer1 0.5563559
## MBA1 1.5919940
## Salary 0.6451503
## Distance 3.0250439
## license1 3.0005762
pred.logit.final <- predict.glm(Logit.final, newdata=test1, type="response")
#Classification
tab.logit<-table(test1$car,pred.logit.final>0.5)
tab.logit
##
## FALSE TRUE
## 0 111 3
## 1 2 16
#Logit
accuracy.logit<-sum(diag(tab.logit))/sum(tab.logit)
accuracy.logit
## [1] 0.9621212
Now the accuracy is 96.2%
Due to unbalanced dataset the model is not predicting 1's accurately, hence using SMOTE technique to over sample the data.
train_SMOTE<-Loan
table(train_SMOTE$car)
##
## 0 1
## 382 61
trainSplit <- SMOTE(Eq.2, train_SMOTE, perc.over = 200, perc.under=300)
index1 <- createDataPartition(trainSplit$car, p=0.70, list=FALSE)
train1 <- Loan[ index1,]
test1 <- Loan[-index1,]
table(trainSplit$car)
##
## 0 1
## 366 183
table(test1$car)
##
## 0 1
## 116 19
Running Logistic regression after using SMOTE technique
Logit.final1 <- glm(Eq.2 , train1, family = binomial)
vif(Logit.final1)
## Age Gender Engineer MBA Salary Distance license
## 2.432835 1.113273 1.251640 1.635857 1.633143 1.349120 1.266904
varImp(Logit.final1)
## Overall
## Age 3.9380132
## Gender1 0.6165907
## Engineer1 1.1049803
## MBA1 2.4994597
## Salary 0.9782194
## Distance 2.7695832
## license1 2.5669102
pred.logit.final1 <- predict.glm(Logit.final1, newdata=test1, type="response")
#Classification
tab.logit1<-table(test1$car,pred.logit.final1>0.5)
tab.logit1
##
## FALSE TRUE
## 0 113 3
## 1 5 14
#Logit
accuracy.logit1<-sum(diag(tab.logit))/sum(tab.logit)
accuracy.logit1
## [1] 0.9621212
Loglikelihood Test
lrtest(Logit.final1)
## Likelihood ratio test
##
## Model 1: car ~ Age + Gender + Engineer + MBA + Salary + Distance + license
## Model 2: car ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 8 -20.513
## 2 1 -122.679 -7 204.33 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Overall p-value less than 0.05, hence the overall the model is valid
McFadden R2
pR2(Logit.final1)
## llh llhNull G2 McFadden r2ML
## -20.5128951 -122.6785911 204.3313918 0.8327916 0.4849100
## r2CU
## 0.8830238
The McFadden R2 is 0.75 which means that 75% of the uncertainity produced by the intercept model has been explained by the full model. If we use only intercept model there is uncertainity in prediction but if we use the full model it will capture 85% of the uncertainities produced by the uncertainity. Thus goodness of fit is robust.
summary(Logit.final1)
##
## Call:
## glm(formula = Eq.2, family = binomial, data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.08978 -0.04155 -0.00776 -0.00140 2.88964
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -46.73020 11.09142 -4.213 2.52e-05 ***
## Age 1.25231 0.31801 3.938 8.22e-05 ***
## Gender1 -0.63529 1.03033 -0.617 0.53750
## Engineer1 1.49455 1.35256 1.105 0.26917
## MBA1 -2.75917 1.10391 -2.499 0.01244 *
## Salary -0.05321 0.05439 -0.978 0.32797
## Distance 0.46143 0.16661 2.770 0.00561 **
## license1 2.33907 0.91124 2.567 0.01026 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 245.357 on 307 degrees of freedom
## Residual deviance: 41.026 on 300 degrees of freedom
## (78 observations deleted due to missingness)
## AIC: 57.026
##
## Number of Fisher Scoring iterations: 9
Gender1,salary,Engineer and MBA doesnot influence the employee's decision to use car as main means of transport. Therefore our focus to be on the below variables Age Distance License
rocplot(Logit.final1)
PROBLEM STATEMENT 2
Running Support vector Machines to identify the means of transport for the test data
Reading the file
Loan<-read.csv(file.choose(), header=T)
Loan <- na.omit(Loan)
svm.2<-svm(Loan$Transport~., data=Loan[,-9], kernel="linear")
summary(svm.2)
##
## Call:
## svm(formula = Loan$Transport ~ ., data = Loan[, -9], kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
## gamma: 0.1111111
##
## Number of Support Vectors: 199
##
## ( 95 83 21 )
##
##
## Number of Classes: 3
##
## Levels:
## 2Wheeler Car Public Transport
plot(svm.2,data=Loan, Age~Salary)
plot(svm.2,data=Loan, Age~Work.Exp)
plot(svm.2,data=Loan,Age ~ license)
test<-read.csv(file.choose(), header=T)
test <- test[,-9]
str(test)
## 'data.frame': 2 obs. of 8 variables:
## $ Age : int 25 25
## $ Gender : int 0 1
## $ Engineer: int 0 1
## $ MBA : int 0 0
## $ Work.Exp: int 4 4
## $ Salary : num 14.3 8.3
## $ Distance: num 3.2 3.3
## $ license : int 1 0
test$Gender <- as.factor(test$Gender)
test$Engineer <- as.factor(test$Engineer)
test$MBA <- as.numeric(test$MBA)
test$license <- as.factor(test$license)
y_pred.svm<-predict(svm.2,newdata = test[-9])
test$Transport <- y_pred.svm
test
## Age Gender Engineer MBA Work.Exp Salary Distance license
## 1 25 0 0 0 4 14.3 3.2 1
## 2 25 1 1 0 4 8.3 3.3 0
## Transport
## 1 Public Transport
## 2 Public Transport
Running KNN Algorithm
Reading the file
Loan<-read.csv(file.choose(), header=T)
Loan <- na.omit(Loan)
sum(is.na(Loan))
## [1] 0
Reading the test data
test1<-read.csv(file.choose(), header=T)
converting to factor variables
Loan$Gender <- as.numeric(Loan$Gender)
test1$Gender <- as.numeric(test1$Gender)
test1$Engineer <- as.numeric(test1$Engineer)
test1$MBA <- as.numeric(test1$MBA)
test1$license <- as.numeric(test1$license)
#knn3
y_pred.4<-knn(train=Loan[,-9],test=test1[,-9], cl=Loan[,9],k=3)
test1$Transport <- y_pred.4
test1
## Age Gender Engineer MBA Work.Exp Salary Distance license
## 1 25 0 0 0 4 14.3 3.2 1
## 2 25 1 1 0 4 8.3 3.3 0
## Transport
## 1 Public Transport
## 2 Public Transport