word_document: default

title: “Machine Learning” author: “Anand” date: “March 25, 2018” output: html_document: default

word_document: default

Importing the libraries

library(car)
library(caret)
library(class)
library(devtools)
library(e1071)
library(ggplot2)
library(Hmisc)
library(MASS)
library(klaR)
library(nnet)
library(plyr)
library(pROC)
library(psych)
library(scatterplot3d)
library(SDMTools)
library(dplyr)
library(ElemStatLearn)
library(rpart)
library(rpart.plot)
library(randomForest)
library(neuralnet)
library(psych)
library(VIM)
library(corrplot)
library(SDMTools)
library(pROC)
library(Hmisc)
library(MASS)
library(ISLR)
library(DMwR)
library(DiscriMiner)
library(lmtest)
library(pscl)
library(Deducer)
library(glmnet)

Reading the file

Loan<-read.csv(file.choose(), header=T)

Define some dummies

Loan$car<-ifelse(Loan$Transport=="Car",1,0)
Loan$Gender<-ifelse(Loan$Gender=="Female",0,1)
Loan <- Loan[,-9]
Loan <- na.omit(Loan)

univariate analysis

summary(Loan)

##       Age            Gender          Engineer           MBA        
##  Min.   :18.00   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:25.00   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :27.00   Median :1.0000   Median :1.0000   Median :0.0000  
##  Mean   :27.75   Mean   :0.7133   Mean   :0.7562   Mean   :0.2528  
##  3rd Qu.:30.00   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :43.00   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     Work.Exp        Salary         Distance        license      
##  Min.   : 0.0   Min.   : 6.50   Min.   : 3.20   Min.   :0.0000  
##  1st Qu.: 3.0   1st Qu.: 9.80   1st Qu.: 8.80   1st Qu.:0.0000  
##  Median : 5.0   Median :13.60   Median :11.00   Median :0.0000  
##  Mean   : 6.3   Mean   :16.24   Mean   :11.33   Mean   :0.2348  
##  3rd Qu.: 8.0   3rd Qu.:15.75   3rd Qu.:13.45   3rd Qu.:0.0000  
##  Max.   :24.0   Max.   :57.00   Max.   :23.40   Max.   :1.0000  
##       car        
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1377  
##  3rd Qu.:0.0000  
##  Max.   :1.0000

str(Loan)

## 'data.frame':    443 obs. of  9 variables:
##  $ Age     : int  28 23 29 28 27 26 28 26 22 27 ...
##  $ Gender  : num  1 0 1 0 1 1 1 0 1 1 ...
##  $ Engineer: int  0 1 1 1 1 1 1 1 1 1 ...
##  $ MBA     : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ Work.Exp: int  4 4 7 5 4 4 5 3 1 4 ...
##  $ Salary  : num  14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
##  $ Distance: num  3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
##  $ license : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ car     : num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "na.action")=Class 'omit'  Named int 145
##   .. ..- attr(*, "names")= chr "145"

describe(Loan)

##          vars   n  mean    sd median trimmed  mad  min  max range  skew
## Age         1 443 27.75  4.42   27.0   27.29 4.45 18.0 43.0  25.0  0.95
## Gender      2 443  0.71  0.45    1.0    0.77 0.00  0.0  1.0   1.0 -0.94
## Engineer    3 443  0.76  0.43    1.0    0.82 0.00  0.0  1.0   1.0 -1.19
## MBA         4 443  0.25  0.44    0.0    0.19 0.00  0.0  1.0   1.0  1.13
## Work.Exp    5 443  6.30  5.12    5.0    5.52 4.45  0.0 24.0  24.0  1.34
## Salary      6 443 16.24 10.46   13.6   13.93 4.74  6.5 57.0  50.5  2.03
## Distance    7 443 11.33  3.61   11.0   11.13 3.56  3.2 23.4  20.2  0.53
## license     8 443  0.23  0.42    0.0    0.17 0.00  0.0  1.0   1.0  1.25
## car         9 443  0.14  0.34    0.0    0.05 0.00  0.0  1.0   1.0  2.10
##          kurtosis   se
## Age          0.89 0.21
## Gender      -1.12 0.02
## Engineer    -0.59 0.02
## MBA         -0.72 0.02
## Work.Exp     1.42 0.24
## Salary       3.38 0.50
## Distance     0.16 0.17
## license     -0.45 0.02
## car          2.40 0.02

Missing value Imputation

aggr(Loan,prop = F,cex.axis = 0.7,numbers=T)

plot of chunk unnamed-chunk-5

Univariate analysis

par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(Loan$Age,main ="Age")
boxplot(Loan$Gender,main ="Gender")
boxplot(Loan$Engineer,main ="Engineer")
boxplot(Loan$MBA,main ="MBA")

plot of chunk unnamed-chunk-6

boxplot(Loan$Work.Exp,main ="Work Exp")
boxplot(Loan$license,main ="License")
boxplot(Loan$Salary,main ="Salary")
boxplot(Loan$Distance,main ="Distance")

plot of chunk unnamed-chunk-6

hist(Loan$Age,main ="Age", xlab=NA, ylab=NA)
hist(Loan$Gender,main ="Gender", xlab=NA, ylab=NA)
hist(Loan$Engineer,main ="Engineer", xlab=NA, ylab=NA)
hist(Loan$MBA,main ="MBA", xlab=NA, ylab=NA)

plot of chunk unnamed-chunk-6

hist(Loan$Work.Exp,main ="Work Exp", xlab=NA, ylab=NA)
hist(Loan$license,main ="License", xlab=NA, ylab=NA)
hist(Loan$Salary,main ="Salary", xlab=NA, ylab=NA)
hist(Loan$Distance,main ="Distance", xlab=NA, ylab=NA)

plot of chunk unnamed-chunk-6

Bivariate analysis

corx <- cor(Loan)
corrplot(corx)

plot of chunk unnamed-chunk-7

Response rate

resp <- sum(Loan$car/nrow(Loan))
resp

## [1] 0.1376975

Creating factor variables

Loan$car <- as.factor(Loan$car)
Loan$Gender <- as.factor(Loan$Gender)
Loan$Engineer <- as.factor(Loan$Engineer)
Loan$MBA <- as.factor(Loan$MBA)
Loan$license <- as.factor(Loan$license)

Running K FOLD VALIDATIONS to identify which model to use

Loantrim<-Loan
set.seed(123)
folds<-createFolds(Loantrim$car,k=10)
str(folds)

## List of 10
##  $ Fold01: int [1:44] 15 17 26 29 37 52 60 63 72 84 ...
##  $ Fold02: int [1:44] 8 20 21 41 43 55 57 58 71 86 ...
##  $ Fold03: int [1:45] 6 11 12 14 27 28 40 67 80 83 ...
##  $ Fold04: int [1:44] 19 31 35 61 64 78 85 97 107 112 ...
##  $ Fold05: int [1:44] 24 25 79 94 100 119 120 131 136 140 ...
##  $ Fold06: int [1:45] 16 36 42 51 53 59 95 98 103 117 ...
##  $ Fold07: int [1:44] 1 2 23 34 49 56 70 75 81 96 ...
##  $ Fold08: int [1:45] 3 4 7 44 46 48 50 54 65 66 ...
##  $ Fold09: int [1:44] 13 18 22 30 33 45 62 68 73 74 ...
##  $ Fold10: int [1:44] 5 9 10 32 38 39 47 69 88 89 ...

Eq.2<- car~ Age + Gender + Engineer+MBA++Work.Exp + Salary+Distance+license

Running Logistic regression

cv_logit<-lapply(folds,function(x){
  test<-Loantrim[x,]
  train<-Loantrim[-x,]
  Logit.1<-glm(Eq.2, train, family = binomial)
  pred.1 <- predict.glm(Logit.1, newdata=test, type="response")
  actual<-test$car
  tab.logit.1<-confusion.matrix(actual,pred.1,threshold = 0.5)
  sum(diag(tab.logit.1))/sum(tab.logit.1)
})

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

str(cv_logit)

## List of 10
##  $ Fold01: num 0.977
##  $ Fold02: num 0.977
##  $ Fold03: num 0.978
##  $ Fold04: num 0.909
##  $ Fold05: num 0.909
##  $ Fold06: num 0.978
##  $ Fold07: num 0.886
##  $ Fold08: num 0.956
##  $ Fold09: num 0.955
##  $ Fold10: num 1

fit.logit<-mean(unlist(cv_logit))
fit.logit

## [1] 0.9524747

The accuracy is 95.2%

Running NB

cv_NB<-lapply(folds,function(x){
  train.NB.kval<-Loantrim[x,]
  test.NB.kval<-Loantrim[x,]
  NB.kval<-naiveBayes(x=train.NB.kval[-1], y=train.NB.kval$car)
  y_pred.NB.kval<-predict( NB.kval,newdata=test.NB.kval[-1])
  cm.NB.kval=table(test.NB.kval[,1],y_pred.NB.kval)
  sum(diag(cm.NB.kval))/sum(cm.NB.kval)
})

str(cv_NB)

## List of 10
##  $ Fold01: num 0.0227
##  $ Fold02: num 0.0227
##  $ Fold03: num 0.0222
##  $ Fold04: num 0.0227
##  $ Fold05: num 0.0227
##  $ Fold06: num 0.0444
##  $ Fold07: num 0.0227
##  $ Fold08: num 0.0222
##  $ Fold09: num 0.0227
##  $ Fold10: num 0.0455

fit.NB<-mean(unlist(cv_NB))
fit.NB

## [1] 0.02707071

The accuracy is 2.7%

Running Linear Discriminannt analysis

cv_LDA<-lapply(folds,function(x){
  train<-Loantrim[-x,]
  test<-Loantrim[x,]
  lda_1<-lda(Eq.2   , train)
  lda1.pred<-predict(lda_1, newdata=test)
  ldapredclass<-lda1.pred$class
  tab.LDA<-table(ldapredclass,test$car)
  sum(diag(tab.LDA))/sum(tab.LDA)
})

str(cv_LDA)

## List of 10
##  $ Fold01: num 1
##  $ Fold02: num 0.909
##  $ Fold03: num 0.978
##  $ Fold04: num 0.909
##  $ Fold05: num 0.932
##  $ Fold06: num 0.956
##  $ Fold07: num 0.886
##  $ Fold08: num 0.978
##  $ Fold09: num 0.932
##  $ Fold10: num 0.977

fit.LDA<-mean(unlist(cv_LDA))
fit.LDA

## [1] 0.9456566

The accuracy is 94.5%

Running Decision Tree

cv_DT<-lapply(folds,function(x){
  train<-Loantrim[-x,]
  test<-Loantrim[x,]
  DT<-rpart(Eq.2, method="class",train)
  pred = predict(DT, type="class",newdata=test)
  tabDT<-table( pred,test$car)
  sum(diag(tabDT))/sum(tabDT)
})

str(cv_DT)

## List of 10
##  $ Fold01: num 0.977
##  $ Fold02: num 0.955
##  $ Fold03: num 0.978
##  $ Fold04: num 0.955
##  $ Fold05: num 0.955
##  $ Fold06: num 1
##  $ Fold07: num 0.955
##  $ Fold08: num 0.978
##  $ Fold09: num 0.909
##  $ Fold10: num 1

fit.DT<-mean(unlist(cv_DT))
fit.DT

## [1] 0.9660101

The accuracy is 96.6%

By running the algorithms we found that decision tree's algorithm accuracy is the highest with 96.6 % but with decision tree we cannot explain variable importance as our problem statement is to build a model that explains best the employee's decision to use car as means of transport.

Hence going for the second highest accuracy of 95.5% from logistic algorithm.

Splitting the data into 70 and 30 %

index1 <- createDataPartition(Loan$car, p=0.70, list=FALSE)
train1 <- Loan[ index1,]
test1 <- Loan[-index1,]

Logit.final <- glm(Eq.2   , train1, family = binomial)
summary(Logit.final)

## 
## Call:
## glm(formula = Eq.2, family = binomial, data = train1)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.81084  -0.04406  -0.00947  -0.00109   2.06335  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -67.25779   17.48674  -3.846 0.000120 ***
## Age           2.18565    0.62155   3.516 0.000437 ***
## Gender1      -1.96151    1.09489  -1.792 0.073213 .  
## Engineer1     0.29510    0.99855   0.296 0.767593    
## MBA1         -1.29675    0.94434  -1.373 0.169694    
## Work.Exp     -1.20176    0.47987  -2.504 0.012268 *  
## Salary        0.16250    0.08998   1.806 0.070925 .  
## Distance      0.43426    0.15118   2.873 0.004071 ** 
## license1      3.21791    1.10470   2.913 0.003581 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 249.919  on 310  degrees of freedom
## Residual deviance:  47.418  on 302  degrees of freedom
## AIC: 65.418
## 
## Number of Fisher Scoring iterations: 9

vif(Logit.final)

##       Age    Gender  Engineer       MBA  Work.Exp    Salary  Distance 
## 15.457249  1.705767  1.144465  1.378596 23.483694  5.074497  1.399514 
##   license 
##  2.200031

varImp(Logit.final)

##             Overall
## Age       3.5164654
## Gender1   1.7915017
## Engineer1 0.2955243
## MBA1      1.3731875
## Work.Exp  2.5043291
## Salary    1.8059558
## Distance  2.8725700
## license1  2.9129268

pred.logit.final <- predict.glm(Logit.final, newdata=test1, type="response")

#Classification
tab.logit<-table(test1$car,pred.logit.final>0.5)
tab.logit

##    
##     FALSE TRUE
##   0   114    0
##   1     4   14

#Logit
accuracy.logit<-sum(diag(tab.logit))/sum(tab.logit)
accuracy.logit

## [1] 0.969697

The VIF shows that there is multicollinearity occurs, ass the variables Age and Work.Exp shows high value Need to keep one variable alone as one variable influence the other, as Age increases Work.exp will increases, henceforth removing work.exp variable.

Creating a model by removing Work.Exp variable

Eq.2<- car~ Age + Gender + Engineer+MBA+Salary+Distance+license
Logit.final <- glm(Eq.2   , train1, family = binomial)
summary(Logit.final)

## 
## Call:
## glm(formula = Eq.2, family = binomial, data = train1)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.00573  -0.08627  -0.03033  -0.00734   2.44071  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -33.85506    6.87762  -4.922 8.54e-07 ***
## Age           0.88463    0.19602   4.513 6.39e-06 ***
## Gender1      -1.39271    0.98564  -1.413  0.15766    
## Engineer1     0.50772    0.91259   0.556  0.57797    
## MBA1         -1.42950    0.89793  -1.592  0.11139    
## Salary       -0.03100    0.04805  -0.645  0.51883    
## Distance      0.40259    0.13308   3.025  0.00249 ** 
## license1      2.89334    0.96426   3.001  0.00269 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 249.919  on 310  degrees of freedom
## Residual deviance:  55.444  on 303  degrees of freedom
## AIC: 71.444
## 
## Number of Fisher Scoring iterations: 8

vif(Logit.final)

##      Age   Gender Engineer      MBA   Salary Distance  license 
## 1.869234 1.531864 1.118493 1.307975 1.635326 1.331920 1.877613

varImp(Logit.final)

##             Overall
## Age       4.5130470
## Gender1   1.4129930
## Engineer1 0.5563559
## MBA1      1.5919940
## Salary    0.6451503
## Distance  3.0250439
## license1  3.0005762

pred.logit.final <- predict.glm(Logit.final, newdata=test1, type="response")

#Classification
tab.logit<-table(test1$car,pred.logit.final>0.5)
tab.logit

##    
##     FALSE TRUE
##   0   111    3
##   1     2   16

#Logit
accuracy.logit<-sum(diag(tab.logit))/sum(tab.logit)
accuracy.logit

## [1] 0.9621212

Now the accuracy is 96.2%

Due to unbalanced dataset the model is not predicting 1's accurately, hence using SMOTE technique to over sample the data.

train_SMOTE<-Loan

table(train_SMOTE$car)

## 
##   0   1 
## 382  61

trainSplit <- SMOTE(Eq.2, train_SMOTE, perc.over = 200, perc.under=300)

index1 <- createDataPartition(trainSplit$car, p=0.70, list=FALSE)
train1 <- Loan[ index1,]
test1 <- Loan[-index1,]
table(trainSplit$car)

## 
##   0   1 
## 366 183

table(test1$car)

## 
##   0   1 
## 116  19

Running Logistic regression after using SMOTE technique

Logit.final1 <- glm(Eq.2   , train1, family = binomial)

vif(Logit.final1)

##      Age   Gender Engineer      MBA   Salary Distance  license 
## 2.432835 1.113273 1.251640 1.635857 1.633143 1.349120 1.266904

varImp(Logit.final1)

##             Overall
## Age       3.9380132
## Gender1   0.6165907
## Engineer1 1.1049803
## MBA1      2.4994597
## Salary    0.9782194
## Distance  2.7695832
## license1  2.5669102

pred.logit.final1 <- predict.glm(Logit.final1, newdata=test1, type="response")

#Classification
tab.logit1<-table(test1$car,pred.logit.final1>0.5)
tab.logit1

##    
##     FALSE TRUE
##   0   113    3
##   1     5   14

#Logit
accuracy.logit1<-sum(diag(tab.logit))/sum(tab.logit)
accuracy.logit1

## [1] 0.9621212

Loglikelihood Test

lrtest(Logit.final1)

## Likelihood ratio test
## 
## Model 1: car ~ Age + Gender + Engineer + MBA + Salary + Distance + license
## Model 2: car ~ 1
##   #Df   LogLik Df  Chisq Pr(>Chisq)    
## 1   8  -20.513                         
## 2   1 -122.679 -7 204.33  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overall p-value less than 0.05, hence the overall the model is valid

McFadden R2

pR2(Logit.final1)

##          llh      llhNull           G2     McFadden         r2ML 
##  -20.5128951 -122.6785911  204.3313918    0.8327916    0.4849100 
##         r2CU 
##    0.8830238

The McFadden R2 is 0.75 which means that 75% of the uncertainity produced by the intercept model has been explained by the full model. If we use only intercept model there is uncertainity in prediction but if we use the full model it will capture 85% of the uncertainities produced by the uncertainity. Thus goodness of fit is robust.

summary(Logit.final1)

## 
## Call:
## glm(formula = Eq.2, family = binomial, data = train1)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.08978  -0.04155  -0.00776  -0.00140   2.88964  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -46.73020   11.09142  -4.213 2.52e-05 ***
## Age           1.25231    0.31801   3.938 8.22e-05 ***
## Gender1      -0.63529    1.03033  -0.617  0.53750    
## Engineer1     1.49455    1.35256   1.105  0.26917    
## MBA1         -2.75917    1.10391  -2.499  0.01244 *  
## Salary       -0.05321    0.05439  -0.978  0.32797    
## Distance      0.46143    0.16661   2.770  0.00561 ** 
## license1      2.33907    0.91124   2.567  0.01026 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 245.357  on 307  degrees of freedom
## Residual deviance:  41.026  on 300  degrees of freedom
##   (78 observations deleted due to missingness)
## AIC: 57.026
## 
## Number of Fisher Scoring iterations: 9

Gender1,salary,Engineer and MBA doesnot influence the employee's decision to use car as main means of transport. Therefore our focus to be on the below variables Age Distance License

rocplot(Logit.final1)

plot of chunk unnamed-chunk-23

                PROBLEM STATEMENT 2

Running Support vector Machines to identify the means of transport for the test data

Reading the file

Loan<-read.csv(file.choose(), header=T)
Loan <- na.omit(Loan)

svm.2<-svm(Loan$Transport~., data=Loan[,-9], kernel="linear")
summary(svm.2)

## 
## Call:
## svm(formula = Loan$Transport ~ ., data = Loan[, -9], kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.1111111 
## 
## Number of Support Vectors:  199
## 
##  ( 95 83 21 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  2Wheeler Car Public Transport

plot(svm.2,data=Loan, Age~Salary)

plot of chunk unnamed-chunk-25

plot(svm.2,data=Loan, Age~Work.Exp)

plot of chunk unnamed-chunk-25

plot(svm.2,data=Loan,Age ~ license)

plot of chunk unnamed-chunk-25

test<-read.csv(file.choose(), header=T)

test <- test[,-9]
str(test)

## 'data.frame':    2 obs. of  8 variables:
##  $ Age     : int  25 25
##  $ Gender  : int  0 1
##  $ Engineer: int  0 1
##  $ MBA     : int  0 0
##  $ Work.Exp: int  4 4
##  $ Salary  : num  14.3 8.3
##  $ Distance: num  3.2 3.3
##  $ license : int  1 0

test$Gender <- as.factor(test$Gender)
test$Engineer <- as.factor(test$Engineer)
test$MBA <- as.numeric(test$MBA)
test$license <- as.factor(test$license)

y_pred.svm<-predict(svm.2,newdata = test[-9])
test$Transport <- y_pred.svm
test

##   Age Gender Engineer MBA Work.Exp Salary Distance license
## 1  25      0        0   0        4   14.3      3.2       1
## 2  25      1        1   0        4    8.3      3.3       0
##          Transport
## 1 Public Transport
## 2 Public Transport

Running KNN Algorithm

Reading the file

Loan<-read.csv(file.choose(), header=T)
Loan <- na.omit(Loan)
sum(is.na(Loan))

## [1] 0

Reading the test data

test1<-read.csv(file.choose(), header=T)

converting to factor variables

Loan$Gender  <- as.numeric(Loan$Gender)
test1$Gender <- as.numeric(test1$Gender)
test1$Engineer <- as.numeric(test1$Engineer)
test1$MBA <- as.numeric(test1$MBA)
test1$license <- as.numeric(test1$license)

#knn3
y_pred.4<-knn(train=Loan[,-9],test=test1[,-9], cl=Loan[,9],k=3)
test1$Transport <- y_pred.4
test1

##   Age Gender Engineer MBA Work.Exp Salary Distance license
## 1  25      0        0   0        4   14.3      3.2       1
## 2  25      1        1   0        4    8.3      3.3       0
##          Transport
## 1 Public Transport
## 2 Public Transport