R Notebook

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

#Some libraries
rm(list=ls())
library(car)
library(caret)

## Warning: package 'caret' was built under R version 3.4.3

## Loading required package: lattice

## Loading required package: ggplot2

library(class)
library(devtools)
library(e1071)
library(ggplot2)
library(klaR)

## Warning: package 'klaR' was built under R version 3.4.3

## Loading required package: MASS

library(klaR)
library(MASS)
library(nnet)
library(plyr)
library(pROC)

## Warning: package 'pROC' was built under R version 3.4.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

## The following object is masked from 'package:car':
## 
##     logit

library(scatterplot3d)
library(SDMTools)

## Warning: package 'SDMTools' was built under R version 3.4.3

## 
## Attaching package: 'SDMTools'

## The following object is masked from 'package:pROC':
## 
##     auc

## The following objects are masked from 'package:caret':
## 
##     sensitivity, specificity

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:MASS':
## 
##     select

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ElemStatLearn)

## Warning: package 'ElemStatLearn' was built under R version 3.4.3

## 
## Attaching package: 'ElemStatLearn'

## The following object is masked from 'package:plyr':
## 
##     ozone

library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:psych':
## 
##     outlier

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(neuralnet)

## 
## Attaching package: 'neuralnet'

## The following object is masked from 'package:dplyr':
## 
##     compute

##Set working directory
setwd("D:/GREAT LAKES/MACHINE LEARNING")
getwd()

## [1] "D:/GREAT LAKES/MACHINE LEARNING"

Cars<-read.csv("Actual_Cars_data.csv", header=T)
sapply(Cars, function(y) sum(length(which(is.na(y)))))

##       Age    Gender  Engineer       MBA  Work.Exp    Salary  Distance 
##         0         0         0         1         0         0         0 
##   license Transport 
##         0         0

Cars[is.na(Cars$MBA),4]<-0
sapply(Cars, function(y) sum(length(which(is.na(y)))))

##       Age    Gender  Engineer       MBA  Work.Exp    Salary  Distance 
##         0         0         0         0         0         0         0 
##   license Transport 
##         0         0

View(Cars)
na.omit(Cars)

summary(Cars)

##       Age           Gender       Engineer           MBA        
##  Min.   :18.00   Female:128   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:25.00   Male  :316   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :27.00                Median :1.0000   Median :0.0000  
##  Mean   :27.75                Mean   :0.7545   Mean   :0.2523  
##  3rd Qu.:30.00                3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :43.00                Max.   :1.0000   Max.   :1.0000  
##     Work.Exp        Salary         Distance        license      
##  Min.   : 0.0   Min.   : 6.50   Min.   : 3.20   Min.   :0.0000  
##  1st Qu.: 3.0   1st Qu.: 9.80   1st Qu.: 8.80   1st Qu.:0.0000  
##  Median : 5.0   Median :13.60   Median :11.00   Median :0.0000  
##  Mean   : 6.3   Mean   :16.24   Mean   :11.32   Mean   :0.2342  
##  3rd Qu.: 8.0   3rd Qu.:15.72   3rd Qu.:13.43   3rd Qu.:0.0000  
##  Max.   :24.0   Max.   :57.00   Max.   :23.40   Max.   :1.0000  
##             Transport  
##  2Wheeler        : 83  
##  Car             : 61  
##  Public Transport:300  
##                        
##                        
##

str(Cars)

## 'data.frame':    444 obs. of  9 variables:
##  $ Age      : int  28 23 29 28 27 26 28 26 22 27 ...
##  $ Gender   : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 2 2 1 2 2 ...
##  $ Engineer : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ MBA      : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ Work.Exp : int  4 4 7 5 4 4 5 3 1 4 ...
##  $ Salary   : num  14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
##  $ Distance : num  3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
##  $ license  : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ Transport: Factor w/ 3 levels "2Wheeler","Car",..: 3 3 3 3 3 3 1 3 3 3 ...

## Determine Levels
levels(Cars$Gender)

## [1] "Female" "Male"

levels(Cars$Transport)

## [1] "2Wheeler"         "Car"              "Public Transport"

#Define some dummies
Cars$Female<-ifelse(Cars$Gender=="Female",1,0)
Cars$Male<-ifelse(Cars$Gender=="Male",1,0)
#Partitioning Data Sets
#Partition train and val
#We will use this throughout so that samples are comparable
set.seed(7)
pd<-sample(2,nrow(Cars),replace=TRUE, prob=c(0.7,0.3))
train<-Cars[pd==1,]
val<-Cars[pd==2,]

#K FOLD VALIDATIONS
names(Cars)

##  [1] "Age"       "Gender"    "Engineer"  "MBA"       "Work.Exp" 
##  [6] "Salary"    "Distance"  "license"   "Transport" "Female"   
## [11] "Male"

Carstrim<-Cars[,-2]
na.omit(Carstrim)

set.seed(7)
pd<-sample(2,nrow(Carstrim),replace=TRUE, prob=c(0.7,0.3))
traintrim<-Carstrim[pd==1,]
valtrim<-Carstrim[pd==2,]
set.seed(7)
folds<-createFolds(Cars$Transport,k=10)
str(folds)

## List of 10
##  $ Fold01: int [1:46] 29 33 45 56 69 90 104 109 119 120 ...
##  $ Fold02: int [1:44] 30 78 92 94 99 100 103 121 130 135 ...
##  $ Fold03: int [1:44] 14 15 47 52 59 71 75 87 88 95 ...
##  $ Fold04: int [1:45] 9 16 36 42 49 50 54 57 64 72 ...
##  $ Fold05: int [1:44] 8 11 13 23 24 26 35 37 41 68 ...
##  $ Fold06: int [1:44] 1 5 7 18 32 34 44 51 63 65 ...
##  $ Fold07: int [1:44] 38 58 84 85 86 113 127 136 141 161 ...
##  $ Fold08: int [1:44] 3 12 20 21 27 28 31 48 60 61 ...
##  $ Fold09: int [1:44] 4 10 19 25 40 46 53 67 77 80 ...
##  $ Fold10: int [1:45] 2 6 17 22 39 43 55 70 73 97 ...

######

Eq.2 <- Transport ~ .
#10 Fold validation with LPM
cv_LPM<-lapply(folds,function(x){
  train<-traintrim[x,]
  test<-valtrim[-x,]
  LPM.1<-lm(Eq.2, train)
  LPM1.pred<-predict(LPM.1, test)
  tab.LPM<-table(test$Transport, LPM1.pred>0.5)
  sum(diag(tab.LPM))/sum(tab.LPM)
})

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading

str(cv_LPM)

## List of 10
##  $ Fold01: num 0.145
##  $ Fold02: num 0.133
##  $ Fold03: num 0.145
##  $ Fold04: num 0.116
##  $ Fold05: num 0.145
##  $ Fold06: num 0.142
##  $ Fold07: num 0.147
##  $ Fold08: num 0.12
##  $ Fold09: num 0.159
##  $ Fold10: num 0.148

fit.LPM<-mean(unlist(cv_LPM))
fit.LPM

## [1] 0.1401046

##########
## Accuracy of LPM is 14.01%
#########

#10 Vold Validation with NB

cv_NB<-lapply(folds,function(x){
  train.NB.kval<-traintrim[x,]
  test.NB.kval<-valtrim[-x,]
  NB.kval<-naiveBayes(x=train.NB.kval[-1], y=train.NB.kval$Transport)
  y_pred.NB.kval<-predict( NB.kval,newdata=test.NB.kval[-1])
  cm.NB.kval=table(test.NB.kval[,1],y_pred.NB.kval)
  sum(diag(cm.NB.kval))/sum(cm.NB.kval)
})

str(cv_NB)

## List of 10
##  $ Fold01: num 0.0229
##  $ Fold02: num 0.00741
##  $ Fold03: num 0.0305
##  $ Fold04: num 0.0155
##  $ Fold05: num 0.0382
##  $ Fold06: num 0.0157
##  $ Fold07: num 0.0368
##  $ Fold08: num 0.032
##  $ Fold09: num 0.0152
##  $ Fold10: num 0.0156

fit.NB<-mean(unlist(cv_NB))
fit.NB

## [1] 0.02298036

#####################
##Accuracy of NB is 2.2%
#####################

#10 Fold with LDA
library(MASS)
library(ISLR)

## Warning: package 'ISLR' was built under R version 3.4.3

cv_LDA<-lapply(folds,function(x){
  train<-traintrim[x,]
  test<-valtrim[-x,]
  lda_1<-lda(Eq.2   , train)
  lda1.pred<-predict(lda_1, newdata=test)
  ldapredclass<-lda1.pred$class
  tab.LDA<-table(ldapredclass,test$Transport)
  sum(diag(tab.LDA))/sum(tab.LDA)
})

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

str(cv_LDA)

## List of 10
##  $ Fold01: num 0.626
##  $ Fold02: num 0.733
##  $ Fold03: num 0.779
##  $ Fold04: num 0.775
##  $ Fold05: num 0.763
##  $ Fold06: num 0.78
##  $ Fold07: num 0.721
##  $ Fold08: num 0.768
##  $ Fold09: num 0.742
##  $ Fold10: num 0.695

fit.LDA<-mean(unlist(cv_LDA))
fit.LDA

## [1] 0.7382319

#########
##Accuracy of LDA is 73.82%
#########

#10 Fold on Decision Trees

cv_DT<-lapply(folds,function(x){
  train<-traintrim[x,]
  test<-valtrim[-x,]
  DT<-rpart(Eq.2, method="class",train)
  pred = predict(DT, type="class",newdata=test)
  tabDT<-table( pred,test$Transport)
  sum(diag(tabDT))/sum(tabDT)
})

str(cv_DT)

## List of 10
##  $ Fold01: num 0.718
##  $ Fold02: num 0.778
##  $ Fold03: num 0.702
##  $ Fold04: num 0.543
##  $ Fold05: num 0.756
##  $ Fold06: num 0.606
##  $ Fold07: num 0.647
##  $ Fold08: num 0.584
##  $ Fold09: num 0.523
##  $ Fold10: num 0.734

fit.DT<-mean(unlist(cv_DT))
fit.DT

## [1] 0.6590446

########
##Accuracy of Decision trees is 65.90%
########

#####SMOTE
library(DMwR)

## Warning: package 'DMwR' was built under R version 3.4.3

## Loading required package: grid

## 
## Attaching package: 'DMwR'

## The following object is masked from 'package:plyr':
## 
##     join

train_SMOTE<-traintrim[,-2]
qplot(Distance,Salary,color=Transport, data=traintrim)

table(train_SMOTE$Transport)

## 
##         2Wheeler              Car Public Transport 
##               63               39              197

#SMOTE

#Two factors to see the plot
train_SMOTE$target <- as.factor(traintrim$Transport)
table(train_SMOTE$target)

## 
##         2Wheeler              Car Public Transport 
##               63               39              197

trainSplit <- SMOTE(target ~ ., train_SMOTE, perc.over = 200, perc.under=300)

print(prop.table(table(trainSplit$target)))

## 
##         2Wheeler              Car Public Transport 
##        0.1509972        0.3333333        0.5156695

table(trainSplit$target)

## 
##         2Wheeler              Car Public Transport 
##               53              117              181

qplot(Distance,Salary,color=Transport, data=trainSplit)

train_SMOTE_new<-trainSplit

train_SMOTE_new <- SMOTE(Transport ~ ., train_SMOTE_new, perc.over = 100, perc.under=100)
train_SMOTE_new$target <- as.factor(train_SMOTE_new$Transport)
trainSplit <- SMOTE(target ~ ., train_SMOTE_new, perc.over = 100, perc.under=100)
trainSplit$target <- as.numeric(trainSplit$target)
trainSplit$target<-ifelse(trainSplit$target==1,1,0)




print(prop.table(table(trainSplit$target)))

## 
##         0         1 
## 0.7708333 0.2291667

write.csv(train_SMOTE, "SMOTE.csv", row.names = FALSE)


#####################
Cars1<-read.csv("SMOTE.csv", header=T)
dim(Cars1)

## [1] 299  10

names(Cars1)

##  [1] "Age"       "MBA"       "Work.Exp"  "Salary"    "Distance" 
##  [6] "license"   "Transport" "Female"    "Male"      "target"

Cars1<-Cars1[,-7]
na.omit(Cars1)

summary(Cars1)

##       Age             MBA            Work.Exp          Salary     
##  Min.   :18.00   Min.   :0.0000   Min.   : 0.000   Min.   : 6.50  
##  1st Qu.:25.00   1st Qu.:0.0000   1st Qu.: 3.000   1st Qu.: 9.55  
##  Median :27.00   Median :0.0000   Median : 5.000   Median :12.90  
##  Mean   :27.49   Mean   :0.2408   Mean   : 5.967   Mean   :15.61  
##  3rd Qu.:30.00   3rd Qu.:0.0000   3rd Qu.: 8.000   3rd Qu.:15.60  
##  Max.   :43.00   Max.   :1.0000   Max.   :24.000   Max.   :57.00  
##     Distance        license          Female            Male       
##  Min.   : 3.30   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 8.60   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :11.30   Median :0.000   Median :0.0000   Median :1.0000  
##  Mean   :11.44   Mean   :0.214   Mean   :0.3144   Mean   :0.6856  
##  3rd Qu.:13.60   3rd Qu.:0.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :23.40   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##               target   
##  2Wheeler        : 63  
##  Car             : 39  
##  Public Transport:197  
##                        
##                        
##

str(Cars1)

## 'data.frame':    299 obs. of  9 variables:
##  $ Age     : int  23 29 28 27 28 22 27 25 27 27 ...
##  $ MBA     : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Work.Exp: int  4 7 5 4 5 1 4 4 4 4 ...
##  $ Salary  : num  8.3 13.4 13.4 13.4 14.4 7.5 13.5 11.5 13.5 13.4 ...
##  $ Distance: num  3.3 4.1 4.5 4.6 5.1 5.1 5.2 5.2 5.3 5.5 ...
##  $ license : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ Female  : int  1 0 1 0 0 0 0 1 0 0 ...
##  $ Male    : int  0 1 0 1 1 1 1 0 1 1 ...
##  $ target  : Factor w/ 3 levels "2Wheeler","Car",..: 3 3 3 3 1 3 3 3 3 3 ...

attach(Cars1)
set.seed(777)
pd<-sample(2,nrow(Cars1),replace=TRUE, prob=c(0.7,0.3))

train1<-Cars1[pd==1,]
val1<-Cars1[pd==2,]
attach(train1)

## The following objects are masked from Cars1:
## 
##     Age, Distance, Female, license, Male, MBA, Salary, target,
##     Work.Exp

attach(val1)

## The following objects are masked from train1:
## 
##     Age, Distance, Female, license, Male, MBA, Salary, target,
##     Work.Exp

## The following objects are masked from Cars1:
## 
##     Age, Distance, Female, license, Male, MBA, Salary, target,
##     Work.Exp

#10 Fold with LDA
library(MASS)
library(ISLR)
Eq.3 <-  target~.
cv_LDA<-lapply(folds,function(x){
  train<-train1[x,]
  test<-val1[-x,]
  lda_1<-lda(Eq.3   , train1)
  lda1.pred<-predict(lda_1, newdata=test)
  ldapredclass<-lda1.pred$class
  tab.LDA<-table(ldapredclass,test$target)
  sum(diag(tab.LDA))/sum(tab.LDA)
})

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

str(cv_LDA)

## List of 10
##  $ Fold01: num 0.84
##  $ Fold02: num 0.833
##  $ Fold03: num 0.835
##  $ Fold04: num 0.842
##  $ Fold05: num 0.827
##  $ Fold06: num 0.808
##  $ Fold07: num 0.84
##  $ Fold08: num 0.847
##  $ Fold09: num 0.855
##  $ Fold10: num 0.844

fit.LDA<-mean(unlist(cv_LDA))
fit.LDA

## [1] 0.8371421

#########
##Accuracy of LDA post Smote is 83.71%
#########
#10 Fold on Decision Trees post SMOTE

cv_DT<-lapply(folds,function(x){
  train<-train1[x,]
  test<-val1[-x,]
  DT<-rpart(Eq.3, method="class",train)
  pred = predict(DT, type="class",newdata=test)
  tabDT<-table( pred,test$target)
  sum(diag(tabDT))/sum(tabDT)
})

str(cv_DT)

## List of 10
##  $ Fold01: num 0.593
##  $ Fold02: num 0.631
##  $ Fold03: num 0.633
##  $ Fold04: num 0.526
##  $ Fold05: num 0.507
##  $ Fold06: num 0.616
##  $ Fold07: num 0.654
##  $ Fold08: num 0.639
##  $ Fold09: num 0.763
##  $ Fold10: num 0.636

fit.DT<-mean(unlist(cv_DT))
fit.DT

## [1] 0.6198609

########
##Accuracy of Decision trees is 61.98%
########

#10 Vold Validation with NB

cv_NB<-lapply(folds,function(x){
  train.NB.kval<-train1[x,]
  test.NB.kval<-val1[-x,]
  NB.kval<-naiveBayes(x=train.NB.kval[-1], y=train.NB.kval$target)
  y_pred.NB.kval<-predict( NB.kval,newdata=test.NB.kval[-1])
  cm.NB.kval=table(test.NB.kval[,1],y_pred.NB.kval)
  sum(diag(cm.NB.kval))/sum(cm.NB.kval)
})

str(cv_NB)

## List of 10
##  $ Fold01: num 0.0247
##  $ Fold02: num 0.0357
##  $ Fold03: num 0.0253
##  $ Fold04: num 0.0263
##  $ Fold05: num 0.0133
##  $ Fold06: num 0.0411
##  $ Fold07: num 0.0123
##  $ Fold08: num 0.111
##  $ Fold09: num 0.0132
##  $ Fold10: num 0.026

fit.NB<-mean(unlist(cv_NB))
fit.NB

## [1] 0.03290558

#####################
##Accuracy of NB is 3.9%
#####################


###SVM
####2_D PLOT
library(rpart)
svm.2<-svm(target~., data=train1, kernel="linear")
summary(svm.2)

## 
## Call:
## svm(formula = target ~ ., data = train1, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.125 
## 
## Number of Support Vectors:  109
## 
##  ( 52 46 11 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  2Wheeler Car Public Transport

##Confusion matrix

svm.full<-svm(target~., data=train1, kernel="radial")
summary(svm.full)

## 
## Call:
## svm(formula = target ~ ., data = train1, kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.125 
## 
## Number of Support Vectors:  126
## 
##  ( 63 45 18 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  2Wheeler Car Public Transport

y_pred.svm.full<-predict(svm.full,newdata=val1[,-9])
y_pred.svm.full

##                4                9               11               15 
## Public Transport Public Transport Public Transport Public Transport 
##               18               26               29               30 
## Public Transport Public Transport Public Transport Public Transport 
##               31               32               33               37 
## Public Transport Public Transport Public Transport Public Transport 
##               46               47               48               49 
## Public Transport Public Transport Public Transport Public Transport 
##               58               61               63               65 
## Public Transport Public Transport Public Transport Public Transport 
##               69               72               73               76 
## Public Transport Public Transport Public Transport Public Transport 
##               89               90               91               94 
## Public Transport Public Transport Public Transport Public Transport 
##               96               99              100              101 
## Public Transport Public Transport Public Transport Public Transport 
##              105              106              112              118 
## Public Transport Public Transport Public Transport              Car 
##              119              120              126              129 
## Public Transport Public Transport Public Transport Public Transport 
##              132              141              142              151 
## Public Transport Public Transport              Car              Car 
##              154              161              164              165 
##         2Wheeler Public Transport Public Transport Public Transport 
##              174              176              177              185 
## Public Transport Public Transport Public Transport Public Transport 
##              188              189              190              192 
## Public Transport Public Transport Public Transport Public Transport 
##              195              201              209              210 
## Public Transport Public Transport              Car Public Transport 
##              211              214              221              223 
## Public Transport         2Wheeler              Car Public Transport 
##              224              230              234              244 
## Public Transport Public Transport Public Transport Public Transport 
##              246              250              251              255 
## Public Transport Public Transport         2Wheeler Public Transport 
##              257              259              265              267 
##         2Wheeler Public Transport Public Transport              Car 
##              270              273              275              276 
## Public Transport Public Transport         2Wheeler              Car 
##              277              286              287              290 
##         2Wheeler              Car         2Wheeler              Car 
##              291              294 
##              Car              Car 
## Levels: 2Wheeler Car Public Transport

#Confusion matrix


head(val1)

head(train1)

cm.SVMB.full=table(val1[,9],y_pred.svm.full)
cm.SVMB.full

##                   y_pred.svm.full
##                    2Wheeler Car Public Transport
##   2Wheeler                6   1               10
##   Car                     0  10                5
##   Public Transport        1   0               53

accuracy.svm.full<-sum(diag(cm.SVMB.full))/sum(cm.SVMB.full)
accuracy.svm.full

## [1] 0.8023256

##
##Accuracy of SVM post SMOTE is 80.23%
###3
#Tune
set.seed(77)
tune.svm<-tune(svm, target~.,data=train1,ranges=list(epsilon=seq(0,1,0.1), cost=2^(2:9)))
summary(tune.svm)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon cost
##        0   64
## 
## - best performance: 0.2010823 
## 
## - Detailed performance results:
##    epsilon cost     error dispersion
## 1      0.0    4 0.2064935 0.05921759
## 2      0.1    4 0.2064935 0.05921759
## 3      0.2    4 0.2064935 0.05921759
## 4      0.3    4 0.2064935 0.05921759
## 5      0.4    4 0.2064935 0.05921759
## 6      0.5    4 0.2064935 0.05921759
## 7      0.6    4 0.2064935 0.05921759
## 8      0.7    4 0.2064935 0.05921759
## 9      0.8    4 0.2064935 0.05921759
## 10     0.9    4 0.2064935 0.05921759
## 11     1.0    4 0.2064935 0.05921759
## 12     0.0    8 0.2153680 0.08001342
## 13     0.1    8 0.2153680 0.08001342
## 14     0.2    8 0.2153680 0.08001342
## 15     0.3    8 0.2153680 0.08001342
## 16     0.4    8 0.2153680 0.08001342
## 17     0.5    8 0.2153680 0.08001342
## 18     0.6    8 0.2153680 0.08001342
## 19     0.7    8 0.2153680 0.08001342
## 20     0.8    8 0.2153680 0.08001342
## 21     0.9    8 0.2153680 0.08001342
## 22     1.0    8 0.2153680 0.08001342
## 23     0.0   16 0.2058442 0.08064332
## 24     0.1   16 0.2058442 0.08064332
## 25     0.2   16 0.2058442 0.08064332
## 26     0.3   16 0.2058442 0.08064332
## 27     0.4   16 0.2058442 0.08064332
## 28     0.5   16 0.2058442 0.08064332
## 29     0.6   16 0.2058442 0.08064332
## 30     0.7   16 0.2058442 0.08064332
## 31     0.8   16 0.2058442 0.08064332
## 32     0.9   16 0.2058442 0.08064332
## 33     1.0   16 0.2058442 0.08064332
## 34     0.0   32 0.2012987 0.10149338
## 35     0.1   32 0.2012987 0.10149338
## 36     0.2   32 0.2012987 0.10149338
## 37     0.3   32 0.2012987 0.10149338
## 38     0.4   32 0.2012987 0.10149338
## 39     0.5   32 0.2012987 0.10149338
## 40     0.6   32 0.2012987 0.10149338
## 41     0.7   32 0.2012987 0.10149338
## 42     0.8   32 0.2012987 0.10149338
## 43     0.9   32 0.2012987 0.10149338
## 44     1.0   32 0.2012987 0.10149338
## 45     0.0   64 0.2010823 0.09343058
## 46     0.1   64 0.2010823 0.09343058
## 47     0.2   64 0.2010823 0.09343058
## 48     0.3   64 0.2010823 0.09343058
## 49     0.4   64 0.2010823 0.09343058
## 50     0.5   64 0.2010823 0.09343058
## 51     0.6   64 0.2010823 0.09343058
## 52     0.7   64 0.2010823 0.09343058
## 53     0.8   64 0.2010823 0.09343058
## 54     0.9   64 0.2010823 0.09343058
## 55     1.0   64 0.2010823 0.09343058
## 56     0.0  128 0.2194805 0.09422258
## 57     0.1  128 0.2194805 0.09422258
## 58     0.2  128 0.2194805 0.09422258
## 59     0.3  128 0.2194805 0.09422258
## 60     0.4  128 0.2194805 0.09422258
## 61     0.5  128 0.2194805 0.09422258
## 62     0.6  128 0.2194805 0.09422258
## 63     0.7  128 0.2194805 0.09422258
## 64     0.8  128 0.2194805 0.09422258
## 65     0.9  128 0.2194805 0.09422258
## 66     1.0  128 0.2194805 0.09422258
## 67     0.0  256 0.2142857 0.11591712
## 68     0.1  256 0.2142857 0.11591712
## 69     0.2  256 0.2142857 0.11591712
## 70     0.3  256 0.2142857 0.11591712
## 71     0.4  256 0.2142857 0.11591712
## 72     0.5  256 0.2142857 0.11591712
## 73     0.6  256 0.2142857 0.11591712
## 74     0.7  256 0.2142857 0.11591712
## 75     0.8  256 0.2142857 0.11591712
## 76     0.9  256 0.2142857 0.11591712
## 77     1.0  256 0.2142857 0.11591712
## 78     0.0  512 0.2285714 0.10580360
## 79     0.1  512 0.2285714 0.10580360
## 80     0.2  512 0.2285714 0.10580360
## 81     0.3  512 0.2285714 0.10580360
## 82     0.4  512 0.2285714 0.10580360
## 83     0.5  512 0.2285714 0.10580360
## 84     0.6  512 0.2285714 0.10580360
## 85     0.7  512 0.2285714 0.10580360
## 86     0.8  512 0.2285714 0.10580360
## 87     0.9  512 0.2285714 0.10580360
## 88     1.0  512 0.2285714 0.10580360

best.svm<-tune.svm$best.model
summary(best.svm)

## 
## Call:
## best.tune(method = svm, train.x = target ~ ., data = train1, 
##     ranges = list(epsilon = seq(0, 1, 0.1), cost = 2^(2:9)))
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  64 
##       gamma:  0.125 
## 
## Number of Support Vectors:  96
## 
##  ( 47 34 15 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  2Wheeler Car Public Transport

best.par<-tune.svm$best.parameters
summary(best.par)

##     epsilon       cost   
##  Min.   :0   Min.   :64  
##  1st Qu.:0   1st Qu.:64  
##  Median :0   Median :64  
##  Mean   :0   Mean   :64  
##  3rd Qu.:0   3rd Qu.:64  
##  Max.   :0   Max.   :64

#CM

y_pred.svm.best<-predict(best.svm,newdata=val1[-9])

#Confusion matrix
cm.SVMB.best=table(val1[,9],y_pred.svm.best)
cm.SVMB.best

##                   y_pred.svm.best
##                    2Wheeler Car Public Transport
##   2Wheeler                6   2                9
##   Car                     0  10                5
##   Public Transport        1   0               53

accuracy.svm.best<-sum(diag(cm.SVMB.best))/sum(cm.SVMB.best)
accuracy.svm.best

## [1] 0.8023256

#################
## Accuracy after tuning SVM is 80.23%
################  
#K- nearest neighbour
names(Cars1)

## [1] "Age"      "MBA"      "Work.Exp" "Salary"   "Distance" "license" 
## [7] "Female"   "Male"     "target"

kneig = knn(train = Cars1[,-9],test = Cars1[,-9],cl= Cars1$target)
cm.knn= table(Actual=Cars1$target, Predicted = kneig)
accuracy.knn<-sum(diag(cm.knn))/sum(cm.knn)
accuracy.knn

## [1] 1

#Predicting the 2 rows using various methods
Cars2=read.csv("Actual_Cars_data.csv", header=T)
Cars2 = na.omit(Cars2)
predictdata = read.csv("cars.test.csv",header = T)
Cars2$Gender  <- as.numeric(Cars2$Gender)
predictdata$Gender <- as.numeric(predictdata$Gender)
predictdata$Engineer <- as.numeric(predictdata$Engineer)
predictdata$MBA <- as.numeric(predictdata$MBA)
predictdata$license <- as.numeric(predictdata$license)
names(Cars2)

## [1] "Age"       "Gender"    "Engineer"  "MBA"       "Work.Exp"  "Salary"   
## [7] "Distance"  "license"   "Transport"

names(predictdata)

## [1] "Age"      "Gender"   "Engineer" "MBA"      "Work.Exp" "Salary"  
## [7] "Distance" "license"

#Using KNN 
knn.test = knn(train= Cars2[,-9], test= predictdata[,-9], cl=Cars2[,9], k=3)
predictdata$Transport.KNN = knn.test
predictdata

##Predicting is PUBLIC TRANSPORT
names(predictdata)

## [1] "Age"           "Gender"        "Engineer"      "MBA"          
## [5] "Work.Exp"      "Salary"        "Distance"      "license"      
## [9] "Transport.KNN"

#Lets us use LDA
lda.predidct = lda(Cars2$Transport~., data=Cars2[,-9])
a = predict(lda.predidct, newdata = predictdata[,-9])
predictdata$Transport.LDA = a$class
predictdata

#Even LDA predicts Public Transport
# Lets us use SVM
predict.svm = svm(Cars2$Transport~., data=Cars2[,-9], kernel= "polynomial")
p.svm = predict(predict.svm, newdata = predictdata[,-c(9,10)])
predictdata$Transport.SVM = p.svm
predictdata[,c(9,10,11)]

#Prediction is Public Transport using all the methods
####################################

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.