This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
#Some libraries
rm(list=ls())
library(car)
library(caret)
## Warning: package 'caret' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: ggplot2
library(class)
library(devtools)
library(e1071)
library(ggplot2)
library(klaR)
## Warning: package 'klaR' was built under R version 3.4.3
## Loading required package: MASS
library(klaR)
library(MASS)
library(nnet)
library(plyr)
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:car':
##
## logit
library(scatterplot3d)
library(SDMTools)
## Warning: package 'SDMTools' was built under R version 3.4.3
##
## Attaching package: 'SDMTools'
## The following object is masked from 'package:pROC':
##
## auc
## The following objects are masked from 'package:caret':
##
## sensitivity, specificity
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ElemStatLearn)
## Warning: package 'ElemStatLearn' was built under R version 3.4.3
##
## Attaching package: 'ElemStatLearn'
## The following object is masked from 'package:plyr':
##
## ozone
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
##Set working directory
setwd("D:/GREAT LAKES/MACHINE LEARNING")
getwd()
## [1] "D:/GREAT LAKES/MACHINE LEARNING"
Cars<-read.csv("Actual_Cars_data.csv", header=T)
sapply(Cars, function(y) sum(length(which(is.na(y)))))
## Age Gender Engineer MBA Work.Exp Salary Distance
## 0 0 0 1 0 0 0
## license Transport
## 0 0
Cars[is.na(Cars$MBA),4]<-0
sapply(Cars, function(y) sum(length(which(is.na(y)))))
## Age Gender Engineer MBA Work.Exp Salary Distance
## 0 0 0 0 0 0 0
## license Transport
## 0 0
View(Cars)
na.omit(Cars)
summary(Cars)
## Age Gender Engineer MBA
## Min. :18.00 Female:128 Min. :0.0000 Min. :0.0000
## 1st Qu.:25.00 Male :316 1st Qu.:1.0000 1st Qu.:0.0000
## Median :27.00 Median :1.0000 Median :0.0000
## Mean :27.75 Mean :0.7545 Mean :0.2523
## 3rd Qu.:30.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :43.00 Max. :1.0000 Max. :1.0000
## Work.Exp Salary Distance license
## Min. : 0.0 Min. : 6.50 Min. : 3.20 Min. :0.0000
## 1st Qu.: 3.0 1st Qu.: 9.80 1st Qu.: 8.80 1st Qu.:0.0000
## Median : 5.0 Median :13.60 Median :11.00 Median :0.0000
## Mean : 6.3 Mean :16.24 Mean :11.32 Mean :0.2342
## 3rd Qu.: 8.0 3rd Qu.:15.72 3rd Qu.:13.43 3rd Qu.:0.0000
## Max. :24.0 Max. :57.00 Max. :23.40 Max. :1.0000
## Transport
## 2Wheeler : 83
## Car : 61
## Public Transport:300
##
##
##
str(Cars)
## 'data.frame': 444 obs. of 9 variables:
## $ Age : int 28 23 29 28 27 26 28 26 22 27 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 2 2 1 2 2 ...
## $ Engineer : int 0 1 1 1 1 1 1 1 1 1 ...
## $ MBA : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...
## $ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
## $ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
## $ license : int 0 0 0 0 0 1 0 0 0 0 ...
## $ Transport: Factor w/ 3 levels "2Wheeler","Car",..: 3 3 3 3 3 3 1 3 3 3 ...
## Determine Levels
levels(Cars$Gender)
## [1] "Female" "Male"
levels(Cars$Transport)
## [1] "2Wheeler" "Car" "Public Transport"
#Define some dummies
Cars$Female<-ifelse(Cars$Gender=="Female",1,0)
Cars$Male<-ifelse(Cars$Gender=="Male",1,0)
#Partitioning Data Sets
#Partition train and val
#We will use this throughout so that samples are comparable
set.seed(7)
pd<-sample(2,nrow(Cars),replace=TRUE, prob=c(0.7,0.3))
train<-Cars[pd==1,]
val<-Cars[pd==2,]
#K FOLD VALIDATIONS
names(Cars)
## [1] "Age" "Gender" "Engineer" "MBA" "Work.Exp"
## [6] "Salary" "Distance" "license" "Transport" "Female"
## [11] "Male"
Carstrim<-Cars[,-2]
na.omit(Carstrim)
set.seed(7)
pd<-sample(2,nrow(Carstrim),replace=TRUE, prob=c(0.7,0.3))
traintrim<-Carstrim[pd==1,]
valtrim<-Carstrim[pd==2,]
set.seed(7)
folds<-createFolds(Cars$Transport,k=10)
str(folds)
## List of 10
## $ Fold01: int [1:46] 29 33 45 56 69 90 104 109 119 120 ...
## $ Fold02: int [1:44] 30 78 92 94 99 100 103 121 130 135 ...
## $ Fold03: int [1:44] 14 15 47 52 59 71 75 87 88 95 ...
## $ Fold04: int [1:45] 9 16 36 42 49 50 54 57 64 72 ...
## $ Fold05: int [1:44] 8 11 13 23 24 26 35 37 41 68 ...
## $ Fold06: int [1:44] 1 5 7 18 32 34 44 51 63 65 ...
## $ Fold07: int [1:44] 38 58 84 85 86 113 127 136 141 161 ...
## $ Fold08: int [1:44] 3 12 20 21 27 28 31 48 60 61 ...
## $ Fold09: int [1:44] 4 10 19 25 40 46 53 67 77 80 ...
## $ Fold10: int [1:45] 2 6 17 22 39 43 55 70 73 97 ...
######
Eq.2 <- Transport ~ .
#10 Fold validation with LPM
cv_LPM<-lapply(folds,function(x){
train<-traintrim[x,]
test<-valtrim[-x,]
LPM.1<-lm(Eq.2, train)
LPM1.pred<-predict(LPM.1, test)
tab.LPM<-table(test$Transport, LPM1.pred>0.5)
sum(diag(tab.LPM))/sum(tab.LPM)
})
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in predict.lm(LPM.1, test): prediction from a rank-deficient fit
## may be misleading
str(cv_LPM)
## List of 10
## $ Fold01: num 0.145
## $ Fold02: num 0.133
## $ Fold03: num 0.145
## $ Fold04: num 0.116
## $ Fold05: num 0.145
## $ Fold06: num 0.142
## $ Fold07: num 0.147
## $ Fold08: num 0.12
## $ Fold09: num 0.159
## $ Fold10: num 0.148
fit.LPM<-mean(unlist(cv_LPM))
fit.LPM
## [1] 0.1401046
##########
## Accuracy of LPM is 14.01%
#########
#10 Vold Validation with NB
cv_NB<-lapply(folds,function(x){
train.NB.kval<-traintrim[x,]
test.NB.kval<-valtrim[-x,]
NB.kval<-naiveBayes(x=train.NB.kval[-1], y=train.NB.kval$Transport)
y_pred.NB.kval<-predict( NB.kval,newdata=test.NB.kval[-1])
cm.NB.kval=table(test.NB.kval[,1],y_pred.NB.kval)
sum(diag(cm.NB.kval))/sum(cm.NB.kval)
})
str(cv_NB)
## List of 10
## $ Fold01: num 0.0229
## $ Fold02: num 0.00741
## $ Fold03: num 0.0305
## $ Fold04: num 0.0155
## $ Fold05: num 0.0382
## $ Fold06: num 0.0157
## $ Fold07: num 0.0368
## $ Fold08: num 0.032
## $ Fold09: num 0.0152
## $ Fold10: num 0.0156
fit.NB<-mean(unlist(cv_NB))
fit.NB
## [1] 0.02298036
#####################
##Accuracy of NB is 2.2%
#####################
#10 Fold with LDA
library(MASS)
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.4.3
cv_LDA<-lapply(folds,function(x){
train<-traintrim[x,]
test<-valtrim[-x,]
lda_1<-lda(Eq.2 , train)
lda1.pred<-predict(lda_1, newdata=test)
ldapredclass<-lda1.pred$class
tab.LDA<-table(ldapredclass,test$Transport)
sum(diag(tab.LDA))/sum(tab.LDA)
})
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
str(cv_LDA)
## List of 10
## $ Fold01: num 0.626
## $ Fold02: num 0.733
## $ Fold03: num 0.779
## $ Fold04: num 0.775
## $ Fold05: num 0.763
## $ Fold06: num 0.78
## $ Fold07: num 0.721
## $ Fold08: num 0.768
## $ Fold09: num 0.742
## $ Fold10: num 0.695
fit.LDA<-mean(unlist(cv_LDA))
fit.LDA
## [1] 0.7382319
#########
##Accuracy of LDA is 73.82%
#########
#10 Fold on Decision Trees
cv_DT<-lapply(folds,function(x){
train<-traintrim[x,]
test<-valtrim[-x,]
DT<-rpart(Eq.2, method="class",train)
pred = predict(DT, type="class",newdata=test)
tabDT<-table( pred,test$Transport)
sum(diag(tabDT))/sum(tabDT)
})
str(cv_DT)
## List of 10
## $ Fold01: num 0.718
## $ Fold02: num 0.778
## $ Fold03: num 0.702
## $ Fold04: num 0.543
## $ Fold05: num 0.756
## $ Fold06: num 0.606
## $ Fold07: num 0.647
## $ Fold08: num 0.584
## $ Fold09: num 0.523
## $ Fold10: num 0.734
fit.DT<-mean(unlist(cv_DT))
fit.DT
## [1] 0.6590446
########
##Accuracy of Decision trees is 65.90%
########
#####SMOTE
library(DMwR)
## Warning: package 'DMwR' was built under R version 3.4.3
## Loading required package: grid
##
## Attaching package: 'DMwR'
## The following object is masked from 'package:plyr':
##
## join
train_SMOTE<-traintrim[,-2]
qplot(Distance,Salary,color=Transport, data=traintrim)
table(train_SMOTE$Transport)
##
## 2Wheeler Car Public Transport
## 63 39 197
#SMOTE
#Two factors to see the plot
train_SMOTE$target <- as.factor(traintrim$Transport)
table(train_SMOTE$target)
##
## 2Wheeler Car Public Transport
## 63 39 197
trainSplit <- SMOTE(target ~ ., train_SMOTE, perc.over = 200, perc.under=300)
print(prop.table(table(trainSplit$target)))
##
## 2Wheeler Car Public Transport
## 0.1509972 0.3333333 0.5156695
table(trainSplit$target)
##
## 2Wheeler Car Public Transport
## 53 117 181
qplot(Distance,Salary,color=Transport, data=trainSplit)
train_SMOTE_new<-trainSplit
train_SMOTE_new <- SMOTE(Transport ~ ., train_SMOTE_new, perc.over = 100, perc.under=100)
train_SMOTE_new$target <- as.factor(train_SMOTE_new$Transport)
trainSplit <- SMOTE(target ~ ., train_SMOTE_new, perc.over = 100, perc.under=100)
trainSplit$target <- as.numeric(trainSplit$target)
trainSplit$target<-ifelse(trainSplit$target==1,1,0)
print(prop.table(table(trainSplit$target)))
##
## 0 1
## 0.7708333 0.2291667
write.csv(train_SMOTE, "SMOTE.csv", row.names = FALSE)
#####################
Cars1<-read.csv("SMOTE.csv", header=T)
dim(Cars1)
## [1] 299 10
names(Cars1)
## [1] "Age" "MBA" "Work.Exp" "Salary" "Distance"
## [6] "license" "Transport" "Female" "Male" "target"
Cars1<-Cars1[,-7]
na.omit(Cars1)
summary(Cars1)
## Age MBA Work.Exp Salary
## Min. :18.00 Min. :0.0000 Min. : 0.000 Min. : 6.50
## 1st Qu.:25.00 1st Qu.:0.0000 1st Qu.: 3.000 1st Qu.: 9.55
## Median :27.00 Median :0.0000 Median : 5.000 Median :12.90
## Mean :27.49 Mean :0.2408 Mean : 5.967 Mean :15.61
## 3rd Qu.:30.00 3rd Qu.:0.0000 3rd Qu.: 8.000 3rd Qu.:15.60
## Max. :43.00 Max. :1.0000 Max. :24.000 Max. :57.00
## Distance license Female Male
## Min. : 3.30 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 8.60 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :11.30 Median :0.000 Median :0.0000 Median :1.0000
## Mean :11.44 Mean :0.214 Mean :0.3144 Mean :0.6856
## 3rd Qu.:13.60 3rd Qu.:0.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :23.40 Max. :1.000 Max. :1.0000 Max. :1.0000
## target
## 2Wheeler : 63
## Car : 39
## Public Transport:197
##
##
##
str(Cars1)
## 'data.frame': 299 obs. of 9 variables:
## $ Age : int 23 29 28 27 28 22 27 25 27 27 ...
## $ MBA : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Work.Exp: int 4 7 5 4 5 1 4 4 4 4 ...
## $ Salary : num 8.3 13.4 13.4 13.4 14.4 7.5 13.5 11.5 13.5 13.4 ...
## $ Distance: num 3.3 4.1 4.5 4.6 5.1 5.1 5.2 5.2 5.3 5.5 ...
## $ license : int 0 0 0 0 0 0 0 0 1 1 ...
## $ Female : int 1 0 1 0 0 0 0 1 0 0 ...
## $ Male : int 0 1 0 1 1 1 1 0 1 1 ...
## $ target : Factor w/ 3 levels "2Wheeler","Car",..: 3 3 3 3 1 3 3 3 3 3 ...
attach(Cars1)
set.seed(777)
pd<-sample(2,nrow(Cars1),replace=TRUE, prob=c(0.7,0.3))
train1<-Cars1[pd==1,]
val1<-Cars1[pd==2,]
attach(train1)
## The following objects are masked from Cars1:
##
## Age, Distance, Female, license, Male, MBA, Salary, target,
## Work.Exp
attach(val1)
## The following objects are masked from train1:
##
## Age, Distance, Female, license, Male, MBA, Salary, target,
## Work.Exp
## The following objects are masked from Cars1:
##
## Age, Distance, Female, license, Male, MBA, Salary, target,
## Work.Exp
#10 Fold with LDA
library(MASS)
library(ISLR)
Eq.3 <- target~.
cv_LDA<-lapply(folds,function(x){
train<-train1[x,]
test<-val1[-x,]
lda_1<-lda(Eq.3 , train1)
lda1.pred<-predict(lda_1, newdata=test)
ldapredclass<-lda1.pred$class
tab.LDA<-table(ldapredclass,test$target)
sum(diag(tab.LDA))/sum(tab.LDA)
})
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
str(cv_LDA)
## List of 10
## $ Fold01: num 0.84
## $ Fold02: num 0.833
## $ Fold03: num 0.835
## $ Fold04: num 0.842
## $ Fold05: num 0.827
## $ Fold06: num 0.808
## $ Fold07: num 0.84
## $ Fold08: num 0.847
## $ Fold09: num 0.855
## $ Fold10: num 0.844
fit.LDA<-mean(unlist(cv_LDA))
fit.LDA
## [1] 0.8371421
#########
##Accuracy of LDA post Smote is 83.71%
#########
#10 Fold on Decision Trees post SMOTE
cv_DT<-lapply(folds,function(x){
train<-train1[x,]
test<-val1[-x,]
DT<-rpart(Eq.3, method="class",train)
pred = predict(DT, type="class",newdata=test)
tabDT<-table( pred,test$target)
sum(diag(tabDT))/sum(tabDT)
})
str(cv_DT)
## List of 10
## $ Fold01: num 0.593
## $ Fold02: num 0.631
## $ Fold03: num 0.633
## $ Fold04: num 0.526
## $ Fold05: num 0.507
## $ Fold06: num 0.616
## $ Fold07: num 0.654
## $ Fold08: num 0.639
## $ Fold09: num 0.763
## $ Fold10: num 0.636
fit.DT<-mean(unlist(cv_DT))
fit.DT
## [1] 0.6198609
########
##Accuracy of Decision trees is 61.98%
########
#10 Vold Validation with NB
cv_NB<-lapply(folds,function(x){
train.NB.kval<-train1[x,]
test.NB.kval<-val1[-x,]
NB.kval<-naiveBayes(x=train.NB.kval[-1], y=train.NB.kval$target)
y_pred.NB.kval<-predict( NB.kval,newdata=test.NB.kval[-1])
cm.NB.kval=table(test.NB.kval[,1],y_pred.NB.kval)
sum(diag(cm.NB.kval))/sum(cm.NB.kval)
})
str(cv_NB)
## List of 10
## $ Fold01: num 0.0247
## $ Fold02: num 0.0357
## $ Fold03: num 0.0253
## $ Fold04: num 0.0263
## $ Fold05: num 0.0133
## $ Fold06: num 0.0411
## $ Fold07: num 0.0123
## $ Fold08: num 0.111
## $ Fold09: num 0.0132
## $ Fold10: num 0.026
fit.NB<-mean(unlist(cv_NB))
fit.NB
## [1] 0.03290558
#####################
##Accuracy of NB is 3.9%
#####################
###SVM
####2_D PLOT
library(rpart)
svm.2<-svm(target~., data=train1, kernel="linear")
summary(svm.2)
##
## Call:
## svm(formula = target ~ ., data = train1, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
## gamma: 0.125
##
## Number of Support Vectors: 109
##
## ( 52 46 11 )
##
##
## Number of Classes: 3
##
## Levels:
## 2Wheeler Car Public Transport
##Confusion matrix
svm.full<-svm(target~., data=train1, kernel="radial")
summary(svm.full)
##
## Call:
## svm(formula = target ~ ., data = train1, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.125
##
## Number of Support Vectors: 126
##
## ( 63 45 18 )
##
##
## Number of Classes: 3
##
## Levels:
## 2Wheeler Car Public Transport
y_pred.svm.full<-predict(svm.full,newdata=val1[,-9])
y_pred.svm.full
## 4 9 11 15
## Public Transport Public Transport Public Transport Public Transport
## 18 26 29 30
## Public Transport Public Transport Public Transport Public Transport
## 31 32 33 37
## Public Transport Public Transport Public Transport Public Transport
## 46 47 48 49
## Public Transport Public Transport Public Transport Public Transport
## 58 61 63 65
## Public Transport Public Transport Public Transport Public Transport
## 69 72 73 76
## Public Transport Public Transport Public Transport Public Transport
## 89 90 91 94
## Public Transport Public Transport Public Transport Public Transport
## 96 99 100 101
## Public Transport Public Transport Public Transport Public Transport
## 105 106 112 118
## Public Transport Public Transport Public Transport Car
## 119 120 126 129
## Public Transport Public Transport Public Transport Public Transport
## 132 141 142 151
## Public Transport Public Transport Car Car
## 154 161 164 165
## 2Wheeler Public Transport Public Transport Public Transport
## 174 176 177 185
## Public Transport Public Transport Public Transport Public Transport
## 188 189 190 192
## Public Transport Public Transport Public Transport Public Transport
## 195 201 209 210
## Public Transport Public Transport Car Public Transport
## 211 214 221 223
## Public Transport 2Wheeler Car Public Transport
## 224 230 234 244
## Public Transport Public Transport Public Transport Public Transport
## 246 250 251 255
## Public Transport Public Transport 2Wheeler Public Transport
## 257 259 265 267
## 2Wheeler Public Transport Public Transport Car
## 270 273 275 276
## Public Transport Public Transport 2Wheeler Car
## 277 286 287 290
## 2Wheeler Car 2Wheeler Car
## 291 294
## Car Car
## Levels: 2Wheeler Car Public Transport
#Confusion matrix
head(val1)
head(train1)
cm.SVMB.full=table(val1[,9],y_pred.svm.full)
cm.SVMB.full
## y_pred.svm.full
## 2Wheeler Car Public Transport
## 2Wheeler 6 1 10
## Car 0 10 5
## Public Transport 1 0 53
accuracy.svm.full<-sum(diag(cm.SVMB.full))/sum(cm.SVMB.full)
accuracy.svm.full
## [1] 0.8023256
##
##Accuracy of SVM post SMOTE is 80.23%
###3
#Tune
set.seed(77)
tune.svm<-tune(svm, target~.,data=train1,ranges=list(epsilon=seq(0,1,0.1), cost=2^(2:9)))
summary(tune.svm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## epsilon cost
## 0 64
##
## - best performance: 0.2010823
##
## - Detailed performance results:
## epsilon cost error dispersion
## 1 0.0 4 0.2064935 0.05921759
## 2 0.1 4 0.2064935 0.05921759
## 3 0.2 4 0.2064935 0.05921759
## 4 0.3 4 0.2064935 0.05921759
## 5 0.4 4 0.2064935 0.05921759
## 6 0.5 4 0.2064935 0.05921759
## 7 0.6 4 0.2064935 0.05921759
## 8 0.7 4 0.2064935 0.05921759
## 9 0.8 4 0.2064935 0.05921759
## 10 0.9 4 0.2064935 0.05921759
## 11 1.0 4 0.2064935 0.05921759
## 12 0.0 8 0.2153680 0.08001342
## 13 0.1 8 0.2153680 0.08001342
## 14 0.2 8 0.2153680 0.08001342
## 15 0.3 8 0.2153680 0.08001342
## 16 0.4 8 0.2153680 0.08001342
## 17 0.5 8 0.2153680 0.08001342
## 18 0.6 8 0.2153680 0.08001342
## 19 0.7 8 0.2153680 0.08001342
## 20 0.8 8 0.2153680 0.08001342
## 21 0.9 8 0.2153680 0.08001342
## 22 1.0 8 0.2153680 0.08001342
## 23 0.0 16 0.2058442 0.08064332
## 24 0.1 16 0.2058442 0.08064332
## 25 0.2 16 0.2058442 0.08064332
## 26 0.3 16 0.2058442 0.08064332
## 27 0.4 16 0.2058442 0.08064332
## 28 0.5 16 0.2058442 0.08064332
## 29 0.6 16 0.2058442 0.08064332
## 30 0.7 16 0.2058442 0.08064332
## 31 0.8 16 0.2058442 0.08064332
## 32 0.9 16 0.2058442 0.08064332
## 33 1.0 16 0.2058442 0.08064332
## 34 0.0 32 0.2012987 0.10149338
## 35 0.1 32 0.2012987 0.10149338
## 36 0.2 32 0.2012987 0.10149338
## 37 0.3 32 0.2012987 0.10149338
## 38 0.4 32 0.2012987 0.10149338
## 39 0.5 32 0.2012987 0.10149338
## 40 0.6 32 0.2012987 0.10149338
## 41 0.7 32 0.2012987 0.10149338
## 42 0.8 32 0.2012987 0.10149338
## 43 0.9 32 0.2012987 0.10149338
## 44 1.0 32 0.2012987 0.10149338
## 45 0.0 64 0.2010823 0.09343058
## 46 0.1 64 0.2010823 0.09343058
## 47 0.2 64 0.2010823 0.09343058
## 48 0.3 64 0.2010823 0.09343058
## 49 0.4 64 0.2010823 0.09343058
## 50 0.5 64 0.2010823 0.09343058
## 51 0.6 64 0.2010823 0.09343058
## 52 0.7 64 0.2010823 0.09343058
## 53 0.8 64 0.2010823 0.09343058
## 54 0.9 64 0.2010823 0.09343058
## 55 1.0 64 0.2010823 0.09343058
## 56 0.0 128 0.2194805 0.09422258
## 57 0.1 128 0.2194805 0.09422258
## 58 0.2 128 0.2194805 0.09422258
## 59 0.3 128 0.2194805 0.09422258
## 60 0.4 128 0.2194805 0.09422258
## 61 0.5 128 0.2194805 0.09422258
## 62 0.6 128 0.2194805 0.09422258
## 63 0.7 128 0.2194805 0.09422258
## 64 0.8 128 0.2194805 0.09422258
## 65 0.9 128 0.2194805 0.09422258
## 66 1.0 128 0.2194805 0.09422258
## 67 0.0 256 0.2142857 0.11591712
## 68 0.1 256 0.2142857 0.11591712
## 69 0.2 256 0.2142857 0.11591712
## 70 0.3 256 0.2142857 0.11591712
## 71 0.4 256 0.2142857 0.11591712
## 72 0.5 256 0.2142857 0.11591712
## 73 0.6 256 0.2142857 0.11591712
## 74 0.7 256 0.2142857 0.11591712
## 75 0.8 256 0.2142857 0.11591712
## 76 0.9 256 0.2142857 0.11591712
## 77 1.0 256 0.2142857 0.11591712
## 78 0.0 512 0.2285714 0.10580360
## 79 0.1 512 0.2285714 0.10580360
## 80 0.2 512 0.2285714 0.10580360
## 81 0.3 512 0.2285714 0.10580360
## 82 0.4 512 0.2285714 0.10580360
## 83 0.5 512 0.2285714 0.10580360
## 84 0.6 512 0.2285714 0.10580360
## 85 0.7 512 0.2285714 0.10580360
## 86 0.8 512 0.2285714 0.10580360
## 87 0.9 512 0.2285714 0.10580360
## 88 1.0 512 0.2285714 0.10580360
best.svm<-tune.svm$best.model
summary(best.svm)
##
## Call:
## best.tune(method = svm, train.x = target ~ ., data = train1,
## ranges = list(epsilon = seq(0, 1, 0.1), cost = 2^(2:9)))
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 64
## gamma: 0.125
##
## Number of Support Vectors: 96
##
## ( 47 34 15 )
##
##
## Number of Classes: 3
##
## Levels:
## 2Wheeler Car Public Transport
best.par<-tune.svm$best.parameters
summary(best.par)
## epsilon cost
## Min. :0 Min. :64
## 1st Qu.:0 1st Qu.:64
## Median :0 Median :64
## Mean :0 Mean :64
## 3rd Qu.:0 3rd Qu.:64
## Max. :0 Max. :64
#CM
y_pred.svm.best<-predict(best.svm,newdata=val1[-9])
#Confusion matrix
cm.SVMB.best=table(val1[,9],y_pred.svm.best)
cm.SVMB.best
## y_pred.svm.best
## 2Wheeler Car Public Transport
## 2Wheeler 6 2 9
## Car 0 10 5
## Public Transport 1 0 53
accuracy.svm.best<-sum(diag(cm.SVMB.best))/sum(cm.SVMB.best)
accuracy.svm.best
## [1] 0.8023256
#################
## Accuracy after tuning SVM is 80.23%
################
#K- nearest neighbour
names(Cars1)
## [1] "Age" "MBA" "Work.Exp" "Salary" "Distance" "license"
## [7] "Female" "Male" "target"
kneig = knn(train = Cars1[,-9],test = Cars1[,-9],cl= Cars1$target)
cm.knn= table(Actual=Cars1$target, Predicted = kneig)
accuracy.knn<-sum(diag(cm.knn))/sum(cm.knn)
accuracy.knn
## [1] 1
#Predicting the 2 rows using various methods
Cars2=read.csv("Actual_Cars_data.csv", header=T)
Cars2 = na.omit(Cars2)
predictdata = read.csv("cars.test.csv",header = T)
Cars2$Gender <- as.numeric(Cars2$Gender)
predictdata$Gender <- as.numeric(predictdata$Gender)
predictdata$Engineer <- as.numeric(predictdata$Engineer)
predictdata$MBA <- as.numeric(predictdata$MBA)
predictdata$license <- as.numeric(predictdata$license)
names(Cars2)
## [1] "Age" "Gender" "Engineer" "MBA" "Work.Exp" "Salary"
## [7] "Distance" "license" "Transport"
names(predictdata)
## [1] "Age" "Gender" "Engineer" "MBA" "Work.Exp" "Salary"
## [7] "Distance" "license"
#Using KNN
knn.test = knn(train= Cars2[,-9], test= predictdata[,-9], cl=Cars2[,9], k=3)
predictdata$Transport.KNN = knn.test
predictdata
##Predicting is PUBLIC TRANSPORT
names(predictdata)
## [1] "Age" "Gender" "Engineer" "MBA"
## [5] "Work.Exp" "Salary" "Distance" "license"
## [9] "Transport.KNN"
#Lets us use LDA
lda.predidct = lda(Cars2$Transport~., data=Cars2[,-9])
a = predict(lda.predidct, newdata = predictdata[,-9])
predictdata$Transport.LDA = a$class
predictdata
#Even LDA predicts Public Transport
# Lets us use SVM
predict.svm = svm(Cars2$Transport~., data=Cars2[,-9], kernel= "polynomial")
p.svm = predict(predict.svm, newdata = predictdata[,-c(9,10)])
predictdata$Transport.SVM = p.svm
predictdata[,c(9,10,11)]
#Prediction is Public Transport using all the methods
####################################
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.