setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/6.Feature Set 5/Directives")
#install.packages("naivebayes")
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.4.3
library(dplyr)
## Warning: Installed Rcpp (0.12.16) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(e1071)
library(readxl)
In order to calculate the accuracy, the original (=actual) labels were imported.
#Import Labels
Labels <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Naive Bayes/1.Labels/Source Data.xlsx")
Label <- Labels$Score
As a next step, The Activation feature set F4 was imported, the Directives columns were imported as both were combined into feature set 5.
#Import Features
Features1 <- read.csv("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/6.Feature Set 5/Directives/Feature Set 4 TP.csv")
Features1 <- Features1[-1]
#Import Features
Features2 <- read.csv("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/6.Feature Set 5/Directives/Directives.csv")
Features2 <- Features2[4:10]
Features2 <- Features2[1:1000,]
#Import Features
Features <- cbind(Features1, Features2)
In order to enable One-vs-all multiclass classification the labels were recoded.
#Class 2
Label2 <- list()
for(i in 1:1000){
if(Label[i]==3| Label[i]==4){
Label2[i] <- 1
}else{
Label2[i] <- 0
}
}
#As Factor
Label2 <- as.factor(unlist(Label2))
#Class 3
Label3 <- list()
for(i in 1:1000){
if(Label[i]==5| Label[i]==6){
Label3[i] <- 1
}else{
Label3[i] <- 0
}
}
#As Factor
Label3 <- as.factor(unlist(Label3))
#Class 4
Label4 <- list()
for(i in 1:1000){
if(Label[i]==7| Label[i]==8){
Label4[i] <- 1
}else{
Label4[i] <- 0
}
}
#As Factor
Label4 <- as.factor(unlist(Label4))
#Class 5
Label5 <- list()
for(i in 1:1000){
if(Label[i]==9| Label[i]==10){
Label5[i] <- 1
}else{
Label5[i] <- 0
}
}
#As Factor
Label5 <- as.factor(unlist(Label5))
#All Labels
All <- list()
for(i in 1:1000){
if(Label[i]==9| Label[i]==10){
All[i] <- 5
}else if(Label[i]==7| Label[i]==8){
All[i] <- 4
}else if(Label[i]==5| Label[i]==6){
All[i] <- 3
}else{
All[i] <- 2
}
}
#As Factor
All <- as.factor(unlist(All))
Since SVM functions merely on numeric data, the features were transformed to numeric.
#Transform Integer to Factor
for(i in 1:445){
Features[,i] <- as.numeric(Features[,i])
}
str(Features)
## 'data.frame': 1000 obs. of 445 variables:
## $ amaz_jj : num 0 0 0 0 1 0 0 0 0 0 ...
## $ arriv_jj : num 1 0 0 0 0 0 0 0 0 0 ...
## $ bad_jj : num 0 0 0 1 0 0 0 0 0 0 ...
## $ basic_jj : num 0 0 1 0 0 0 0 0 0 0 ...
## $ beauti_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ befor_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ best_jjs : num 1 0 0 0 0 0 0 0 0 0 ...
## $ big_jj : num 1 0 0 0 0 0 0 0 0 1 ...
## $ build_jj : num 0 0 0 1 0 0 0 0 0 0 ...
## $ central_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ clean_jj : num 0 0 0 1 0 0 1 0 1 0 ...
## $ clear_jj : num 0 0 0 0 1 0 0 0 0 0 ...
## $ close_jj : num 1 0 0 0 0 0 0 0 0 0 ...
## $ cold_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ difficult_jj : num 0 0 1 0 0 0 0 0 0 0 ...
## $ due_jj : num 1 0 0 0 0 0 0 0 0 0 ...
## $ earl_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ easi_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ english_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ enough_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ excel_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ extra_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ first_jj : num 0 1 0 0 0 0 0 0 0 0 ...
## $ free_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fresh_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ friend_jj : num 0 0 0 0 0 0 0 1 0 0 ...
## $ front_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ full_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ general_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ good_jj : num 0 0 1 0 0 1 0 1 0 1 ...
## $ great_jj : num 0 1 0 1 0 1 0 0 0 0 ...
## $ guest_jjs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ high_jj : num 1 0 0 0 0 0 0 1 0 0 ...
## $ hot_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ huge_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ littl_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ locat_jj : num 0 0 0 1 0 0 0 0 0 0 ...
## $ london_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ loud_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ main_jj : num 0 0 0 0 0 1 0 0 0 0 ...
## $ major_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ modern_jj : num 0 0 0 0 0 1 0 0 0 0 ...
## $ much_jj : num 0 0 0 0 0 0 0 0 0 1 ...
## $ new_jj : num 1 1 0 0 0 0 0 0 0 0 ...
## $ next_jj : num 1 0 0 1 0 0 0 0 0 0 ...
## $ nice_jj : num 0 0 1 1 0 0 0 0 0 0 ...
## $ nois_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ noisi_jj : num 0 0 0 1 0 0 0 0 0 0 ...
## $ ok_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ old_jj : num 0 0 0 1 0 0 0 0 0 0 ...
## $ onli_jj : num 1 0 0 0 0 0 0 0 0 0 ...
## $ open_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ overal_jj : num 0 0 0 1 0 0 0 0 0 0 ...
## $ particular_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ perfect_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pillow_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pleasant_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ poor_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ public_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ quiet_jj : num 0 0 0 0 0 0 1 0 0 0 ...
## $ realli_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ recept_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ safe_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ second_jj : num 0 1 0 1 0 0 0 0 0 0 ...
## $ select_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ servic_jj : num 0 0 0 0 0 0 0 0 0 1 ...
## $ short_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ shower_jjr : num 0 0 0 0 0 0 0 0 0 0 ...
## $ sleep_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ small_jj : num 1 0 0 0 0 0 0 0 0 0 ...
## $ spacious_jj : num 0 0 0 0 0 0 1 0 0 0 ...
## $ special_jj : num 1 0 0 0 0 0 0 0 0 0 ...
## $ standard_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ stay_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ steep_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ super_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ sure_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ underground_jj: num 0 0 0 0 0 0 0 0 0 0 ...
## $ upgrad_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ veri_jj : num 0 0 0 0 0 0 0 0 1 0 ...
## $ warm_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ whole_jj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ask_vb : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bed_vbd : num 0 0 0 1 0 0 0 0 0 1 ...
## $ build_vb : num 0 0 0 0 0 0 0 0 0 0 ...
## $ came_vbd : num 1 0 0 1 0 0 0 0 0 0 ...
## $ check_vb : num 1 0 0 0 0 0 0 0 0 0 ...
## $ definit_vb : num 0 0 0 0 0 0 0 0 0 0 ...
## $ done_vbn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ expens_vbz : num 0 0 0 0 0 0 0 0 0 0 ...
## $ gave_vbd : num 0 0 0 0 0 0 0 0 0 0 ...
## $ get_vb : num 0 0 0 0 1 0 0 0 0 0 ...
## $ given_vbn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ go_vb : num 0 0 1 0 0 0 0 0 0 0 ...
## $ go_vbp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ got_vbd : num 1 0 0 0 0 0 0 0 0 0 ...
## $ like_vb : num 0 0 0 0 0 0 0 0 0 0 ...
## $ love_vb : num 0 0 0 0 0 0 0 0 0 0 ...
## $ made_vbd : num 1 0 0 0 0 0 1 0 0 0 ...
## [list output truncated]
#Features
set.seed(1234)
ind <- sample(2,nrow(Features),replace = T, prob =c(0.8,0.2))
train <- Features[ind == 1,]
test <- Features[ind ==2,]
train.labels.2 <- Label2[ind == 1]
test.labels.2 <- Label2[ind ==2]
train.labels.3 <- Label3[ind == 1]
test.labels.3 <- Label3[ind ==2]
train.labels.4 <- Label4[ind == 1]
test.labels.4 <- Label4[ind ==2]
train.labels.5 <- Label5[ind == 1]
test.labels.5 <- Label5[ind ==2]
train.labels <- All[ind == 1]
test.labels <- All[ind ==2]
In the following code chunks the SVM Models were created. Probability was chosen as an output to enable one-vs-all voting.
#SVM2
train2 <- train
train2$Score <- train.labels.2
SVM2 <- svm(Score~.,data = train2,scale = FALSE,probability=TRUE)
train3 <- train
train3$Score <- train.labels.3
SVM3 <- svm(Score~.,data = train3,scale = FALSE,probability=TRUE)
train4 <- train
train4$Score <- train.labels.4
SVM4 <- svm(Score~.,data = train4,scale = FALSE,probability=TRUE)
train5 <- train
train5$Score <- train.labels.5
SVM5 <- svm(Score~.,data = train5,scale = FALSE,probability=TRUE)
P2 <- predict(SVM2,newdata = test,probability = TRUE)
P3 <- predict(SVM3,newdata = test,probability = TRUE)
P4 <- predict(SVM4,newdata = test,probability = TRUE)
P5 <- predict(SVM5,newdata = test,probability = TRUE)
Prob2 <- attr(P2,"probabilities")
Prob3 <- attr(P3,"probabilities")
Prob4 <- attr(P4,"probabilities")
Prob5 <- attr(P5,"probabilities")
Using the probabilities the class label was chosen based on the class output yielding the highest probability.
Voting.df <- data.frame(Prob2, Prob3,Prob4,Prob5)
colnames(Voting.df) <- c("Class 2: 1","Class2: 0","Class 3: 0","Class3: 1","Class 4: 0","Class4: 1","Class 5: 0","Class5: 1")
head(Voting.df)
## Class 2: 1 Class2: 0 Class 3: 0 Class3: 1 Class 4: 0 Class4: 1
## 5 0.00896994 0.9910301 0.7423329 0.25766715 0.6214878 0.3785122
## 14 0.01353055 0.9864695 0.9353734 0.06462658 0.6310323 0.3689677
## 16 0.02652086 0.9734791 0.9077051 0.09229490 0.7297924 0.2702076
## 26 0.01703159 0.9829684 0.8426246 0.15737537 0.7121952 0.2878048
## 28 0.02325420 0.9767458 0.8764019 0.12359813 0.6896383 0.3103617
## 29 0.02161586 0.9783841 0.7117325 0.28826753 0.6659524 0.3340476
## Class 5: 0 Class5: 1
## 5 0.7647381 0.2352619
## 14 0.6252877 0.3747123
## 16 0.6394111 0.3605889
## 26 0.6119417 0.3880583
## 28 0.3634612 0.6365388
## 29 0.8595895 0.1404105
SEQ <- c(1,4,6,8)
Transformed.Voting.df <- Voting.df[SEQ]
colnames(Transformed.Voting.df) <- c("2","3","4","5")
head(Transformed.Voting.df)
## 2 3 4 5
## 5 0.00896994 0.25766715 0.3785122 0.2352619
## 14 0.01353055 0.06462658 0.3689677 0.3747123
## 16 0.02652086 0.09229490 0.2702076 0.3605889
## 26 0.01703159 0.15737537 0.2878048 0.3880583
## 28 0.02325420 0.12359813 0.3103617 0.6365388
## 29 0.02161586 0.28826753 0.3340476 0.1404105
Evaluation <- Transformed.Voting.df
Index <- as.numeric(apply(Transformed.Voting.df,MARGIN = 1,which.max))
Index <- Index+1
Evaluation$Vote <- Index
Evaluation$Actual <- test.labels
head(Evaluation,100)
## 2 3 4 5 Vote Actual
## 5 0.008969940 0.25766715 0.3785122 0.23526189 4 4
## 14 0.013530546 0.06462658 0.3689677 0.37471230 5 5
## 16 0.026520864 0.09229490 0.2702076 0.36058888 5 5
## 26 0.017031593 0.15737537 0.2878048 0.38805827 5 4
## 28 0.023254200 0.12359813 0.3103617 0.63653879 5 4
## 29 0.021615863 0.28826753 0.3340476 0.14041049 4 4
## 39 0.096575877 0.05013732 0.2833911 0.18401079 4 5
## 40 0.007689031 0.19457177 0.4591637 0.22727519 4 3
## 60 0.012512930 0.04390206 0.3086926 0.58635300 5 5
## 61 0.126339022 0.13915024 0.2878105 0.25188831 4 3
## 72 0.014788173 0.05896982 0.2999720 0.75616831 5 4
## 81 0.021862866 0.15089503 0.2982282 0.39753074 5 3
## 86 0.028595778 0.10746275 0.3058422 0.60426347 5 5
## 90 0.040551205 0.13015573 0.3136177 0.19506194 4 4
## 92 0.094202743 0.10633937 0.3304228 0.14202402 4 4
## 113 0.039992356 0.16929183 0.3859987 0.11139709 4 5
## 116 0.015670583 0.10850525 0.3819989 0.37531346 4 4
## 117 0.020363441 0.10665776 0.3071756 0.43436640 5 5
## 122 0.069730005 0.12068627 0.3635117 0.20094548 4 4
## 123 0.017897163 0.08111000 0.3217716 0.36458897 5 2
## 124 0.054143301 0.10938830 0.3002118 0.17456286 4 4
## 131 0.006737198 0.13140358 0.2792546 0.61750868 5 4
## 135 0.308682680 0.13501443 0.4503738 0.07476070 4 3
## 137 0.013982300 0.03272387 0.3349908 0.57310717 5 5
## 140 0.033877337 0.13135765 0.3583033 0.37797247 5 4
## 142 0.038843759 0.05524177 0.3581089 0.60511917 5 5
## 149 0.013678166 0.11188070 0.3338507 0.62510426 5 4
## 154 0.022369831 0.11168413 0.2506650 0.47387389 5 5
## 156 0.054940618 0.09258194 0.3623109 0.20449581 4 3
## 158 0.149546626 0.14529973 0.4186452 0.02928768 4 3
## 169 0.019307022 0.06561817 0.2378568 0.67497017 5 5
## 185 0.008794933 0.08953548 0.2103407 0.74249724 5 5
## 187 0.006827037 0.10104775 0.3711644 0.39075180 5 5
## 192 0.049320996 0.16501424 0.4115207 0.06198452 4 3
## 194 0.023625246 0.10049116 0.3781683 0.28211733 4 4
## 195 0.028088261 0.14866297 0.2768519 0.40898362 5 4
## 196 0.050729315 0.37240281 0.3519989 0.05204611 3 5
## 197 0.203474206 0.27190454 0.2423694 0.08039968 3 3
## 199 0.014174682 0.09014140 0.2428369 0.55101347 5 5
## 210 0.141482591 0.17059555 0.3827134 0.06330054 4 3
## 216 0.013680373 0.06915786 0.2072190 0.78448665 5 5
## 220 0.006311984 0.21372523 0.2982777 0.18560628 4 4
## 227 0.187854835 0.01623465 0.3065295 0.30102144 4 5
## 234 0.029963705 0.11555460 0.3341516 0.32185746 4 3
## 240 0.020080290 0.14121658 0.3812388 0.22840211 4 5
## 245 0.076661422 0.08850394 0.3493080 0.51420035 5 4
## 249 0.018325436 0.14384272 0.2807481 0.49481189 5 5
## 261 0.020476400 0.11559107 0.3336434 0.42577310 5 3
## 277 0.012623178 0.07568487 0.2730304 0.85191626 5 5
## 283 0.021937118 0.09811821 0.3203753 0.44238230 5 5
## 290 0.014755965 0.09291608 0.2514336 0.77747811 5 4
## 293 0.012859616 0.08771263 0.3511743 0.26873683 4 5
## 302 0.011462215 0.13005329 0.2775993 0.57568135 5 4
## 305 0.029537728 0.08279859 0.2945444 0.65025263 5 4
## 308 0.020438268 0.12386981 0.2906515 0.55999210 5 4
## 311 0.012036448 0.08860496 0.2460488 0.67430384 5 5
## 320 0.021823162 0.10402206 0.2931855 0.63792259 5 2
## 322 0.032101994 0.07742145 0.2827552 0.76202938 5 5
## 330 0.011845651 0.06924417 0.2216778 0.90564218 5 4
## 332 0.049964174 0.12298955 0.4028285 0.25501301 4 4
## 333 0.034719361 0.09494105 0.3049548 0.62760417 5 5
## 339 0.016440554 0.09748882 0.3066599 0.46986190 5 5
## 341 0.027757502 0.09790572 0.4336191 0.21592424 4 4
## 344 0.053157011 0.06005152 0.2966739 0.75748894 5 5
## 349 0.013221066 0.11065266 0.2340092 0.68363635 5 5
## 355 0.026460767 0.10640358 0.2684180 0.77473965 5 5
## 356 0.029986808 0.10297335 0.3067079 0.45703286 5 3
## 365 0.016354512 0.15699100 0.2787720 0.41748305 5 3
## 366 0.018464978 0.11451527 0.2830460 0.51474303 5 4
## 369 0.009933948 0.12972724 0.3152632 0.41016706 5 4
## 371 0.013238878 0.09216215 0.2622614 0.70263713 5 5
## 373 0.013228099 0.07061495 0.3370108 0.58208550 5 5
## 389 0.039060306 0.09721878 0.2786441 0.67607125 5 2
## 390 0.024810994 0.13298704 0.2735313 0.64926745 5 4
## 396 0.033795083 0.08364046 0.4143470 0.25377028 4 4
## 412 0.004667192 0.09036212 0.3548040 0.45929294 5 5
## 413 0.027136827 0.11202732 0.3877304 0.33576659 4 3
## 415 0.016298063 0.09123903 0.3310444 0.54020044 5 4
## 422 0.063392990 0.10276902 0.3134085 0.53672039 5 5
## 425 0.011471670 0.07125098 0.2521237 0.88887895 5 5
## 434 0.018015926 0.07527461 0.3235391 0.47864164 5 5
## 438 0.010261977 0.10144728 0.3059179 0.68083852 5 4
## 441 0.156568596 0.17492483 0.2846521 0.22494651 4 5
## 442 0.026619335 0.10842265 0.2601110 0.59261890 5 5
## 445 0.015961954 0.14469511 0.3365806 0.50653330 5 5
## 447 0.043473023 0.14771770 0.3475054 0.08609948 4 3
## 453 0.022206502 0.15702560 0.3382819 0.44670561 5 4
## 454 0.141755202 0.05088024 0.2615173 0.16012225 4 5
## 462 0.006832734 0.07020427 0.2357424 0.78037325 5 5
## 474 0.013686654 0.09550583 0.2731114 0.62878661 5 3
## 476 0.028117968 0.19662585 0.2530625 0.41506226 5 3
## 493 0.022427165 0.11057226 0.2786160 0.47448741 5 5
## 502 0.014625404 0.16011324 0.3337716 0.31529361 4 4
## 503 0.040902886 0.03252506 0.3463613 0.38617576 5 5
## 506 0.012588605 0.13183076 0.3153006 0.40888169 5 5
## 508 0.016284396 0.05958420 0.3673441 0.60601347 5 5
## 512 0.024612295 0.14606130 0.2298947 0.72124827 5 5
## 513 0.022537913 0.10842390 0.2496111 0.52024332 5 5
## 521 0.228837513 0.04836192 0.3097010 0.64791543 5 2
## 524 0.010900264 0.14587327 0.3233630 0.35958624 5 5
Finally, the confusion matrix was created and accuracy, precision and recall was computed.
CM <- table(Evaluation$Actual,Evaluation$Vote)
CM
##
## 2 3 4 5
## 2 0 0 1 6
## 3 0 1 14 10
## 4 1 0 22 39
## 5 0 2 12 100
#Proportions
Overall <- length(Evaluation$Actual)
Length2 <- length(which(Evaluation$Actual==2))
Length3 <- length(which(Evaluation$Actual==3))
Length4 <- length(which(Evaluation$Actual==4))
Length5 <- length(which(Evaluation$Actual==5))
#Accuracy
Accuracy <- sum(diag(CM))/sum(CM)
#Precision
Precision <- diag(CM)/rowSums(CM)
Precision <- (Precision[1]*Length2+Precision[2]*Length3+Precision[3]*Length4+Precision[4]*Length5)/208
#Recall
Recall <- diag(CM)/colSums(CM)
Recall <- (Recall[1]*Length2+Recall[2]*Length3+Recall[3]*Length4+Recall[4]*Length5)/208
Accuracy
## [1] 0.5913462
Precision
## 2
## 0.5913462
Recall
## 2
## 0.5274926