PREPARATION

setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/30")
#install.packages("naivebayes")
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.4.3
library(dplyr)
## Warning: Installed Rcpp (0.12.16) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(e1071)
library(readxl)

Import actual labels.

#Import Labels
Labels <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Naive Bayes/1.Labels/Source Data.xlsx")

Label <- Labels$Score

Import the TFIDF feature set with a 30th percentile cut-off.

#Import Features
Features <- read.csv("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/30/Feature Set 1 30th TFIDF.csv")

Features <- Features[-1]

RECODE LABELS FOR ONE-VS-ALL

#Class 2
Label2 <- list()
for(i in 1:1000){
  if(Label[i]==3| Label[i]==4){
    Label2[i] <- 1
  }else{
    Label2[i] <- 0
  }
}
#As Factor
Label2 <- as.factor(unlist(Label2))
#Class 3
Label3 <- list()
for(i in 1:1000){
  if(Label[i]==5| Label[i]==6){
    Label3[i] <- 1
  }else{
    Label3[i] <- 0
  }
}
#As Factor
Label3 <- as.factor(unlist(Label3))
#Class 4
Label4 <- list()
for(i in 1:1000){
  if(Label[i]==7| Label[i]==8){
    Label4[i] <- 1
  }else{
    Label4[i] <- 0
  }
}
#As Factor
Label4 <- as.factor(unlist(Label4))
#Class 5
Label5 <- list()
for(i in 1:1000){
  if(Label[i]==9| Label[i]==10){
    Label5[i] <- 1
  }else{
    Label5[i] <- 0
  }
}
#As Factor
Label5 <- as.factor(unlist(Label5))
#All Labels
All <- list()
for(i in 1:1000){
  if(Label[i]==9| Label[i]==10){
    All[i] <- 5
  }else if(Label[i]==7| Label[i]==8){
    All[i] <- 4
  }else if(Label[i]==5| Label[i]==6){
    All[i] <- 3
  }else{
    All[i] <- 2
  }
  
  
}
#As Factor
All <- as.factor(unlist(All))

TRANSFORM FEATURES TO FACTOR VARIABLES

#Transform Integer to Factor
for(i in 1:1857){
  Features[,i] <- as.numeric(Features[,i])
}
str(Features)
## 'data.frame':    1000 obs. of  1857 variables:
##  $ abit          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ abl           : num  0.0351 0 0 0 0 ...
##  $ about         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ abov          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ absolut       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accent        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accept        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ access        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accommod      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accomplish    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accur         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ acess         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ach           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ across        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ activ         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ actual        : num  0 0 0 0 0 ...
##  $ add           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ addit         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adequ         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adjac         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adjust        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ador          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adult         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advanc        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advantag      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adverti       : num  0 0 0 0 0.105 ...
##  $ advi          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advic         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ affect        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ africa        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ after         : num  0 0 0 0 0.112 ...
##  $ afternoon     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ afterward     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ago           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ agreeabl      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ahead         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ air           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aircon        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aircondit     : num  0 0 0 0 0 ...
##  $ airi          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ airless       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ airport       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alarm         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ albert        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ albrt         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alcohol       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aldo          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alittl        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ all           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alloc         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ allow         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ almost        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ along         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alreadi       : num  0 0 0 0.062 0 ...
##  $ also          : num  0 0 0 0.0933 0 ...
##  $ altern        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ although      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alway         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amaz          : num  0 0 0 0 0.0653 ...
##  $ ambianc       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ambienc       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amen          : num  0 0.107 0 0 0 ...
##  $ amend         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ american      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amount        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amsterdam     : num  0 0.0847 0 0.0463 0 ...
##  $ and           : num  0.031 0 0 0 0 ...
##  $ ann           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anna          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ annoy         : num  0.0357 0 0 0 0 ...
##  $ anoth         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ answer        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ant           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anymor        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anyon         : num  0 0.102 0 0 0 ...
##  $ anyth         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anyway        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anywh         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ apart         : num  0 0 0 0 0 ...
##  $ apolog        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appal         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appar         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appeal        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appear        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appl          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ applic        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appoint       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appreci       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ approach      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ april         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ architectur   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ area          : num  0 0 0 0.0285 0 ...
##  $ aren          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ arena         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aroom         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ around        : num  0 0 0 0.0371 0 ...
##  $ arrang        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ arriv         : num  0.0239 0 0 0 0.064 ...
##   [list output truncated]

PARTITIONING TRAINING & VALIDATION

#Features
set.seed(1234)
ind <- sample(2,nrow(Features),replace = T, prob =c(0.8,0.2))
train <- Features[ind == 1,]
test <- Features[ind ==2,]

Labels

train.labels.2 <- Label2[ind == 1]
test.labels.2 <- Label2[ind ==2]

train.labels.3 <- Label3[ind == 1]
test.labels.3 <- Label3[ind ==2]

train.labels.4 <- Label4[ind == 1]
test.labels.4 <- Label4[ind ==2]

train.labels.5 <- Label5[ind == 1]
test.labels.5 <- Label5[ind ==2]

train.labels <- All[ind == 1]
test.labels <- All[ind ==2]

SVM MODEL

#SVM2
train2 <- train
train2$Score <- train.labels.2
SVM2 <- svm(Score~.,data = train2,scale = FALSE,probability=TRUE)

train3 <- train
train3$Score <- train.labels.3
SVM3 <- svm(Score~.,data = train3,scale = FALSE,probability=TRUE)

train4 <- train
train4$Score <- train.labels.4
SVM4 <- svm(Score~.,data = train4,scale = FALSE,probability=TRUE)

train5 <- train
train5$Score <- train.labels.5
SVM5 <- svm(Score~.,data = train5,scale = FALSE,probability=TRUE)
P2 <- predict(SVM2,newdata = test,probability = TRUE)
P3 <- predict(SVM3,newdata = test,probability = TRUE)
P4 <- predict(SVM4,newdata = test,probability = TRUE)
P5 <- predict(SVM5,newdata = test,probability = TRUE)
Prob2 <- attr(P2,"probabilities")
Prob3 <- attr(P3,"probabilities")
Prob4 <- attr(P4,"probabilities")
Prob5 <- attr(P5,"probabilities")

VOTING

Use probabilities as an input for the voting procedure.

Voting.df <- data.frame(Prob2, Prob3,Prob4,Prob5)
colnames(Voting.df) <- c("Class 2: 1","Class2: 0","Class 3: 0","Class3: 1","Class 4: 0","Class4: 1","Class 5: 0","Class5: 1")

head(Voting.df)
##    Class 2: 1 Class2: 0 Class 3: 0  Class3: 1 Class 4: 0 Class4: 1
## 5  0.04692655 0.9530734  0.8323141 0.16768589  0.7023211 0.2976789
## 14 0.02228479 0.9777152  0.9167291 0.08327088  0.6814337 0.3185663
## 16 0.01981856 0.9801814  0.9281491 0.07185086  0.7516857 0.2483143
## 26 0.03885615 0.9611439  0.9080667 0.09193334  0.7244762 0.2755238
## 28 0.03999167 0.9600083  0.7269670 0.27303302  0.7351252 0.2648748
## 29 0.02539728 0.9746027  0.8706528 0.12934723  0.6950132 0.3049868
##    Class 5: 0 Class5: 1
## 5   0.7057269 0.2942731
## 14  0.5531563 0.4468437
## 16  0.3899368 0.6100632
## 26  0.5462239 0.4537761
## 28  0.6809468 0.3190532
## 29  0.6227421 0.3772579
SEQ <- c(1,4,6,8)
Transformed.Voting.df <- Voting.df[SEQ]
colnames(Transformed.Voting.df) <- c("2","3","4","5")
head(Transformed.Voting.df)
##             2          3         4         5
## 5  0.04692655 0.16768589 0.2976789 0.2942731
## 14 0.02228479 0.08327088 0.3185663 0.4468437
## 16 0.01981856 0.07185086 0.2483143 0.6100632
## 26 0.03885615 0.09193334 0.2755238 0.4537761
## 28 0.03999167 0.27303302 0.2648748 0.3190532
## 29 0.02539728 0.12934723 0.3049868 0.3772579
Evaluation <- Transformed.Voting.df
Index <- as.numeric(apply(Transformed.Voting.df,MARGIN = 1,which.max))
Index <- Index+1
Evaluation$Vote <- Index
Evaluation$Actual <- test.labels
head(Evaluation,100)
##              2          3         4            5 Vote Actual
## 5   0.04692655 0.16768589 0.2976789 0.2942730836    4      4
## 14  0.02228479 0.08327088 0.3185663 0.4468436543    5      5
## 16  0.01981856 0.07185086 0.2483143 0.6100632190    5      5
## 26  0.03885615 0.09193334 0.2755238 0.4537761433    5      4
## 28  0.03999167 0.27303302 0.2648748 0.3190531525    5      4
## 29  0.02539728 0.12934723 0.3049868 0.3772579423    5      4
## 39  0.03225661 0.08485048 0.2899245 0.5000000000    5      5
## 40  0.02791010 0.14415399 0.3509652 0.3193606269    4      3
## 60  0.02244474 0.10191471 0.2634859 0.5831549571    5      5
## 61  0.04293729 0.19469974 0.2843128 0.3727440650    5      3
## 72  0.01887934 0.05148822 0.2830851 0.5516962098    5      4
## 81  0.02868761 0.11243599 0.3294949 0.3778524697    5      3
## 86  0.03622932 0.11907888 0.2987395 0.4928516271    5      5
## 90  0.08781516 0.13231460 0.3037537 0.2150666393    4      4
## 92  0.02242731 0.13861375 0.3043761 0.2550467806    4      4
## 113 0.03083626 0.07206089 0.3283978 0.3756817587    5      5
## 116 0.05176527 0.19003380 0.2555539 0.3615086985    5      4
## 117 0.02270332 0.07693756 0.3764143 0.4500734723    5      5
## 122 0.04290547 0.10491129 0.3581545 0.1236995475    4      4
## 123 0.02935237 0.06774438 0.4057065 0.3426231974    4      2
## 124 0.02627472 0.12048940 0.3211063 0.3569908090    5      4
## 131 0.02730114 0.12255986 0.3157600 0.4124097714    5      4
## 135 0.04085696 0.13999263 0.3131905 0.2752240166    4      3
## 137 0.01608029 0.07860844 0.3087267 0.5840435836    5      5
## 140 0.04063716 0.15107022 0.3101048 0.3111817916    5      4
## 142 0.02299509 0.06100677 0.3552151 0.4507173909    5      5
## 149 0.03207331 0.08096582 0.3461447 0.4595470820    5      4
## 154 0.02104629 0.10713521 0.2418738 0.6224135324    5      5
## 156 0.04408692 0.09412309 0.3464442 0.3276176428    4      3
## 158 0.04217183 0.16871771 0.2815797 0.3473552787    5      3
## 169 0.02577609 0.06737245 0.2142887 0.7613892506    5      5
## 185 0.02735043 0.08712102 0.2650423 0.5422414688    5      5
## 187 0.01944295 0.10167937 0.3330202 0.4637094750    5      5
## 192 0.02598313 0.19123803 0.3224908 0.2694920285    4      3
## 194 0.19544138 0.65948284 0.2537345 0.1351230147    3      4
## 195 0.02167714 0.30470751 0.2853504 0.2350640321    3      4
## 196 0.04686736 0.13492068 0.3166975 0.3043457476    4      5
## 197 0.08106960 0.17321587 0.2540127 0.2372967989    4      3
## 199 0.02073469 0.06566576 0.2528099 0.7419574167    5      5
## 210 0.08048519 0.14651835 0.3103369 0.1692949464    4      3
## 216 0.05137974 0.07920486 0.1880214 0.7923754415    5      5
## 220 0.02473627 0.17646922 0.2710235 0.3751341222    5      4
## 227 0.03721557 0.07369087 0.2958850 0.4804523919    5      5
## 234 0.02708268 0.11906784 0.3610474 0.3000761264    4      3
## 240 0.02080226 0.06545871 0.5613175 0.1610634842    4      5
## 245 0.14008587 0.14907863 0.3963608 0.1683807618    4      4
## 249 0.03788292 0.11406446 0.3715662 0.3348729592    4      5
## 261 0.05022787 0.34397716 0.4007921 0.1949668203    4      3
## 277 0.02584104 0.04519664 0.2040012 0.8909037598    5      5
## 283 0.03613260 0.12006152 0.2866262 0.5111685626    5      5
## 290 0.00782589 0.02558933 0.1555891 0.9714777941    5      4
## 293 0.04417507 0.06110192 0.3880079 0.2817081324    4      5
## 302 0.02109125 0.16455694 0.2528143 0.4704500398    5      4
## 305 0.03097455 0.11172712 0.3842496 0.3134299851    4      4
## 308 0.02276611 0.08842611 0.3012715 0.1726885480    4      4
## 311 0.03056796 0.05451522 0.2618905 0.7415156869    5      5
## 320 0.02271925 0.05636143 0.1902482 0.9551184351    5      2
## 322 0.02513604 0.04904128 0.2294708 0.8198763558    5      5
## 330 0.01802514 0.04750162 0.1615313 0.7984239673    5      4
## 332 0.03461809 0.09328405 0.8306350 0.0305976184    4      4
## 333 0.02910950 0.02833768 0.3061660 0.8232435453    5      5
## 339 0.01369778 0.12876836 0.2308076 0.5747612711    5      5
## 341 0.05727140 0.09604462 0.4652806 0.1311120596    4      4
## 344 0.02830126 0.07596179 0.3194249 0.6510223024    5      5
## 349 0.02020746 0.06129258 0.2057091 0.9022887958    5      5
## 355 0.02504210 0.02334981 0.1584316 0.9718569524    5      5
## 356 0.04339732 0.09000655 0.3047676 0.3940734193    5      3
## 365 0.06959273 0.22231382 0.3228310 0.2197959144    4      3
## 366 0.04210835 0.09595045 0.2433727 0.5529519749    5      4
## 369 0.01502884 0.05540137 0.3273764 0.4775488458    5      4
## 371 0.01989465 0.08513382 0.2696018 0.8344921411    5      5
## 373 0.03116423 0.04945786 0.3419122 0.5723273917    5      5
## 389 0.23028985 0.06082960 0.8599565 0.0002353072    4      2
## 390 0.11898223 0.52953666 0.7626101 0.0001847357    4      4
## 396 0.02265194 0.05328127 0.2863200 0.6380914549    5      4
## 412 0.01387290 0.06797259 0.3424959 0.4935461795    5      5
## 413 0.02661140 0.09003471 0.2788438 0.4534474637    5      3
## 415 0.02103694 0.08019458 0.3828692 0.4487479149    5      4
## 422 0.02824716 0.13958037 0.2702158 0.3971285518    5      5
## 425 0.02185902 0.05939023 0.2839997 0.6363518299    5      5
## 434 0.04185669 0.08421752 0.3228192 0.3886757595    5      5
## 438 0.01730632 0.07424458 0.2873048 0.6862537523    5      4
## 441 0.03517408 0.11243348 0.3024119 0.4250930720    5      5
## 442 0.02631141 0.09038816 0.2278588 0.6425058637    5      5
## 445 0.02337146 0.09550871 0.3487867 0.3966759296    5      5
## 447 0.03755188 0.09519742 0.2775516 0.3771252944    5      3
## 453 0.08688449 0.23844194 0.3152877 0.2256103514    4      4
## 454 0.02403543 0.11178312 0.2751485 0.4562861401    5      5
## 462 0.01430085 0.05273255 0.2542055 0.7498840631    5      5
## 474 0.04858910 0.15297236 0.2993886 0.4065741549    5      3
## 476 0.01914223 0.12299868 0.2637640 0.4654137662    5      3
## 493 0.01427421 0.07310598 0.5000000 0.2295688079    4      5
## 502 0.02207945 0.10914397 0.4062248 0.3223819108    4      4
## 503 0.02950357 0.09452876 0.2656599 0.5247322425    5      5
## 506 0.02361365 0.17823023 0.2585278 0.4454028445    5      5
## 508 0.02260229 0.06360659 0.3531484 0.5704404497    5      5
## 512 0.03170636 0.13886052 0.3227134 0.4857793628    5      5
## 513 0.02635301 0.12759301 0.2956366 0.4809159868    5      5
## 521 0.06909524 0.12536680 0.2653970 0.3641871435    5      2
## 524 0.10204442 0.13263492 0.3735893 0.1949408961    4      5
CM <- table(Evaluation$Actual,Evaluation$Vote)
CM
##    
##       3   4   5
##   2   0   4   3
##   3   0  14  11
##   4   2  26  34
##   5   1  13 100
#Proportions
Overall <- length(Evaluation$Actual)
Length2 <- length(which(Evaluation$Actual==2))
Length3 <- length(which(Evaluation$Actual==3))
Length4 <- length(which(Evaluation$Actual==4))
Length5 <- length(which(Evaluation$Actual==5))


#Accuracy
Accuracy <- sum(0,29,100)/sum(CM)

#Precision
Rows <- rowSums(CM)
Precision2 <- CM[2,1]/Rows[2]
Precision3 <- CM[3,2]/Rows[3]
Precision4 <- CM[4,3]/Rows[4]

Precision <- (Precision2*Length3+Precision3*Length4+Precision4*Length5)/208

#Recall
Col <- colSums(CM)
Recall2 <- CM[2,1]/Col[1]
Recall3 <- CM[3,2]/Col[2]
Recall4 <- CM[4,3]/Col[3]

Recall <- (Recall2*Length3+Recall3*Length4+Recall4*Length5)/208


Accuracy
## [1] 0.6201923
Precision
##         3 
## 0.6057692
Recall
##         3 
## 0.5062872