PREPARATION

setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/90")
#install.packages("naivebayes")
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.4.3
library(dplyr)
## Warning: Installed Rcpp (0.12.16) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(e1071)
library(readxl)

Import actual labels.

#Import Labels
Labels <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Naive Bayes/1.Labels/Source Data.xlsx")

Label <- Labels$Score

Import the TFIDF feature set with a 90th percentile cut-off.

#Import Features
Features <- read.csv("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/90/Feature Set 1 90th TFIDF.csv")

Features <- Features[-1]

RECODE LABELS FOR ONE-VS-ALL

#Class 2
Label2 <- list()
for(i in 1:1000){
  if(Label[i]==3| Label[i]==4){
    Label2[i] <- 1
  }else{
    Label2[i] <- 0
  }
}
#As Factor
Label2 <- as.factor(unlist(Label2))
#Class 3
Label3 <- list()
for(i in 1:1000){
  if(Label[i]==5| Label[i]==6){
    Label3[i] <- 1
  }else{
    Label3[i] <- 0
  }
}
#As Factor
Label3 <- as.factor(unlist(Label3))
#Class 4
Label4 <- list()
for(i in 1:1000){
  if(Label[i]==7| Label[i]==8){
    Label4[i] <- 1
  }else{
    Label4[i] <- 0
  }
}
#As Factor
Label4 <- as.factor(unlist(Label4))
#Class 5
Label5 <- list()
for(i in 1:1000){
  if(Label[i]==9| Label[i]==10){
    Label5[i] <- 1
  }else{
    Label5[i] <- 0
  }
}
#As Factor
Label5 <- as.factor(unlist(Label5))
#All Labels
All <- list()
for(i in 1:1000){
  if(Label[i]==9| Label[i]==10){
    All[i] <- 5
  }else if(Label[i]==7| Label[i]==8){
    All[i] <- 4
  }else if(Label[i]==5| Label[i]==6){
    All[i] <- 3
  }else{
    All[i] <- 2
  }
  
  
}
#As Factor
All <- as.factor(unlist(All))

TRANSFORM FEATURES TO NUMERIC VARIABLES

#Transform Integer to Factor
for(i in 1:268){
  Features[,i] <- as.numeric(Features[,i])
}
str(Features)
## 'data.frame':    1000 obs. of  268 variables:
##  $ access      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ air         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ airport     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ all         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ also        : num  0 0 0 0.0933 0 ...
##  $ although    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alway       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amaz        : num  0 0 0 0 0.0653 ...
##  $ anyth       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ area        : num  0 0 0 0.0285 0 ...
##  $ around      : num  0 0 0 0.0371 0 ...
##  $ arriv       : num  0.0239 0 0 0 0.064 ...
##  $ ask         : num  0.0239 0 0.1319 0 0 ...
##  $ attent      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ away        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ awesom      : num  0 0 0 0 0 ...
##  $ back        : num  0 0.0698 0.1376 0 0 ...
##  $ bad         : num  0 0 0 0.0417 0 ...
##  $ bar         : num  0 0 0.1048 0.0291 0 ...
##  $ basement    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bath        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bathroom    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ beauti      : num  0.0197 0 0 0 0 ...
##  $ bed         : num  0 0 0 0.0337 0 ...
##  $ bedroom     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ best        : num  0.031 0 0 0 0 ...
##  $ better      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ big         : num  0.0492 0 0 0 0 ...
##  $ birthday    : num  0 0 0 0 0.0999 ...
##  $ bit         : num  0 0.12 0.118 0 0 ...
##  $ book        : num  0.132 0 0 0 0.177 ...
##  $ breakfast   : num  0 0 0.0523 0 0 ...
##  $ brilliant   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ broken      : num  0.0365 0 0 0 0 ...
##  $ buffet      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ build       : num  0 0 0 0.0603 0.0528 ...
##  $ busi        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ can         : num  0.0681 0.0636 0 0 0 ...
##  $ center      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ centr       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ central     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ chang       : num  0.0546 0 0 0 0 ...
##  $ charg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ check       : num  0.0495 0.0693 0 0 0 ...
##  $ choic       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ citi        : num  0.0251 0 0 0 0 ...
##  $ clean       : num  0 0 0 0.04 0 ...
##  $ close       : num  0.0187 0 0 0 0 ...
##  $ coff        : num  0 0 0.138 0 0 ...
##  $ cold        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ comfi       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ comfort     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ complain    : num  0 0 0 0.0509 0 ...
##  $ complet     : num  0 0 0 0 0.0852 ...
##  $ condit      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ construct   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ conveni     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ cook        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ cool        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ couldn      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ court       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ day         : num  0.0646 0 0 0.033 0 ...
##  $ decor       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ definit     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ delici      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ design      : num  0 0 0 0 0 ...
##  $ desk        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ didn        : num  0 0 0 0 0.057 ...
##  $ difficult   : num  0 0 0.187 0 0 ...
##  $ don         : num  0.0296 0 0 0 0 ...
##  $ door        : num  0 0 0 0.0717 0 ...
##  $ doubl       : num  0.0286 0 0 0 0 ...
##  $ drink       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ due         : num  0.0286 0 0 0 0 ...
##  $ earl        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ easi        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ effici      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ enough      : num  0 0 0 0 0 ...
##  $ especi      : num  0 0 0 0 0.0791 ...
##  $ etc         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ euro        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ even        : num  0.0424 0.0593 0 0 0 ...
##  $ everyth     : num  0 0 0 0 0 ...
##  $ excel       : num  0 0.0462 0 0 0 ...
##  $ except      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ execut      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ expen       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ experi      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ extra       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ extrem      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ facil       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fantast     : num  0 0.0744 0 0 0 ...
##  $ far         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ feel        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ first       : num  0 0.0714 0 0 0 ...
##  $ floor       : num  0.022 0 0 0.101 0 ...
##  $ food        : num  0 0.0572 0 0 0 ...
##  $ free        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fresh       : num  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]

PARTITIONING TRAINING & VALIDATION

#Features
set.seed(1234)
ind <- sample(2,nrow(Features),replace = T, prob =c(0.8,0.2))
train <- Features[ind == 1,]
test <- Features[ind ==2,]

Labels

train.labels.2 <- Label2[ind == 1]
test.labels.2 <- Label2[ind ==2]

train.labels.3 <- Label3[ind == 1]
test.labels.3 <- Label3[ind ==2]

train.labels.4 <- Label4[ind == 1]
test.labels.4 <- Label4[ind ==2]

train.labels.5 <- Label5[ind == 1]
test.labels.5 <- Label5[ind ==2]

train.labels <- All[ind == 1]
test.labels <- All[ind ==2]

SVM MODEL

#SVM2
train2 <- train
train2$Score <- train.labels.2
SVM2 <- svm(Score~.,data = train2,scale = FALSE,probability=TRUE)

train3 <- train
train3$Score <- train.labels.3
SVM3 <- svm(Score~.,data = train3,scale = FALSE,probability=TRUE)

train4 <- train
train4$Score <- train.labels.4
SVM4 <- svm(Score~.,data = train4,scale = FALSE,probability=TRUE)

train5 <- train
train5$Score <- train.labels.5
SVM5 <- svm(Score~.,data = train5,scale = FALSE,probability=TRUE)
P2 <- predict(SVM2,newdata = test,probability = TRUE)
P3 <- predict(SVM3,newdata = test,probability = TRUE)
P4 <- predict(SVM4,newdata = test,probability = TRUE)
P5 <- predict(SVM5,newdata = test,probability = TRUE)
Prob2 <- attr(P2,"probabilities")
Prob3 <- attr(P3,"probabilities")
Prob4 <- attr(P4,"probabilities")
Prob5 <- attr(P5,"probabilities")

VOTING

Use probabilities as an input for the voting procedure. Choose the class with the highest probability.

Voting.df <- data.frame(Prob2, Prob3,Prob4,Prob5)
colnames(Voting.df) <- c("Class 2: 1","Class2: 0","Class 3: 0","Class3: 1","Class 4: 0","Class4: 1","Class 5: 0","Class5: 1")

head(Voting.df)
##    Class 2: 1 Class2: 0 Class 3: 0  Class3: 1 Class 4: 0 Class4: 1
## 5  0.04885169 0.9511483  0.8603780 0.13962197  0.7468755 0.2531245
## 14 0.02774571 0.9722543  0.9051723 0.09482768  0.6400074 0.3599926
## 16 0.02528299 0.9747170  0.9038296 0.09617038  0.7624328 0.2375672
## 26 0.03513537 0.9648646  0.8828457 0.11715428  0.7020542 0.2979458
## 28 0.02662306 0.9733769  0.7654968 0.23450319  0.7518647 0.2481353
## 29 0.02489697 0.9751030  0.8735787 0.12642128  0.6944193 0.3055807
##    Class 5: 0 Class5: 1
## 5   0.6868148 0.3131852
## 14  0.5972668 0.4027332
## 16  0.3910458 0.6089542
## 26  0.5744081 0.4255919
## 28  0.6413601 0.3586399
## 29  0.6205540 0.3794460
SEQ <- c(1,4,6,8)
Transformed.Voting.df <- Voting.df[SEQ]
colnames(Transformed.Voting.df) <- c("2","3","4","5")
head(Transformed.Voting.df)
##             2          3         4         5
## 5  0.04885169 0.13962197 0.2531245 0.3131852
## 14 0.02774571 0.09482768 0.3599926 0.4027332
## 16 0.02528299 0.09617038 0.2375672 0.6089542
## 26 0.03513537 0.11715428 0.2979458 0.4255919
## 28 0.02662306 0.23450319 0.2481353 0.3586399
## 29 0.02489697 0.12642128 0.3055807 0.3794460
Evaluation <- Transformed.Voting.df
Index <- as.numeric(apply(Transformed.Voting.df,MARGIN = 1,which.max))
Index <- Index+1
Evaluation$Vote <- Index
Evaluation$Actual <- test.labels
head(Evaluation,100)
##               2          3         4            5 Vote Actual
## 5   0.048851689 0.13962197 0.2531245 0.3131852426    5      4
## 14  0.027745711 0.09482768 0.3599926 0.4027332225    5      5
## 16  0.025282992 0.09617038 0.2375672 0.6089541751    5      5
## 26  0.035135366 0.11715428 0.2979458 0.4255919202    5      4
## 28  0.026623060 0.23450319 0.2481353 0.3586398766    5      4
## 29  0.024896973 0.12642128 0.3055807 0.3794459851    5      4
## 39  0.036536618 0.12424408 0.2888428 0.4612490148    5      5
## 40  0.035644377 0.12323624 0.3283707 0.3505065109    5      3
## 60  0.029408125 0.09154327 0.2712927 0.6158405073    5      5
## 61  0.051942141 0.14855692 0.2971700 0.3837800502    5      3
## 72  0.019718569 0.10571773 0.3094063 0.5142240078    5      4
## 81  0.021781236 0.15234691 0.2885612 0.4038576564    5      3
## 86  0.040817876 0.12311165 0.2884011 0.4744722074    5      5
## 90  0.067847027 0.13577413 0.2917451 0.2791338510    4      4
## 92  0.041583049 0.11751313 0.3040554 0.3013099746    4      4
## 113 0.036300932 0.11350701 0.3278792 0.3701295504    5      5
## 116 0.044133424 0.12091228 0.3039658 0.3940207368    5      4
## 117 0.049930287 0.07249002 0.3616510 0.4076591590    5      5
## 122 0.051663587 0.10171284 0.3020654 0.1674888718    4      4
## 123 0.024064351 0.08677774 0.3889846 0.2851514819    4      2
## 124 0.033283393 0.13255552 0.3104288 0.3295816876    5      4
## 131 0.023889227 0.11900661 0.3098417 0.4582640960    5      4
## 135 0.039383987 0.13523788 0.2942044 0.3164267907    5      3
## 137 0.016532450 0.08273318 0.3044123 0.5302957495    5      5
## 140 0.048896451 0.10015208 0.3626998 0.3409149006    4      4
## 142 0.026067860 0.08799262 0.3041699 0.5000000000    5      5
## 149 0.025286992 0.13028532 0.3140719 0.4370963571    5      4
## 154 0.030788708 0.11691151 0.2284364 0.5642755009    5      5
## 156 0.046506981 0.11395710 0.3373108 0.3559665822    5      3
## 158 0.059433902 0.13542878 0.2710934 0.3828574060    5      3
## 169 0.037464298 0.09769099 0.2264764 0.7292931912    5      5
## 185 0.030784259 0.08644208 0.2749857 0.5000000000    5      5
## 187 0.027673133 0.11803391 0.2780658 0.4802115610    5      5
## 192 0.029014122 0.14023549 0.3565525 0.2655653626    4      3
## 194 0.285376955 0.20966591 0.2289379 0.2397666583    2      4
## 195 0.032563728 0.12747305 0.2895316 0.3223163917    5      4
## 196 0.043812747 0.12981392 0.2729996 0.3466345979    5      5
## 197 0.032954490 0.15030899 0.2729243 0.2887154439    5      3
## 199 0.025662717 0.08612619 0.2899891 0.7325962547    5      5
## 210 0.051784746 0.14537557 0.3024988 0.2033648978    4      3
## 216 0.039140476 0.08141991 0.2035340 0.7802598447    5      5
## 220 0.035204821 0.14863718 0.2510545 0.3626297741    5      4
## 227 0.035804817 0.09962221 0.3008678 0.4503308965    5      5
## 234 0.035224975 0.13314629 0.3300683 0.3214264667    4      3
## 240 0.034768282 0.08754134 0.5492909 0.1630282680    4      5
## 245 0.060786248 0.14471316 0.3610944 0.2042830197    4      4
## 249 0.039774930 0.15139285 0.2927583 0.3494543342    5      5
## 261 0.039420221 0.13548024 0.4101987 0.2396298680    4      3
## 277 0.026004215 0.07797192 0.2167359 0.8716678772    5      5
## 283 0.032632695 0.10891841 0.2935209 0.5000000000    5      5
## 290 0.007695373 0.03532522 0.1807015 0.9664885353    5      4
## 293 0.030570066 0.08538436 0.3394252 0.3276588569    4      5
## 302 0.017716884 0.13161495 0.2741405 0.5118427627    5      4
## 305 0.035667484 0.11901602 0.3222612 0.3534082204    5      4
## 308 0.033923830 0.14276317 0.2390172 0.1702995706    4      4
## 311 0.037346808 0.09548608 0.2737962 0.6877678800    5      5
## 320 0.010940518 0.06948904 0.1910065 0.9456523686    5      2
## 322 0.023888585 0.08286825 0.2122464 0.7979931778    5      5
## 330 0.019085387 0.08041492 0.2094494 0.7440514704    5      4
## 332 0.041388900 0.11899486 0.8285014 0.0391859853    4      4
## 333 0.031269309 0.05868737 0.3432452 0.8000304480    5      5
## 339 0.012312252 0.11425930 0.3414204 0.4930447976    5      5
## 341 0.032470212 0.11255587 0.4853232 0.1241306871    4      4
## 344 0.040705955 0.07542003 0.2772083 0.6505276677    5      5
## 349 0.024071618 0.06035080 0.2220790 0.8907913770    5      5
## 355 0.033124961 0.03109111 0.1901577 0.9618078218    5      5
## 356 0.030796402 0.13054699 0.2674326 0.4272684139    5      3
## 365 0.048344331 0.19930491 0.3001073 0.2431249581    4      3
## 366 0.024222984 0.11219326 0.2843450 0.5166032150    5      4
## 369 0.020542735 0.06503988 0.3831389 0.3890316789    5      4
## 371 0.024731375 0.08087753 0.2288317 0.8141855561    5      5
## 373 0.038572859 0.10269661 0.2972136 0.5804506445    5      5
## 389 0.035259077 0.11223026 0.8182538 0.0002804723    4      2
## 390 0.144617347 0.27687532 0.5529048 0.0003362382    4      4
## 396 0.033604821 0.06774184 0.2762720 0.6190074074    5      4
## 412 0.013650242 0.10569122 0.3461724 0.4945300356    5      5
## 413 0.033173766 0.11182884 0.2575729 0.4612524911    5      3
## 415 0.016784184 0.11871202 0.4164798 0.3994187212    4      4
## 422 0.034594112 0.10761905 0.2713939 0.4546613067    5      5
## 425 0.022516500 0.10788824 0.2571681 0.5801644876    5      5
## 434 0.025022107 0.10167848 0.3115039 0.3921660944    5      5
## 438 0.026097190 0.10953294 0.2872527 0.6784594023    5      4
## 441 0.036130759 0.10960426 0.3120295 0.4057825631    5      5
## 442 0.023161919 0.10963241 0.2124435 0.6642278775    5      5
## 445 0.032884813 0.11498510 0.3397893 0.4110311362    5      5
## 447 0.033216845 0.11091875 0.3033982 0.3580821038    5      3
## 453 0.037114733 0.18916646 0.2907818 0.3106796215    5      4
## 454 0.026184094 0.11553035 0.2835597 0.4466551262    5      5
## 462 0.022765854 0.07083547 0.2786164 0.7346099402    5      5
## 474 0.054665854 0.08137064 0.3007620 0.4098527811    5      3
## 476 0.030106481 0.13942062 0.2574207 0.5000000000    5      3
## 493 0.022504694 0.09402321 0.4688219 0.2448016290    4      5
## 502 0.021622399 0.14917779 0.3902132 0.3329139630    4      4
## 503 0.032147540 0.09894937 0.2702238 0.5166714694    5      5
## 506 0.029381837 0.12797009 0.2680221 0.4267302424    5      5
## 508 0.033996134 0.06842316 0.3461537 0.5938929754    5      5
## 512 0.038367371 0.10771707 0.3024371 0.4824878707    5      5
## 513 0.032314168 0.10613563 0.3140933 0.3923348220    5      5
## 521 0.057494234 0.10453153 0.3212953 0.4030173539    5      2
## 524 0.065021048 0.11180550 0.3340279 0.2297464362    4      5
CM <- table(Evaluation$Actual,Evaluation$Vote)
CM
##    
##       2   4   5
##   2   0   4   3
##   3   0  10  15
##   4   1  21  40
##   5   0   8 106
#Proportions
Overall <- length(Evaluation$Actual)
Length2 <- length(which(Evaluation$Actual==2))
Length3 <- length(which(Evaluation$Actual==3))
Length4 <- length(which(Evaluation$Actual==4))
Length5 <- length(which(Evaluation$Actual==5))


#Accuracy
Accuracy <- sum(0,21,106)/sum(CM)

#Precision
Rows <- rowSums(CM)
Precision2 <- CM[1,1]/Rows[2]
Precision3 <- CM[3,2]/Rows[3]
Precision4 <- CM[4,3]/Rows[4]

Precision <- (Precision2*Length3+Precision3*Length4+Precision4*Length5)/208

#Recall
Col <- colSums(CM)
Recall2 <- CM[1,1]/Col[1]
Recall3 <- CM[3,2]/Col[2]
Recall4 <- CM[4,3]/Col[3]

Recall <- (Recall2*Length3+Recall3*Length4+Recall4*Length5)/208


Accuracy
## [1] 0.6105769
Precision
##         3 
## 0.6105769
Recall
##         2 
## 0.4998173