PREPARATION

setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/70")
#install.packages("naivebayes")
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.4.3
library(dplyr)
## Warning: Installed Rcpp (0.12.16) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(e1071)
library(readxl)

Import actual labels.

#Import Labels
Labels <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Naive Bayes/1.Labels/Source Data.xlsx")

Label <- Labels$Score

Import TFIDF feature set with a 70th percentile cut-off.

#Import Features
Features <- read.csv("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/70/Feature Set 1 70th TFIDF.csv")

Features <- Features[-1]

RECODE LABELS FOR ONE-VS-ALL

#Class 2
Label2 <- list()
for(i in 1:1000){
  if(Label[i]==3| Label[i]==4){
    Label2[i] <- 1
  }else{
    Label2[i] <- 0
  }
}
#As Factor
Label2 <- as.factor(unlist(Label2))
#Class 3
Label3 <- list()
for(i in 1:1000){
  if(Label[i]==5| Label[i]==6){
    Label3[i] <- 1
  }else{
    Label3[i] <- 0
  }
}
#As Factor
Label3 <- as.factor(unlist(Label3))
#Class 4
Label4 <- list()
for(i in 1:1000){
  if(Label[i]==7| Label[i]==8){
    Label4[i] <- 1
  }else{
    Label4[i] <- 0
  }
}
#As Factor
Label4 <- as.factor(unlist(Label4))
#Class 5
Label5 <- list()
for(i in 1:1000){
  if(Label[i]==9| Label[i]==10){
    Label5[i] <- 1
  }else{
    Label5[i] <- 0
  }
}
#As Factor
Label5 <- as.factor(unlist(Label5))
#All Labels
All <- list()
for(i in 1:1000){
  if(Label[i]==9| Label[i]==10){
    All[i] <- 5
  }else if(Label[i]==7| Label[i]==8){
    All[i] <- 4
  }else if(Label[i]==5| Label[i]==6){
    All[i] <- 3
  }else{
    All[i] <- 2
  }
  
  
}
#As Factor
All <- as.factor(unlist(All))

TRANSFORM FEATURES TO NUMERIC VARIABLES

#Transform Integer to Factor
for(i in 1:802){
  Features[,i] <- as.numeric(Features[,i])
}
str(Features)
## 'data.frame':    1000 obs. of  802 variables:
##  $ abl          : num  0.0351 0 0 0 0 ...
##  $ absolut      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accent       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ access       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accommod     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ actual       : num  0 0 0 0 0 ...
##  $ adequ        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adult        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advanc       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adverti      : num  0 0 0 0 0.105 ...
##  $ advi         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advic        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ affect       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ air          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aircondit    : num  0 0 0 0 0 ...
##  $ airi         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ airport      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alarm        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ all          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alloc        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ allow        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ also         : num  0 0 0 0.0933 0 ...
##  $ although     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alway        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amaz         : num  0 0 0 0 0.0653 ...
##  $ amen         : num  0 0.107 0 0 0 ...
##  $ amsterdam    : num  0 0.0847 0 0.0463 0 ...
##  $ and          : num  0.031 0 0 0 0 ...
##  $ anna         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ annoy        : num  0.0357 0 0 0 0 ...
##  $ anoth        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ant          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anyon        : num  0 0.102 0 0 0 ...
##  $ anyth        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anyway       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ apart        : num  0 0 0 0 0 ...
##  $ appal        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appear       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appoint      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ approach     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ architectur  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ area         : num  0 0 0 0.0285 0 ...
##  $ arena        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ around       : num  0 0 0 0.0371 0 ...
##  $ arriv        : num  0.0239 0 0 0 0.064 ...
##  $ ask          : num  0.0239 0 0.1319 0 0 ...
##  $ aspect       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ athmosph     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ atmosph      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ attent       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ attitud      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ attract      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ atttent      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ avail        : num  0.031 0 0 0 0 ...
##  $ averag       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ away         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ awesom       : num  0 0 0 0 0 ...
##  $ back         : num  0 0.0698 0.1376 0 0 ...
##  $ bacon        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bad          : num  0 0 0 0.0417 0 ...
##  $ bag          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bang         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bar          : num  0 0 0.1048 0.0291 0 ...
##  $ bare         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ base         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ basement     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ basic        : num  0 0 0.187 0 0 ...
##  $ bath         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bathroom     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bathtub      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ beauti       : num  0.0197 0 0 0 0 ...
##  $ bed          : num  0 0 0 0.0337 0 ...
##  $ bedroom      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ best         : num  0.031 0 0 0 0 ...
##  $ better       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ big          : num  0.0492 0 0 0 0 ...
##  $ bigger       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bike         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ birthday     : num  0 0 0 0 0.0999 ...
##  $ bit          : num  0 0.12 0.118 0 0 ...
##  $ black        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ blanket      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ blind        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ block        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ board        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ boiler       : num  0 0 0.241 0 0 ...
##  $ bonus        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ book         : num  0.132 0 0 0 0.177 ...
##  $ box          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ boyfriend    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bread        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ breakfast    : num  0 0 0.0523 0 0 ...
##  $ bright       : num  0 0 0 0 0 ...
##  $ brilliant    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ broken       : num  0.0365 0 0 0 0 ...
##  $ brought      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ buffet       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ build        : num  0 0 0 0.0603 0.0528 ...
##  $ bus          : num  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]

PARTITIONING TRAINING & VALIDATION

#Features
set.seed(1234)
ind <- sample(2,nrow(Features),replace = T, prob =c(0.8,0.2))
train <- Features[ind == 1,]
test <- Features[ind ==2,]

Labels

train.labels.2 <- Label2[ind == 1]
test.labels.2 <- Label2[ind ==2]

train.labels.3 <- Label3[ind == 1]
test.labels.3 <- Label3[ind ==2]

train.labels.4 <- Label4[ind == 1]
test.labels.4 <- Label4[ind ==2]

train.labels.5 <- Label5[ind == 1]
test.labels.5 <- Label5[ind ==2]

train.labels <- All[ind == 1]
test.labels <- All[ind ==2]

SVM MODEL

#SVM2
train2 <- train
train2$Score <- train.labels.2
SVM2 <- svm(Score~.,data = train2,scale = FALSE,probability=TRUE)

train3 <- train
train3$Score <- train.labels.3
SVM3 <- svm(Score~.,data = train3,scale = FALSE,probability=TRUE)

train4 <- train
train4$Score <- train.labels.4
SVM4 <- svm(Score~.,data = train4,scale = FALSE,probability=TRUE)

train5 <- train
train5$Score <- train.labels.5
SVM5 <- svm(Score~.,data = train5,scale = FALSE,probability=TRUE)
P2 <- predict(SVM2,newdata = test,probability = TRUE)
P3 <- predict(SVM3,newdata = test,probability = TRUE)
P4 <- predict(SVM4,newdata = test,probability = TRUE)
P5 <- predict(SVM5,newdata = test,probability = TRUE)
Prob2 <- attr(P2,"probabilities")
Prob3 <- attr(P3,"probabilities")
Prob4 <- attr(P4,"probabilities")
Prob5 <- attr(P5,"probabilities")

VOTING

Use probabilities as an input for the voting procedure. Choose the class with the highest probability.

Voting.df <- data.frame(Prob2, Prob3,Prob4,Prob5)
colnames(Voting.df) <- c("Class 2: 1","Class2: 0","Class 3: 0","Class3: 1","Class 4: 0","Class4: 1","Class 5: 0","Class5: 1")

head(Voting.df)
##    Class 2: 1 Class2: 0 Class 3: 0  Class3: 1 Class 4: 0 Class4: 1
## 5  0.04668780 0.9533122  0.8558887 0.14411134  0.7103309 0.2896691
## 14 0.02789076 0.9721092  0.9009667 0.09903335  0.6708631 0.3291369
## 16 0.02231681 0.9776832  0.9134637 0.08653633  0.7620809 0.2379191
## 26 0.03822932 0.9617707  0.8972014 0.10279863  0.7229603 0.2770397
## 28 0.04522425 0.9547758  0.7486731 0.25132687  0.7304311 0.2695689
## 29 0.02281651 0.9771835  0.9090137 0.09098627  0.6992499 0.3007501
##    Class 5: 0 Class5: 1
## 5   0.6937458 0.3062542
## 14  0.5793271 0.4206729
## 16  0.3920841 0.6079159
## 26  0.5436353 0.4563647
## 28  0.6696402 0.3303598
## 29  0.6164656 0.3835344
SEQ <- c(1,4,6,8)
Transformed.Voting.df <- Voting.df[SEQ]
colnames(Transformed.Voting.df) <- c("2","3","4","5")
head(Transformed.Voting.df)
##             2          3         4         5
## 5  0.04668780 0.14411134 0.2896691 0.3062542
## 14 0.02789076 0.09903335 0.3291369 0.4206729
## 16 0.02231681 0.08653633 0.2379191 0.6079159
## 26 0.03822932 0.10279863 0.2770397 0.4563647
## 28 0.04522425 0.25132687 0.2695689 0.3303598
## 29 0.02281651 0.09098627 0.3007501 0.3835344
Evaluation <- Transformed.Voting.df
Index <- as.numeric(apply(Transformed.Voting.df,MARGIN = 1,which.max))
Index <- Index+1
Evaluation$Vote <- Index
Evaluation$Actual <- test.labels
head(Evaluation,100)
##              2          3         4            5 Vote Actual
## 5   0.04668780 0.14411134 0.2896691 0.3062542204    5      4
## 14  0.02789076 0.09903335 0.3291369 0.4206728690    5      5
## 16  0.02231681 0.08653633 0.2379191 0.6079159098    5      5
## 26  0.03822932 0.10279863 0.2770397 0.4563647356    5      4
## 28  0.04522425 0.25132687 0.2695689 0.3303598085    5      4
## 29  0.02281651 0.09098627 0.3007501 0.3835344186    5      4
## 39  0.03305619 0.08995137 0.2843293 0.5000000000    5      5
## 40  0.02633637 0.13788211 0.3373378 0.3307879032    4      3
## 60  0.02506159 0.09503349 0.2573851 0.5741836575    5      5
## 61  0.03975009 0.18223615 0.2906639 0.3734977224    5      3
## 72  0.01994690 0.05119684 0.2934963 0.5417508529    5      4
## 81  0.03143006 0.11354223 0.3182546 0.3793170679    5      3
## 86  0.02751853 0.12779138 0.3020795 0.5000000000    5      5
## 90  0.05727079 0.12494698 0.3026338 0.2190275477    4      4
## 92  0.02565524 0.14301612 0.3009383 0.2655729266    4      4
## 113 0.03109249 0.07842272 0.3184917 0.3724311041    5      5
## 116 0.05143052 0.16958643 0.2620891 0.3680350127    5      4
## 117 0.02540218 0.07802040 0.3561931 0.4453888427    5      5
## 122 0.04046065 0.13339775 0.3338676 0.1280884357    4      4
## 123 0.02711407 0.07182488 0.4050457 0.3404043601    4      2
## 124 0.02716514 0.10528435 0.3043763 0.3659000305    5      4
## 131 0.02683660 0.10843195 0.3384214 0.4157019993    5      4
## 135 0.04359338 0.12392758 0.3018791 0.2841510135    4      3
## 137 0.01275128 0.08412494 0.3040230 0.5775762988    5      5
## 140 0.03537626 0.14052454 0.3157547 0.3085613075    4      4
## 142 0.02289964 0.05836258 0.3509510 0.4605295809    5      5
## 149 0.02427140 0.07416831 0.3408025 0.4643272878    5      4
## 154 0.02292391 0.11320711 0.2374482 0.6173066013    5      5
## 156 0.04091144 0.09106159 0.3364255 0.3342467407    4      3
## 158 0.04944712 0.16794116 0.2749314 0.3457240152    5      3
## 169 0.02486484 0.06386500 0.2193025 0.7552772077    5      5
## 185 0.02985575 0.08746761 0.2528275 0.5312441380    5      5
## 187 0.01999354 0.09820672 0.3224571 0.4668851100    5      5
## 192 0.03087044 0.14285503 0.3293447 0.2759630565    4      3
## 194 0.22853524 0.55359697 0.2495643 0.1380200886    3      4
## 195 0.02273870 0.25864157 0.2752491 0.2376984946    4      4
## 196 0.04005926 0.11846741 0.3074524 0.3203900323    5      5
## 197 0.07093429 0.15648532 0.2570035 0.2428793629    4      3
## 199 0.02062199 0.06611638 0.2651854 0.7504706727    5      5
## 210 0.08050338 0.14196916 0.2920090 0.1817187798    4      3
## 216 0.03136323 0.07251198 0.1890957 0.7852045677    5      5
## 220 0.03204837 0.14822793 0.2609382 0.3889631056    5      4
## 227 0.03608857 0.06667880 0.2827288 0.4819063474    5      5
## 234 0.02949519 0.12055925 0.3737274 0.2991850406    4      3
## 240 0.01772800 0.06583320 0.5501839 0.1613092824    4      5
## 245 0.09536381 0.14800760 0.3859078 0.1741048164    4      4
## 249 0.03138296 0.12058163 0.3531731 0.3323475578    4      5
## 261 0.02840585 0.39398858 0.3894780 0.1945946236    3      3
## 277 0.02252080 0.04964714 0.2096614 0.8838221303    5      5
## 283 0.02384335 0.13904064 0.2957860 0.5251045637    5      5
## 290 0.01069258 0.02909132 0.1639922 0.9686232380    5      4
## 293 0.03384441 0.05876396 0.3711986 0.2995474619    4      5
## 302 0.01829217 0.11093310 0.2443212 0.4929561782    5      4
## 305 0.04194273 0.10962057 0.3861270 0.3049839382    4      4
## 308 0.02838902 0.12245262 0.3136746 0.1742143171    4      4
## 311 0.03250225 0.06052099 0.2554438 0.7265893016    5      5
## 320 0.01503806 0.06112370 0.2038668 0.9509761656    5      2
## 322 0.02534775 0.04280721 0.2250635 0.8149021274    5      5
## 330 0.02146590 0.05644429 0.1770706 0.7877506562    5      4
## 332 0.03409575 0.09671216 0.8181575 0.0314774332    4      4
## 333 0.02446328 0.03011527 0.3337695 0.8139345417    5      5
## 339 0.01859570 0.13880927 0.2480972 0.5669771971    5      5
## 341 0.04289687 0.07842570 0.4859418 0.1268576573    4      4
## 344 0.03067522 0.06194310 0.3062120 0.6478734617    5      5
## 349 0.01866830 0.06145658 0.2150480 0.8961124715    5      5
## 355 0.01623829 0.02562462 0.1711914 0.9678454260    5      5
## 356 0.04652645 0.10894022 0.3106021 0.3929103029    5      3
## 365 0.07226638 0.15768169 0.3066844 0.2198171737    4      3
## 366 0.03073180 0.06779255 0.2545362 0.5556347003    5      4
## 369 0.01566237 0.05495422 0.3386898 0.4530046005    5      4
## 371 0.01583100 0.09257094 0.2606355 0.8258508221    5      5
## 373 0.03353986 0.05400197 0.3426102 0.5626151739    5      5
## 389 0.10949805 0.07885363 0.8506742 0.0002591507    4      2
## 390 0.24229518 0.56121064 0.7664688 0.0002053102    4      4
## 396 0.02368806 0.04066344 0.2863621 0.6342019755    5      4
## 412 0.01733431 0.08659807 0.3591604 0.4860565893    5      5
## 413 0.02210061 0.11287010 0.2781288 0.4480579888    5      3
## 415 0.01850532 0.08176998 0.3910118 0.4457900910    5      4
## 422 0.02315691 0.13633230 0.2753871 0.4115202386    5      5
## 425 0.02111399 0.07055621 0.2816232 0.6353375182    5      5
## 434 0.03401370 0.08461636 0.3141675 0.3833170543    5      5
## 438 0.01560432 0.08454230 0.2874751 0.6813725189    5      4
## 441 0.03061994 0.10647435 0.2997158 0.4221386926    5      5
## 442 0.02584918 0.10413704 0.2278435 0.6349063724    5      5
## 445 0.02709484 0.10258863 0.3424323 0.3921436117    5      5
## 447 0.03447430 0.09742633 0.2695485 0.3914068171    5      3
## 453 0.03450107 0.22960758 0.3011368 0.2643526126    4      4
## 454 0.02098104 0.11076878 0.2840189 0.4430664930    5      5
## 462 0.01557827 0.05990222 0.2510160 0.7465997249    5      5
## 474 0.04925204 0.11410126 0.2966320 0.4364747398    5      3
## 476 0.02773956 0.10271579 0.2704870 0.4700992521    5      3
## 493 0.01236140 0.08012259 0.5000000 0.2278779018    4      5
## 502 0.02029159 0.12543068 0.3936436 0.3281053883    4      4
## 503 0.02963062 0.09239408 0.2597120 0.5257826390    5      5
## 506 0.02601148 0.16713533 0.2590329 0.4435236961    5      5
## 508 0.02479170 0.06225357 0.3461246 0.5708906748    5      5
## 512 0.02813992 0.09843015 0.3047279 0.5000000000    5      5
## 513 0.02622257 0.12327240 0.2956174 0.4740116580    5      5
## 521 0.08358890 0.11891853 0.2527259 0.3629107250    5      2
## 524 0.08583100 0.08851636 0.3542563 0.2120163269    4      5
CM <- table(Evaluation$Actual,Evaluation$Vote)
CM
##    
##       3   4   5
##   2   0   4   3
##   3   1  13  11
##   4   1  26  35
##   5   1  11 102
#Proportions
Overall <- length(Evaluation$Actual)
Length2 <- length(which(Evaluation$Actual==2))
Length3 <- length(which(Evaluation$Actual==3))
Length4 <- length(which(Evaluation$Actual==4))
Length5 <- length(which(Evaluation$Actual==5))


#Accuracy
Accuracy <- sum(0,29,100)/sum(CM)

#Precision
Rows <- rowSums(CM)
Precision2 <- CM[2,1]/Rows[2]
Precision3 <- CM[3,2]/Rows[3]
Precision4 <- CM[4,3]/Rows[4]

Precision <- (Precision2*Length3+Precision3*Length4+Precision4*Length5)/208

#Recall
Col <- colSums(CM)
Recall2 <- CM[2,1]/Col[1]
Recall3 <- CM[3,2]/Col[2]
Recall4 <- CM[4,3]/Col[3]

Recall <- (Recall2*Length3+Recall3*Length4+Recall4*Length5)/208


Accuracy
## [1] 0.6201923
Precision
##         3 
## 0.6201923
Recall
##         3 
## 0.5538068