PREPARATION

setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/Full")
#install.packages("naivebayes")
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.4.3
library(dplyr)
## Warning: Installed Rcpp (0.12.16) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(e1071)
library(readxl)

Import actual labels.

#Import Labels
Labels <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Naive Bayes/1.Labels/Source Data.xlsx")

Label <- Labels$Score

Import full TFIDF Feature set (no percentile cut-off)

#Import Features
Features <- read.csv("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/Full/Feature Set 1 Full TFIDF.csv")

Features <- Features[-1]

RECODE LABELS FOR ONE-VS-ALL

#Class 2
Label2 <- list()
for(i in 1:1000){
  if(Label[i]==3| Label[i]==4){
    Label2[i] <- 1
  }else{
    Label2[i] <- 0
  }
}
#As Factor
Label2 <- as.factor(unlist(Label2))
#Class 3
Label3 <- list()
for(i in 1:1000){
  if(Label[i]==5| Label[i]==6){
    Label3[i] <- 1
  }else{
    Label3[i] <- 0
  }
}
#As Factor
Label3 <- as.factor(unlist(Label3))
#Class 4
Label4 <- list()
for(i in 1:1000){
  if(Label[i]==7| Label[i]==8){
    Label4[i] <- 1
  }else{
    Label4[i] <- 0
  }
}
#As Factor
Label4 <- as.factor(unlist(Label4))
#Class 5
Label5 <- list()
for(i in 1:1000){
  if(Label[i]==9| Label[i]==10){
    Label5[i] <- 1
  }else{
    Label5[i] <- 0
  }
}
#As Factor
Label5 <- as.factor(unlist(Label5))
#All Labels
All <- list()
for(i in 1:1000){
  if(Label[i]==9| Label[i]==10){
    All[i] <- 5
  }else if(Label[i]==7| Label[i]==8){
    All[i] <- 4
  }else if(Label[i]==5| Label[i]==6){
    All[i] <- 3
  }else{
    All[i] <- 2
  }
  
  
}
#As Factor
All <- as.factor(unlist(All))

TRANSFORM FEATURES TO NUMERIC VARIABLES

#Transform Integer to Factor
for(i in 1:2672){
  Features[,i] <- as.numeric(Features[,i])
}
str(Features)
## 'data.frame':    1000 obs. of  2672 variables:
##  $ abil          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ abit          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ abl           : num  0.0351 0 0 0 0 ...
##  $ abnorm        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ about         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ abov          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ abrupt        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ absolut       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accent        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accept        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ access        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accid         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accommod      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accomplish    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accur         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accustom      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ acess         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ach           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ acknowledg    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ acomod        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ across        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ activ         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ actual        : num  0 0 0 0 0 ...
##  $ adaptor       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ add           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ addit         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adequ         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adjac         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adjust        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ador          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adult         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advanc        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advantag      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adverti       : num  0 0 0 0 0.105 ...
##  $ advi          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advic         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ affair        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ affect        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ afford        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ afraid        : num  0 0 0 0.0753 0 ...
##  $ africa        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ after         : num  0 0 0 0 0.112 ...
##  $ afterdinn     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ afternoon     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ afterward     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ago           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ agr           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ agreeabl      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ahead         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ air           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aircon        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aircondit     : num  0 0 0 0 0 ...
##  $ airi          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ airless       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ airport       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alarm         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ albeit        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ albert        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ albrt         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alcohol       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aldo          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alittl        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ all           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ allevi        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alloc         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ allow         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ almost        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ along         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alongsid      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alot          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alreadi       : num  0 0 0 0.062 0 ...
##  $ alright       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ also          : num  0 0 0 0.0933 0 ...
##  $ altern        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ although      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alway         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amaz          : num  0 0 0 0 0.0653 ...
##  $ ambianc       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ambienc       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amen          : num  0 0.107 0 0 0 ...
##  $ amend         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ america       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ american      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amongst       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amount        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ampl          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amsterdam     : num  0 0.0847 0 0.0463 0 ...
##  $ and           : num  0.031 0 0 0 0 ...
##  $ angl          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ angri         : num  0.0493 0 0 0.0753 0 ...
##  $ ann           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anna          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ annex         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ announc       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ annoy         : num  0.0357 0 0 0 0 ...
##  $ anoth         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ansterdam     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ answer        : num  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]

PARTITIONING TRAINING & VALIDATION

#Features
set.seed(1234)
ind <- sample(2,nrow(Features),replace = T, prob =c(0.8,0.2))
train <- Features[ind == 1,]
test <- Features[ind ==2,]

Labels

train.labels.2 <- Label2[ind == 1]
test.labels.2 <- Label2[ind ==2]

train.labels.3 <- Label3[ind == 1]
test.labels.3 <- Label3[ind ==2]

train.labels.4 <- Label4[ind == 1]
test.labels.4 <- Label4[ind ==2]

train.labels.5 <- Label5[ind == 1]
test.labels.5 <- Label5[ind ==2]

train.labels <- All[ind == 1]
test.labels <- All[ind ==2]

SVM MODEL

#SVM2
train2 <- train
train2$Score <- train.labels.2
SVM2 <- svm(Score~.,data = train2,scale = FALSE,probability=TRUE)

train3 <- train
train3$Score <- train.labels.3
SVM3 <- svm(Score~.,data = train3,scale = FALSE,probability=TRUE)

train4 <- train
train4$Score <- train.labels.4
SVM4 <- svm(Score~.,data = train4,scale = FALSE,probability=TRUE)

train5 <- train
train5$Score <- train.labels.5
SVM5 <- svm(Score~.,data = train5,scale = FALSE,probability=TRUE)
P2 <- predict(SVM2,newdata = test,probability = TRUE)
P3 <- predict(SVM3,newdata = test,probability = TRUE)
P4 <- predict(SVM4,newdata = test,probability = TRUE)
P5 <- predict(SVM5,newdata = test,probability = TRUE)
Prob2 <- attr(P2,"probabilities")
Prob3 <- attr(P3,"probabilities")
Prob4 <- attr(P4,"probabilities")
Prob5 <- attr(P5,"probabilities")

VOTING

Use probabilities as an input for the voting procedure.

Voting.df <- data.frame(Prob2, Prob3,Prob4,Prob5)
colnames(Voting.df) <- c("Class 2: 1","Class2: 0","Class 3: 0","Class3: 1","Class 4: 0","Class4: 1","Class 5: 0","Class5: 1")

head(Voting.df)
##    Class 2: 1 Class2: 0 Class 3: 0  Class3: 1 Class 4: 0 Class4: 1
## 5  0.03741647 0.9625835  0.8252794 0.17472059  0.6976505 0.3023495
## 14 0.01782711 0.9821729  0.9244645 0.07553551  0.6696734 0.3303266
## 16 0.01330174 0.9866983  0.9279899 0.07201010  0.7493122 0.2506878
## 26 0.03510767 0.9648923  0.8930392 0.10696078  0.7208495 0.2791505
## 28 0.03125477 0.9687452  0.7009480 0.29905199  0.7428127 0.2571873
## 29 0.02125255 0.9787475  0.8639767 0.13602333  0.6973001 0.3026999
##    Class 5: 0 Class5: 1
## 5   0.7109404 0.2890596
## 14  0.5588675 0.4411325
## 16  0.3906866 0.6093134
## 26  0.5501470 0.4498530
## 28  0.6851740 0.3148260
## 29  0.6323034 0.3676966
SEQ <- c(1,4,6,8)
Transformed.Voting.df <- Voting.df[SEQ]
colnames(Transformed.Voting.df) <- c("2","3","4","5")
head(Transformed.Voting.df)
##             2          3         4         5
## 5  0.03741647 0.17472059 0.3023495 0.2890596
## 14 0.01782711 0.07553551 0.3303266 0.4411325
## 16 0.01330174 0.07201010 0.2506878 0.6093134
## 26 0.03510767 0.10696078 0.2791505 0.4498530
## 28 0.03125477 0.29905199 0.2571873 0.3148260
## 29 0.02125255 0.13602333 0.3026999 0.3676966
Evaluation <- Transformed.Voting.df
Index <- as.numeric(apply(Transformed.Voting.df,MARGIN = 1,which.max))
Index <- Index+1
Evaluation$Vote <- Index
Evaluation$Actual <- test.labels
head(Evaluation,100)
##               2          3         4            5 Vote Actual
## 5   0.037416468 0.17472059 0.3023495 0.2890596317    4      4
## 14  0.017827114 0.07553551 0.3303266 0.4411324797    5      5
## 16  0.013301742 0.07201010 0.2506878 0.6093133857    5      5
## 26  0.035107672 0.10696078 0.2791505 0.4498530382    5      4
## 28  0.031254768 0.29905199 0.2571873 0.3148260123    5      4
## 29  0.021252548 0.13602333 0.3026999 0.3676965958    5      4
## 39  0.030898262 0.08580307 0.2931842 0.5000000000    5      5
## 40  0.022038150 0.17488799 0.3530405 0.3157463273    4      3
## 60  0.018306889 0.11214332 0.2611371 0.5818320219    5      5
## 61  0.040522680 0.19931055 0.2909625 0.3670662734    5      3
## 72  0.012746117 0.05031849 0.2759687 0.5504925975    5      4
## 81  0.024686245 0.12600224 0.3197435 0.3748402112    5      3
## 86  0.029862239 0.13671611 0.3126539 0.4926435102    5      5
## 90  0.092667244 0.10528310 0.3111719 0.2113235574    4      4
## 92  0.021050171 0.16117844 0.3027949 0.2526995236    4      4
## 113 0.022242168 0.06612885 0.3366246 0.3734876418    5      5
## 116 0.043697211 0.19976956 0.2493013 0.3589411118    5      4
## 117 0.020694174 0.10569411 0.3893446 0.4485605475    5      5
## 122 0.054133362 0.13757054 0.3562905 0.1205063839    4      4
## 123 0.021668619 0.06222019 0.4293218 0.3400823265    4      2
## 124 0.022689709 0.13669499 0.3233422 0.3527543609    5      4
## 131 0.025332634 0.15096548 0.3042234 0.4084615605    5      4
## 135 0.036992245 0.15855692 0.3164694 0.2683822969    4      3
## 137 0.011557142 0.07079650 0.2910938 0.5854606994    5      5
## 140 0.033013561 0.19824526 0.3177220 0.3076789030    4      4
## 142 0.019145013 0.06810994 0.3353849 0.4505646259    5      5
## 149 0.026835850 0.08824215 0.3576602 0.4561666052    5      4
## 154 0.028165485 0.12126667 0.2474598 0.6212410248    5      5
## 156 0.042180524 0.09341816 0.3555188 0.3171279483    4      3
## 158 0.038882107 0.18049328 0.2850176 0.3446855218    5      3
## 169 0.019201500 0.05529163 0.2027361 0.7625902178    5      5
## 185 0.017077592 0.07884758 0.2621885 0.5439613390    5      5
## 187 0.012699054 0.09969859 0.3305230 0.4590653050    5      5
## 192 0.021088073 0.17734512 0.3180637 0.2681233483    4      3
## 194 0.284678761 0.64035190 0.2563261 0.1326999505    3      4
## 195 0.017804569 0.31124708 0.2929832 0.2307620894    3      4
## 196 0.049649328 0.14926022 0.3275724 0.3013484288    4      5
## 197 0.064900083 0.16826283 0.2546144 0.2340286490    4      3
## 199 0.017118694 0.06470830 0.2506535 0.7451032956    5      5
## 210 0.087208435 0.16889314 0.3276000 0.1657052743    4      3
## 216 0.034735060 0.08999539 0.1824819 0.7939982978    5      5
## 220 0.023028494 0.20324833 0.2678536 0.3719357673    5      4
## 227 0.034469672 0.07717825 0.2940601 0.4780477153    5      5
## 234 0.025369790 0.13008832 0.3536845 0.2962062359    4      3
## 240 0.021006336 0.08638538 0.5654067 0.1582008998    4      5
## 245 0.133369834 0.17931689 0.4022274 0.1675539524    4      4
## 249 0.042182896 0.13882380 0.3600550 0.3325010507    4      5
## 261 0.040983533 0.30804582 0.4019439 0.1916605766    4      3
## 277 0.015865805 0.04253851 0.1982928 0.8896620934    5      5
## 283 0.028278796 0.13699369 0.2979834 0.5117647565    5      5
## 290 0.009012931 0.02057348 0.1484060 0.9707621225    5      4
## 293 0.031055796 0.05108379 0.3856622 0.2794263283    4      5
## 302 0.019841360 0.17725199 0.2499733 0.4697672034    5      4
## 305 0.033074225 0.13890100 0.3577666 0.3127348113    4      4
## 308 0.028733674 0.14618508 0.3117253 0.1709664371    4      4
## 311 0.022122003 0.05792426 0.2514989 0.7408362830    5      5
## 320 0.019251498 0.03233792 0.1777136 0.9536075825    5      2
## 322 0.016393160 0.04001089 0.2347203 0.8257830925    5      5
## 330 0.010919698 0.05987310 0.1665512 0.7984481514    5      4
## 332 0.044554322 0.10572643 0.8306462 0.0293967466    4      4
## 333 0.016717959 0.02258387 0.3055452 0.8192140947    5      5
## 339 0.017657102 0.13501000 0.2202148 0.5768949959    5      5
## 341 0.043604808 0.08837580 0.4703951 0.1282297326    4      4
## 344 0.022088920 0.06186437 0.3221320 0.6555760883    5      5
## 349 0.020703238 0.05720334 0.1953555 0.9023534902    5      5
## 355 0.012317548 0.01725703 0.1528846 0.9731915061    5      5
## 356 0.050570798 0.09969227 0.3018859 0.3924097832    5      3
## 365 0.066808063 0.21909685 0.3225787 0.2166017975    4      3
## 366 0.046959831 0.09852977 0.2567884 0.5562807586    5      4
## 369 0.010283760 0.05816167 0.3322517 0.4819778213    5      4
## 371 0.013828980 0.07948887 0.2557849 0.8363540584    5      5
## 373 0.025552776 0.04751141 0.3453037 0.5676286538    5      5
## 389 0.401060743 0.06696648 0.8710832 0.0002170279    4      2
## 390 0.190435629 0.70251729 0.7959817 0.0001691107    4      4
## 396 0.013110844 0.03395022 0.2832280 0.6434339027    5      4
## 412 0.010453355 0.07250264 0.3477017 0.4890175242    5      5
## 413 0.022838093 0.11980322 0.2850946 0.4509584760    5      3
## 415 0.015099034 0.07044108 0.3880623 0.4447846657    5      4
## 422 0.020996662 0.16374428 0.2659642 0.3929502414    5      5
## 425 0.014993467 0.05997513 0.2833060 0.6277514807    5      5
## 434 0.032913781 0.09324043 0.2971854 0.3887153976    5      5
## 438 0.011007894 0.07678396 0.2865649 0.6834928365    5      4
## 441 0.034478404 0.12372785 0.3041702 0.4214714045    5      5
## 442 0.027514028 0.09393353 0.2236135 0.6375821569    5      5
## 445 0.022628329 0.11095636 0.3387803 0.3926556389    5      5
## 447 0.028791003 0.10879906 0.2874135 0.3763802965    5      3
## 453 0.064877540 0.27451927 0.3289878 0.2214745024    4      4
## 454 0.020513686 0.10966761 0.2744635 0.4524884635    5      5
## 462 0.011333396 0.05320518 0.2484347 0.7481929047    5      5
## 474 0.056392110 0.17138572 0.3007924 0.4043834265    5      3
## 476 0.015443359 0.13915284 0.2688888 0.4659042438    5      3
## 493 0.012787223 0.07781358 0.5132826 0.2268967542    4      5
## 502 0.018900264 0.11423706 0.4082753 0.3170469413    4      4
## 503 0.025793015 0.10766069 0.2613522 0.5215568683    5      5
## 506 0.019802225 0.15628183 0.2531119 0.4456369295    5      5
## 508 0.018801045 0.06494197 0.3392477 0.5664422488    5      5
## 512 0.029216750 0.14811447 0.3109878 0.4805260870    5      5
## 513 0.022825469 0.13159684 0.2962530 0.4778380383    5      5
## 521 0.060920217 0.13614955 0.2703392 0.3620449335    5      2
## 524 0.070055224 0.10341984 0.3857416 0.1918922306    4      5
CM <- table(Evaluation$Actual,Evaluation$Vote)
CM
##    
##       3   4   5
##   2   0   4   3
##   3   0  14  11
##   4   2  27  33
##   5   1  13 100
#Proportions
Overall <- length(Evaluation$Actual)
Length2 <- length(which(Evaluation$Actual==2))
Length3 <- length(which(Evaluation$Actual==3))
Length4 <- length(which(Evaluation$Actual==4))
Length5 <- length(which(Evaluation$Actual==5))


#Accuracy
Accuracy <- sum(0,27,100)/sum(CM)

#Precision
Rows <- rowSums(CM)
Precision2 <- CM[2,1]/Rows[2]
Precision3 <- CM[3,2]/Rows[3]
Precision4 <- CM[4,3]/Rows[4]

Precision <- (Precision2*Length3+Precision3*Length4+Precision4*Length5)/208

#Recall
Col <- colSums(CM)
Recall2 <- CM[2,1]/Col[1]
Recall3 <- CM[3,2]/Col[2]
Recall4 <- CM[4,3]/Col[3]

Recall <- (Recall2*Length3+Recall3*Length4+Recall4*Length5)/208


Accuracy
## [1] 0.6105769
Precision
##         3 
## 0.6105769
Recall
##         3 
## 0.5116014