PREPARATION

setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/10")
#install.packages("naivebayes")
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.4.3
library(dplyr)
## Warning: Installed Rcpp (0.12.16) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(e1071)
library(readxl)

Import actual Labels.

#Import Labels
Labels <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Naive Bayes/1.Labels/Source Data.xlsx")

Label <- Labels$Score

Import TFIDF feature set with a 10th percentile cut-off.

#Import Features
Features <- read.csv("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/10/Feature Set 1 10th TFIDF.csv")

Features <- Features[-1]

RECODE LABELS FOR ONE-VS-ALL

#Class 2
Label2 <- list()
for(i in 1:1000){
  if(Label[i]==3| Label[i]==4){
    Label2[i] <- 1
  }else{
    Label2[i] <- 0
  }
}
#As Factor
Label2 <- as.factor(unlist(Label2))
#Class 3
Label3 <- list()
for(i in 1:1000){
  if(Label[i]==5| Label[i]==6){
    Label3[i] <- 1
  }else{
    Label3[i] <- 0
  }
}
#As Factor
Label3 <- as.factor(unlist(Label3))
#Class 4
Label4 <- list()
for(i in 1:1000){
  if(Label[i]==7| Label[i]==8){
    Label4[i] <- 1
  }else{
    Label4[i] <- 0
  }
}
#As Factor
Label4 <- as.factor(unlist(Label4))
#Class 5
Label5 <- list()
for(i in 1:1000){
  if(Label[i]==9| Label[i]==10){
    Label5[i] <- 1
  }else{
    Label5[i] <- 0
  }
}
#As Factor
Label5 <- as.factor(unlist(Label5))
#All Labels
All <- list()
for(i in 1:1000){
  if(Label[i]==9| Label[i]==10){
    All[i] <- 5
  }else if(Label[i]==7| Label[i]==8){
    All[i] <- 4
  }else if(Label[i]==5| Label[i]==6){
    All[i] <- 3
  }else{
    All[i] <- 2
  }
  
  
}
#As Factor
All <- as.factor(unlist(All))

TRANSFORM FEATURES TO NUMERIC VARIABLES

#Transform Integer to Factor
for(i in 1:2396){
  Features[,i] <- as.numeric(Features[,i])
}
str(Features)
## 'data.frame':    1000 obs. of  2396 variables:
##  $ abit          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ abl           : num  0.0351 0 0 0 0 ...
##  $ about         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ abov          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ absolut       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accent        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accept        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ access        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accid         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accommod      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accomplish    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accur         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ acess         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ach           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ acknowledg    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ acomod        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ across        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ activ         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ actual        : num  0 0 0 0 0 ...
##  $ adaptor       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ add           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ addit         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adequ         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adjac         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adjust        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ador          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adult         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advanc        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advantag      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ adverti       : num  0 0 0 0 0.105 ...
##  $ advi          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ advic         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ affect        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ afford        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ afraid        : num  0 0 0 0.0753 0 ...
##  $ africa        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ after         : num  0 0 0 0 0.112 ...
##  $ afternoon     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ afterward     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ago           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ agr           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ agreeabl      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ahead         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ air           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aircon        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aircondit     : num  0 0 0 0 0 ...
##  $ airi          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ airless       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ airport       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alarm         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ albeit        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ albert        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ albrt         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alcohol       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ aldo          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alittl        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ all           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ allevi        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alloc         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ allow         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ almost        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ along         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alot          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alreadi       : num  0 0 0 0.062 0 ...
##  $ alright       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ also          : num  0 0 0 0.0933 0 ...
##  $ altern        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ although      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alway         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amaz          : num  0 0 0 0 0.0653 ...
##  $ ambianc       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ambienc       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amen          : num  0 0.107 0 0 0 ...
##  $ amend         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ american      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amount        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amsterdam     : num  0 0.0847 0 0.0463 0 ...
##  $ and           : num  0.031 0 0 0 0 ...
##  $ angl          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ann           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anna          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ annex         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ announc       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ annoy         : num  0.0357 0 0 0 0 ...
##  $ anoth         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ansterdam     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ answer        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ant           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anymor        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anyon         : num  0 0.102 0 0 0 ...
##  $ anyth         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anyway        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anywh         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ apart         : num  0 0 0 0 0 ...
##  $ apex          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ apolog        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appal         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ appar         : num  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]

PARTITIONING TRAINING & VALIDATION

#Features
set.seed(1234)
ind <- sample(2,nrow(Features),replace = T, prob =c(0.8,0.2))
train <- Features[ind == 1,]
test <- Features[ind ==2,]

Labels

train.labels.2 <- Label2[ind == 1]
test.labels.2 <- Label2[ind ==2]

train.labels.3 <- Label3[ind == 1]
test.labels.3 <- Label3[ind ==2]

train.labels.4 <- Label4[ind == 1]
test.labels.4 <- Label4[ind ==2]

train.labels.5 <- Label5[ind == 1]
test.labels.5 <- Label5[ind ==2]

train.labels <- All[ind == 1]
test.labels <- All[ind ==2]

SVM MODEL

#SVM2
train2 <- train
train2$Score <- train.labels.2
SVM2 <- svm(Score~.,data = train2,scale = FALSE,probability=TRUE)

train3 <- train
train3$Score <- train.labels.3
SVM3 <- svm(Score~.,data = train3,scale = FALSE,probability=TRUE)

train4 <- train
train4$Score <- train.labels.4
SVM4 <- svm(Score~.,data = train4,scale = FALSE,probability=TRUE)

train5 <- train
train5$Score <- train.labels.5
SVM5 <- svm(Score~.,data = train5,scale = FALSE,probability=TRUE)
P2 <- predict(SVM2,newdata = test,probability = TRUE)
P3 <- predict(SVM3,newdata = test,probability = TRUE)
P4 <- predict(SVM4,newdata = test,probability = TRUE)
P5 <- predict(SVM5,newdata = test,probability = TRUE)
Prob2 <- attr(P2,"probabilities")
Prob3 <- attr(P3,"probabilities")
Prob4 <- attr(P4,"probabilities")
Prob5 <- attr(P5,"probabilities")

VOTING

Probabilities used as an input for voting procedure. The class with the highest probability is chosen.

Voting.df <- data.frame(Prob2, Prob3,Prob4,Prob5)
colnames(Voting.df) <- c("Class 2: 1","Class2: 0","Class 3: 0","Class3: 1","Class 4: 0","Class4: 1","Class 5: 0","Class5: 1")

head(Voting.df)
##    Class 2: 1 Class2: 0 Class 3: 0  Class3: 1 Class 4: 0 Class4: 1
## 5  0.03818164 0.9618184  0.8445282 0.15547176  0.7021886 0.2978114
## 14 0.02100280 0.9789972  0.9197674 0.08023261  0.6640827 0.3359173
## 16 0.01537635 0.9846237  0.9338277 0.06617229  0.7490503 0.2509497
## 26 0.03786958 0.9621304  0.9052764 0.09472360  0.7222275 0.2777725
## 28 0.03757358 0.9624264  0.7261978 0.27380224  0.7386476 0.2613524
## 29 0.02258985 0.9774101  0.8794324 0.12056755  0.6987523 0.3012477
##    Class 5: 0 Class5: 1
## 5   0.7101294 0.2898706
## 14  0.5587509 0.4412491
## 16  0.3921667 0.6078333
## 26  0.5494612 0.4505388
## 28  0.6831995 0.3168005
## 29  0.6243519 0.3756481
SEQ <- c(1,4,6,8)
Transformed.Voting.df <- Voting.df[SEQ]
colnames(Transformed.Voting.df) <- c("2","3","4","5")
head(Transformed.Voting.df)
##             2          3         4         5
## 5  0.03818164 0.15547176 0.2978114 0.2898706
## 14 0.02100280 0.08023261 0.3359173 0.4412491
## 16 0.01537635 0.06617229 0.2509497 0.6078333
## 26 0.03786958 0.09472360 0.2777725 0.4505388
## 28 0.03757358 0.27380224 0.2613524 0.3168005
## 29 0.02258985 0.12056755 0.3012477 0.3756481
Evaluation <- Transformed.Voting.df
Index <- as.numeric(apply(Transformed.Voting.df,MARGIN = 1,which.max))
Index <- Index+1
Evaluation$Vote <- Index
Evaluation$Actual <- test.labels
head(Evaluation,100)
##               2          3         4            5 Vote Actual
## 5   0.038181639 0.15547176 0.2978114 0.2898706043    4      4
## 14  0.021002798 0.08023261 0.3359173 0.4412491293    5      5
## 16  0.015376345 0.06617229 0.2509497 0.6078333058    5      5
## 26  0.037869579 0.09472360 0.2777725 0.4505388267    5      4
## 28  0.037573580 0.27380224 0.2613524 0.3168005403    5      4
## 29  0.022589850 0.12056755 0.3012477 0.3756481325    5      4
## 39  0.030592838 0.08578453 0.2912015 0.5000000000    5      5
## 40  0.022718256 0.14801904 0.3590708 0.3162584448    4      3
## 60  0.021584213 0.10037105 0.2627470 0.5810768187    5      5
## 61  0.037346816 0.19031195 0.2956103 0.3686840196    5      3
## 72  0.016285025 0.04664604 0.2830236 0.5495992051    5      4
## 81  0.028686378 0.11588716 0.3421449 0.3748018829    5      3
## 86  0.035310003 0.12081539 0.3003863 0.4906887587    5      5
## 90  0.078577184 0.12554426 0.3098202 0.2126480606    4      4
## 92  0.028450195 0.15628750 0.3098685 0.2531634626    4      4
## 113 0.028079048 0.06466468 0.3424152 0.3718152615    5      5
## 116 0.043449758 0.19987941 0.2653301 0.3589459628    5      4
## 117 0.036180765 0.06489630 0.3839651 0.4476680001    5      5
## 122 0.047840831 0.10759261 0.3650936 0.1218210265    4      4
## 123 0.026338348 0.06340522 0.4188837 0.3402201879    4      2
## 124 0.023658222 0.10617956 0.3235465 0.3539933045    5      4
## 131 0.030976519 0.11635976 0.3210432 0.4094035596    5      4
## 135 0.039554055 0.13881523 0.3225515 0.2709346845    4      3
## 137 0.016810254 0.07230888 0.3035049 0.5818155076    5      5
## 140 0.047957859 0.12468294 0.3293954 0.3082099114    4      4
## 142 0.022866624 0.06239578 0.3429161 0.4486701816    5      5
## 149 0.038086860 0.07878263 0.3560355 0.4568854504    5      4
## 154 0.024955117 0.09565987 0.2388775 0.6187062782    5      5
## 156 0.045862279 0.08846731 0.3523094 0.3187385411    4      3
## 158 0.038118416 0.15811149 0.2855218 0.3452013961    5      3
## 169 0.025297698 0.05256913 0.2020893 0.7597796071    5      5
## 185 0.018420384 0.09643769 0.2694042 0.5384865791    5      5
## 187 0.016937782 0.10465724 0.3384917 0.4614470292    5      5
## 192 0.025561003 0.19243354 0.3318065 0.2666732811    4      3
## 194 0.183914654 0.66302172 0.2566762 0.1337540839    3      4
## 195 0.028642216 0.27268901 0.2862556 0.2325611879    4      4
## 196 0.047494316 0.11530312 0.3196075 0.3015560750    4      5
## 197 0.083686392 0.16828669 0.2540324 0.2350210628    4      3
## 199 0.019355150 0.06193176 0.2485947 0.7407440246    5      5
## 210 0.079609492 0.13935989 0.3278457 0.1669280418    4      3
## 216 0.038575151 0.07215352 0.1802531 0.7910278874    5      5
## 220 0.023421024 0.15750785 0.2718751 0.3719463302    5      4
## 227 0.041515876 0.06998549 0.2935571 0.4779714145    5      5
## 234 0.029683551 0.12380251 0.3483891 0.2976597973    4      3
## 240 0.029256972 0.06335722 0.5558158 0.1589525729    4      5
## 245 0.126979135 0.18737727 0.4262045 0.1669676038    4      4
## 249 0.028224759 0.11468586 0.3590326 0.3328329539    4      5
## 261 0.046702710 0.34236268 0.4000962 0.1928046873    4      3
## 277 0.018655848 0.03811587 0.2038610 0.8894263617    5      5
## 283 0.027473975 0.10502194 0.2748507 0.5091833088    5      5
## 290 0.008801241 0.01841556 0.1429213 0.9717877578    5      4
## 293 0.030828246 0.05560175 0.3831336 0.2792487262    4      5
## 302 0.018592285 0.18142852 0.2400851 0.4707115884    5      4
## 305 0.033248137 0.10901835 0.3880683 0.3121311613    4      4
## 308 0.015468679 0.09588874 0.3044043 0.1705840908    4      4
## 311 0.023644429 0.04727674 0.2573469 0.7410263225    5      5
## 320 0.018302995 0.03891481 0.1938830 0.9543266859    5      2
## 322 0.021873774 0.04423799 0.2320681 0.8206593118    5      5
## 330 0.009278512 0.04478738 0.1604451 0.7987634514    5      4
## 332 0.046892986 0.10013246 0.8406066 0.0300076477    4      4
## 333 0.021909013 0.02147178 0.3333679 0.8198486945    5      5
## 339 0.013315538 0.08892196 0.2214835 0.5727948564    5      5
## 341 0.038216983 0.07712122 0.4734935 0.1290986031    4      4
## 344 0.026875715 0.06202343 0.3294884 0.6529550570    5      5
## 349 0.021016254 0.04988112 0.1989856 0.9028061350    5      5
## 355 0.018285762 0.01332988 0.1626699 0.9717529770    5      5
## 356 0.053355157 0.10247983 0.3128616 0.3913178502    5      3
## 365 0.054011188 0.19906758 0.3378470 0.2181146196    4      3
## 366 0.045034492 0.10026741 0.2483308 0.5509598332    5      4
## 369 0.015739227 0.04076636 0.3353147 0.4789342605    5      4
## 371 0.017218976 0.07516175 0.2713630 0.8346406705    5      5
## 373 0.023295599 0.04465106 0.3500923 0.5687523356    5      5
## 389 0.361942664 0.09013756 0.8615288 0.0002280336    4      2
## 390 0.231439468 0.52479573 0.7573463 0.0001786078    4      4
## 396 0.019471918 0.04058484 0.2861237 0.6377289064    5      4
## 412 0.013315830 0.06796631 0.3506868 0.4904207841    5      5
## 413 0.028943176 0.10295357 0.2884834 0.4515013939    5      3
## 415 0.017119366 0.08247124 0.3929728 0.4460533436    5      4
## 422 0.023115135 0.13245157 0.2702334 0.3946163337    5      5
## 425 0.013637127 0.05456433 0.2836367 0.6359376191    5      5
## 434 0.041346739 0.08148757 0.3067403 0.3873253559    5      5
## 438 0.015091452 0.07856271 0.2871240 0.6849597820    5      4
## 441 0.037882644 0.10092857 0.3026317 0.4215633154    5      5
## 442 0.022724647 0.09515182 0.2195513 0.6422249499    5      5
## 445 0.027157425 0.09478386 0.3366993 0.3943161999    5      5
## 447 0.034689807 0.08931374 0.2818816 0.3747871404    5      3
## 453 0.074608643 0.25745808 0.3287340 0.2225987667    4      4
## 454 0.020599107 0.11123314 0.2778320 0.4540698942    5      5
## 462 0.012408731 0.04805849 0.2475114 0.7473467295    5      5
## 474 0.066166317 0.13238458 0.3003370 0.4036945023    5      3
## 476 0.015844746 0.11457258 0.2592295 0.4629135428    5      3
## 493 0.014972706 0.06105938 0.5054259 0.2273277347    4      5
## 502 0.019785371 0.11419798 0.4154750 0.3193647199    4      4
## 503 0.026248770 0.09176566 0.2636241 0.5204691506    5      5
## 506 0.018909295 0.15469834 0.2603761 0.4440069919    5      5
## 508 0.019438074 0.05825576 0.3576710 0.5678299111    5      5
## 512 0.028144990 0.12464845 0.3223665 0.4817929111    5      5
## 513 0.023266827 0.12454233 0.2960895 0.4780164914    5      5
## 521 0.089271103 0.12567625 0.2596351 0.3621024013    5      2
## 524 0.081589939 0.10234623 0.3810107 0.1933533250    4      5
CM <- table(Evaluation$Actual,Evaluation$Vote)
CM
##    
##       3   4   5
##   2   0   4   3
##   3   0  14  11
##   4   1  29  32
##   5   1  13 100
#Proportions
Overall <- length(Evaluation$Actual)
Length2 <- length(which(Evaluation$Actual==2))
Length3 <- length(which(Evaluation$Actual==3))
Length4 <- length(which(Evaluation$Actual==4))
Length5 <- length(which(Evaluation$Actual==5))


#Accuracy
Accuracy <- sum(0,29,100)/sum(CM)

#Precision
Rows <- rowSums(CM)
Precision2 <- CM[2,1]/Rows[2]
Precision3 <- CM[3,2]/Rows[3]
Precision4 <- CM[4,3]/Rows[4]

Precision <- (Precision2*Length3+Precision3*Length4+Precision4*Length5)/208

#Recall
Col <- colSums(CM)
Recall2 <- CM[2,1]/Col[1]
Recall3 <- CM[3,2]/Col[2]
Recall4 <- CM[4,3]/Col[3]

Recall <- (Recall2*Length3+Recall3*Length4+Recall4*Length5)/208


Accuracy
## [1] 0.6201923
Precision
##         3 
## 0.6201923
Recall
##         3 
## 0.5194657