setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/90")
#install.packages("naivebayes")
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.4.3
library(dplyr)
## Warning: Installed Rcpp (0.12.16) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(e1071)
library(readxl)
Import actual labels.
#Import Labels
Labels <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Naive Bayes/1.Labels/Source Data.xlsx")
Label <- Labels$Score
Import the TFIDF feature set with a 90th percentile cut-off.
#Import Features
Features <- read.csv("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/90/Feature Set 1 90th TFIDF.csv")
Features <- Features[-1]
#Class 2
Label2 <- list()
for(i in 1:1000){
if(Label[i]==3| Label[i]==4){
Label2[i] <- 1
}else{
Label2[i] <- 0
}
}
#As Factor
Label2 <- as.factor(unlist(Label2))
#Class 3
Label3 <- list()
for(i in 1:1000){
if(Label[i]==5| Label[i]==6){
Label3[i] <- 1
}else{
Label3[i] <- 0
}
}
#As Factor
Label3 <- as.factor(unlist(Label3))
#Class 4
Label4 <- list()
for(i in 1:1000){
if(Label[i]==7| Label[i]==8){
Label4[i] <- 1
}else{
Label4[i] <- 0
}
}
#As Factor
Label4 <- as.factor(unlist(Label4))
#Class 5
Label5 <- list()
for(i in 1:1000){
if(Label[i]==9| Label[i]==10){
Label5[i] <- 1
}else{
Label5[i] <- 0
}
}
#As Factor
Label5 <- as.factor(unlist(Label5))
#All Labels
All <- list()
for(i in 1:1000){
if(Label[i]==9| Label[i]==10){
All[i] <- 5
}else if(Label[i]==7| Label[i]==8){
All[i] <- 4
}else if(Label[i]==5| Label[i]==6){
All[i] <- 3
}else{
All[i] <- 2
}
}
#As Factor
All <- as.factor(unlist(All))
#Transform Integer to Factor
for(i in 1:268){
Features[,i] <- as.numeric(Features[,i])
}
str(Features)
## 'data.frame': 1000 obs. of 268 variables:
## $ access : num 0 0 0 0 0 0 0 0 0 0 ...
## $ air : num 0 0 0 0 0 0 0 0 0 0 ...
## $ airport : num 0 0 0 0 0 0 0 0 0 0 ...
## $ all : num 0 0 0 0 0 0 0 0 0 0 ...
## $ also : num 0 0 0 0.0933 0 ...
## $ although : num 0 0 0 0 0 0 0 0 0 0 ...
## $ alway : num 0 0 0 0 0 0 0 0 0 0 ...
## $ amaz : num 0 0 0 0 0.0653 ...
## $ anyth : num 0 0 0 0 0 0 0 0 0 0 ...
## $ area : num 0 0 0 0.0285 0 ...
## $ around : num 0 0 0 0.0371 0 ...
## $ arriv : num 0.0239 0 0 0 0.064 ...
## $ ask : num 0.0239 0 0.1319 0 0 ...
## $ attent : num 0 0 0 0 0 0 0 0 0 0 ...
## $ away : num 0 0 0 0 0 0 0 0 0 0 ...
## $ awesom : num 0 0 0 0 0 ...
## $ back : num 0 0.0698 0.1376 0 0 ...
## $ bad : num 0 0 0 0.0417 0 ...
## $ bar : num 0 0 0.1048 0.0291 0 ...
## $ basement : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bath : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bathroom : num 0 0 0 0 0 0 0 0 0 0 ...
## $ beauti : num 0.0197 0 0 0 0 ...
## $ bed : num 0 0 0 0.0337 0 ...
## $ bedroom : num 0 0 0 0 0 0 0 0 0 0 ...
## $ best : num 0.031 0 0 0 0 ...
## $ better : num 0 0 0 0 0 0 0 0 0 0 ...
## $ big : num 0.0492 0 0 0 0 ...
## $ birthday : num 0 0 0 0 0.0999 ...
## $ bit : num 0 0.12 0.118 0 0 ...
## $ book : num 0.132 0 0 0 0.177 ...
## $ breakfast : num 0 0 0.0523 0 0 ...
## $ brilliant : num 0 0 0 0 0 0 0 0 0 0 ...
## $ broken : num 0.0365 0 0 0 0 ...
## $ buffet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ build : num 0 0 0 0.0603 0.0528 ...
## $ busi : num 0 0 0 0 0 0 0 0 0 0 ...
## $ can : num 0.0681 0.0636 0 0 0 ...
## $ center : num 0 0 0 0 0 0 0 0 0 0 ...
## $ centr : num 0 0 0 0 0 0 0 0 0 0 ...
## $ central : num 0 0 0 0 0 0 0 0 0 0 ...
## $ chang : num 0.0546 0 0 0 0 ...
## $ charg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ check : num 0.0495 0.0693 0 0 0 ...
## $ choic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ citi : num 0.0251 0 0 0 0 ...
## $ clean : num 0 0 0 0.04 0 ...
## $ close : num 0.0187 0 0 0 0 ...
## $ coff : num 0 0 0.138 0 0 ...
## $ cold : num 0 0 0 0 0 0 0 0 0 0 ...
## $ comfi : num 0 0 0 0 0 0 0 0 0 0 ...
## $ comfort : num 0 0 0 0 0 0 0 0 0 0 ...
## $ complain : num 0 0 0 0.0509 0 ...
## $ complet : num 0 0 0 0 0.0852 ...
## $ condit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ construct : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conveni : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cook : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cool : num 0 0 0 0 0 0 0 0 0 0 ...
## $ couldn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ court : num 0 0 0 0 0 0 0 0 0 0 ...
## $ day : num 0.0646 0 0 0.033 0 ...
## $ decor : num 0 0 0 0 0 0 0 0 0 0 ...
## $ definit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ delici : num 0 0 0 0 0 0 0 0 0 0 ...
## $ design : num 0 0 0 0 0 ...
## $ desk : num 0 0 0 0 0 0 0 0 0 0 ...
## $ didn : num 0 0 0 0 0.057 ...
## $ difficult : num 0 0 0.187 0 0 ...
## $ don : num 0.0296 0 0 0 0 ...
## $ door : num 0 0 0 0.0717 0 ...
## $ doubl : num 0.0286 0 0 0 0 ...
## $ drink : num 0 0 0 0 0 0 0 0 0 0 ...
## $ due : num 0.0286 0 0 0 0 ...
## $ earl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ easi : num 0 0 0 0 0 0 0 0 0 0 ...
## $ effici : num 0 0 0 0 0 0 0 0 0 0 ...
## $ enough : num 0 0 0 0 0 ...
## $ especi : num 0 0 0 0 0.0791 ...
## $ etc : num 0 0 0 0 0 0 0 0 0 0 ...
## $ euro : num 0 0 0 0 0 0 0 0 0 0 ...
## $ even : num 0.0424 0.0593 0 0 0 ...
## $ everyth : num 0 0 0 0 0 ...
## $ excel : num 0 0.0462 0 0 0 ...
## $ except : num 0 0 0 0 0 0 0 0 0 0 ...
## $ execut : num 0 0 0 0 0 0 0 0 0 0 ...
## $ expen : num 0 0 0 0 0 0 0 0 0 0 ...
## $ experi : num 0 0 0 0 0 0 0 0 0 0 ...
## $ extra : num 0 0 0 0 0 0 0 0 0 0 ...
## $ extrem : num 0 0 0 0 0 0 0 0 0 0 ...
## $ facil : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fantast : num 0 0.0744 0 0 0 ...
## $ far : num 0 0 0 0 0 0 0 0 0 0 ...
## $ feel : num 0 0 0 0 0 0 0 0 0 0 ...
## $ first : num 0 0.0714 0 0 0 ...
## $ floor : num 0.022 0 0 0.101 0 ...
## $ food : num 0 0.0572 0 0 0 ...
## $ free : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fresh : num 0 0 0 0 0 0 0 0 0 0 ...
## [list output truncated]
#Features
set.seed(1234)
ind <- sample(2,nrow(Features),replace = T, prob =c(0.8,0.2))
train <- Features[ind == 1,]
test <- Features[ind ==2,]
train.labels.2 <- Label2[ind == 1]
test.labels.2 <- Label2[ind ==2]
train.labels.3 <- Label3[ind == 1]
test.labels.3 <- Label3[ind ==2]
train.labels.4 <- Label4[ind == 1]
test.labels.4 <- Label4[ind ==2]
train.labels.5 <- Label5[ind == 1]
test.labels.5 <- Label5[ind ==2]
train.labels <- All[ind == 1]
test.labels <- All[ind ==2]
#SVM2
train2 <- train
train2$Score <- train.labels.2
SVM2 <- svm(Score~.,data = train2,scale = FALSE,probability=TRUE)
train3 <- train
train3$Score <- train.labels.3
SVM3 <- svm(Score~.,data = train3,scale = FALSE,probability=TRUE)
train4 <- train
train4$Score <- train.labels.4
SVM4 <- svm(Score~.,data = train4,scale = FALSE,probability=TRUE)
train5 <- train
train5$Score <- train.labels.5
SVM5 <- svm(Score~.,data = train5,scale = FALSE,probability=TRUE)
P2 <- predict(SVM2,newdata = test,probability = TRUE)
P3 <- predict(SVM3,newdata = test,probability = TRUE)
P4 <- predict(SVM4,newdata = test,probability = TRUE)
P5 <- predict(SVM5,newdata = test,probability = TRUE)
Prob2 <- attr(P2,"probabilities")
Prob3 <- attr(P3,"probabilities")
Prob4 <- attr(P4,"probabilities")
Prob5 <- attr(P5,"probabilities")
Use probabilities as an input for the voting procedure. Choose the class with the highest probability.
Voting.df <- data.frame(Prob2, Prob3,Prob4,Prob5)
colnames(Voting.df) <- c("Class 2: 1","Class2: 0","Class 3: 0","Class3: 1","Class 4: 0","Class4: 1","Class 5: 0","Class5: 1")
head(Voting.df)
## Class 2: 1 Class2: 0 Class 3: 0 Class3: 1 Class 4: 0 Class4: 1
## 5 0.04885169 0.9511483 0.8603780 0.13962197 0.7468755 0.2531245
## 14 0.02774571 0.9722543 0.9051723 0.09482768 0.6400074 0.3599926
## 16 0.02528299 0.9747170 0.9038296 0.09617038 0.7624328 0.2375672
## 26 0.03513537 0.9648646 0.8828457 0.11715428 0.7020542 0.2979458
## 28 0.02662306 0.9733769 0.7654968 0.23450319 0.7518647 0.2481353
## 29 0.02489697 0.9751030 0.8735787 0.12642128 0.6944193 0.3055807
## Class 5: 0 Class5: 1
## 5 0.6868148 0.3131852
## 14 0.5972668 0.4027332
## 16 0.3910458 0.6089542
## 26 0.5744081 0.4255919
## 28 0.6413601 0.3586399
## 29 0.6205540 0.3794460
SEQ <- c(1,4,6,8)
Transformed.Voting.df <- Voting.df[SEQ]
colnames(Transformed.Voting.df) <- c("2","3","4","5")
head(Transformed.Voting.df)
## 2 3 4 5
## 5 0.04885169 0.13962197 0.2531245 0.3131852
## 14 0.02774571 0.09482768 0.3599926 0.4027332
## 16 0.02528299 0.09617038 0.2375672 0.6089542
## 26 0.03513537 0.11715428 0.2979458 0.4255919
## 28 0.02662306 0.23450319 0.2481353 0.3586399
## 29 0.02489697 0.12642128 0.3055807 0.3794460
Evaluation <- Transformed.Voting.df
Index <- as.numeric(apply(Transformed.Voting.df,MARGIN = 1,which.max))
Index <- Index+1
Evaluation$Vote <- Index
Evaluation$Actual <- test.labels
head(Evaluation,100)
## 2 3 4 5 Vote Actual
## 5 0.048851689 0.13962197 0.2531245 0.3131852426 5 4
## 14 0.027745711 0.09482768 0.3599926 0.4027332225 5 5
## 16 0.025282992 0.09617038 0.2375672 0.6089541751 5 5
## 26 0.035135366 0.11715428 0.2979458 0.4255919202 5 4
## 28 0.026623060 0.23450319 0.2481353 0.3586398766 5 4
## 29 0.024896973 0.12642128 0.3055807 0.3794459851 5 4
## 39 0.036536618 0.12424408 0.2888428 0.4612490148 5 5
## 40 0.035644377 0.12323624 0.3283707 0.3505065109 5 3
## 60 0.029408125 0.09154327 0.2712927 0.6158405073 5 5
## 61 0.051942141 0.14855692 0.2971700 0.3837800502 5 3
## 72 0.019718569 0.10571773 0.3094063 0.5142240078 5 4
## 81 0.021781236 0.15234691 0.2885612 0.4038576564 5 3
## 86 0.040817876 0.12311165 0.2884011 0.4744722074 5 5
## 90 0.067847027 0.13577413 0.2917451 0.2791338510 4 4
## 92 0.041583049 0.11751313 0.3040554 0.3013099746 4 4
## 113 0.036300932 0.11350701 0.3278792 0.3701295504 5 5
## 116 0.044133424 0.12091228 0.3039658 0.3940207368 5 4
## 117 0.049930287 0.07249002 0.3616510 0.4076591590 5 5
## 122 0.051663587 0.10171284 0.3020654 0.1674888718 4 4
## 123 0.024064351 0.08677774 0.3889846 0.2851514819 4 2
## 124 0.033283393 0.13255552 0.3104288 0.3295816876 5 4
## 131 0.023889227 0.11900661 0.3098417 0.4582640960 5 4
## 135 0.039383987 0.13523788 0.2942044 0.3164267907 5 3
## 137 0.016532450 0.08273318 0.3044123 0.5302957495 5 5
## 140 0.048896451 0.10015208 0.3626998 0.3409149006 4 4
## 142 0.026067860 0.08799262 0.3041699 0.5000000000 5 5
## 149 0.025286992 0.13028532 0.3140719 0.4370963571 5 4
## 154 0.030788708 0.11691151 0.2284364 0.5642755009 5 5
## 156 0.046506981 0.11395710 0.3373108 0.3559665822 5 3
## 158 0.059433902 0.13542878 0.2710934 0.3828574060 5 3
## 169 0.037464298 0.09769099 0.2264764 0.7292931912 5 5
## 185 0.030784259 0.08644208 0.2749857 0.5000000000 5 5
## 187 0.027673133 0.11803391 0.2780658 0.4802115610 5 5
## 192 0.029014122 0.14023549 0.3565525 0.2655653626 4 3
## 194 0.285376955 0.20966591 0.2289379 0.2397666583 2 4
## 195 0.032563728 0.12747305 0.2895316 0.3223163917 5 4
## 196 0.043812747 0.12981392 0.2729996 0.3466345979 5 5
## 197 0.032954490 0.15030899 0.2729243 0.2887154439 5 3
## 199 0.025662717 0.08612619 0.2899891 0.7325962547 5 5
## 210 0.051784746 0.14537557 0.3024988 0.2033648978 4 3
## 216 0.039140476 0.08141991 0.2035340 0.7802598447 5 5
## 220 0.035204821 0.14863718 0.2510545 0.3626297741 5 4
## 227 0.035804817 0.09962221 0.3008678 0.4503308965 5 5
## 234 0.035224975 0.13314629 0.3300683 0.3214264667 4 3
## 240 0.034768282 0.08754134 0.5492909 0.1630282680 4 5
## 245 0.060786248 0.14471316 0.3610944 0.2042830197 4 4
## 249 0.039774930 0.15139285 0.2927583 0.3494543342 5 5
## 261 0.039420221 0.13548024 0.4101987 0.2396298680 4 3
## 277 0.026004215 0.07797192 0.2167359 0.8716678772 5 5
## 283 0.032632695 0.10891841 0.2935209 0.5000000000 5 5
## 290 0.007695373 0.03532522 0.1807015 0.9664885353 5 4
## 293 0.030570066 0.08538436 0.3394252 0.3276588569 4 5
## 302 0.017716884 0.13161495 0.2741405 0.5118427627 5 4
## 305 0.035667484 0.11901602 0.3222612 0.3534082204 5 4
## 308 0.033923830 0.14276317 0.2390172 0.1702995706 4 4
## 311 0.037346808 0.09548608 0.2737962 0.6877678800 5 5
## 320 0.010940518 0.06948904 0.1910065 0.9456523686 5 2
## 322 0.023888585 0.08286825 0.2122464 0.7979931778 5 5
## 330 0.019085387 0.08041492 0.2094494 0.7440514704 5 4
## 332 0.041388900 0.11899486 0.8285014 0.0391859853 4 4
## 333 0.031269309 0.05868737 0.3432452 0.8000304480 5 5
## 339 0.012312252 0.11425930 0.3414204 0.4930447976 5 5
## 341 0.032470212 0.11255587 0.4853232 0.1241306871 4 4
## 344 0.040705955 0.07542003 0.2772083 0.6505276677 5 5
## 349 0.024071618 0.06035080 0.2220790 0.8907913770 5 5
## 355 0.033124961 0.03109111 0.1901577 0.9618078218 5 5
## 356 0.030796402 0.13054699 0.2674326 0.4272684139 5 3
## 365 0.048344331 0.19930491 0.3001073 0.2431249581 4 3
## 366 0.024222984 0.11219326 0.2843450 0.5166032150 5 4
## 369 0.020542735 0.06503988 0.3831389 0.3890316789 5 4
## 371 0.024731375 0.08087753 0.2288317 0.8141855561 5 5
## 373 0.038572859 0.10269661 0.2972136 0.5804506445 5 5
## 389 0.035259077 0.11223026 0.8182538 0.0002804723 4 2
## 390 0.144617347 0.27687532 0.5529048 0.0003362382 4 4
## 396 0.033604821 0.06774184 0.2762720 0.6190074074 5 4
## 412 0.013650242 0.10569122 0.3461724 0.4945300356 5 5
## 413 0.033173766 0.11182884 0.2575729 0.4612524911 5 3
## 415 0.016784184 0.11871202 0.4164798 0.3994187212 4 4
## 422 0.034594112 0.10761905 0.2713939 0.4546613067 5 5
## 425 0.022516500 0.10788824 0.2571681 0.5801644876 5 5
## 434 0.025022107 0.10167848 0.3115039 0.3921660944 5 5
## 438 0.026097190 0.10953294 0.2872527 0.6784594023 5 4
## 441 0.036130759 0.10960426 0.3120295 0.4057825631 5 5
## 442 0.023161919 0.10963241 0.2124435 0.6642278775 5 5
## 445 0.032884813 0.11498510 0.3397893 0.4110311362 5 5
## 447 0.033216845 0.11091875 0.3033982 0.3580821038 5 3
## 453 0.037114733 0.18916646 0.2907818 0.3106796215 5 4
## 454 0.026184094 0.11553035 0.2835597 0.4466551262 5 5
## 462 0.022765854 0.07083547 0.2786164 0.7346099402 5 5
## 474 0.054665854 0.08137064 0.3007620 0.4098527811 5 3
## 476 0.030106481 0.13942062 0.2574207 0.5000000000 5 3
## 493 0.022504694 0.09402321 0.4688219 0.2448016290 4 5
## 502 0.021622399 0.14917779 0.3902132 0.3329139630 4 4
## 503 0.032147540 0.09894937 0.2702238 0.5166714694 5 5
## 506 0.029381837 0.12797009 0.2680221 0.4267302424 5 5
## 508 0.033996134 0.06842316 0.3461537 0.5938929754 5 5
## 512 0.038367371 0.10771707 0.3024371 0.4824878707 5 5
## 513 0.032314168 0.10613563 0.3140933 0.3923348220 5 5
## 521 0.057494234 0.10453153 0.3212953 0.4030173539 5 2
## 524 0.065021048 0.11180550 0.3340279 0.2297464362 4 5
CM <- table(Evaluation$Actual,Evaluation$Vote)
CM
##
## 2 4 5
## 2 0 4 3
## 3 0 10 15
## 4 1 21 40
## 5 0 8 106
#Proportions
Overall <- length(Evaluation$Actual)
Length2 <- length(which(Evaluation$Actual==2))
Length3 <- length(which(Evaluation$Actual==3))
Length4 <- length(which(Evaluation$Actual==4))
Length5 <- length(which(Evaluation$Actual==5))
#Accuracy
Accuracy <- sum(0,21,106)/sum(CM)
#Precision
Rows <- rowSums(CM)
Precision2 <- CM[1,1]/Rows[2]
Precision3 <- CM[3,2]/Rows[3]
Precision4 <- CM[4,3]/Rows[4]
Precision <- (Precision2*Length3+Precision3*Length4+Precision4*Length5)/208
#Recall
Col <- colSums(CM)
Recall2 <- CM[1,1]/Col[1]
Recall3 <- CM[3,2]/Col[2]
Recall4 <- CM[4,3]/Col[3]
Recall <- (Recall2*Length3+Recall3*Length4+Recall4*Length5)/208
Accuracy
## [1] 0.6105769
Precision
## 3
## 0.6105769
Recall
## 2
## 0.4998173