setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/70")
#install.packages("naivebayes")
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.4.3
library(dplyr)
## Warning: Installed Rcpp (0.12.16) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(e1071)
library(readxl)
Import actual labels.
#Import Labels
Labels <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Naive Bayes/1.Labels/Source Data.xlsx")
Label <- Labels$Score
Import TFIDF feature set with a 70th percentile cut-off.
#Import Features
Features <- read.csv("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/SVM/2.Feature Set 1/TFIDF/70/Feature Set 1 70th TFIDF.csv")
Features <- Features[-1]
#Class 2
Label2 <- list()
for(i in 1:1000){
if(Label[i]==3| Label[i]==4){
Label2[i] <- 1
}else{
Label2[i] <- 0
}
}
#As Factor
Label2 <- as.factor(unlist(Label2))
#Class 3
Label3 <- list()
for(i in 1:1000){
if(Label[i]==5| Label[i]==6){
Label3[i] <- 1
}else{
Label3[i] <- 0
}
}
#As Factor
Label3 <- as.factor(unlist(Label3))
#Class 4
Label4 <- list()
for(i in 1:1000){
if(Label[i]==7| Label[i]==8){
Label4[i] <- 1
}else{
Label4[i] <- 0
}
}
#As Factor
Label4 <- as.factor(unlist(Label4))
#Class 5
Label5 <- list()
for(i in 1:1000){
if(Label[i]==9| Label[i]==10){
Label5[i] <- 1
}else{
Label5[i] <- 0
}
}
#As Factor
Label5 <- as.factor(unlist(Label5))
#All Labels
All <- list()
for(i in 1:1000){
if(Label[i]==9| Label[i]==10){
All[i] <- 5
}else if(Label[i]==7| Label[i]==8){
All[i] <- 4
}else if(Label[i]==5| Label[i]==6){
All[i] <- 3
}else{
All[i] <- 2
}
}
#As Factor
All <- as.factor(unlist(All))
#Transform Integer to Factor
for(i in 1:802){
Features[,i] <- as.numeric(Features[,i])
}
str(Features)
## 'data.frame': 1000 obs. of 802 variables:
## $ abl : num 0.0351 0 0 0 0 ...
## $ absolut : num 0 0 0 0 0 0 0 0 0 0 ...
## $ accent : num 0 0 0 0 0 0 0 0 0 0 ...
## $ access : num 0 0 0 0 0 0 0 0 0 0 ...
## $ accommod : num 0 0 0 0 0 0 0 0 0 0 ...
## $ actual : num 0 0 0 0 0 ...
## $ adequ : num 0 0 0 0 0 0 0 0 0 0 ...
## $ adult : num 0 0 0 0 0 0 0 0 0 0 ...
## $ advanc : num 0 0 0 0 0 0 0 0 0 0 ...
## $ adverti : num 0 0 0 0 0.105 ...
## $ advi : num 0 0 0 0 0 0 0 0 0 0 ...
## $ advic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ affect : num 0 0 0 0 0 0 0 0 0 0 ...
## $ air : num 0 0 0 0 0 0 0 0 0 0 ...
## $ aircondit : num 0 0 0 0 0 ...
## $ airi : num 0 0 0 0 0 0 0 0 0 0 ...
## $ airport : num 0 0 0 0 0 0 0 0 0 0 ...
## $ alarm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ all : num 0 0 0 0 0 0 0 0 0 0 ...
## $ alloc : num 0 0 0 0 0 0 0 0 0 0 ...
## $ allow : num 0 0 0 0 0 0 0 0 0 0 ...
## $ also : num 0 0 0 0.0933 0 ...
## $ although : num 0 0 0 0 0 0 0 0 0 0 ...
## $ alway : num 0 0 0 0 0 0 0 0 0 0 ...
## $ amaz : num 0 0 0 0 0.0653 ...
## $ amen : num 0 0.107 0 0 0 ...
## $ amsterdam : num 0 0.0847 0 0.0463 0 ...
## $ and : num 0.031 0 0 0 0 ...
## $ anna : num 0 0 0 0 0 0 0 0 0 0 ...
## $ annoy : num 0.0357 0 0 0 0 ...
## $ anoth : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ant : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anyon : num 0 0.102 0 0 0 ...
## $ anyth : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anyway : num 0 0 0 0 0 0 0 0 0 0 ...
## $ apart : num 0 0 0 0 0 ...
## $ appal : num 0 0 0 0 0 0 0 0 0 0 ...
## $ appear : num 0 0 0 0 0 0 0 0 0 0 ...
## $ appoint : num 0 0 0 0 0 0 0 0 0 0 ...
## $ approach : num 0 0 0 0 0 0 0 0 0 0 ...
## $ architectur : num 0 0 0 0 0 0 0 0 0 0 ...
## $ area : num 0 0 0 0.0285 0 ...
## $ arena : num 0 0 0 0 0 0 0 0 0 0 ...
## $ around : num 0 0 0 0.0371 0 ...
## $ arriv : num 0.0239 0 0 0 0.064 ...
## $ ask : num 0.0239 0 0.1319 0 0 ...
## $ aspect : num 0 0 0 0 0 0 0 0 0 0 ...
## $ athmosph : num 0 0 0 0 0 0 0 0 0 0 ...
## $ atmosph : num 0 0 0 0 0 0 0 0 0 0 ...
## $ attent : num 0 0 0 0 0 0 0 0 0 0 ...
## $ attitud : num 0 0 0 0 0 0 0 0 0 0 ...
## $ attract : num 0 0 0 0 0 0 0 0 0 0 ...
## $ atttent : num 0 0 0 0 0 0 0 0 0 0 ...
## $ avail : num 0.031 0 0 0 0 ...
## $ averag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ away : num 0 0 0 0 0 0 0 0 0 0 ...
## $ awesom : num 0 0 0 0 0 ...
## $ back : num 0 0.0698 0.1376 0 0 ...
## $ bacon : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bad : num 0 0 0 0.0417 0 ...
## $ bag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bang : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bar : num 0 0 0.1048 0.0291 0 ...
## $ bare : num 0 0 0 0 0 0 0 0 0 0 ...
## $ base : num 0 0 0 0 0 0 0 0 0 0 ...
## $ basement : num 0 0 0 0 0 0 0 0 0 0 ...
## $ basic : num 0 0 0.187 0 0 ...
## $ bath : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bathroom : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bathtub : num 0 0 0 0 0 0 0 0 0 0 ...
## $ beauti : num 0.0197 0 0 0 0 ...
## $ bed : num 0 0 0 0.0337 0 ...
## $ bedroom : num 0 0 0 0 0 0 0 0 0 0 ...
## $ best : num 0.031 0 0 0 0 ...
## $ better : num 0 0 0 0 0 0 0 0 0 0 ...
## $ big : num 0.0492 0 0 0 0 ...
## $ bigger : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bike : num 0 0 0 0 0 0 0 0 0 0 ...
## $ birthday : num 0 0 0 0 0.0999 ...
## $ bit : num 0 0.12 0.118 0 0 ...
## $ black : num 0 0 0 0 0 0 0 0 0 0 ...
## $ blanket : num 0 0 0 0 0 0 0 0 0 0 ...
## $ blind : num 0 0 0 0 0 0 0 0 0 0 ...
## $ block : num 0 0 0 0 0 0 0 0 0 0 ...
## $ board : num 0 0 0 0 0 0 0 0 0 0 ...
## $ boiler : num 0 0 0.241 0 0 ...
## $ bonus : num 0 0 0 0 0 0 0 0 0 0 ...
## $ book : num 0.132 0 0 0 0.177 ...
## $ box : num 0 0 0 0 0 0 0 0 0 0 ...
## $ boyfriend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bread : num 0 0 0 0 0 0 0 0 0 0 ...
## $ breakfast : num 0 0 0.0523 0 0 ...
## $ bright : num 0 0 0 0 0 ...
## $ brilliant : num 0 0 0 0 0 0 0 0 0 0 ...
## $ broken : num 0.0365 0 0 0 0 ...
## $ brought : num 0 0 0 0 0 0 0 0 0 0 ...
## $ buffet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ build : num 0 0 0 0.0603 0.0528 ...
## $ bus : num 0 0 0 0 0 0 0 0 0 0 ...
## [list output truncated]
#Features
set.seed(1234)
ind <- sample(2,nrow(Features),replace = T, prob =c(0.8,0.2))
train <- Features[ind == 1,]
test <- Features[ind ==2,]
train.labels.2 <- Label2[ind == 1]
test.labels.2 <- Label2[ind ==2]
train.labels.3 <- Label3[ind == 1]
test.labels.3 <- Label3[ind ==2]
train.labels.4 <- Label4[ind == 1]
test.labels.4 <- Label4[ind ==2]
train.labels.5 <- Label5[ind == 1]
test.labels.5 <- Label5[ind ==2]
train.labels <- All[ind == 1]
test.labels <- All[ind ==2]
#SVM2
train2 <- train
train2$Score <- train.labels.2
SVM2 <- svm(Score~.,data = train2,scale = FALSE,probability=TRUE)
train3 <- train
train3$Score <- train.labels.3
SVM3 <- svm(Score~.,data = train3,scale = FALSE,probability=TRUE)
train4 <- train
train4$Score <- train.labels.4
SVM4 <- svm(Score~.,data = train4,scale = FALSE,probability=TRUE)
train5 <- train
train5$Score <- train.labels.5
SVM5 <- svm(Score~.,data = train5,scale = FALSE,probability=TRUE)
P2 <- predict(SVM2,newdata = test,probability = TRUE)
P3 <- predict(SVM3,newdata = test,probability = TRUE)
P4 <- predict(SVM4,newdata = test,probability = TRUE)
P5 <- predict(SVM5,newdata = test,probability = TRUE)
Prob2 <- attr(P2,"probabilities")
Prob3 <- attr(P3,"probabilities")
Prob4 <- attr(P4,"probabilities")
Prob5 <- attr(P5,"probabilities")
Use probabilities as an input for the voting procedure. Choose the class with the highest probability.
Voting.df <- data.frame(Prob2, Prob3,Prob4,Prob5)
colnames(Voting.df) <- c("Class 2: 1","Class2: 0","Class 3: 0","Class3: 1","Class 4: 0","Class4: 1","Class 5: 0","Class5: 1")
head(Voting.df)
## Class 2: 1 Class2: 0 Class 3: 0 Class3: 1 Class 4: 0 Class4: 1
## 5 0.04668780 0.9533122 0.8558887 0.14411134 0.7103309 0.2896691
## 14 0.02789076 0.9721092 0.9009667 0.09903335 0.6708631 0.3291369
## 16 0.02231681 0.9776832 0.9134637 0.08653633 0.7620809 0.2379191
## 26 0.03822932 0.9617707 0.8972014 0.10279863 0.7229603 0.2770397
## 28 0.04522425 0.9547758 0.7486731 0.25132687 0.7304311 0.2695689
## 29 0.02281651 0.9771835 0.9090137 0.09098627 0.6992499 0.3007501
## Class 5: 0 Class5: 1
## 5 0.6937458 0.3062542
## 14 0.5793271 0.4206729
## 16 0.3920841 0.6079159
## 26 0.5436353 0.4563647
## 28 0.6696402 0.3303598
## 29 0.6164656 0.3835344
SEQ <- c(1,4,6,8)
Transformed.Voting.df <- Voting.df[SEQ]
colnames(Transformed.Voting.df) <- c("2","3","4","5")
head(Transformed.Voting.df)
## 2 3 4 5
## 5 0.04668780 0.14411134 0.2896691 0.3062542
## 14 0.02789076 0.09903335 0.3291369 0.4206729
## 16 0.02231681 0.08653633 0.2379191 0.6079159
## 26 0.03822932 0.10279863 0.2770397 0.4563647
## 28 0.04522425 0.25132687 0.2695689 0.3303598
## 29 0.02281651 0.09098627 0.3007501 0.3835344
Evaluation <- Transformed.Voting.df
Index <- as.numeric(apply(Transformed.Voting.df,MARGIN = 1,which.max))
Index <- Index+1
Evaluation$Vote <- Index
Evaluation$Actual <- test.labels
head(Evaluation,100)
## 2 3 4 5 Vote Actual
## 5 0.04668780 0.14411134 0.2896691 0.3062542204 5 4
## 14 0.02789076 0.09903335 0.3291369 0.4206728690 5 5
## 16 0.02231681 0.08653633 0.2379191 0.6079159098 5 5
## 26 0.03822932 0.10279863 0.2770397 0.4563647356 5 4
## 28 0.04522425 0.25132687 0.2695689 0.3303598085 5 4
## 29 0.02281651 0.09098627 0.3007501 0.3835344186 5 4
## 39 0.03305619 0.08995137 0.2843293 0.5000000000 5 5
## 40 0.02633637 0.13788211 0.3373378 0.3307879032 4 3
## 60 0.02506159 0.09503349 0.2573851 0.5741836575 5 5
## 61 0.03975009 0.18223615 0.2906639 0.3734977224 5 3
## 72 0.01994690 0.05119684 0.2934963 0.5417508529 5 4
## 81 0.03143006 0.11354223 0.3182546 0.3793170679 5 3
## 86 0.02751853 0.12779138 0.3020795 0.5000000000 5 5
## 90 0.05727079 0.12494698 0.3026338 0.2190275477 4 4
## 92 0.02565524 0.14301612 0.3009383 0.2655729266 4 4
## 113 0.03109249 0.07842272 0.3184917 0.3724311041 5 5
## 116 0.05143052 0.16958643 0.2620891 0.3680350127 5 4
## 117 0.02540218 0.07802040 0.3561931 0.4453888427 5 5
## 122 0.04046065 0.13339775 0.3338676 0.1280884357 4 4
## 123 0.02711407 0.07182488 0.4050457 0.3404043601 4 2
## 124 0.02716514 0.10528435 0.3043763 0.3659000305 5 4
## 131 0.02683660 0.10843195 0.3384214 0.4157019993 5 4
## 135 0.04359338 0.12392758 0.3018791 0.2841510135 4 3
## 137 0.01275128 0.08412494 0.3040230 0.5775762988 5 5
## 140 0.03537626 0.14052454 0.3157547 0.3085613075 4 4
## 142 0.02289964 0.05836258 0.3509510 0.4605295809 5 5
## 149 0.02427140 0.07416831 0.3408025 0.4643272878 5 4
## 154 0.02292391 0.11320711 0.2374482 0.6173066013 5 5
## 156 0.04091144 0.09106159 0.3364255 0.3342467407 4 3
## 158 0.04944712 0.16794116 0.2749314 0.3457240152 5 3
## 169 0.02486484 0.06386500 0.2193025 0.7552772077 5 5
## 185 0.02985575 0.08746761 0.2528275 0.5312441380 5 5
## 187 0.01999354 0.09820672 0.3224571 0.4668851100 5 5
## 192 0.03087044 0.14285503 0.3293447 0.2759630565 4 3
## 194 0.22853524 0.55359697 0.2495643 0.1380200886 3 4
## 195 0.02273870 0.25864157 0.2752491 0.2376984946 4 4
## 196 0.04005926 0.11846741 0.3074524 0.3203900323 5 5
## 197 0.07093429 0.15648532 0.2570035 0.2428793629 4 3
## 199 0.02062199 0.06611638 0.2651854 0.7504706727 5 5
## 210 0.08050338 0.14196916 0.2920090 0.1817187798 4 3
## 216 0.03136323 0.07251198 0.1890957 0.7852045677 5 5
## 220 0.03204837 0.14822793 0.2609382 0.3889631056 5 4
## 227 0.03608857 0.06667880 0.2827288 0.4819063474 5 5
## 234 0.02949519 0.12055925 0.3737274 0.2991850406 4 3
## 240 0.01772800 0.06583320 0.5501839 0.1613092824 4 5
## 245 0.09536381 0.14800760 0.3859078 0.1741048164 4 4
## 249 0.03138296 0.12058163 0.3531731 0.3323475578 4 5
## 261 0.02840585 0.39398858 0.3894780 0.1945946236 3 3
## 277 0.02252080 0.04964714 0.2096614 0.8838221303 5 5
## 283 0.02384335 0.13904064 0.2957860 0.5251045637 5 5
## 290 0.01069258 0.02909132 0.1639922 0.9686232380 5 4
## 293 0.03384441 0.05876396 0.3711986 0.2995474619 4 5
## 302 0.01829217 0.11093310 0.2443212 0.4929561782 5 4
## 305 0.04194273 0.10962057 0.3861270 0.3049839382 4 4
## 308 0.02838902 0.12245262 0.3136746 0.1742143171 4 4
## 311 0.03250225 0.06052099 0.2554438 0.7265893016 5 5
## 320 0.01503806 0.06112370 0.2038668 0.9509761656 5 2
## 322 0.02534775 0.04280721 0.2250635 0.8149021274 5 5
## 330 0.02146590 0.05644429 0.1770706 0.7877506562 5 4
## 332 0.03409575 0.09671216 0.8181575 0.0314774332 4 4
## 333 0.02446328 0.03011527 0.3337695 0.8139345417 5 5
## 339 0.01859570 0.13880927 0.2480972 0.5669771971 5 5
## 341 0.04289687 0.07842570 0.4859418 0.1268576573 4 4
## 344 0.03067522 0.06194310 0.3062120 0.6478734617 5 5
## 349 0.01866830 0.06145658 0.2150480 0.8961124715 5 5
## 355 0.01623829 0.02562462 0.1711914 0.9678454260 5 5
## 356 0.04652645 0.10894022 0.3106021 0.3929103029 5 3
## 365 0.07226638 0.15768169 0.3066844 0.2198171737 4 3
## 366 0.03073180 0.06779255 0.2545362 0.5556347003 5 4
## 369 0.01566237 0.05495422 0.3386898 0.4530046005 5 4
## 371 0.01583100 0.09257094 0.2606355 0.8258508221 5 5
## 373 0.03353986 0.05400197 0.3426102 0.5626151739 5 5
## 389 0.10949805 0.07885363 0.8506742 0.0002591507 4 2
## 390 0.24229518 0.56121064 0.7664688 0.0002053102 4 4
## 396 0.02368806 0.04066344 0.2863621 0.6342019755 5 4
## 412 0.01733431 0.08659807 0.3591604 0.4860565893 5 5
## 413 0.02210061 0.11287010 0.2781288 0.4480579888 5 3
## 415 0.01850532 0.08176998 0.3910118 0.4457900910 5 4
## 422 0.02315691 0.13633230 0.2753871 0.4115202386 5 5
## 425 0.02111399 0.07055621 0.2816232 0.6353375182 5 5
## 434 0.03401370 0.08461636 0.3141675 0.3833170543 5 5
## 438 0.01560432 0.08454230 0.2874751 0.6813725189 5 4
## 441 0.03061994 0.10647435 0.2997158 0.4221386926 5 5
## 442 0.02584918 0.10413704 0.2278435 0.6349063724 5 5
## 445 0.02709484 0.10258863 0.3424323 0.3921436117 5 5
## 447 0.03447430 0.09742633 0.2695485 0.3914068171 5 3
## 453 0.03450107 0.22960758 0.3011368 0.2643526126 4 4
## 454 0.02098104 0.11076878 0.2840189 0.4430664930 5 5
## 462 0.01557827 0.05990222 0.2510160 0.7465997249 5 5
## 474 0.04925204 0.11410126 0.2966320 0.4364747398 5 3
## 476 0.02773956 0.10271579 0.2704870 0.4700992521 5 3
## 493 0.01236140 0.08012259 0.5000000 0.2278779018 4 5
## 502 0.02029159 0.12543068 0.3936436 0.3281053883 4 4
## 503 0.02963062 0.09239408 0.2597120 0.5257826390 5 5
## 506 0.02601148 0.16713533 0.2590329 0.4435236961 5 5
## 508 0.02479170 0.06225357 0.3461246 0.5708906748 5 5
## 512 0.02813992 0.09843015 0.3047279 0.5000000000 5 5
## 513 0.02622257 0.12327240 0.2956174 0.4740116580 5 5
## 521 0.08358890 0.11891853 0.2527259 0.3629107250 5 2
## 524 0.08583100 0.08851636 0.3542563 0.2120163269 4 5
CM <- table(Evaluation$Actual,Evaluation$Vote)
CM
##
## 3 4 5
## 2 0 4 3
## 3 1 13 11
## 4 1 26 35
## 5 1 11 102
#Proportions
Overall <- length(Evaluation$Actual)
Length2 <- length(which(Evaluation$Actual==2))
Length3 <- length(which(Evaluation$Actual==3))
Length4 <- length(which(Evaluation$Actual==4))
Length5 <- length(which(Evaluation$Actual==5))
#Accuracy
Accuracy <- sum(0,29,100)/sum(CM)
#Precision
Rows <- rowSums(CM)
Precision2 <- CM[2,1]/Rows[2]
Precision3 <- CM[3,2]/Rows[3]
Precision4 <- CM[4,3]/Rows[4]
Precision <- (Precision2*Length3+Precision3*Length4+Precision4*Length5)/208
#Recall
Col <- colSums(CM)
Recall2 <- CM[2,1]/Col[1]
Recall3 <- CM[3,2]/Col[2]
Recall4 <- CM[4,3]/Col[3]
Recall <- (Recall2*Length3+Recall3*Length4+Recall4*Length5)/208
Accuracy
## [1] 0.6201923
Precision
## 3
## 0.6201923
Recall
## 3
## 0.5538068