library(tokenizers)
## Warning: package 'tokenizers' was built under R version 3.4.4
library(tm)
## Warning: package 'tm' was built under R version 3.4.3
## Loading required package: NLP
library(ngram)
## Warning: package 'ngram' was built under R version 3.4.3
library(NLP)
library(stringr)
setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Feature Set3/Code/5. Subsetting")
TaggedReviews <- readxl::read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Feature Set4/Input/9.Reviews with Booster and Attenuator Tagging.xlsx")
Destination.Text <- TaggedReviews$AttenuatorTaggedReviews
Destination.Text <- as.list(Destination.Text)
Since the negation words are already tag, a subset without negations have been created for further tagging.
POS_Set <- list()
for (o in 1:4735){
Extract <- unlist(Destination.Text[[o]])
Extract <- gsub(pattern = "\\S+_NOT\\b",replacement = "",x = Extract)
Extract <- gsub(pattern = "\\S+_LOW\\b",replacement = "",x = Extract)
Extract <- gsub(pattern = "\\S+_HIGH\\b",replacement = "",x = Extract)
POS_Set[[o]] <- stripWhitespace(Extract)
}
POS_Set[[1]]
## [1] "I am so angry that i made this post available via all possible sites i use when planing my trips so will make the mistake of booking this place"
df <- data.frame(matrix(NA, nrow = 4735, ncol = 1))
df$POS_Text <- POS_Set
WriteXLS::WriteXLS(df,ExcelFileName = "10.POS Set.xlsx")
Tagged.Words.Neg <- str_extract(string = Destination.Text,pattern = "\\S+_NOT\\b")
Tagged.Words.Boost <- str_extract(string = Destination.Text,pattern = "\\S+_HIGH\\b")
Tagged.Words.Atte <- str_extract(string = Destination.Text,pattern = "\\S+_LOW\\b")
Tagged.Words.Clean1 <- Tagged.Words.Neg[-which(sapply(Tagged.Words.Neg,is.na)==TRUE)]
Tagged.Words.Clean2 <- Tagged.Words.Boost[-which(sapply(Tagged.Words.Boost,is.na)==TRUE)]
Tagged.Words.Clean3 <- Tagged.Words.Atte[-which(sapply(Tagged.Words.Atte,is.na)==TRUE)]
Index1 <- grep("\\S+_NOT\\b",Tagged.Words.Neg)
Index2 <- grep("\\S+_HIGH\\b",Tagged.Words.Boost)
Index3 <- grep("\\S+_LOW\\b",Tagged.Words.Atte)
#Negations
Neg.df <- data.frame(matrix(seq(1,364),ncol=1,nrow=364))
Neg.df$Index <- Index1
Neg.df$Tagged.Words <- Tagged.Words.Clean1
#Boosters
Boost.df <- data.frame(matrix(seq(1,79),ncol=1,nrow=79))
Boost.df$Index <- Index2
Boost.df$Tagged.Words <- Tagged.Words.Clean2
#Attenuators
Att.df <- data.frame(matrix(seq(1,76),ncol=1,nrow=76))
Att.df$Index <- Index3
Att.df$Tagged.Words <- Tagged.Words.Clean3
WriteXLS::WriteXLS(Neg.df,ExcelFileName = "11. Neg Words and Index.xlsx")
WriteXLS::WriteXLS(Boost.df,ExcelFileName = "11. Boost Words and Index.xlsx")
WriteXLS::WriteXLS(Att.df,ExcelFileName = "11. Attenuator Words and Index.xlsx")