Subsetting

library(tokenizers)

## Warning: package 'tokenizers' was built under R version 3.4.4

library(tm)

## Warning: package 'tm' was built under R version 3.4.3

## Loading required package: NLP

library(ngram)

## Warning: package 'ngram' was built under R version 3.4.3

library(NLP)
library(stringr)

setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Feature Set3/Code/5. Subsetting")

IMPORT DATA

TaggedReviews <- readxl::read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Feature Set4/Input/9.Reviews with Booster and Attenuator Tagging.xlsx")

Destination.Text <- TaggedReviews$AttenuatorTaggedReviews
Destination.Text <- as.list(Destination.Text)

Create POS Subset Without Negations

Since the negation words are already tag, a subset without negations have been created for further tagging.

POS_Set <- list()

for (o in 1:4735){
  Extract <- unlist(Destination.Text[[o]])
  Extract <- gsub(pattern = "\\S+_NOT\\b",replacement = "",x = Extract)
  Extract <- gsub(pattern = "\\S+_LOW\\b",replacement = "",x = Extract)
  Extract <- gsub(pattern = "\\S+_HIGH\\b",replacement = "",x = Extract)
  POS_Set[[o]] <- stripWhitespace(Extract)
}

POS_Set[[1]]

## [1] "I am so angry that i made this post available via all possible sites i use when planing my trips so will make the mistake of booking this place"

df <- data.frame(matrix(NA, nrow = 4735, ncol = 1))
df$POS_Text <- POS_Set

WriteXLS::WriteXLS(df,ExcelFileName = "10.POS Set.xlsx")

Extract Negations

Tagged.Words.Neg <- str_extract(string = Destination.Text,pattern = "\\S+_NOT\\b")
Tagged.Words.Boost <- str_extract(string = Destination.Text,pattern = "\\S+_HIGH\\b")
Tagged.Words.Atte <- str_extract(string = Destination.Text,pattern = "\\S+_LOW\\b")

Tagged.Words.Clean1 <- Tagged.Words.Neg[-which(sapply(Tagged.Words.Neg,is.na)==TRUE)]
Tagged.Words.Clean2 <- Tagged.Words.Boost[-which(sapply(Tagged.Words.Boost,is.na)==TRUE)]
Tagged.Words.Clean3 <- Tagged.Words.Atte[-which(sapply(Tagged.Words.Atte,is.na)==TRUE)]

Index1 <- grep("\\S+_NOT\\b",Tagged.Words.Neg)
Index2 <- grep("\\S+_HIGH\\b",Tagged.Words.Boost)
Index3 <- grep("\\S+_LOW\\b",Tagged.Words.Atte)

#Negations
Neg.df <- data.frame(matrix(seq(1,364),ncol=1,nrow=364))
Neg.df$Index <- Index1
Neg.df$Tagged.Words <- Tagged.Words.Clean1

#Boosters
Boost.df <- data.frame(matrix(seq(1,79),ncol=1,nrow=79))
Boost.df$Index <- Index2
Boost.df$Tagged.Words <- Tagged.Words.Clean2

#Attenuators
Att.df <- data.frame(matrix(seq(1,76),ncol=1,nrow=76))
Att.df$Index <- Index3
Att.df$Tagged.Words <- Tagged.Words.Clean3

WriteXLS::WriteXLS(Neg.df,ExcelFileName = "11. Neg Words and Index.xlsx")
WriteXLS::WriteXLS(Boost.df,ExcelFileName = "11. Boost Words and Index.xlsx")
WriteXLS::WriteXLS(Att.df,ExcelFileName = "11. Attenuator Words and Index.xlsx")

Subsetting

Lisa

7/14/2018

IMPORT DATA

Create POS Subset Without Negations

Extract Negations