#install.packages("tokenizers")
#install.packages("FSelectorRcpp")
library(NLP)
library(FSelectorRcpp)
library(tokenizers)
## Warning: package 'tokenizers' was built under R version 3.4.4
library(tm)
## Warning: package 'tm' was built under R version 3.4.3
library(SnowballC)
library(stats)
library(ngram)
## Warning: package 'ngram' was built under R version 3.4.3
library(stringr)
library(readxl)
setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Feature Set4/Code/2.Attenuator/1. Booster and Attenuator Detection")
Data <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Feature Set4/Input/5.Reviews with Booster Tagging.xlsx")
Text <- as.character(Data$BoosterTaggedReviews)
Text <- as.list(Text)
Activation <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Feature Set4/Input/1.Boosters and Attenuators.xlsx")
First a list of common negations is retrieved to spot any sentence containing negations.
#Get List of Attenuators
Attenuator <- as.list(Activation$Tentative)
#Asterix
Asterix <- str_extract(Attenuator,"\\*$")
Asterix <- as.list(which(is.na(Asterix)==FALSE))
#Other
Other <- list()
for(z in 1:178){
if(z %in% Asterix)next
Other[[z]] <- print(z)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 5
## [1] 7
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 41
## [1] 42
## [1] 43
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 52
## [1] 53
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 63
## [1] 68
## [1] 69
## [1] 70
## [1] 72
## [1] 73
## [1] 74
## [1] 75
## [1] 76
## [1] 77
## [1] 78
## [1] 79
## [1] 80
## [1] 81
## [1] 82
## [1] 83
## [1] 85
## [1] 86
## [1] 87
## [1] 88
## [1] 89
## [1] 90
## [1] 91
## [1] 93
## [1] 96
## [1] 97
## [1] 99
## [1] 100
## [1] 101
## [1] 102
## [1] 105
## [1] 106
## [1] 108
## [1] 110
## [1] 111
## [1] 112
## [1] 113
## [1] 114
## [1] 115
## [1] 116
## [1] 117
## [1] 118
## [1] 119
## [1] 120
## [1] 122
## [1] 123
## [1] 125
## [1] 127
## [1] 128
## [1] 131
## [1] 132
## [1] 133
## [1] 134
## [1] 135
## [1] 136
## [1] 137
## [1] 138
## [1] 139
## [1] 140
## [1] 141
## [1] 142
## [1] 143
## [1] 144
## [1] 145
## [1] 150
## [1] 151
## [1] 152
## [1] 153
## [1] 158
## [1] 159
## [1] 160
## [1] 165
## [1] 166
## [1] 167
## [1] 168
## [1] 169
## [1] 170
## [1] 172
## [1] 173
## [1] 174
## [1] 175
## [1] 176
## [1] 177
## [1] 178
Other <- as.list(unlist(Other))
#Adjust format (Asterix)
Attenuators.Clean <- Attenuator
for (j in Asterix){
Extract <- Attenuator[[j]]
ADD <- "^"
New_Word <- paste(ADD,Extract,sep = "")
Attenuators.Clean[[j]] <- New_Word
}
#Adjust format (Other)
for (k in Other){
Extract <- Attenuator[[k]]
ADD1 <- "^"
ADD2 <- "$"
New_Word <- paste(ADD1,Extract,ADD2,sep = "")
Attenuators.Clean[[k]] <- New_Word
}
trim.leading <- function (x){
x <- sub("^\\s+", "", x)
print(x)
}
In this step the reviews are checked for Attenuators. The function checks each word to determine if the word is identical to any of the Attenuator list elements and returns a logical element. The results are stored in a two dimensional list structure First list level: list of sentences each sentence list elements contains a list of words.
Reviews <- list()
for (k in 1:4735){
#Extract Sentences
Extract <- Text[[k]]
Word.Number <- as.numeric(wordcount(Extract))
Words <- strsplit(Extract,split = " ")
Words <- unlist(Words)
Words <- as.list(Words)
Result <- list()
for(l in 1:Word.Number){
Word <- Words[[l]]
Word <- any(sapply(Attenuators.Clean,grepl,Word))
Result[[l]] <- Word
}
Reviews[[k]] <- Result
}
In order to retrieve the sentence, two-dimensional list structure containing the logical elements was used to determine the “location” of the Attenuatorin the text. Both the Review ID and the Sentence ID number were saved.
First it was determined which sentences contained a Attenuator:
Attenuator.Final <- list()
for (m in 1:4735){
Extract <- Reviews[[m]]
Evaluation <- any(Extract==TRUE)
Attenuator.Final[[m]] <- Evaluation
}
Second, both the Review and the Sentence ID were saved in the “Index.List” structure.
Attenuator.Index.List <- which(unlist(Attenuator.Final)==TRUE)
Attenuator.Index.List <- as.list(Attenuator.Index.List)
Attenuator.Index.List <- unlist(Attenuator.Index.List)
Finally, the index list was used to retrieve reviews containing Attenuators. Those reviews were stored in the structure “Attenuator.Sentences”.
Extract <- Text[Attenuator.Index.List]
df <- data.frame(matrix(NA,ncol=1,nrow=623))
df$Attenuator.Text <- Extract
Finally the reviews containing Attenuators are extracted as an Excel file for further processing
WriteXLS::WriteXLS(df,ExcelFileName = "6. Attenuator Fragments.xlsx")
In order to be able to reinsert the tagged sentences both the Review and Sentence ID of sentences containing negations are extracted.
df.Vec <- data.frame(Attenuator.Index.List)
Those indeces are exported as an Excel file.
WriteXLS::WriteXLS(df.Vec,ExcelFileName = "6. Booster Sentence Indicators.xlsx")