PREPROCESSING

1. Preparation

#install.packages("tokenizers")
#install.packages("FSelectorRcpp")
library(NLP)
library(FSelectorRcpp)
library(tokenizers)

## Warning: package 'tokenizers' was built under R version 3.4.4

library(tm)

## Warning: package 'tm' was built under R version 3.4.3

library(SnowballC)
library(stats)
library(ngram)

## Warning: package 'ngram' was built under R version 3.4.3

library(stringr)
library(readxl)

setwd("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Feature Set4/Code/1.Booster/1. Booster and Attenuator Detection")

Data <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Feature Set4/Input/1.Reviews with Negation Tagging.xlsx")

Text <- as.character(Data$Tagged)
Text <- as.list(Text)

Activation <- read_excel("~/Google Drive/UM/Smart Services/Thesis/Thesis/Code/Feature Set4/Input/1.Boosters and Attenuators.xlsx")

BOOSTER DETECTION

2. Booster List

First a list of common negations is retrieved to spot any sentence containing negations.

#Get List of Boosters
Booster <- as.list(Activation$Certainty)

#Asterix
Asterix <- str_extract(Booster,"\\*$")
Asterix <- as.list(which(is.na(Asterix)==FALSE))

#Other
Other <- list()
for(z in 1:178){
  if(z %in% Asterix)next
  Other[[z]] <- print(z)
}

## [1] 1
## [1] 2
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 11
## [1] 12
## [1] 13
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 26
## [1] 27
## [1] 28
## [1] 30
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 38
## [1] 41
## [1] 46
## [1] 47
## [1] 48
## [1] 50
## [1] 51
## [1] 52
## [1] 54
## [1] 55
## [1] 58
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
## [1] 69
## [1] 71
## [1] 72
## [1] 73
## [1] 74
## [1] 75
## [1] 76
## [1] 77
## [1] 78
## [1] 79
## [1] 80
## [1] 81
## [1] 82
## [1] 83
## [1] 84
## [1] 85
## [1] 86
## [1] 90
## [1] 92
## [1] 93
## [1] 94
## [1] 95
## [1] 96
## [1] 97
## [1] 98
## [1] 99
## [1] 100
## [1] 102
## [1] 103
## [1] 104
## [1] 105
## [1] 106
## [1] 112
## [1] 113
## [1] 114
## [1] 115
## [1] 116
## [1] 117
## [1] 118
## [1] 119
## [1] 120
## [1] 121
## [1] 122
## [1] 123
## [1] 124
## [1] 125
## [1] 126
## [1] 127
## [1] 128
## [1] 129
## [1] 130
## [1] 131
## [1] 132
## [1] 133
## [1] 134
## [1] 135
## [1] 136
## [1] 137
## [1] 138
## [1] 139
## [1] 140
## [1] 141
## [1] 142
## [1] 143
## [1] 144
## [1] 145
## [1] 146
## [1] 147
## [1] 148
## [1] 149
## [1] 150
## [1] 151
## [1] 152
## [1] 153
## [1] 154
## [1] 155
## [1] 156
## [1] 157
## [1] 158
## [1] 159
## [1] 160
## [1] 161
## [1] 162
## [1] 163
## [1] 164
## [1] 165
## [1] 166
## [1] 167
## [1] 168
## [1] 169
## [1] 170
## [1] 171
## [1] 172
## [1] 173
## [1] 174
## [1] 175
## [1] 176
## [1] 177
## [1] 178

Other <- as.list(unlist(Other))

#Adjust format (Asterix)
Boosters.Clean <- Booster

for (j in Asterix){
  Extract <- Booster[[j]]
  ADD <- "^"
  New_Word <- paste(ADD,Extract,sep = "")
  Boosters.Clean[[j]] <- New_Word
}

#Adjust format (Other)
for (k in Other){
  Extract <- Booster[[k]]
  ADD1 <- "^"
  ADD2 <- "$"
  New_Word <- paste(ADD1,Extract,ADD2,sep = "")
  Boosters.Clean[[k]] <- New_Word
}

3. Trim Leading Function

trim.leading <- function (x){
  x <- sub("^\\s+", "", x)
  print(x)
}

4. Identify Boosters

In this step the reviews are checked for boosters. The function checks each word to determine if the word is identical to any of the booster list elements and returns a logical element. The results are stored in a two dimensional list structure First list level: list of sentences each sentence list elements contains a list of words.

Reviews <- list()

for (k in 1:4735){
  #Extract Sentences
  Extract <- Text[[k]]
  Word.Number <- as.numeric(wordcount(Extract))
  Words <- strsplit(Extract,split = " ")
  Words <- unlist(Words)
  Words <- as.list(Words)
  Result <- list()
  
  for(l in 1:Word.Number){
    Word <- Words[[l]]
    Word <- any(sapply(Boosters.Clean,grepl,Word))
    Result[[l]] <- Word
  }
  
  Reviews[[k]] <- Result
  
}

5. Extract Sentence ID (Part1)

In order to retrieve the sentence, two-dimensional list structure containing the logical elements was used to determine the “location” of the boosterin the text. Both the Review ID and the Sentence ID number were saved.

First it was determined which sentences contained a Booster:

Booster.Final <- list()

for (m in 1:4735){
  Extract <- Reviews[[m]]
  Evaluation <- any(Extract==TRUE)
  Booster.Final[[m]] <- Evaluation
}

6. Extract Review and Sentence Number (Part2)

Second, both the Review and the Sentence ID were saved in the “Index.List” structure.

Booster.Index.List <- which(unlist(Booster.Final)==TRUE)
Booster.Index.List <- as.list(Booster.Index.List)
Booster.Index.List <- unlist(Booster.Index.List)

8 Extract Booster Reviews

Finally, the index list was used to retrieve reviews containing boosters. Those reviews were stored in the structure “Booster.Sentences”.

Extract <- Text[Booster.Index.List]

df <- data.frame(matrix(NA,ncol=1,nrow=516))
df$Boost.Text <- Extract

Finally the reviews containing boosters are extracted as an Excel file for further processing

WriteXLS::WriteXLS(df,ExcelFileName = "2. Booster Fragments.xlsx")

df.Vec <- data.frame(Booster.Index.List)

Those indeces are exported as an Excel file.

WriteXLS::WriteXLS(df.Vec,ExcelFileName = "2. Booster Sentence Indicators.xlsx")

Detecting Boosters

Lisa

7/15/2018