In this file the textfile which was combined out of three files, is transformed into the N-grams (2, 3 and 4). These N-grams are used in a predictionalgoritme.
In this part the Totaltext data set which is cleaned before is read.
URLfile <- "C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/"
## Reading twitter
con <- file(paste0(URLfile,"en_US.combined.txt"), "r")
Totaltext <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
rm(con)
cleaningData <- function(inputfile){
inputfile <- iconv(inputfile, "latin1", "ASCII", sub="")
inputfile <- VectorSource(inputfile)
inputfile <- SimpleCorpus(inputfile, control = list(language = "en"))
inputfile <- tm_map(inputfile, content_transformer(tolower))
inputfile <- tm_map(inputfile, removePunctuation)
inputfile <- tm_map(inputfile, removeNumbers)
inputfile <- tm_map(inputfile, stripWhitespace)
}
Totaltext <- cleaningData(Totaltext)
textdata <- VectorSource(Totaltext)
rm(Totaltext)
textdata <- VCorpus(textdata)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
## Building a document term matrix for 2 word clusters
## I had to transform the data into a VCorpus because in my version the Weka package is not working.
dtm2 <- DocumentTermMatrix(textdata, control = list(tokenize = BigramTokenizer))
# The clusters are sorted so the clusters with the higest frequency can be extracted
dtm2 <- aggregate(count ~ term, data = tidy(dtm2), sum)
dtm2 <- dtm2[order(as.numeric(dtm2$count), decreasing = TRUE), ]
## Building the model
names(dtm2) <- c("word", "freq")
## Saving dtm2
saveRDS(dtm2, "C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/en_US.dtm2.rds")
rm(dtm2)
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
## Building a document term matrix for 3 word clusters
dtm3 <- DocumentTermMatrix(textdata, control = list(tokenize = TrigramTokenizer))
## The clusters are sorted so the clusters with the higest frequency can be extracted
dtm3 <- aggregate(count ~ term, data = tidy(dtm3), sum)
dtm3 <- dtm3[order(as.numeric(dtm3$count), decreasing = TRUE), ]
## Building the model
names(dtm3) <- c("word", "freq")
## Saving dtm2
saveRDS(dtm3, "C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/en_US.dtm3.rds")
rm(dtm3)
FourgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
## Building a document term matrix for 4 word clusters
dtm4 <- DocumentTermMatrix(textdata, control = list(tokenize = FourgramTokenizer))
## The clusters are sorted so the clusters with the higest frequency can be extracted
dtm4 <- aggregate(count ~ term, data = tidy(dtm4), sum)
dtm4 <- dtm4[order(as.numeric(dtm4$count), decreasing = TRUE), ]
## Building the model
names(dtm4) <- c("word", "freq")
## Saving dtm4
saveRDS(dtm4, "C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/en_US.dtm4.rds")
rm(textdata)
dtm2 <- readRDS("C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/en_US.dtm4.rds")
dtm3 <- readRDS("C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/en_US.dtm4.rds")
dtm4 <- readRDS("C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/en_US.dtm4.rds")
## Split the first column of the dtm in seperate collumns
dtm2 <- as.data.frame(cbind(str_split(dtm2$word, " ", simplify = TRUE), dtm2$freq))
dtm3 <- as.data.frame(cbind(str_split(dtm3$word, " ", simplify = TRUE), dtm3$freq))
dtm4 <- as.data.frame(cbind(str_split(dtm4$word, " ", simplify = TRUE), dtm4$freq))
## Change the rownames
names(dtm2) <- c("word1","result", "freq")
names(dtm3) <- c("word1","word2", "result", "freq")
names(dtm4) <- c("word1", "word2", "word3", "result", "freq")
## Sort the dtm's based on collumns with predictors and frequency
dtm2 <- dtm2[order(as.character(dtm2$word1), as.numeric(-dtm2$freq)), ]
dtm3 <- dtm3[order(as.character(dtm3$word1), as.character(dtm3$word2), as.numeric(-dtm3$freq)), ]
dtm4 <- dtm4[order(as.character(dtm4$word1), as.character(dtm4$word2), as.character(dtm4$word3), as.numeric(-dtm4$freq)), ]
## Collect the first row of each unique predictor (set)
dtm2 <- dtm2[!duplicated(dtm2[1]),]
dtm3 <- dtm3[!duplicated(dtm3[1:2]),]
dtm4 <- dtm4[!duplicated(dtm4[1:3]),]
dtm2 <- dtm2[ , c(1:2)]
dtm3 <- dtm3[ , c(1:3)]
dtm4 <- dtm4[ , c(1:4)]
dtm2 <- dtm2[hunspell_check(as.character(dtm2$word1)), ]
dtm2 <- dtm2[hunspell_check(as.character(dtm2$result)), ]
dtm3 <- dtm3[hunspell_check(as.character(dtm3$word1)), ]
dtm3 <- dtm3[hunspell_check(as.character(dtm3$word2)), ]
dtm3 <- dtm3[hunspell_check(as.character(dtm3$result)), ]
dtm4 <- dtm4[hunspell_check(as.character(dtm4$word1)), ]
dtm4 <- dtm4[hunspell_check(as.character(dtm4$word2)), ]
dtm4 <- dtm4[hunspell_check(as.character(dtm4$word3)), ]
dtm4 <- dtm4[hunspell_check(as.character(dtm4$result)), ]
saveRDS(dtm2, "C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/en_US.dtm2.rds")
saveRDS(dtm3, "C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/en_US.dtm3.rds")
saveRDS(dtm4, "C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/en_US.dtm4.rds")
zin <- "i think well"
zinschoon <- as.data.frame(cbind(str_split(unlist(cleaningData(zin))[1], " ", simplify = TRUE)))
aantal <- ncol(zinschoon)
if (aantal >2) {
resultaat <- filter(dtm4, as.character(dtm4$word1) == as.character(zinschoon[1, aantal-2]) &
as.character(dtm4$word2) == as.character(zinschoon[1, aantal-1]) &
as.character(dtm4$word3) == as.character(zinschoon[1, aantal ]))
if (empty(resultaat)) {
resultaat <- filter(dtm3, as.character(dtm3$word1) == as.character(zinschoon[1, aantal-1]) &
as.character(dtm3$word2) == as.character(zinschoon[1, aantal]))
if (empty(result)) {
resultaat <- filter(dtm2, as.character(dtm2$word1) == as.character(zinschoon[1, aantal]))
}
}
}
if (aantal == 2) {
resultaat <- filter(dtm3, as.character(dtm3$word1) == as.character(zinschoon[1, aantal-1]) &
as.character(dtm3$word2) == as.character(zinschoon[1, aantal]))
if (empty(resultaat)) {
resultaat <- filter(dtm2, as.character(dtm2$word1) == as.character(zinschoon[1, aantal]))
}
}
if (aantal == 1) {
resultaat <- filter(dtm2, as.character(dtm2$word1) == as.character(zinschoon[1, aantal]))
}
if (empty(resultaat)) {
antwoord <- "Sorry, I can't find the answer!"
} else { antwoord <- as.character(resultaat$result[1])
}
print(antwoord)
## [1] "be"