This documents the final code that samples and processes the text data as well as build the predictive model and test its accuracy.
It was created in i386-w64-mingw32, i386, mingw32, i386, mingw32, , 3, 3.3, 2017, 03, 06, 72310, R, R version 3.3.3 (2017-03-06), Another Canoe.
library(quanteda) #main processing engine
library(ngram) #to process the input strings
library(stringr) #to merge input data with ngrams
library(dplyr) #to manipulate data throughout the process
options(stringsAsFactors = FALSE)
the exploratory analysis revealed that the sampling the blog text would be the most efficient way to approach this problem. Sampling speeds up the process, and the blog text is the richer than the alternative news and twitter texts.
This code pulls a random sample from the text file. It splits the source text into chunks and then applies a 50/50 coin-flip routine to each chunk to see if it makes it to the final sample.
setwd()
extract <- function(filename, sample.size) {
i <- 1
mydata <- file(filename, "r")
dataR <- readLines(mydata)
chunksize <- as.integer(2*(length(dataR)/(sample.size)))
chunknumber <- as.integer(sample.size/chunksize)
sample.text <- as.data.frame(NULL)
close(mydata)
gc()
while(i < chunknumber) {
sample.text <- sample.text
pull <- dataR[(i*chunksize):((i+1)*chunksize)]
pull <- as.data.frame(pull)
set.seed(1234)
if(rbinom(n=1, size=1, prob=.5)<1) {
sample.text <- rbind(sample.text, pull)
} else {
sample.text <- sample.text
}
i <- i + 1
}
return(sample.text)
}
blog.sample <- extract("en_US.blogs.txt", 10000)
blog.sample$pull <- as.character(blog.sample$pull)
saveRDS(blog.sample, "blog.rds")
Next the code faces a Naughty Word filter. Since I could only think of about 12 bad words I took advantage of the 700+ list available at this web site: #https://www.frontgatemedia.com/a-list-of-723-bad-words-to-blacklist-and-how-to-use-facebooks-moderation-tool/ Note - at first I avoided reading this file because I did not want to know more than 12 bad words. However, some of the words here really aren’t so bad, like beer, so do be sure to read any file you use and edit appropriately.
naughty <- read.csv("Terms-to-Block.csv",
header=FALSE,
stringsAsFactors = FALSE)
naughty <- as.vector(naughty$V1)
naughty <- gsub(",", "", naughty)
Now all the pieces are in place to create and process the corpus. Quanteda is a powerful tool that quickly cleans, transforms and summarizes text data into training and test samples. The split will be 70:30 train:test.
qcorp <- corpus(as.vector(blog.sample), text_field = "pull")
set.seed(1234)
splitter <- c(1:ndoc(qcorp))
splittrn <- sample(splitter, .7*ndoc(qcorp), replace=FALSE)
splittst <- splitter[-splittrn]
train <- qcorp[splittrn]
test <- qcorp[splittst]
cleanTNT <- function(qcorp){
toks <- tokens(qcorp,
removeSymbols=TRUE,
removeNumbers=TRUE,
removeHyphens=TRUE)
toks <- removeFeatures(toks, naughty)
}
train <- cleanTNT(train)
test <- cleanTNT(test)
saveRDS(train, file="train.Rda")
saveRDS(test, file="test.Rda")
con <- file("train.Rda")
train <- readRDS(con)
close(con)
con <- file("test.Rda")
test <- readRDS(con)
close(con)
dfm.trn <- dfm(train)
dfm.tst <- dfm(test)
type.trn <- ntype(dfm.trn)
type.tst <- ntype(dfm.tst)
xuni <- dfm_weight(dfm.trn, type="frequency")
xbi <- collocations(train, size=2)
xtri <- collocations(train, size=3)
Saving the ngram files enables the backoff model to run unecumbered by the large corpus files. This model begins searching trigrams for a user typed term. If that search is unproductive it backs off to a search among bigrams. If that fails it moves to the unigrams and selects the alternatives with the highest frequency. In each case a maximum of 3 words are returned as choices for the user.
prediction <- function(unigram, bigram, trigram, typing){
typingn <- ntoken(typing)
unigram <- tbl_df(unigram)
unigram <- unigram %>% arrange(desc(relfreq)) %>% select(names)
unigram <- as.character(unigram[1,1])
bigram <- bigram
trigram <- trigram
if(typingn>1) {
lastwords <- word(typing, -2:-1)
trigram$match1 <- stringr::str_detect(trigram$word1, lastwords[1])
trigram2 <- trigram %>% filter(match1=="TRUE")
n1 <- nrow(trigram2)
if(n1>0) {
trigram2$match2 <- stringr::str_detect(trigram2$word2,lastwords[2])
trigram3 <- trigram2 %>% filter(match2=="TRUE")
n2 <- nrow(trigram3)
if(n2 >0) {
trigram4 <- trigram3 %>% arrange(desc(relfreq))
trigram5 <- trigram4 %>% select(word3)
trigram5[1,1]
} else {
trigram6 <- trigram %>% arrange(desc(relfreq))
trigram7 <- trigram6 %>% select(word3)
trigram7[1,1]
}
}
} else {
final <- word(typing, -1)
bigram$match1 <- stringr::str_detect(bigram$word1, final[1])
bigram2 <- bigram %>% filter(match1=="TRUE")
n3 <- nrow(bigram2)
if(n3>0) {
bigram3 <- bigram2 %>% arrange(desc(relfreq))
bigram4 <- bigram3 %>% select(word2)
bigram4[1,1]
} else
unigram
}
}
The final step is to assess the accuracy of the model using the test data. Beware - this takes a LOT of time to run - about 12 hours total.
ytri <- collocations(test, size=3)
x <- paste(ytri$word1, ytri$word2)
saveRDS(ytri, file="ytri.Rda")
con <- file("ytri.Rda")
ytri <- readRDS(con)
close(con)
ytri$match <- ytri$new==ytri$word3
ytrisum <- ytri %>% filter(match==TRUE)
answer <- nrow(ytrisum)/nrow(ytri)
ytricheck <- ytri %>% arrange(desc(count))
ytricheck <- ytricheck %>% select(-match) %>% select(-new2)
ytricheck.short <- ytricheck[1:104500,]
chunkn <- as.integer(length(x)/500)
check1 <- matrix(NA, nrow=500, ncol=209)
holder <- sapply(check1[,1:209], prediction, unigram=ug, bigram=bg, trigram=tg)
saveRDS(holder, file="testresults")
#create trigram predictionsa and accuracy rate
ytricheck.short$new <- holder
ytricheck.short$match <- as.character(ytricheck.short$new)==ytricheck.short$word3
ytricheck.short.sum <- ytricheck.short %>% filter(match==TRUE)
accuracy3gram <- nrow(ytricheck.short.sum)/nrow(ytricheck.short)
sum(ntoken(test))
#create bigram predictions and accuracy rate
ybi <- collocations(test, size=2)
x2 <- ybi$word1
holder2 <- sapply(x2, prediction, unigram=ug, bigram=bg, trigram=tg)
saveRDS(holder2, file="testresults2")
ybi$new <- holder2
ybi$match <- as.character(ybi$new)==ybi$word2
ybi.sum <- ybi %>% filter(match==TRUE)
accuracy2grams <- nrow(ybi.sum)/nrow(ybi)
topfeatures(dfm.tst)
ntoken(test)
ntype(test)
ntype(ybi$word2)
sum(ntoken(train))
#develop a random predictor alternative to see success rates
rp1 <- as.vector(colnames(dfm.trn))
rp2 <- sample(rp1, 104500, replace=TRUE)
ytricheck.short.rp3 <- cbind(ytricheck.short, rp2)
ytricheck.short.rp3$rp2match <- ytricheck.short.rp3$word3==ytricheck.short.rp3$rp2
ytricheck.short.rp3.sum <- ytricheck.short.rp3 %>% filter(rp2match==TRUE)
answerrp3 <- nrow(ytricheck.short.rp3.sum)/(nrow(ytricheck.short))
rp2b <- sample(rp1, 75151, replace=TRUE)
ybi.rp3 <- cbind(ybi, rp2b)
ybi.rp3$rp2bmatch <- ybi.rp3$word2==ybi.rp3$rp2b
ybi.rp3.sum <- ybi.rp3 %>% filter(rp2bmatch==TRUE)
answerrp3b <- nrow(ybi.rp3.sum)/nrow(ybi.rp3)
#calculate p values for model results vs. random
ppois(answercheck, answerrp3, lower.tail=FALSE) # returns 2.i7 e-05
ppois(answercheck2, answerrp3b, lower.tail=FALSE) # returns 2.66 e-05