Using three source files from Twitter, blogs, and news articles, build a text predictor that takes a phrase and returns a prediction for the next word.
Here is a link to the final app…
https://zburch.shinyapps.io/FinalProject
Here is a link to the server/ui code…
https://github.com/zburch/NLPFinalProject
Download files
#Main file for text mining
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip","corpus.zip")
zipPath <- paste(getwd(),"corpus.zip",sep="/")
unzip(zipPath)
#List of profane words
download.file("https://www.frontgatemedia.com/new/wp-content/uploads/2014/03/Terms-to-Block.csv","profanity.csv")
Read in portion of data, save to file
fileList <- c("en_us/en_US.twitter.txt",
"en_us/en_US.blogs.txt",
"en_us/en_US.news.txt")
df <- NULL
set.seed(123)
for (i in 1:length(fileList)){
con <- file(paste("final",fileList[i],sep="/"), "r")
text <- NULL
while(TRUE){
#Bring in approx. 15% of the data
if(rbinom(1,1,prob=.15)){
#Grab 5 lines at a time
newSet <- readLines(con,5)
if(length(newSet)==0){
break
}
text <- c(text,newSet)
} else {
#Skip 5 lines
readLines(con,5)
}
}
close(con)
#Add filenames to the list
fileName <- rep(strsplit(fileList[i],
split=".",
fixed=TRUE)[[1]][2],
length(text))
newDF <- data.frame(fileName,text)
df <- rbind(df,newDF)
}
df$fileName <- as.factor(df$fileName)
write.csv(df,"subset.csv")
Perform basic global cleaning
df <- read.csv("subset.csv",row.names=NULL)
df <- df[,2:3]
rownames(df) <- c()
#Lowercase
df$text <- tolower(df$text)
#Strip extra whitespace
df$text <- gsub("\\s+"," ",df$text)
#Trim whitespace
df$text <- trimws(df$text)
df$text <- gsub("@","at",df$text,fixed=TRUE)
df$text <- gsub("&","and",df$text,fixed=TRUE)
#remove punctuation
df$text <- gsub("[^a-z0-9# ]","",df$text)
#split training/test sets
split <- rep(0,length(df$text))
split[1:2000] <- 1
split <- sample(split)
training <- df[!split>0,]
test <- df[split>0,]
write.csv(test,"testing.csv",row.names=FALSE)
Build ngrams (uni-, bi-, tri-, and quad-)
library(dplyr)
library(tidytext)
#Get unigrams of all data
unigrams <- training %>%
unnest_tokens(unigram,text,token="ngrams",n=1)
#Get bigrams of all data
bigrams <- training %>%
unnest_tokens(bigram,text,token="ngrams",n=2)
#Get trigrams of all data
trigrams <- training %>%
unnest_tokens(trigram,text,token="ngrams",n=3)
#Get quadgrams of all data
quadgrams <- training %>%
unnest_tokens(quadgram,text,token="ngrams",n=4)
Clean ngrams
#Remove any ngram containing non-alpha or space characters
unigrams <- unigrams[grepl("^[a-z]*$",unigrams$unigram),]
bigrams <- bigrams[grepl("^[a-z]* [a-z]*$",
bigrams$bigram),]
trigrams <- trigrams[grepl("^[a-z]* [a-z]* [a-z]*$",
trigrams$trigram),]
quadgrams <- quadgrams[grepl("^[a-z]* [a-z]* [a-z]* [a-z]*$",
quadgrams$quadgram),]
#Remove retweets
unigrams <- unigrams[!grepl("^rt$",unigrams$unigram),]
bigrams <- bigrams[!grepl("(^rt | rt$)",bigrams$bigram),]
trigrams <- trigrams[!grepl("(^rt | rt | rt$)",trigrams$trigram),]
quadgrams <- quadgrams[!grepl("(^rt | rt | rt$)",quadgrams$quadgram),]
#Bring in profanity database profanity
badwords <- read.csv("profanity.csv")
#Get rid of extra header rows
badwords <- badwords[4:length(badwords[,1]),2]
#Strip whitespace
badwords <- gsub("\\s+"," ",badwords)
#Trim whitespace
badwords <- trimws(badwords)
#Lowercase
badwords <- tolower(badwords)
#Remove punctuation
badwords <- gsub(",","",badwords,fixed=TRUE)
#Remove any non-alpha or space characters
badwords <- badwords[grepl("^[a-z]*$",badwords)]
#Remove profanity rows from ngrams
unigrams <- unigrams[!grepl(paste("\\<",paste(badwords,collapse="\\>|")),
unigrams$unigram),]
bigrams <- bigrams[!grepl(paste("\\<",paste(badwords,collapse="\\>|")),
bigrams$bigram),]
trigrams <- trigrams[!grepl(paste("\\<",paste(badwords,collapse="\\>|")),
trigrams$trigram),]
quadgrams <- quadgrams[!grepl(paste("\\<",paste(badwords,collapse="\\>|")),
quadgrams$quadgram),]
#Save files
write.csv(unigrams,"unigrams.csv",row.names=FALSE)
write.csv(bigrams,"bigrams.csv",row.names=FALSE)
write.csv(trigrams,"trigrams.csv",row.names=FALSE)
write.csv(quadgrams,"quadgrams.csv",row.names=FALSE)
Basic exploratory analysis
library(dplyr)
library(tidytext)
uniCt <- unigrams %>%
count(unigram,sort=TRUE) %>%
mutate(rsum = cumsum(n))
uniCt <- as.data.frame(uniCt)
head(uniCt)
class(uniCt)
#How many unique words needed to cover 50% of all words?
fiftyPct <- sum(uniCt$n)*.5
fiftyPctUnigrams <- uniCt[uniCt$rsum<=fiftyPct,]
length(fiftyPctUnigrams[,1])
#90%?
ninetyPct <- sum(uniCt$n)*.9
ninetyPctUnigrams <- uniCt[uniCt$rsum<=ninetyPct,]
length(ninetyPctUnigrams[,1])
library(dplyr)
library(tidyr)
bigram <- read.csv("bigrams.csv")
trigram <- read.csv("trigrams.csv")
quadgram <- read.csv("quadgrams.csv")
#1) Remove any n-grams that only occur once, as they are not
# likely to generalize well and slow down the model.
#2) Order ngrams by most used
#3) Remove any ngrams that have the same (n-1)gram
# and are not in the top x-most used for that ngram.
bigram <- bigram %>% group_by(bigram) %>%
tally() %>% filter(n>1) %>%
arrange(desc(n)) %>%
separate(bigram, c('first','second'), sep=" ") %>%
group_by(first) %>% mutate(count = row_number()) %>%
filter(count<=numResults) %>% select(first,second,n)
trigram <- trigram %>% group_by(trigram) %>%
tally() %>% filter(n>1) %>%
arrange(desc(n)) %>%
separate(trigram, c('first','second','third'), sep=" ") %>%
group_by(first,second) %>% mutate(count = row_number()) %>%
filter(count<=numResults) %>% select(first,second,third,n)
quadgram <- quadgram %>% group_by(quadgram) %>%
tally() %>% filter(n>1) %>%
arrange(desc(n)) %>%
separate(quadgram, c('first','second','third','fourth'), sep=" ") %>%
group_by(first,second,third) %>% mutate(count = row_number()) %>%
filter(count<=numResults) %>% select(first,second,third,fourth,n)
write.csv(bigram,"biSummary.csv",row.names=FALSE)
write.csv(trigram,"triSummary.csv",row.names=FALSE)
write.csv(quadgram,"quadSummary.csv",row.names=FALSE)
Read in data
bigrams <- read.csv("biSummary.csv",row.names=NULL)
trigrams <- read.csv("triSummary.csv",row.names=NULL)
quadgrams <- read.csv("quadSummary.csv",row.names=NULL)
Create default values for unseen n-grams
library(dplyr)
def <- bigrams[,1:2]
def <- def %>% group_by(second) %>%
tally() %>% arrange(desc(n)) %>%
mutate(pct = .99*(min(bigrams$pct)))
def <- def$second[1:10]
write.csv(def,"defaults.csv",row.names=FALSE)
Build Model
def <- read.csv("defaults.csv",row.names=NULL)
model <- function(oldest=NULL,middle=NULL,newest){
preds <- NULL
head(def)
if (is.null(oldest) & is.null(middle)){
preds <- bigrams[bigrams$first==newest,2]
if(length(preds)<numResults){
preds <- unique(c(as.character(preds),as.character(def)))
}
preds <- preds[1:numResults]
} else if(is.null(oldest)){
preds <- trigrams[trigrams$first==middle & trigrams$second==newest,3]
if (length(preds)<numResults){
biPreds <- bigrams[bigrams$first==newest,2]
preds <- unique(c(as.character(preds),as.character(biPreds)))
}
if (length(preds)<numResults){
preds <- unique(c(as.character(preds),as.character(def)))
}
preds <- preds[1:numResults]
} else {
preds <- quadgrams[quadgrams$first==oldest & quadgrams$second==middle &
quadgrams$third==newest,4]
if (length(preds)<numResults){
triPreds <- trigrams[trigrams$first==middle & trigrams$second==newest,3]
preds <- unique(c(as.character(preds),as.character(triPreds)))
}
if (length(preds)<numResults){
biPreds <- bigrams[bigrams$first==newest,2]
preds <- unique(c(as.character(preds),as.character(biPreds)))
}
if (length(preds)<numResults){
preds <- unique(c(as.character(preds),as.character(def)))
}
preds <- preds[1:numResults]
}
return(preds)
}
Build Test quadgrams
library(dplyr)
library(tidytext)
test <- read.csv("testing.csv")
#Get quadgrams of all data
quadgramsTest <- test %>%
unnest_tokens(quadgram,text,token="ngrams",n=4)
Clean Test Quadgrams
#Remove any ngram containing non-alpha or space characters
quadgramsTest <- quadgramsTest[grepl("^[a-z]* [a-z]* [a-z]* [a-z]*$",
quadgramsTest$quadgram),]
#Remove retweets
quadgramsTest <- quadgramsTest[!grepl("(^rt | rt | rt$)",quadgramsTest$quadgram),]
#Bring in profanity database profanity
badwords <- read.csv("profanity.csv")
#Get rid of extra header rows
badwords <- badwords[4:length(badwords[,1]),2]
#Strip whitespace
badwords <- gsub("\\s+"," ",badwords)
#Trim whitespace
badwords <- trimws(badwords)
#Lowercase
badwords <- tolower(badwords)
#Remove punctuation
badwords <- gsub(",","",badwords,fixed=TRUE)
#Remove any non-alpha or space characters
badwords <- badwords[grepl("^[a-z]*$",badwords)]
#Remove profanity rows from ngrams
quadgramsTest <- quadgramsTest[!grepl(paste("\\<",paste(badwords,collapse="\\>|")),
quadgramsTest$quadgram),]
#Save files
write.csv(quadgramsTest,"quadgramsTest.csv",row.names=FALSE)
Summarize Test ngrams
library(dplyr)
library(tidyr)
quadgramTest <- read.csv("quadgramsTest.csv")
quadLinesTest <- length(quadgramTest$quadgram)
quadgramTest <- quadgramTest %>% group_by(quadgram) %>%
tally() %>% arrange(desc(n)) %>%
separate(quadgram, c('first','second','third','fourth'), sep=" ")
write.csv(quadgramTest,"quadSummaryTest.csv",row.names=FALSE)
Run test through model
file <- read.csv("quadSummaryTest.csv")
test <- file[,1:3]
truth <- file[,4]
results=NULL
for(i in 1:length(test$first)){
results <- rbind(results,model(as.character(test$first[i]),
as.character(test$second[i]),
as.character(test$third[i])))
}
validate = NULL
for(i in 1:length(truth)){
validate <- c(validate,truth[i] %in% results[i,])
}
correct <- sum(validate)
testCases <- length(validate)
pct <- round(sum(validate)/length(validate),4)*100
In testing quadgram performance, we find that out of 37235, the correct word was in the prediction of 3 words 8111 times, yielding an accuracy of 21.78%.