Overview
For this project, I will be creating a text prediction Shiny app which will allow a user to type in several words into a text box and will then predict the most likely next word based on a text prediction algorithm trained on 899,288 samples of text from blogs, 1,010,242 samples of text from news stories, and 2,360,148 samples of tweets, as can be seen below.
invisible(notifyMe("Overview"))
# Set path to text samples.
wd <- getwd()
path <- paste0(wd,"/final/en_US")
# Read in samples of blog text and count samples.
f <- file(paste0(path,"/en_US.blogs.txt"), open="rb")
nlinesBlogs <- 0L
while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
nlinesBlogs <- nlinesBlogs + sum(chunk == as.raw(10L))
}
close(f)
# Read in samples of news text and count samples.
f <- file(paste0(path,"/en_US.news.txt"), open="rb")
nlinesNews <- 0L
while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
nlinesNews <- nlinesNews + sum(chunk == as.raw(10L))
}
close(f)
# Read in samples of Twitter text and count samples.
f <- file(paste0(path,"/en_US.twitter.txt"), open="rb")
nlinesTwitter <- 0L
while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
nlinesTwitter <- nlinesTwitter + sum(chunk == as.raw(10L))
}
close(f)
# Print out results.
print(paste(format(nlinesBlogs, big.mark = ',') , "Blog Samples"))
print(paste(format(nlinesNews, big.mark = ','), "News Samples"))
print(paste(format(nlinesTwitter, big.mark = ','), "Twitter Samples"))
[1] "899,288 Blog Samples"
[1] "1,010,242 News Samples"
[1] "2,360,148 Twitter Samples"
Reading in and Cleaning the Data
The amount of data in these data sets is VERY large. Just reading in each file takes a surprisingly long amount of time. After reading in the data, I cleaned the data set by removing all non-letters from all of the text samples except for apostrophes since these sometime indicate conjunction words which are parts of everday speech.
require(qdap)
invisible(notifyMe("Reading In Blogs"))
## Read in text examples from blogs.
wd <- getwd()
path <- paste0(wd,"/final/en_US")
blogsText <- readLines(paste0(path,"/en_US.blogs.txt"), encoding="UTF-16", n = -1)
# Clean text by turning all text to lowercase, removing non-letters (except for apostrophes) and cleaning up extra white space.
blogsText <- sapply(blogsText, function(x) {gsub("[^A-Za-z ']+", "", x, fixed = FALSE)})
blogsText <- tolower(blogsText)
blogsText <- trimws(blogsText)
blogsText <- clean(blogsText)
names(blogsText) <- NULL
#########################################################
invisible(notifyMe("Reading In News"))
## Read in text examples from news.
wd <- getwd()
path <- paste0(wd,"/final/en_US")
newsText <- readLines(paste0(path,"/en_US.news.txt"), encoding="UTF-16", n = -1)
# Clean text by turning all text to lowercase, removing non-letters (except for apostrophes) and cleaning up extra white space.
newsText <- sapply(newsText, function(x) {gsub("[^A-Za-z ']+", "", x, fixed = FALSE)})
newsText <- tolower(newsText)
newsText <- trimws(newsText)
newsText <- clean(newsText)
names(newsText) <- NULL
#########################################################
invisible(notifyMe("Reading In Twitter"))
## Read in text examples from Twitter.
wd <- getwd()
path <- paste0(wd,"/final/en_US")
twitterText <- readLines(paste0(path,"/en_US.twitter.txt"), encoding="UTF-16", n = -1)
# Clean text by turning all text to lowercase, removing non-letters (except for apostrophes) and cleaning up extra white space.
twitterText <- sapply(twitterText, function(x) {gsub("[^A-Za-z ']+", "", x, fixed = FALSE)})
twitterText <- tolower(twitterText)
twitterText <- trimws(twitterText)
twitterText <- clean(twitterText)
names(twitterText) <- NULL
Parsing the Text
After this I parsed each text sample into all the unique 1-word, 2-word and 3-word phrases contained in each sample. Once the data was read in, I counted the frequencies of terms for each set which will be used to develop the prediction algorithm. For the 2-word and 3-word phrases, I only kept terms which occured .001% or more in the samples. This cuts down immensely on the computation time and should not affect my algorithm because these phrases were too infrequent to have statistical significance for prediction.
require(Matrix)
require(text2vec)
# Tokenize blogs text.
blogsTextIt <- itoken(blogsText, chunks_number = 10, progressbar = FALSE)
invisible(notifyMe("Generating Unigrams"))
# Create and prune vocabulary.
blogsUnigramVocab <- create_vocabulary(blogsTextIt, ngram=as.integer(c(1,1)))
# blogsUnigramVocab <- prune_vocabulary(blogsUnigramVocab, doc_proportion_min = 0.001)
# Create vocabulary vectorizer function.
blogsUnigramVocabVec <- vocab_vectorizer(blogsUnigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
# Generate corpus.
blogsUnigramCorpus <- create_corpus(blogsTextIt, vectorizer = blogsUnigramVocabVec)
# Generate document term matrix.
blogsUnigramDTM <- get_dtm(blogsUnigramCorpus)
################################# And likewise...
newsTextIt <- itoken(newsText, chunks_number = 10, progressbar = FALSE)
newsUnigramVocab <- create_vocabulary(newsTextIt, ngram=as.integer(c(1,1)))
# newsUnigramVocab <- prune_vocabulary(newsUnigramVocab, doc_proportion_min = 0.001)
newsUnigramVocabVec <- vocab_vectorizer(newsUnigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
newsUnigramCorpus <- create_corpus(newsTextIt, vectorizer = newsUnigramVocabVec)
newsUnigramDTM <- get_dtm(newsUnigramCorpus)
################################# And likewise...
twitterTextIt <- itoken(twitterText, chunks_number = 10, progressbar = FALSE)
twitterUnigramVocab <- create_vocabulary(twitterTextIt, ngram=as.integer(c(1,1)))
# twitterUnigramVocab <- prune_vocabulary(twitterUnigramVocab, doc_proportion_min = 0.001)
twitterUnigramVocabVec <- vocab_vectorizer(twitterUnigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
twitterUnigramCorpus <- create_corpus(twitterTextIt, vectorizer = twitterUnigramVocabVec)
twitterUnigramDTM <- get_dtm(twitterUnigramCorpus)
################################# Generating Digrams...
invisible(notifyMe("Generating Digrams"))
blogsDigramVocab <- create_vocabulary(blogsTextIt, ngram=as.integer(c(2,2)))
blogsDigramVocab <- prune_vocabulary(blogsDigramVocab, doc_proportion_min = 0.001)
blogsDigramVocabVec <- vocab_vectorizer(blogsDigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
blogsDigramCorpus <- create_corpus(blogsTextIt, vectorizer = blogsDigramVocabVec)
blogsDigramDTM <- get_dtm(blogsDigramCorpus)
################################# And likewise...
newsDigramVocab <- create_vocabulary(newsTextIt, ngram=as.integer(c(2,2)))
newsDigramVocab <- prune_vocabulary(newsDigramVocab, doc_proportion_min = 0.001)
newsDigramVocabVec <- vocab_vectorizer(newsDigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
newsDigramCorpus <- create_corpus(newsTextIt, vectorizer = newsDigramVocabVec)
newsDigramDTM <- get_dtm(newsDigramCorpus)
################################# And likewise...
twitterDigramVocab <- create_vocabulary(twitterTextIt, ngram=as.integer(c(2,2)))
twitterDigramVocab <- prune_vocabulary(twitterDigramVocab, doc_proportion_min = 0.001)
twitterDigramVocabVec <- vocab_vectorizer(twitterDigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
twitterDigramCorpus <- create_corpus(twitterTextIt, vectorizer = twitterDigramVocabVec)
twitterDigramDTM <- get_dtm(twitterDigramCorpus)
################################# Generating Trigrams...
invisible(notifyMe("Generating Trigrams"))
blogsTrigramVocab <- create_vocabulary(blogsTextIt, ngram=as.integer(c(3,3)))
blogsTrigramVocab <- prune_vocabulary(blogsTrigramVocab, doc_proportion_min = 0.001)
blogsTrigramVocabVec <- vocab_vectorizer(blogsTrigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
blogsTrigramCorpus <- create_corpus(blogsTextIt, vectorizer = blogsTrigramVocabVec)
blogsTrigramDTM <- get_dtm(blogsTrigramCorpus)
################################# And likewise...
newsTrigramVocab <- create_vocabulary(newsTextIt, ngram=as.integer(c(3,3)))
newsTrigramVocab <- prune_vocabulary(newsTrigramVocab, doc_proportion_min = 0.001)
newsTrigramVocabVec <- vocab_vectorizer(newsTrigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
newsTrigramCorpus <- create_corpus(newsTextIt, vectorizer = newsTrigramVocabVec)
newsTrigramDTM <- get_dtm(newsTrigramCorpus)
################################# And likewise...
twitterTrigramVocab <- create_vocabulary(twitterTextIt, ngram=as.integer(c(3,3)))
twitterTrigramVocab <- prune_vocabulary(twitterTrigramVocab, doc_proportion_min = 0.001)
twitterTrigramVocabVec <- vocab_vectorizer(twitterTrigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
twitterTrigramCorpus <- create_corpus(twitterTextIt, vectorizer = twitterTrigramVocabVec)
twitterTrigramDTM <- get_dtm(twitterTrigramCorpus)
Word Counts
Overall word counts are given below for the blogs, news and Twitter text sources.
invisible(notifyMe("Word Counts"))
# Use DTMs to determine word-counts.
print(paste(length(colnames(blogsUnigramDTM)), "Blogs Unigrams"))
[1] "3245 Blogs Unigrams"
print(paste(length(colnames(newsUnigramDTM)), "News Unigrams"))
[1] "3358 News Unigrams"
print(paste(length(colnames(twitterUnigramDTM)), "Twitter Unigrams"))
[1] "1218 Twitter Unigrams"
Top 10 Term Counts
The top 10 terms for each text source and each 1,2 or 3-word term are given below in table form.
invisible(notifyMe("Top Ten"))
# Create function to order top 10 terms for each data source and ngram.
topTerms <- function(dtmName, maxVals = 10){
require(ggplot2)
require(ggthemes)
source("functions.R")
uni <- grep("Unigram",dtmName)
di <- grep("Digram", dtmName)
tri <- grep("Trigram", dtmName)
sourceName <- c("Blogs", "News", "Twitter")[[which(sapply(c("blogs", "news", "twitter"),
function(x) {grep(x, dtmName)})==1)]]
gramNum <- c(1,2,3)[[which(sapply(c("Unigram", "Digram", "Trigram"),
function(x) {grep(x, dtmName)})==1)]]
dtm <- eval(as.name(dtmName))
## Order terms and counts data frame
counts <- colSums(dtm)
counts <- counts[order(counts, decreasing=TRUE)]
terms <- names(counts)
## Calculate percentage of all terms for each term.
percents <- paste0(signif(100*counts/sum(counts), 2),"%")
orderedGrams <- data.frame(terms = terms, terms_percents = percents, terms_counts = counts)
rownames(orderedGrams) <- NULL
return(orderedGrams[1:maxVals,])
}
# Use topTerms function to create tables of top terms for each data source and ngram.
termsDf <- list()
termsDf[[1]] <- topTerms("blogsUnigramDTM")
termsDf[[2]] <- topTerms("newsUnigramDTM")
termsDf[[3]] <- topTerms("twitterUnigramDTM")
termsDf[[4]] <- topTerms("blogsDigramDTM")
termsDf[[5]] <- topTerms("newsDigramDTM")
termsDf[[6]] <- topTerms("twitterDigramDTM")
termsDf[[7]] <- topTerms("blogsTrigramDTM")
termsDf[[8]] <- topTerms("newsTrigramDTM")
termsDf[[9]] <- topTerms("twitterTrigramDTM")
for(i in 1:length(termsDf)){
source <- c("Blogs", "News", "Twitter")[[(i-1)%%3+1]]
rownames(termsDf[[i]]) <- 1:10
print(kable(termsDf[[i]], caption = source, row.names = TRUE))
}
Blogs
| 1 |
the |
5% |
1855244 |
| 2 |
and |
2.9% |
1086085 |
| 3 |
to |
2.9% |
1065625 |
| 4 |
a |
2.4% |
896784 |
| 5 |
of |
2.4% |
875010 |
| 6 |
i |
2.1% |
769212 |
| 7 |
in |
1.6% |
593546 |
| 8 |
that |
1.2% |
459439 |
| 9 |
is |
1.2% |
431769 |
| 10 |
it |
1.1% |
400742 |
News
| 1 |
the |
5.9% |
151490 |
| 2 |
to |
2.7% |
69348 |
| 3 |
and |
2.7% |
68215 |
| 4 |
a |
2.6% |
67166 |
| 5 |
of |
2.3% |
59088 |
| 6 |
in |
2% |
51458 |
| 7 |
for |
1.1% |
27107 |
| 8 |
that |
1% |
26339 |
| 9 |
is |
0.85% |
21950 |
| 10 |
on |
0.8% |
20570 |
Twitter
| 1 |
the |
3.2% |
933603 |
| 2 |
to |
2.7% |
786579 |
| 3 |
i |
2.4% |
713305 |
| 4 |
a |
2.1% |
608495 |
| 5 |
you |
1.9% |
543432 |
| 6 |
and |
1.5% |
433667 |
| 7 |
for |
1.3% |
384515 |
| 8 |
in |
1.3% |
376910 |
| 9 |
of |
1.2% |
358957 |
| 10 |
is |
1.2% |
357462 |
Blogs
| 1 |
of_the |
1.8% |
187084 |
| 2 |
in_the |
1.5% |
154044 |
| 3 |
to_the |
0.82% |
85975 |
| 4 |
on_the |
0.71% |
75211 |
| 5 |
to_be |
0.65% |
68028 |
| 6 |
and_the |
0.56% |
58547 |
| 7 |
for_the |
0.55% |
58057 |
| 8 |
i_was |
0.47% |
49344 |
| 9 |
and_i |
0.47% |
49023 |
| 10 |
i_have |
0.45% |
47802 |
News
| 1 |
of_the |
2.6% |
14093 |
| 2 |
in_the |
2.5% |
13709 |
| 3 |
to_the |
1.2% |
6442 |
| 4 |
on_the |
1% |
5537 |
| 5 |
for_the |
0.98% |
5396 |
| 6 |
at_the |
0.82% |
4517 |
| 7 |
and_the |
0.74% |
4047 |
| 8 |
in_a |
0.73% |
4043 |
| 9 |
to_be |
0.64% |
3550 |
| 10 |
with_the |
0.6% |
3321 |
Twitter
| 1 |
in_the |
1.5% |
78175 |
| 2 |
for_the |
1.4% |
73868 |
| 3 |
of_the |
1.1% |
56825 |
| 4 |
on_the |
0.95% |
48384 |
| 5 |
to_be |
0.92% |
46884 |
| 6 |
to_the |
0.85% |
43339 |
| 7 |
thanks_for |
0.84% |
42735 |
| 8 |
at_the |
0.73% |
37174 |
| 9 |
i_love |
0.69% |
35392 |
| 10 |
going_to |
0.67% |
34177 |
Blogs
| 1 |
one_of_the |
1.5% |
14412 |
| 2 |
a_lot_of |
1.3% |
12229 |
| 3 |
as_well_as |
0.71% |
6870 |
| 4 |
to_be_a |
0.71% |
6830 |
| 5 |
it_was_a |
0.71% |
6785 |
| 6 |
some_of_the |
0.7% |
6708 |
| 7 |
out_of_the |
0.67% |
6471 |
| 8 |
the_end_of |
0.67% |
6464 |
| 9 |
be_able_to |
0.65% |
6227 |
| 10 |
a_couple_of |
0.62% |
5994 |
News
| 1 |
one_of_the |
2.8% |
1083 |
| 2 |
a_lot_of |
2.3% |
876 |
| 3 |
as_well_as |
1.2% |
479 |
| 4 |
according_to_the |
1.1% |
435 |
| 5 |
in_the_first |
1.1% |
425 |
| 6 |
going_to_be |
1.1% |
419 |
| 7 |
part_of_the |
1.1% |
410 |
| 8 |
the_end_of |
1.1% |
407 |
| 9 |
out_of_the |
1% |
393 |
| 10 |
some_of_the |
1% |
392 |
Twitter
| 1 |
thanks_for_the |
8% |
23515 |
| 2 |
looking_forward_to |
3% |
8712 |
| 3 |
thank_you_for |
2.9% |
8594 |
| 4 |
i_love_you |
2.8% |
8200 |
| 5 |
for_the_follow |
2.7% |
7797 |
| 6 |
going_to_be |
2.5% |
7394 |
| 7 |
can’t_wait_to |
2.5% |
7235 |
| 8 |
i_want_to |
2.4% |
7034 |
| 9 |
a_lot_of |
2.1% |
6225 |
| 10 |
to_be_a |
2% |
5988 |
Initial Plots
Below are the top 10 most common unigrams, digrams and trigrams for the blogs, news and Twitter text sources given graphically.
invisible(notifyMe("Initial Plots"))
addPlot <- function(dtmName, maxVals = 10){
require(ggplot2)
require(ggthemes)
source("functions.R")
uni <- grep("Unigram",dtmName)
di <- grep("Digram", dtmName)
tri <- grep("Trigram", dtmName)
sourceName <- c("Blogs", "News", "Twitter")[[which(sapply(c("blogs", "news", "twitter"),
function(x) {grep(x, dtmName)})==1)]]
gramNum <- c(1,2,3)[[which(sapply(c("Unigram", "Digram", "Trigram"),
function(x) {grep(x, dtmName)})==1)]]
dtm <- eval(as.name(dtmName))
## Order terms and counts data frame
counts <- colSums(dtm)
counts <- counts[order(counts, decreasing=TRUE)]
terms <- names(counts)
## Calculate percentage of all terms for each term.
percents <- counts/sum(counts)
orderedGrams <- data.frame(terms = terms, terms_percents = percents, terms_counts = counts)
## Modify title, subtitle and axis labels based on which ngrams are given.
#gramNum <- grams$ngram[[2]]
nGramTitle <- c("Unigrams", "Digrams", "Trigrams")[[gramNum]]
termPhraseTitle <- c("Term", "Phrase")[[as.numeric(gramNum!=1)+1]]
numTerms <- format(length(terms), big.mark = ",")
# plotTitle <- paste0(termPhraseTitle, " Percantages for ", nGramTitle)
plotTitle <- paste("Top", maxVals, sourceName, nGramTitle)
#plotSubtitle <- paste0("Out of ", numTerms, " Unique ", termPhraseTitle, "s")
plotSubtitle <- paste0("Out of ", numTerms)
## Plot top 10 terms by percentage of all terms used.
plot <- ggplot(data=orderedGrams[1:maxVals,],
aes(x=factor(terms, levels=terms), y=terms_percents, group=1)) +
scale_y_continuous(labels = scales::percent, limits = c(0, NA)) +
## geom_bar(stat="identity", position="dodge") +
geom_line(size=1.5) +
geom_label(aes(x=factor(terms,levels=terms), y=terms_percents,
label = stackWords(as.character(terms))), hjust = "middle",
vjust = "top") +
geom_point(color="red", size=3) +
labs(title = plotTitle, subtitle = plotSubtitle) +
# labs(x=paste0("Top ", maxVals, " ", termPhraseTitle, "s")) +
# labs(y = paste0("Percentage of ", nGramTitle)) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size=16, hjust = .5),
plot.subtitle = element_text(face = "bold", size=14, hjust = .5),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
plot.margin=unit(c(2,0,4,0),"lines"),
#plot.margin=unit(c(2,4,2,2),"lines"),
# axis.title.x = element_text(margin = ggplot2::margin(0,0,0,0), size=6, face = "bold"),
# axis.title.y = element_text(margin = ggplot2::margin(0,5,0,0), size=6, face = "bold")
# axis.title.y = element_text(margin = ggplot2::margin(0,25,0,0), size=6, face = "bold"),
axis.title.y = element_blank(),
axis.text.y = element_text(size = 6))
return(plot)
}
require(grid)
require(gridExtra)
plots <- list()
plots[[1]] <- addPlot("blogsUnigramDTM")
plots[[2]] <- addPlot("newsUnigramDTM")
plots[[3]] <- addPlot("twitterUnigramDTM")
plots[[4]] <- addPlot("blogsDigramDTM")
plots[[5]] <- addPlot("newsDigramDTM")
plots[[6]] <- addPlot("twitterDigramDTM")
plots[[7]] <- addPlot("blogsTrigramDTM")
plots[[8]] <- addPlot("newsTrigramDTM")
plots[[9]] <- addPlot("twitterTrigramDTM")
do.call("grid.arrange", c(plots, ncol=3))

Plans for the Algorithm and App
My plan is to combine the blogs, news and Twitter texts into one data set, split the data into a training set and a testing set for training and testing the model. I intend to use a generalized linear model (GLM) to create the model using 1-word, 2-word and 3-word document term matrices. The Shiny App will simply provide the user a text box to type a phrase into. After the user hits submit, the app will use the words typed in as the input for the model to predict the next word and perhaps the next 2 highest probability words.