Capstone Milestone Report 1

Overview

For this project, I will be creating a text prediction Shiny app which will allow a user to type in several words into a text box and will then predict the most likely next word based on a text prediction algorithm trained on 899,288 samples of text from blogs, 1,010,242 samples of text from news stories, and 2,360,148 samples of tweets, as can be seen below.

     invisible(notifyMe("Overview"))

     # Set path to text samples.
     wd <- getwd()
     path <- paste0(wd,"/final/en_US")
     
     # Read in samples of blog text and count samples.
     f <- file(paste0(path,"/en_US.blogs.txt"), open="rb")
     nlinesBlogs <- 0L
     while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
          nlinesBlogs <- nlinesBlogs + sum(chunk == as.raw(10L))
     }
     close(f)
     
     # Read in samples of news text and count samples.
     f <- file(paste0(path,"/en_US.news.txt"), open="rb")
     nlinesNews <- 0L
     while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
          nlinesNews <- nlinesNews + sum(chunk == as.raw(10L))
     }
     close(f)
     
     # Read in samples of Twitter text and count samples.
     f <- file(paste0(path,"/en_US.twitter.txt"), open="rb")
     nlinesTwitter <- 0L
     while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
          nlinesTwitter <- nlinesTwitter + sum(chunk == as.raw(10L))
     }
     close(f)
     
     # Print out results.
     print(paste(format(nlinesBlogs, big.mark = ',') , "Blog Samples"))
     print(paste(format(nlinesNews, big.mark = ','), "News Samples"))
     print(paste(format(nlinesTwitter, big.mark = ','), "Twitter Samples"))

[1] "899,288 Blog Samples"
[1] "1,010,242 News Samples"
[1] "2,360,148 Twitter Samples"

Reading in and Cleaning the Data

The amount of data in these data sets is VERY large. Just reading in each file takes a surprisingly long amount of time. After reading in the data, I cleaned the data set by removing all non-letters from all of the text samples except for apostrophes since these sometime indicate conjunction words which are parts of everday speech.

     require(qdap)
     
     invisible(notifyMe("Reading In Blogs"))

     ## Read in text examples from blogs.
     wd <- getwd()
     path <- paste0(wd,"/final/en_US")
     blogsText <- readLines(paste0(path,"/en_US.blogs.txt"), encoding="UTF-16", n = -1)
     
     # Clean text by turning all text to lowercase, removing non-letters (except for apostrophes) and cleaning up extra white space.
     blogsText <- sapply(blogsText, function(x) {gsub("[^A-Za-z ']+", "", x, fixed = FALSE)})
     blogsText <- tolower(blogsText)
     blogsText <- trimws(blogsText)
     blogsText <- clean(blogsText)
 
     names(blogsText) <- NULL
     
     #########################################################
     
     invisible(notifyMe("Reading In News"))
     
     ## Read in text examples from news.
     wd <- getwd()
     path <- paste0(wd,"/final/en_US")
     newsText <- readLines(paste0(path,"/en_US.news.txt"), encoding="UTF-16", n = -1)
     
     # Clean text by turning all text to lowercase, removing non-letters (except for apostrophes) and cleaning up extra white space.
     newsText <- sapply(newsText, function(x) {gsub("[^A-Za-z ']+", "", x, fixed = FALSE)})
     newsText <- tolower(newsText)
     newsText <- trimws(newsText)
     newsText <- clean(newsText)
     
     names(newsText) <- NULL
     
     #########################################################
     
     invisible(notifyMe("Reading In Twitter"))
     
     ## Read in text examples from Twitter.
     wd <- getwd()
     path <- paste0(wd,"/final/en_US")
     twitterText <- readLines(paste0(path,"/en_US.twitter.txt"), encoding="UTF-16", n = -1)
     
     # Clean text by turning all text to lowercase, removing non-letters (except for apostrophes) and cleaning up extra white space.
     twitterText <- sapply(twitterText, function(x) {gsub("[^A-Za-z ']+", "", x, fixed = FALSE)})
     twitterText <- tolower(twitterText)
     twitterText <- trimws(twitterText)
     twitterText <- clean(twitterText)
     
     names(twitterText) <- NULL

Parsing the Text

After this I parsed each text sample into all the unique 1-word, 2-word and 3-word phrases contained in each sample. Once the data was read in, I counted the frequencies of terms for each set which will be used to develop the prediction algorithm. For the 2-word and 3-word phrases, I only kept terms which occured .001% or more in the samples. This cuts down immensely on the computation time and should not affect my algorithm because these phrases were too infrequent to have statistical significance for prediction.

     require(Matrix)
     require(text2vec)

     # Tokenize blogs text.
     blogsTextIt <- itoken(blogsText, chunks_number = 10, progressbar = FALSE)
     
     invisible(notifyMe("Generating Unigrams"))
     
     # Create and prune vocabulary.
     blogsUnigramVocab <- create_vocabulary(blogsTextIt, ngram=as.integer(c(1,1)))
 #    blogsUnigramVocab <- prune_vocabulary(blogsUnigramVocab, doc_proportion_min = 0.001)
     
     # Create vocabulary vectorizer function.
     blogsUnigramVocabVec <- vocab_vectorizer(blogsUnigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
     
     # Generate corpus.
     blogsUnigramCorpus <- create_corpus(blogsTextIt, vectorizer = blogsUnigramVocabVec)
     
     # Generate document term matrix.
     blogsUnigramDTM <- get_dtm(blogsUnigramCorpus)
     
     ################################# And likewise...
     
     newsTextIt <- itoken(newsText, chunks_number = 10, progressbar = FALSE)
     newsUnigramVocab <- create_vocabulary(newsTextIt, ngram=as.integer(c(1,1)))
    # newsUnigramVocab <- prune_vocabulary(newsUnigramVocab, doc_proportion_min = 0.001)
     newsUnigramVocabVec <- vocab_vectorizer(newsUnigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
     newsUnigramCorpus <- create_corpus(newsTextIt, vectorizer = newsUnigramVocabVec)
     newsUnigramDTM <- get_dtm(newsUnigramCorpus)
     
     ################################# And likewise...
     
     twitterTextIt <- itoken(twitterText, chunks_number = 10, progressbar = FALSE)
     twitterUnigramVocab <- create_vocabulary(twitterTextIt, ngram=as.integer(c(1,1)))
    # twitterUnigramVocab <- prune_vocabulary(twitterUnigramVocab, doc_proportion_min = 0.001)
     twitterUnigramVocabVec <- vocab_vectorizer(twitterUnigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
     twitterUnigramCorpus <- create_corpus(twitterTextIt, vectorizer = twitterUnigramVocabVec)
     twitterUnigramDTM <- get_dtm(twitterUnigramCorpus)
     
     ################################# Generating Digrams...
     
     invisible(notifyMe("Generating Digrams"))
     
     blogsDigramVocab <- create_vocabulary(blogsTextIt, ngram=as.integer(c(2,2)))
     blogsDigramVocab <- prune_vocabulary(blogsDigramVocab, doc_proportion_min = 0.001)
     blogsDigramVocabVec <- vocab_vectorizer(blogsDigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
     blogsDigramCorpus <- create_corpus(blogsTextIt, vectorizer = blogsDigramVocabVec)
     blogsDigramDTM <- get_dtm(blogsDigramCorpus)
     
     ################################# And likewise...
     
     newsDigramVocab <- create_vocabulary(newsTextIt, ngram=as.integer(c(2,2)))
     newsDigramVocab <- prune_vocabulary(newsDigramVocab, doc_proportion_min = 0.001)
     newsDigramVocabVec <- vocab_vectorizer(newsDigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
     newsDigramCorpus <- create_corpus(newsTextIt, vectorizer = newsDigramVocabVec)
     newsDigramDTM <- get_dtm(newsDigramCorpus)
     
     ################################# And likewise...
     
     twitterDigramVocab <- create_vocabulary(twitterTextIt, ngram=as.integer(c(2,2)))
     twitterDigramVocab <- prune_vocabulary(twitterDigramVocab, doc_proportion_min = 0.001)
     twitterDigramVocabVec <- vocab_vectorizer(twitterDigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
     twitterDigramCorpus <- create_corpus(twitterTextIt, vectorizer = twitterDigramVocabVec)
     twitterDigramDTM <- get_dtm(twitterDigramCorpus)
     
     ################################# Generating Trigrams...
     
     invisible(notifyMe("Generating Trigrams"))
     
     blogsTrigramVocab <- create_vocabulary(blogsTextIt, ngram=as.integer(c(3,3)))
     blogsTrigramVocab <- prune_vocabulary(blogsTrigramVocab, doc_proportion_min = 0.001)
     blogsTrigramVocabVec <- vocab_vectorizer(blogsTrigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
     blogsTrigramCorpus <- create_corpus(blogsTextIt, vectorizer = blogsTrigramVocabVec)
     blogsTrigramDTM <- get_dtm(blogsTrigramCorpus)
     
     ################################# And likewise...
     
     newsTrigramVocab <- create_vocabulary(newsTextIt, ngram=as.integer(c(3,3)))
     newsTrigramVocab <- prune_vocabulary(newsTrigramVocab, doc_proportion_min = 0.001)
     newsTrigramVocabVec <- vocab_vectorizer(newsTrigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
     newsTrigramCorpus <- create_corpus(newsTextIt, vectorizer = newsTrigramVocabVec)
     newsTrigramDTM <- get_dtm(newsTrigramCorpus)
     
     ################################# And likewise...
     
     twitterTrigramVocab <- create_vocabulary(twitterTextIt, ngram=as.integer(c(3,3)))
     twitterTrigramVocab <- prune_vocabulary(twitterTrigramVocab, doc_proportion_min = 0.001)
     twitterTrigramVocabVec <- vocab_vectorizer(twitterTrigramVocab, grow_dtm = TRUE, skip_grams_window = 0L)
     twitterTrigramCorpus <- create_corpus(twitterTextIt, vectorizer = twitterTrigramVocabVec)
     twitterTrigramDTM <- get_dtm(twitterTrigramCorpus)

Word Counts

Overall word counts are given below for the blogs, news and Twitter text sources.

     invisible(notifyMe("Word Counts"))

     # Use DTMs to determine word-counts.
     print(paste(length(colnames(blogsUnigramDTM)), "Blogs Unigrams"))

[1] "3245 Blogs Unigrams"

     print(paste(length(colnames(newsUnigramDTM)), "News Unigrams"))

[1] "3358 News Unigrams"

     print(paste(length(colnames(twitterUnigramDTM)), "Twitter Unigrams"))

[1] "1218 Twitter Unigrams"

Top 10 Term Counts

The top 10 terms for each text source and each 1,2 or 3-word term are given below in table form.

     invisible(notifyMe("Top Ten"))

     # Create function to order top 10 terms for each data source and ngram.
     topTerms <- function(dtmName, maxVals = 10){
     
          require(ggplot2)
          require(ggthemes)
          source("functions.R")
          
          uni <- grep("Unigram",dtmName)  
          di <- grep("Digram", dtmName)
          tri <- grep("Trigram", dtmName)
          
          sourceName <- c("Blogs", "News", "Twitter")[[which(sapply(c("blogs", "news", "twitter"),
                                                       function(x) {grep(x, dtmName)})==1)]]
           
          gramNum <- c(1,2,3)[[which(sapply(c("Unigram", "Digram", "Trigram"), 
                                            function(x) {grep(x, dtmName)})==1)]]
        
          dtm <- eval(as.name(dtmName))
          
          ## Order terms and counts data frame
          counts <- colSums(dtm)
          counts <- counts[order(counts, decreasing=TRUE)]
          terms <- names(counts)
          
          ## Calculate percentage of all terms for each term.
          percents <- paste0(signif(100*counts/sum(counts), 2),"%")
          orderedGrams <- data.frame(terms = terms, terms_percents = percents, terms_counts = counts)
          rownames(orderedGrams) <- NULL
          
          return(orderedGrams[1:maxVals,])
     }
     
     # Use topTerms function to create tables of top terms for each data source and ngram.
     termsDf <- list()
     termsDf[[1]] <- topTerms("blogsUnigramDTM")
     termsDf[[2]] <- topTerms("newsUnigramDTM")
     termsDf[[3]] <- topTerms("twitterUnigramDTM")
     termsDf[[4]] <- topTerms("blogsDigramDTM")
     termsDf[[5]] <- topTerms("newsDigramDTM")
     termsDf[[6]] <- topTerms("twitterDigramDTM")
     termsDf[[7]] <- topTerms("blogsTrigramDTM")
     termsDf[[8]] <- topTerms("newsTrigramDTM")
     termsDf[[9]] <- topTerms("twitterTrigramDTM")
     
     for(i in 1:length(termsDf)){
          
          source <- c("Blogs", "News", "Twitter")[[(i-1)%%3+1]]
          rownames(termsDf[[i]]) <- 1:10
          print(kable(termsDf[[i]], caption = source, row.names = TRUE))
     }

Blogs
	terms	terms_percents	terms_counts
1	the	5%	1855244
2	and	2.9%	1086085
3	to	2.9%	1065625
4	a	2.4%	896784
5	of	2.4%	875010
6	i	2.1%	769212
7	in	1.6%	593546
8	that	1.2%	459439
9	is	1.2%	431769
10	it	1.1%	400742

News
	terms	terms_percents	terms_counts
1	the	5.9%	151490
2	to	2.7%	69348
3	and	2.7%	68215
4	a	2.6%	67166
5	of	2.3%	59088
6	in	2%	51458
7	for	1.1%	27107
8	that	1%	26339
9	is	0.85%	21950
10	on	0.8%	20570

Twitter
	terms	terms_percents	terms_counts
1	the	3.2%	933603
2	to	2.7%	786579
3	i	2.4%	713305
4	a	2.1%	608495
5	you	1.9%	543432
6	and	1.5%	433667
7	for	1.3%	384515
8	in	1.3%	376910
9	of	1.2%	358957
10	is	1.2%	357462

Blogs
	terms	terms_percents	terms_counts
1	of_the	1.8%	187084
2	in_the	1.5%	154044
3	to_the	0.82%	85975
4	on_the	0.71%	75211
5	to_be	0.65%	68028
6	and_the	0.56%	58547
7	for_the	0.55%	58057
8	i_was	0.47%	49344
9	and_i	0.47%	49023
10	i_have	0.45%	47802

News
	terms	terms_percents	terms_counts
1	of_the	2.6%	14093
2	in_the	2.5%	13709
3	to_the	1.2%	6442
4	on_the	1%	5537
5	for_the	0.98%	5396
6	at_the	0.82%	4517
7	and_the	0.74%	4047
8	in_a	0.73%	4043
9	to_be	0.64%	3550
10	with_the	0.6%	3321

Twitter
	terms	terms_percents	terms_counts
1	in_the	1.5%	78175
2	for_the	1.4%	73868
3	of_the	1.1%	56825
4	on_the	0.95%	48384
5	to_be	0.92%	46884
6	to_the	0.85%	43339
7	thanks_for	0.84%	42735
8	at_the	0.73%	37174
9	i_love	0.69%	35392
10	going_to	0.67%	34177

Blogs
	terms	terms_percents	terms_counts
1	one_of_the	1.5%	14412
2	a_lot_of	1.3%	12229
3	as_well_as	0.71%	6870
4	to_be_a	0.71%	6830
5	it_was_a	0.71%	6785
6	some_of_the	0.7%	6708
7	out_of_the	0.67%	6471
8	the_end_of	0.67%	6464
9	be_able_to	0.65%	6227
10	a_couple_of	0.62%	5994

News
	terms	terms_percents	terms_counts
1	one_of_the	2.8%	1083
2	a_lot_of	2.3%	876
3	as_well_as	1.2%	479
4	according_to_the	1.1%	435
5	in_the_first	1.1%	425
6	going_to_be	1.1%	419
7	part_of_the	1.1%	410
8	the_end_of	1.1%	407
9	out_of_the	1%	393
10	some_of_the	1%	392

Twitter
	terms	terms_percents	terms_counts
1	thanks_for_the	8%	23515
2	looking_forward_to	3%	8712
3	thank_you_for	2.9%	8594
4	i_love_you	2.8%	8200
5	for_the_follow	2.7%	7797
6	going_to_be	2.5%	7394
7	can’t_wait_to	2.5%	7235
8	i_want_to	2.4%	7034
9	a_lot_of	2.1%	6225
10	to_be_a	2%	5988

Initial Plots

Below are the top 10 most common unigrams, digrams and trigrams for the blogs, news and Twitter text sources given graphically.

     invisible(notifyMe("Initial Plots"))

addPlot <- function(dtmName, maxVals = 10){
     
     require(ggplot2)
     require(ggthemes)
     source("functions.R")
     
     uni <- grep("Unigram",dtmName)  
     di <- grep("Digram", dtmName)
     tri <- grep("Trigram", dtmName)
     
     sourceName <- c("Blogs", "News", "Twitter")[[which(sapply(c("blogs", "news", "twitter"),
                                                  function(x) {grep(x, dtmName)})==1)]]
      
     gramNum <- c(1,2,3)[[which(sapply(c("Unigram", "Digram", "Trigram"), 
                                       function(x) {grep(x, dtmName)})==1)]]
   
     dtm <- eval(as.name(dtmName))
     
     ## Order terms and counts data frame
     counts <- colSums(dtm)
     counts <- counts[order(counts, decreasing=TRUE)]
     terms <- names(counts)
     
     ## Calculate percentage of all terms for each term.
     percents <- counts/sum(counts)
     orderedGrams <- data.frame(terms = terms, terms_percents = percents, terms_counts = counts)
     
     ## Modify title, subtitle and axis labels based on which ngrams are given.
     #gramNum <- grams$ngram[[2]]
     nGramTitle <- c("Unigrams", "Digrams", "Trigrams")[[gramNum]]
     termPhraseTitle <- c("Term", "Phrase")[[as.numeric(gramNum!=1)+1]]
     numTerms <- format(length(terms), big.mark = ",")
    # plotTitle <- paste0(termPhraseTitle, " Percantages for ", nGramTitle)
     plotTitle <- paste("Top", maxVals, sourceName, nGramTitle)
     #plotSubtitle <- paste0("Out of ", numTerms, " Unique ", termPhraseTitle, "s")
     plotSubtitle <- paste0("Out of ", numTerms)
     
     ## Plot top 10 terms by percentage of all terms used.
     plot <- ggplot(data=orderedGrams[1:maxVals,], 
                    aes(x=factor(terms, levels=terms), y=terms_percents, group=1)) + 
          scale_y_continuous(labels = scales::percent, limits = c(0, NA)) +
          ## geom_bar(stat="identity", position="dodge") +
          
          geom_line(size=1.5) +
          
          geom_label(aes(x=factor(terms,levels=terms), y=terms_percents,
                         label = stackWords(as.character(terms))), hjust = "middle", 
                              vjust = "top") +
          geom_point(color="red", size=3) + 
          labs(title = plotTitle, subtitle = plotSubtitle) + 
         # labs(x=paste0("Top ", maxVals, " ", termPhraseTitle, "s")) + 
         # labs(y = paste0("Percentage of ", nGramTitle)) +
          theme_minimal() + 
          theme(plot.title = element_text(face = "bold", size=16, hjust = .5),
                plot.subtitle = element_text(face = "bold", size=14, hjust = .5),
                axis.text.x = element_blank(),
                axis.title.x = element_blank(),
                plot.margin=unit(c(2,0,4,0),"lines"),
                #plot.margin=unit(c(2,4,2,2),"lines"),
               # axis.title.x = element_text(margin = ggplot2::margin(0,0,0,0), size=6, face = "bold"),
              # axis.title.y = element_text(margin = ggplot2::margin(0,5,0,0), size=6, face = "bold")
           #    axis.title.y = element_text(margin = ggplot2::margin(0,25,0,0), size=6, face = "bold"),
               axis.title.y = element_blank(),
               axis.text.y = element_text(size = 6))
     
     return(plot)
}
     
     require(grid)
     require(gridExtra)
     
     plots <- list()
     plots[[1]] <- addPlot("blogsUnigramDTM")
     plots[[2]] <- addPlot("newsUnigramDTM")
     plots[[3]] <- addPlot("twitterUnigramDTM")
     plots[[4]] <- addPlot("blogsDigramDTM")
     plots[[5]] <- addPlot("newsDigramDTM")
     plots[[6]] <- addPlot("twitterDigramDTM")
     plots[[7]] <- addPlot("blogsTrigramDTM")
     plots[[8]] <- addPlot("newsTrigramDTM")
     plots[[9]] <- addPlot("twitterTrigramDTM")
    
     do.call("grid.arrange", c(plots, ncol=3))

Plans for the Algorithm and App

My plan is to combine the blogs, news and Twitter texts into one data set, split the data into a training set and a testing set for training and testing the model. I intend to use a generalized linear model (GLM) to create the model using 1-word, 2-word and 3-word document term matrices. The Shiny App will simply provide the user a text box to type a phrase into. After the user hits submit, the app will use the words typed in as the input for the model to predict the next word and perhaps the next 2 highest probability words.