downloading data

First let’s load packages then let’s download the data.

suppressPackageStartupMessages(library(quanteda))
suppressPackageStartupMessages(library(wordcloud))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(quanteda.textstats))
suppressPackageStartupMessages(library(quanteda.textplots))

twitterText <- suppressMessages(readLines("./final/en_US/en_US.twitter.txt"))
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 167155 appears to
## contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 268547 appears to
## contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 1759032 appears to
## contain an embedded nul
newsText <- readLines("./final/en_US/en_US.news.txt")
blogsText <- readLines("./final/en_US/en_US.blogs.txt")

exploring each file

first lets look at the length in lines, amount of words, and the amount of characters in each file.

unlist(lapply(list(twitterText, newsText, blogsText), length))
## [1] 2360148 1010242  899288
unlist(lapply(list(twitterText, newsText, blogsText), 
              function(x) sum(vapply(x, nchar, numeric(1))))
       )
## [1] 162096031 203223159 206824505
twitterWord <- unlist(strsplit(twitterText, " "))
newsWord <- unlist(strsplit(newsText, " "))
blogWord <- unlist(strsplit(blogsText, " "))

unlist(lapply(list(twitterWord, newsWord, blogWord), length))
## [1] 30373543 34372530 37334131

Now lets look at how many words are in each data set and the most common words in each data set. To do this we will first take 1/5 of each data set to make computaion easier.

set.seed(3000)
twitterText <- twitterText[sample(c(TRUE, FALSE), 
                   length(twitterText), 
                   replace = TRUE, 
                   prob = c(1/5, 4/5))]
newsText <- newsText[sample(c(TRUE, FALSE), 
                   length(newsText), 
                   replace = TRUE, 
                   prob = c(1/5, 4/5))]
blogsText<- blogsText[sample(c(TRUE, FALSE), 
                   length(blogsText), 
                   replace = TRUE, 
                   prob = c(1/5, 4/5))]

twitterCorp <- corpus(twitterText)
newsCorp <- corpus(newsText)
blogsCorp <- corpus(blogsText)

badWords <- readLines("./list.txt")

twitterTok <- tokens(twitterCorp, 
                     remove_punct = TRUE, 
                     remove_symbols = TRUE, 
                     remove_numbers = TRUE, 
                     remove_url = TRUE, 
                     remove_separators = TRUE)
newsTok <- tokens(twitterCorp, 
                  remove_punct = TRUE, 
                  remove_symbols = TRUE, 
                  remove_numbers = TRUE, 
                  remove_url = TRUE, 
                  remove_separators = TRUE)
blogsTok <- tokens(twitterCorp, 
                   remove_punct = TRUE, 
                   remove_symbols = TRUE, 
                   remove_numbers = TRUE, 
                   remove_url = TRUE, 
                   remove_separators = TRUE)

twitterDfm <- dfm(twitterTok, 
                  remove = c(stopwords("en"), badWords))
## Warning: 'remove' is deprecated; use dfm_remove() instead
newsDfm <- dfm(newsTok, 
               remove = c(stopwords("en"), badWords))
## Warning: 'remove' is deprecated; use dfm_remove() instead
blogsDfm <- dfm(blogsTok, 
                remove = c(stopwords("en"), badWords))
## Warning: 'remove' is deprecated; use dfm_remove() instead

Now let’s look at the most common words in each data set.

set.seed(24601)
textplot_wordcloud(twitterDfm, min_count = 6, random_order = FALSE,
                   rotation = .25, 
                   color = RColorBrewer::brewer.pal(8,"Dark2"), 
                   max_words = 50)

textplot_wordcloud(newsDfm, min_count = 6, random_order = FALSE,
                   rotation = .25, 
                   color = RColorBrewer::brewer.pal(8,"Dark2"),
                    max_words = 50)

textplot_wordcloud(blogsDfm, min_count = 6, random_order = FALSE,
                   rotation = .25, 
                   color = RColorBrewer::brewer.pal(8,"Dark2"),
                    max_words = 50)

# word cloud styling came from
# https://quanteda.io/articles/pkgdown/quickstart.html#exploring-corpus-texts

exploring combined files

Now that we have looked at each of our data sets lets look at the most common letters all together to get an Idea of what to do when we have unobserved word sequences.

masterText <- c(twitterText, newsText, blogsText)
masterCorp <- corpus(masterText)
masterTok <- tokens(masterCorp, 
                   remove_punct = TRUE, 
                   remove_symbols = TRUE, 
                   remove_numbers = TRUE, 
                   remove_url = TRUE, 
                   remove_separators = TRUE)
masterDfm <- dfm(masterTok, remove = c(stopwords("en"), badWords))
## Warning: 'remove' is deprecated; use dfm_remove() instead
barplot(topfeatures(masterDfm, 50), horiz = TRUE, cex.names=0.8, las=2)

These words will make good candidates for when n grams fail. This was my basic summary I hope it was not too brief I tried to keep it short in order to maximize efficiency. Thank you for reading