#Load Packages
library(dplyr)
library(readr)
library(tokenizers)
library(tidyverse)
library(tidytext)
library(tm)
library(quanteda)
library(wordcloud)
library(RColorBrewer)
Load Data
news <- readLines("SwiftKey Dataset/en_US.news.txt", encoding = "UTF-8")
blogs<- readLines("SwiftKey Dataset/en_US.blogs.txt", encoding = "UTF-8")
twitter <- readLines("SwiftKey Dataset/en_US.twitter.txt", encoding = "UTF-8")

Summary Statistics

Basic Summary of the data shows the File Size, Word Count, and Line count for the News, Blogs, and Twitter Datasets.
summary_df <- data.frame('File' = c("News","Blogs","Twitter"),
                         "File Size" = sapply(list(news, blogs, twitter), function(x){format(object.size(x),units="auto")}),
                         "Word Count" = sapply(list(news, blogs, twitter),function(x){sum(str_count(x,'\\S+'))}),
                         "Line Count" = sapply(list(news, blogs, twitter),function(x){ c(length(x))})
)
summary_df
##      File File.Size Word.Count Line.Count
## 1    News  257.3 Mb   34372529    1010242
## 2   Blogs  255.4 Mb   37334131     899288
## 3 Twitter    319 Mb   30373543    2360148

Profanity Filtering

Before we begin to clean the data we run a profanity filter on the three datasets to remove profanity from the files. The profanity filter uses a list of words found on https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en and removes all instances of these words from all three files.
#Profanity Filtering
profanity <- readLines("profanityList.txt", encoding = "UTF-8")
profanityPattern <- str_c("\\b(", str_flatten(profanity, "|"), ")\\b")
newsClean <- str_replace_all(news, regex(profanityPattern, ignore_case = TRUE), "")
blogsClean <- str_replace_all(blogs, regex(profanityPattern, ignore_case = TRUE), "")
twitterClean <- str_replace_all(twitter, regex(profanityPattern, ignore_case = TRUE), "")

Sampling

Since all three files are so large we take a sample of the data to conduct further analysis. Here I chose to take 5% of the sample data.
set.seed(1234) 
newsClean <- readLines("newsClean.txt", encoding = "UTF-8")
sampleSizeNews <- round(length(newsClean)*0.05)
newsSample <- sample(newsClean, size=sampleSizeNews)

## do this for each of the three
blogsClean <- readLines("blogsClean.txt", encoding = "UTF-8")
sampleSizeBlogs <- round(length(blogsClean)*0.05)
blogsSample <- sample(blogsClean, size=sampleSizeBlogs)

twitterClean <- readLines("twitterClean.txt", encoding = "UTF-8")
sampleSizeTwitter <- round(length(twitterClean)*0.05)
twitterSample <- sample(twitterClean, size=sampleSizeTwitter)

Data Cleaning and Wordclouds

Here we start cleaning the data to do some further analysis. First we need to create a token for each dataset in order to start cleaning the data. Then the following steps are performed on all three datasets:
  • Convert all words to lowercase
  • Remove any characters not in the English alphabet (this includes symbols and numbers)
  • Remove instances of characters repeated 3 or more times
  • Remove instances of single letters
  • Remove Stopwords
#News Dataset Cleaning
set.seed(1234) 
newsToken <- tokens(newsSample, remove_punct = TRUE, remove_numbers = TRUE, 
                    remove_symbols = TRUE)
#convert to lowercase
newsToken <- tokens_tolower(newsToken)
#remove all characters not in English alphabet
newsToken <- tokens_keep(newsToken, pattern = "^[a-zA-Z]+$", valuetype = "regex")
#remove instances of characters repeated 3 or more times
newsToken <- tokens_split(newsToken, separator = "([[:alpha:]])\\1{2,}", valuetype = "regex")
#remove instances of single letters
newsToken <- tokens_split(newsToken, separator = "\\W*\\b\\w\\b\\W*", valuetype = "regex")
#remove stopwords
newsToken <- tokens_remove(newsToken, pattern = stopwords('en'))
#convert back to data frame
newsFreq_df <- as.data.frame(table(unlist(newsToken)))

#create a wordcloud of the 100 most frequent words in news 
wordcloud(words = newsFreq_df$Var1, freq = newsFreq_df$Freq, min.freq = 1,max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"),scale=c(1,.8))

#Blogs Dataset Cleaning
blogsToken <- tokens(blogsSample, remove_punct = TRUE, remove_numbers = TRUE, 
                    remove_symbols = TRUE)
#convert to lowercase
blogsToken <- tokens_tolower(blogsToken)
#remove all characters not in English alphabet
blogsToken <- tokens_keep(blogsToken, pattern = "^[a-zA-Z]+$", valuetype = "regex")
#remove instances of characters repeated 3 or more times
blogsToken <- tokens_split(blogsToken, separator = "([[:alpha:]])\\1{2,}", valuetype = "regex")
#remove instances of single letters
blogsToken <- tokens_split(blogsToken, separator = "\\W*\\b\\w\\b\\W*", valuetype = "regex")
#remove stopwords
blogsToken <- tokens_remove(blogsToken, pattern = stopwords('en'))
#convert back to data frame
blogsFreq_df <- as.data.frame(table(unlist(blogsToken)))

#create a wordcloud of the 100 most frequent words in blogs 
wordcloud(words = blogsFreq_df$Var1, freq = blogsFreq_df$Freq, min.freq = 1,max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"),scale=c(1,.8))

#Twitter Dataset Cleaning
twitterToken <- tokens(twitterSample, remove_punct = TRUE, remove_numbers = TRUE, 
                    remove_symbols = TRUE)
#convert to lowercase
twitterToken <- tokens_tolower(twitterToken)
#remove all characters not in English alphabet
twitterToken <- tokens_keep(twitterToken, pattern = "^[a-zA-Z]+$", valuetype = "regex")
#remove instances of single letters
twitterToken <- tokens_split(twitterToken, separator = "\\W*\\b\\w\\b\\W*", valuetype = "regex")
#remove instances of characters repeated 3 or more times
twitterToken <- tokens_split(twitterToken, separator = "([[:alpha:]])\\1{2,}", valuetype = "regex")
#remove stopwords
twitterToken <- tokens_remove(twitterToken, pattern = stopwords('en'))
#convert back to data frame
twitterFreq_df <- as.data.frame(table(unlist(twitterToken)))

#create a wordcloud of the 100 most frequent words in twitter 
wordcloud(words = twitterFreq_df$Var1, freq = twitterFreq_df$Freq, min.freq = 1,max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"),scale=c(1,.8))

Exploratory Analysis

Here we use an ngram model to find the most frequent bigrams and trigrams in all three datasets
#create a bigram of news
news_bigram <- tokens_ngrams(newsToken, n = 2:2)
#convert back to a data frame
news_bigram_df <- as.data.frame(table(unlist(news_bigram)))
#create a dataframe of the top 10 most common bigrams
freq_news_bigram_top_10 <- as.data.frame(sort(table(unlist(news_bigram)),      
                                       decreasing = TRUE)[1:10])
#plot 10 most frequent bigrams in news
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_news_bigram_top_10) + nrow(freq_news_bigram_top_10) - 1 
barplot(height = freq_news_bigram_top_10$Freq, names= freq_news_bigram_top_10 $Var1, 
        main = "Top 10 Most Frequent bigrams in News Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees and add rotated labels 
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_news_bigram_top_10$Var1), cex = 0.65)

#create a trigram of news
news_trigram <- tokens_ngrams(newsToken, n =3:3)
#convert back to a data frame
news_trigram_df <- as.data.frame(table(unlist(news_trigram)))
#create a dataframe of the top 10 most common trigrams
freq_news_trigram_top_10 <- as.data.frame(sort(table(unlist(news_trigram)),      
                                              decreasing = TRUE)[1:10])
#plot 10 most frequent trigrams in news
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_news_trigram_top_10) + nrow(freq_news_trigram_top_10) - 1 
barplot(height = freq_news_trigram_top_10$Freq, names= freq_news_trigram_top_10 $Var1, 
        main = "Top 10 Most Frequent Trigrams in News Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees 
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_news_trigram_top_10$Var1), cex = 0.65)

#create a bigram of blogs
blogs_bigram <- tokens_ngrams(blogsToken, n = 2:2)
#convert back to a data frame
blogs_bigram_df <- as.data.frame(table(unlist(blogs_bigram)))
#create a dataframe of the top 10 most common bigrams
freq_blogs_bigram_top_10 <- as.data.frame(sort(table(unlist(blogs_bigram)),      
                                       decreasing = TRUE)[1:10])
#plot 10 most frequent bigrams in blogs
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_blogs_bigram_top_10) + nrow(freq_blogs_bigram_top_10) - 1 
barplot(height = freq_blogs_bigram_top_10$Freq, names= freq_blogs_bigram_top_10 $Var1, 
        main = "Top 10 Most Frequent bigrams in Blogs Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_blogs_bigram_top_10$Var1), cex = 0.65)

#create a trigram of blogs
blogs_trigram <- tokens_ngrams(blogsToken, n =3:3)
#convert back to a data frame
blogs_trigram_df <- as.data.frame(table(unlist(blogs_trigram)))
#create a dataframe of the top 10 most common trigrams
freq_blogs_trigram_top_10 <- as.data.frame(sort(table(unlist(blogs_trigram)),      
                                              decreasing = TRUE)[1:10])
#plot 10 most frequent trigrams in blogs
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_blogs_trigram_top_10) + nrow(freq_blogs_trigram_top_10) - 1 
barplot(height = freq_blogs_trigram_top_10$Freq, names= freq_blogs_trigram_top_10 $Var1, 
        main = "Top 10 Most Frequent Trigrams in Blogs Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_blogs_trigram_top_10$Var1), cex = 0.65)

#create a bigram of twitter
twitter_bigram <- tokens_ngrams(twitterToken, n = 2:2)
#convert back to a data frame
twitter_bigram_df <- as.data.frame(table(unlist(twitter_bigram)))
#create a dataframe of the top 10 most common bigrams
freq_twitter_bigram_top_10 <- as.data.frame(sort(table(unlist(twitter_bigram)),      
                                       decreasing = TRUE)[1:10])
#plot 10 most frequent bigrams in twitter
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_twitter_bigram_top_10) + nrow(freq_twitter_bigram_top_10) - 1 
barplot(height = freq_twitter_bigram_top_10$Freq, names= freq_twitter_bigram_top_10 $Var1, 
        main = "Top 10 Most Frequent bigrams in Twitter Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_twitter_bigram_top_10$Var1), cex = 0.65)

#create a trigram of twitter
twitter_trigram <- tokens_ngrams(twitterToken, n =3:3)
#convert back to a data frame
twitter_trigram_df <- as.data.frame(table(unlist(twitter_trigram)))
#create a dataframe of the top 10 most common trigrams
freq_twitter_trigram_top_10 <- as.data.frame(sort(table(unlist(twitter_trigram)),      
                                              decreasing = TRUE)[1:10])
#plot 10 most frequent trigrams in twitter
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_twitter_trigram_top_10) + nrow(freq_twitter_trigram_top_10) - 1 
barplot(height = freq_twitter_trigram_top_10$Freq, names= freq_twitter_trigram_top_10 $Var1, 
        main = "Top 10 Most Frequent Trigrams in Twitter Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_twitter_trigram_top_10$Var1), cex = 0.65)

Plans for the Prediction Algorithm and Shiny App.

For the prediction algorithm I plan to extend the ngrams to lengths 3,4, and 5 as well as using the bigrams and trigrams. I plan to incorporate Katz’s back-off model into my prediction alogrithm. I will create the shiny app to take a word input from the user and output a predicted word.