Milestone Report

#Load Packages
library(dplyr)
library(readr)
library(tokenizers)
library(tidyverse)
library(tidytext)
library(tm)
library(quanteda)
library(wordcloud)
library(RColorBrewer)

Load Data

news <- readLines("SwiftKey Dataset/en_US.news.txt", encoding = "UTF-8")
blogs<- readLines("SwiftKey Dataset/en_US.blogs.txt", encoding = "UTF-8")
twitter <- readLines("SwiftKey Dataset/en_US.twitter.txt", encoding = "UTF-8")

Summary Statistics

Basic Summary of the data shows the File Size, Word Count, and Line count for the News, Blogs, and Twitter Datasets.

summary_df <- data.frame('File' = c("News","Blogs","Twitter"),
                         "File Size" = sapply(list(news, blogs, twitter), function(x){format(object.size(x),units="auto")}),
                         "Word Count" = sapply(list(news, blogs, twitter),function(x){sum(str_count(x,'\\S+'))}),
                         "Line Count" = sapply(list(news, blogs, twitter),function(x){ c(length(x))})
)
summary_df

##      File File.Size Word.Count Line.Count
## 1    News  257.3 Mb   34372529    1010242
## 2   Blogs  255.4 Mb   37334131     899288
## 3 Twitter    319 Mb   30373543    2360148

Profanity Filtering

Before we begin to clean the data we run a profanity filter on the three datasets to remove profanity from the files. The profanity filter uses a list of words found on https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en and removes all instances of these words from all three files.

#Profanity Filtering
profanity <- readLines("profanityList.txt", encoding = "UTF-8")
profanityPattern <- str_c("\\b(", str_flatten(profanity, "|"), ")\\b")
newsClean <- str_replace_all(news, regex(profanityPattern, ignore_case = TRUE), "")
blogsClean <- str_replace_all(blogs, regex(profanityPattern, ignore_case = TRUE), "")
twitterClean <- str_replace_all(twitter, regex(profanityPattern, ignore_case = TRUE), "")

Sampling

Since all three files are so large we take a sample of the data to conduct further analysis. Here I chose to take 5% of the sample data.

set.seed(1234) 
newsClean <- readLines("newsClean.txt", encoding = "UTF-8")
sampleSizeNews <- round(length(newsClean)*0.05)
newsSample <- sample(newsClean, size=sampleSizeNews)

## do this for each of the three
blogsClean <- readLines("blogsClean.txt", encoding = "UTF-8")
sampleSizeBlogs <- round(length(blogsClean)*0.05)
blogsSample <- sample(blogsClean, size=sampleSizeBlogs)

twitterClean <- readLines("twitterClean.txt", encoding = "UTF-8")
sampleSizeTwitter <- round(length(twitterClean)*0.05)
twitterSample <- sample(twitterClean, size=sampleSizeTwitter)

Data Cleaning and Wordclouds

Here we start cleaning the data to do some further analysis. First we need to create a token for each dataset in order to start cleaning the data. Then the following steps are performed on all three datasets:

Convert all words to lowercase
Remove any characters not in the English alphabet (this includes symbols and numbers)
Remove instances of characters repeated 3 or more times
Remove instances of single letters
Remove Stopwords

#News Dataset Cleaning
set.seed(1234) 
newsToken <- tokens(newsSample, remove_punct = TRUE, remove_numbers = TRUE, 
                    remove_symbols = TRUE)
#convert to lowercase
newsToken <- tokens_tolower(newsToken)
#remove all characters not in English alphabet
newsToken <- tokens_keep(newsToken, pattern = "^[a-zA-Z]+$", valuetype = "regex")
#remove instances of characters repeated 3 or more times
newsToken <- tokens_split(newsToken, separator = "([[:alpha:]])\\1{2,}", valuetype = "regex")
#remove instances of single letters
newsToken <- tokens_split(newsToken, separator = "\\W*\\b\\w\\b\\W*", valuetype = "regex")
#remove stopwords
newsToken <- tokens_remove(newsToken, pattern = stopwords('en'))
#convert back to data frame
newsFreq_df <- as.data.frame(table(unlist(newsToken)))

#create a wordcloud of the 100 most frequent words in news 
wordcloud(words = newsFreq_df$Var1, freq = newsFreq_df$Freq, min.freq = 1,max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"),scale=c(1,.8))

#Blogs Dataset Cleaning
blogsToken <- tokens(blogsSample, remove_punct = TRUE, remove_numbers = TRUE, 
                    remove_symbols = TRUE)
#convert to lowercase
blogsToken <- tokens_tolower(blogsToken)
#remove all characters not in English alphabet
blogsToken <- tokens_keep(blogsToken, pattern = "^[a-zA-Z]+$", valuetype = "regex")
#remove instances of characters repeated 3 or more times
blogsToken <- tokens_split(blogsToken, separator = "([[:alpha:]])\\1{2,}", valuetype = "regex")
#remove instances of single letters
blogsToken <- tokens_split(blogsToken, separator = "\\W*\\b\\w\\b\\W*", valuetype = "regex")
#remove stopwords
blogsToken <- tokens_remove(blogsToken, pattern = stopwords('en'))
#convert back to data frame
blogsFreq_df <- as.data.frame(table(unlist(blogsToken)))

#create a wordcloud of the 100 most frequent words in blogs 
wordcloud(words = blogsFreq_df$Var1, freq = blogsFreq_df$Freq, min.freq = 1,max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"),scale=c(1,.8))

#Twitter Dataset Cleaning
twitterToken <- tokens(twitterSample, remove_punct = TRUE, remove_numbers = TRUE, 
                    remove_symbols = TRUE)
#convert to lowercase
twitterToken <- tokens_tolower(twitterToken)
#remove all characters not in English alphabet
twitterToken <- tokens_keep(twitterToken, pattern = "^[a-zA-Z]+$", valuetype = "regex")
#remove instances of single letters
twitterToken <- tokens_split(twitterToken, separator = "\\W*\\b\\w\\b\\W*", valuetype = "regex")
#remove instances of characters repeated 3 or more times
twitterToken <- tokens_split(twitterToken, separator = "([[:alpha:]])\\1{2,}", valuetype = "regex")
#remove stopwords
twitterToken <- tokens_remove(twitterToken, pattern = stopwords('en'))
#convert back to data frame
twitterFreq_df <- as.data.frame(table(unlist(twitterToken)))

#create a wordcloud of the 100 most frequent words in twitter 
wordcloud(words = twitterFreq_df$Var1, freq = twitterFreq_df$Freq, min.freq = 1,max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"),scale=c(1,.8))

Exploratory Analysis

Here we use an ngram model to find the most frequent bigrams and trigrams in all three datasets

#create a bigram of news
news_bigram <- tokens_ngrams(newsToken, n = 2:2)
#convert back to a data frame
news_bigram_df <- as.data.frame(table(unlist(news_bigram)))
#create a dataframe of the top 10 most common bigrams
freq_news_bigram_top_10 <- as.data.frame(sort(table(unlist(news_bigram)),      
                                       decreasing = TRUE)[1:10])
#plot 10 most frequent bigrams in news
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_news_bigram_top_10) + nrow(freq_news_bigram_top_10) - 1 
barplot(height = freq_news_bigram_top_10$Freq, names= freq_news_bigram_top_10 $Var1, 
        main = "Top 10 Most Frequent bigrams in News Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees and add rotated labels 
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_news_bigram_top_10$Var1), cex = 0.65)

#create a trigram of news
news_trigram <- tokens_ngrams(newsToken, n =3:3)
#convert back to a data frame
news_trigram_df <- as.data.frame(table(unlist(news_trigram)))
#create a dataframe of the top 10 most common trigrams
freq_news_trigram_top_10 <- as.data.frame(sort(table(unlist(news_trigram)),      
                                              decreasing = TRUE)[1:10])
#plot 10 most frequent trigrams in news
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_news_trigram_top_10) + nrow(freq_news_trigram_top_10) - 1 
barplot(height = freq_news_trigram_top_10$Freq, names= freq_news_trigram_top_10 $Var1, 
        main = "Top 10 Most Frequent Trigrams in News Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees 
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_news_trigram_top_10$Var1), cex = 0.65)

#create a bigram of blogs
blogs_bigram <- tokens_ngrams(blogsToken, n = 2:2)
#convert back to a data frame
blogs_bigram_df <- as.data.frame(table(unlist(blogs_bigram)))
#create a dataframe of the top 10 most common bigrams
freq_blogs_bigram_top_10 <- as.data.frame(sort(table(unlist(blogs_bigram)),      
                                       decreasing = TRUE)[1:10])
#plot 10 most frequent bigrams in blogs
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_blogs_bigram_top_10) + nrow(freq_blogs_bigram_top_10) - 1 
barplot(height = freq_blogs_bigram_top_10$Freq, names= freq_blogs_bigram_top_10 $Var1, 
        main = "Top 10 Most Frequent bigrams in Blogs Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_blogs_bigram_top_10$Var1), cex = 0.65)

#create a trigram of blogs
blogs_trigram <- tokens_ngrams(blogsToken, n =3:3)
#convert back to a data frame
blogs_trigram_df <- as.data.frame(table(unlist(blogs_trigram)))
#create a dataframe of the top 10 most common trigrams
freq_blogs_trigram_top_10 <- as.data.frame(sort(table(unlist(blogs_trigram)),      
                                              decreasing = TRUE)[1:10])
#plot 10 most frequent trigrams in blogs
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_blogs_trigram_top_10) + nrow(freq_blogs_trigram_top_10) - 1 
barplot(height = freq_blogs_trigram_top_10$Freq, names= freq_blogs_trigram_top_10 $Var1, 
        main = "Top 10 Most Frequent Trigrams in Blogs Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_blogs_trigram_top_10$Var1), cex = 0.65)

#create a bigram of twitter
twitter_bigram <- tokens_ngrams(twitterToken, n = 2:2)
#convert back to a data frame
twitter_bigram_df <- as.data.frame(table(unlist(twitter_bigram)))
#create a dataframe of the top 10 most common bigrams
freq_twitter_bigram_top_10 <- as.data.frame(sort(table(unlist(twitter_bigram)),      
                                       decreasing = TRUE)[1:10])
#plot 10 most frequent bigrams in twitter
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_twitter_bigram_top_10) + nrow(freq_twitter_bigram_top_10) - 1 
barplot(height = freq_twitter_bigram_top_10$Freq, names= freq_twitter_bigram_top_10 $Var1, 
        main = "Top 10 Most Frequent bigrams in Twitter Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_twitter_bigram_top_10$Var1), cex = 0.65)

#create a trigram of twitter
twitter_trigram <- tokens_ngrams(twitterToken, n =3:3)
#convert back to a data frame
twitter_trigram_df <- as.data.frame(table(unlist(twitter_trigram)))
#create a dataframe of the top 10 most common trigrams
freq_twitter_trigram_top_10 <- as.data.frame(sort(table(unlist(twitter_trigram)),      
                                              decreasing = TRUE)[1:10])
#plot 10 most frequent trigrams in twitter
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_twitter_trigram_top_10) + nrow(freq_twitter_trigram_top_10) - 1 
barplot(height = freq_twitter_trigram_top_10$Freq, names= freq_twitter_trigram_top_10 $Var1, 
        main = "Top 10 Most Frequent Trigrams in Twitter Dataset",
        xlab = "",
        xaxt = "n", # Do not plot the default labels
        space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25, 
     srt = 60, adj = 1, xpd = TRUE,
     labels = paste(freq_twitter_trigram_top_10$Var1), cex = 0.65)

Plans for the Prediction Algorithm and Shiny App.

For the prediction algorithm I plan to extend the ngrams to lengths 3,4, and 5 as well as using the bigrams and trigrams. I plan to incorporate Katz’s back-off model into my prediction alogrithm. I will create the shiny app to take a word input from the user and output a predicted word.

Milestone Report

Marager

2022-12-19

Load Data

Summary Statistics

Basic Summary of the data shows the File Size, Word Count, and Line count for the News, Blogs, and Twitter Datasets.

Profanity Filtering

Sampling

Since all three files are so large we take a sample of the data to conduct further analysis. Here I chose to take 5% of the sample data.

Data Cleaning and Wordclouds

Here we start cleaning the data to do some further analysis. First we need to create a token for each dataset in order to start cleaning the data. Then the following steps are performed on all three datasets:

Exploratory Analysis

Here we use an ngram model to find the most frequent bigrams and trigrams in all three datasets

Plans for the Prediction Algorithm and Shiny App.

For the prediction algorithm I plan to extend the ngrams to lengths 3,4, and 5 as well as using the bigrams and trigrams. I plan to incorporate Katz’s back-off model into my prediction alogrithm. I will create the shiny app to take a word input from the user and output a predicted word.