#Load Packages
library(dplyr)
library(readr)
library(tokenizers)
library(tidyverse)
library(tidytext)
library(tm)
library(quanteda)
library(wordcloud)
library(RColorBrewer)
news <- readLines("SwiftKey Dataset/en_US.news.txt", encoding = "UTF-8")
blogs<- readLines("SwiftKey Dataset/en_US.blogs.txt", encoding = "UTF-8")
twitter <- readLines("SwiftKey Dataset/en_US.twitter.txt", encoding = "UTF-8")
summary_df <- data.frame('File' = c("News","Blogs","Twitter"),
"File Size" = sapply(list(news, blogs, twitter), function(x){format(object.size(x),units="auto")}),
"Word Count" = sapply(list(news, blogs, twitter),function(x){sum(str_count(x,'\\S+'))}),
"Line Count" = sapply(list(news, blogs, twitter),function(x){ c(length(x))})
)
summary_df
## File File.Size Word.Count Line.Count
## 1 News 257.3 Mb 34372529 1010242
## 2 Blogs 255.4 Mb 37334131 899288
## 3 Twitter 319 Mb 30373543 2360148
#Profanity Filtering
profanity <- readLines("profanityList.txt", encoding = "UTF-8")
profanityPattern <- str_c("\\b(", str_flatten(profanity, "|"), ")\\b")
newsClean <- str_replace_all(news, regex(profanityPattern, ignore_case = TRUE), "")
blogsClean <- str_replace_all(blogs, regex(profanityPattern, ignore_case = TRUE), "")
twitterClean <- str_replace_all(twitter, regex(profanityPattern, ignore_case = TRUE), "")
set.seed(1234)
newsClean <- readLines("newsClean.txt", encoding = "UTF-8")
sampleSizeNews <- round(length(newsClean)*0.05)
newsSample <- sample(newsClean, size=sampleSizeNews)
## do this for each of the three
blogsClean <- readLines("blogsClean.txt", encoding = "UTF-8")
sampleSizeBlogs <- round(length(blogsClean)*0.05)
blogsSample <- sample(blogsClean, size=sampleSizeBlogs)
twitterClean <- readLines("twitterClean.txt", encoding = "UTF-8")
sampleSizeTwitter <- round(length(twitterClean)*0.05)
twitterSample <- sample(twitterClean, size=sampleSizeTwitter)
#News Dataset Cleaning
set.seed(1234)
newsToken <- tokens(newsSample, remove_punct = TRUE, remove_numbers = TRUE,
remove_symbols = TRUE)
#convert to lowercase
newsToken <- tokens_tolower(newsToken)
#remove all characters not in English alphabet
newsToken <- tokens_keep(newsToken, pattern = "^[a-zA-Z]+$", valuetype = "regex")
#remove instances of characters repeated 3 or more times
newsToken <- tokens_split(newsToken, separator = "([[:alpha:]])\\1{2,}", valuetype = "regex")
#remove instances of single letters
newsToken <- tokens_split(newsToken, separator = "\\W*\\b\\w\\b\\W*", valuetype = "regex")
#remove stopwords
newsToken <- tokens_remove(newsToken, pattern = stopwords('en'))
#convert back to data frame
newsFreq_df <- as.data.frame(table(unlist(newsToken)))
#create a wordcloud of the 100 most frequent words in news
wordcloud(words = newsFreq_df$Var1, freq = newsFreq_df$Freq, min.freq = 1,max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"),scale=c(1,.8))
#Blogs Dataset Cleaning
blogsToken <- tokens(blogsSample, remove_punct = TRUE, remove_numbers = TRUE,
remove_symbols = TRUE)
#convert to lowercase
blogsToken <- tokens_tolower(blogsToken)
#remove all characters not in English alphabet
blogsToken <- tokens_keep(blogsToken, pattern = "^[a-zA-Z]+$", valuetype = "regex")
#remove instances of characters repeated 3 or more times
blogsToken <- tokens_split(blogsToken, separator = "([[:alpha:]])\\1{2,}", valuetype = "regex")
#remove instances of single letters
blogsToken <- tokens_split(blogsToken, separator = "\\W*\\b\\w\\b\\W*", valuetype = "regex")
#remove stopwords
blogsToken <- tokens_remove(blogsToken, pattern = stopwords('en'))
#convert back to data frame
blogsFreq_df <- as.data.frame(table(unlist(blogsToken)))
#create a wordcloud of the 100 most frequent words in blogs
wordcloud(words = blogsFreq_df$Var1, freq = blogsFreq_df$Freq, min.freq = 1,max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"),scale=c(1,.8))
#Twitter Dataset Cleaning
twitterToken <- tokens(twitterSample, remove_punct = TRUE, remove_numbers = TRUE,
remove_symbols = TRUE)
#convert to lowercase
twitterToken <- tokens_tolower(twitterToken)
#remove all characters not in English alphabet
twitterToken <- tokens_keep(twitterToken, pattern = "^[a-zA-Z]+$", valuetype = "regex")
#remove instances of single letters
twitterToken <- tokens_split(twitterToken, separator = "\\W*\\b\\w\\b\\W*", valuetype = "regex")
#remove instances of characters repeated 3 or more times
twitterToken <- tokens_split(twitterToken, separator = "([[:alpha:]])\\1{2,}", valuetype = "regex")
#remove stopwords
twitterToken <- tokens_remove(twitterToken, pattern = stopwords('en'))
#convert back to data frame
twitterFreq_df <- as.data.frame(table(unlist(twitterToken)))
#create a wordcloud of the 100 most frequent words in twitter
wordcloud(words = twitterFreq_df$Var1, freq = twitterFreq_df$Freq, min.freq = 1,max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"),scale=c(1,.8))
#create a bigram of news
news_bigram <- tokens_ngrams(newsToken, n = 2:2)
#convert back to a data frame
news_bigram_df <- as.data.frame(table(unlist(news_bigram)))
#create a dataframe of the top 10 most common bigrams
freq_news_bigram_top_10 <- as.data.frame(sort(table(unlist(news_bigram)),
decreasing = TRUE)[1:10])
#plot 10 most frequent bigrams in news
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_news_bigram_top_10) + nrow(freq_news_bigram_top_10) - 1
barplot(height = freq_news_bigram_top_10$Freq, names= freq_news_bigram_top_10 $Var1,
main = "Top 10 Most Frequent bigrams in News Dataset",
xlab = "",
xaxt = "n", # Do not plot the default labels
space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25,
srt = 60, adj = 1, xpd = TRUE,
labels = paste(freq_news_bigram_top_10$Var1), cex = 0.65)
#create a trigram of news
news_trigram <- tokens_ngrams(newsToken, n =3:3)
#convert back to a data frame
news_trigram_df <- as.data.frame(table(unlist(news_trigram)))
#create a dataframe of the top 10 most common trigrams
freq_news_trigram_top_10 <- as.data.frame(sort(table(unlist(news_trigram)),
decreasing = TRUE)[1:10])
#plot 10 most frequent trigrams in news
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_news_trigram_top_10) + nrow(freq_news_trigram_top_10) - 1
barplot(height = freq_news_trigram_top_10$Freq, names= freq_news_trigram_top_10 $Var1,
main = "Top 10 Most Frequent Trigrams in News Dataset",
xlab = "",
xaxt = "n", # Do not plot the default labels
space = 1)
#rotate 60 degrees
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25,
srt = 60, adj = 1, xpd = TRUE,
labels = paste(freq_news_trigram_top_10$Var1), cex = 0.65)
#create a bigram of blogs
blogs_bigram <- tokens_ngrams(blogsToken, n = 2:2)
#convert back to a data frame
blogs_bigram_df <- as.data.frame(table(unlist(blogs_bigram)))
#create a dataframe of the top 10 most common bigrams
freq_blogs_bigram_top_10 <- as.data.frame(sort(table(unlist(blogs_bigram)),
decreasing = TRUE)[1:10])
#plot 10 most frequent bigrams in blogs
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_blogs_bigram_top_10) + nrow(freq_blogs_bigram_top_10) - 1
barplot(height = freq_blogs_bigram_top_10$Freq, names= freq_blogs_bigram_top_10 $Var1,
main = "Top 10 Most Frequent bigrams in Blogs Dataset",
xlab = "",
xaxt = "n", # Do not plot the default labels
space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25,
srt = 60, adj = 1, xpd = TRUE,
labels = paste(freq_blogs_bigram_top_10$Var1), cex = 0.65)
#create a trigram of blogs
blogs_trigram <- tokens_ngrams(blogsToken, n =3:3)
#convert back to a data frame
blogs_trigram_df <- as.data.frame(table(unlist(blogs_trigram)))
#create a dataframe of the top 10 most common trigrams
freq_blogs_trigram_top_10 <- as.data.frame(sort(table(unlist(blogs_trigram)),
decreasing = TRUE)[1:10])
#plot 10 most frequent trigrams in blogs
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_blogs_trigram_top_10) + nrow(freq_blogs_trigram_top_10) - 1
barplot(height = freq_blogs_trigram_top_10$Freq, names= freq_blogs_trigram_top_10 $Var1,
main = "Top 10 Most Frequent Trigrams in Blogs Dataset",
xlab = "",
xaxt = "n", # Do not plot the default labels
space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25,
srt = 60, adj = 1, xpd = TRUE,
labels = paste(freq_blogs_trigram_top_10$Var1), cex = 0.65)
#create a bigram of twitter
twitter_bigram <- tokens_ngrams(twitterToken, n = 2:2)
#convert back to a data frame
twitter_bigram_df <- as.data.frame(table(unlist(twitter_bigram)))
#create a dataframe of the top 10 most common bigrams
freq_twitter_bigram_top_10 <- as.data.frame(sort(table(unlist(twitter_bigram)),
decreasing = TRUE)[1:10])
#plot 10 most frequent bigrams in twitter
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_twitter_bigram_top_10) + nrow(freq_twitter_bigram_top_10) - 1
barplot(height = freq_twitter_bigram_top_10$Freq, names= freq_twitter_bigram_top_10 $Var1,
main = "Top 10 Most Frequent bigrams in Twitter Dataset",
xlab = "",
xaxt = "n", # Do not plot the default labels
space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25,
srt = 60, adj = 1, xpd = TRUE,
labels = paste(freq_twitter_bigram_top_10$Var1), cex = 0.65)
#create a trigram of twitter
twitter_trigram <- tokens_ngrams(twitterToken, n =3:3)
#convert back to a data frame
twitter_trigram_df <- as.data.frame(table(unlist(twitter_trigram)))
#create a dataframe of the top 10 most common trigrams
freq_twitter_trigram_top_10 <- as.data.frame(sort(table(unlist(twitter_trigram)),
decreasing = TRUE)[1:10])
#plot 10 most frequent trigrams in twitter
par(mar = c(7, 4, 2, 2) + 0.2) #add room for the rotated labels
end_point = 0.5 + nrow(freq_twitter_trigram_top_10) + nrow(freq_twitter_trigram_top_10) - 1
barplot(height = freq_twitter_trigram_top_10$Freq, names= freq_twitter_trigram_top_10 $Var1,
main = "Top 10 Most Frequent Trigrams in Twitter Dataset",
xlab = "",
xaxt = "n", # Do not plot the default labels
space = 1)
#rotate 60 degrees and add rotated labels
text(seq(1.5, end_point, by = 2), par("usr")[3]-0.25,
srt = 60, adj = 1, xpd = TRUE,
labels = paste(freq_twitter_trigram_top_10$Var1), cex = 0.65)