In order to understand the relationships and distributions of words in the data set, we will first do an exploratory data analysis.
library(quanteda)
library(readtext)
library(stringi)
library(ggplot2)
library(cowplot)
library(reshape2)
library(dplyr)
if(!file.exists(file.path(getwd(), 'data', 'final', 'en_US', 'en_US.blogs.txt'))){
download.file('https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip',
destfile = file.path(getwd(), 'data', 'Coursera-SwiftKey.zip'),
method = 'curl', quiet = T)
unzip(file.path(getwd(),'data', 'Coursera-SwiftKey.zip'), exdir = file.path(getwd(), 'data'))
}
blogs <- readtext(file.path(getwd(), 'data', 'final', 'en_US' ,'en_US.blogs.txt'))
news <- readtext(file.path(getwd(), 'data', 'final', 'en_US' ,'en_US.news.txt'))
tweets <- readtext(file.path(getwd(), 'data', 'final', 'en_US', 'en_US.twitter.txt'))
lines <- data.frame('source' = c('blogs', 'news', 'tweets'),
'lines' = c(stri_count_fixed(blogs, '\n'),
stri_count_fixed(news, '\n'),
stri_count_fixed(tweets, '\n')))
lines
## source lines
## 1 blogs 899287
## 2 news 1010241
## 3 tweets 2360147
blog_corpus <- corpus(blogs)
docvars(blog_corpus, 'Source') <- 'blogs'
news_corpus <- corpus(news)
docvars(news_corpus, 'Source') <- 'news'
twitter_corpus <- corpus(tweets)
docvars(twitter_corpus, 'Source') <- 'tweets'
#unfortunately the summary() function crashes repeatedly on the combined corpus so we are showing individual summaries below
#combine all three
#combined_corpus <- blog_corpus + news_corpus + twitter_corpus
combined_corpus <- c(blog_corpus, news_corpus, twitter_corpus)
#remove individual objects to free memory
#rm(blogs, blog_corpus, news, news_corpus, tweets, twitter_corpus)
rm(blogs, news, tweets )
corpus_summary <- data.frame()
for (i in 1:length(combined_corpus)){
corpus_summary <- rbind(corpus_summary,summary(combined_corpus[i]))
}
corpus_summary
## Corpus consisting of 1 document, showing 1 document:
##
## Text Types Tokens Sentences Source
## en_US.blogs.txt 488731 42829003 2072623 blogs
## en_US.news.txt 436848 39913287 1869517 news
## en_US.twitter.txt 563897 36726231 2580555 tweets
word_plot <- ggplot(data = corpus_summary, aes(x = Source, y = Tokens, fill = Source)) +
geom_col() +
guides(fill = FALSE) +
scale_y_continuous(expand = c(0, 0)) +
ylab('Word counts')
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
sentence_plot <- ggplot(data = corpus_summary, aes(x = Source, y = Sentences, fill = Source)) +
geom_col() +
scale_y_continuous(expand = c(0, 0)) +
guides(fill = FALSE)
plot_grid(word_plot, sentence_plot, labels = 'AUTO')
In the pre-processing step we will tokenise words and sentences, remove profanity etc.
tokenised_word_list <- list()
for(i in 1:length(combined_corpus)){
tokenised_word_list[[i]] <-tokens(combined_corpus[i], what = 'word', remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove_separators = TRUE, remove_url = TRUE)
}
#profanity_url <- 'https://github.com/dsojevic/profanity-list/blob/main/en.txt'
profanity_url <- 'https://raw.githubusercontent.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en'
profanity_file_name <- file.path(getwd(), 'profanity.txt')
if(!file.exists(profanity_file_name)){
download.file(profanity_url, profanity_file_name)
}
profanity <- readLines(profanity_file_name)
for(i in 1:length(tokenised_word_list)){
tokenised_word_list[[i]] <- tokens_remove(tokenised_word_list[[i]], pattern = profanity)
}
word_summary <- data.frame()
for(i in 1:length(tokenised_word_list)){
word_summary <- rbind(word_summary, summary(tokenised_word_list[[i]]))
}
word_summary
## Var1 Var2 Freq
## 1 en_US.blogs.txt Length 36901924
## 2 en_US.blogs.txt Class -none-
## 3 en_US.blogs.txt Mode character
## 4 en_US.news.txt Length 33542726
## 5 en_US.news.txt Class -none-
## 6 en_US.news.txt Mode character
## 7 en_US.twitter.txt Length 29474639
## 8 en_US.twitter.txt Class -none-
## 9 en_US.twitter.txt Mode character
tokenised_sentence_list <- list()
for(i in 1:length(combined_corpus)){
tokenised_sentence_list[[i]] <-tokens(combined_corpus[i], what = 'sentence', remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove_separators = TRUE, remove_url = TRUE)
}
profanity_url <- 'https://raw.githubusercontent.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en'
profanity_file_name <- file.path(getwd(), 'profanity.txt')
if(!file.exists(profanity_file_name)){
download.file(profanity_url, profanity_file_name)
}
profanity <- readLines(profanity_file_name)
for(i in 1:length(tokenised_sentence_list)){
tokenised_sentence_list[[i]] <- tokens_remove(tokenised_sentence_list[[i]], pattern = profanity)
}
sentence_summary <- data.frame()
for(i in 1:length(tokenised_sentence_list)){
sentence_summary <- rbind(sentence_summary, summary(tokenised_sentence_list[[i]]))
}
sentence_summary
## Var1 Var2 Freq
## 1 en_US.blogs.txt Length 2072537
## 2 en_US.blogs.txt Class -none-
## 3 en_US.blogs.txt Mode character
## 4 en_US.news.txt Length 1869516
## 5 en_US.news.txt Class -none-
## 6 en_US.news.txt Mode character
## 7 en_US.twitter.txt Length 2578299
## 8 en_US.twitter.txt Class -none-
## 9 en_US.twitter.txt Mode character
plot_df <- data.frame(matrix(ncol=1,nrow=3))
colnames(plot_df) <- c("Freq")
row.names(plot_df) <- c('blogs', 'news', 'tweets')
plot_df[1,1] <- word_summary[1,3]
plot_df[2,1] <- word_summary[4,3]
plot_df[3,1] <- word_summary[7,3]
tokenized_word_plot <- ggplot(data = plot_df, aes(x = row.names(plot_df), y = plot_df[, 1], fill = row.names(plot_df))) +
geom_col() +
guides(fill = "none") +
xlab('Source') +
ylab('Word counts')
plot_df[1,1] <- sentence_summary[1,3]
plot_df[2,1] <- sentence_summary[4,3]
plot_df[3,1] <- sentence_summary[7,3]
tokenized_sentence_plot <- ggplot(data = plot_df, aes(x = row.names(plot_df), y = plot_df[, 1], fill = row.names(plot_df))) +
geom_col() +
guides(fill = "none") +
xlab('Source') +
ylab('Sentence counts')
plot_grid(tokenized_word_plot, tokenized_sentence_plot, labels = 'AUTO')
#make document feature matrix
word_dfm_list <- list()
for(i in 1:length(tokenised_word_list)){
word_dfm_list[[i]] <- dfm(tokenised_word_list[[i]])
}
top_words <- list()
for(i in 1:length(word_dfm_list)){
top_words[[i]] <- topfeatures(word_dfm_list[[i]], n=20)
}
gram1 <- data.frame()
gram1 <- cbind(as.data.frame(top_words[[1]]),as.data.frame(top_words[[2]]),as.data.frame(top_words[[3]]))
colnames(gram1)<- c('blogs', 'news', 'tweets')
gram1$words <- rownames(gram1)
df <- melt(gram1, id.vars = c('words'))
ggplot(data = df, aes(x = reorder(words, value), y = value)) +
geom_col(aes(fill = variable)) +
coord_flip() +
ylab('Frequency') + xlab('Top 20 common words') +
scale_y_continuous(expand = c(0, 0)) +
guides(fill = guide_legend(title = 'Source')) +
theme(legend.position = 'top')
percent_dfm_word <- list()
percent_df_word <- list()
for(i in 1:length(word_dfm_list)){
percent_dfm_word[[i]] <- dfm_weight(word_dfm_list[[i]], scheme = 'prop') * 100
percent_df_word[[i]] <- data.frame(topfeatures(percent_dfm_word[[i]]))
colnames(percent_df_word[[i]]) <- 'percent'
percent_df_word[[i]]$words <- rownames(percent_df_word[[i]])
}
percent_df_word[[1]]$source <- "blogs"
percent_df_word[[2]]$source <- "news"
percent_df_word[[3]]$source <- "tweets"
combined_percent_df_word <- bind_rows(percent_df_word[[1]],percent_df_word[[2]], percent_df_word[[3]])
ggplot(data = combined_percent_df_word, aes(x = words, y = percent, colour = source, group = source)) +
geom_point() +
geom_line() +
scale_x_discrete(limits = combined_percent_df_word$words) +
labs(
x="Words",
y= "Percentage",
color= "Source"
)
remove = stopwords('english').stemmed_dfm_list <- list()
no_common_word_dfm_list <- list()
for(i in 1:length(word_dfm_list)){
stemmed_dfm_list[[i]] <- dfm_wordstem(word_dfm_list[[i]])
no_common_word_dfm_list[[i]] <- dfm_select(stemmed_dfm_list[[i]], pattern=stopwords('english'), selection = 'remove')
}
top_20_word_list <- list()
top_20_word_df_list <- list()
for(i in 1:length(no_common_word_dfm_list)){
top_20_word_list[[i]] <- topfeatures(no_common_word_dfm_list[[i]], n=20)
top_20_word_df_list[[i]] <- as.data.frame(top_20_word_list[[i]])
top_20_word_df_list[[i]]$words <- row.names(top_20_word_df_list[[i]])
}
top_20_word_df_list[[1]]$source <- "blogs"
top_20_word_df_list[[2]]$source <- "news"
top_20_word_df_list[[3]]$source <- "tweets"
combined_top_20_df <- bind_rows(top_20_word_df_list[[1]], top_20_word_df_list[[2]], top_20_word_df_list[[3]])
colnames(combined_top_20_df) <- c('frequency', 'words', 'source')
ggplot(data = combined_top_20_df, aes(x = reorder(words, frequency), y = frequency, fill=source)) +
geom_col(aes(fill = source)) +
geom_col(width = 0.7) +
ylab('Frequency') + xlab('Top 20 common words') +
scale_y_continuous(expand = c(0, 0)) +
guides(fill = guide_legend(title = 'Source')) +
theme(legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1))
no_common_word_dfm_list_percentage <- list()
no_common_word_df_list_percentage <- list()
for(i in 1:length(no_common_word_dfm_list)){
no_common_word_dfm_list_percentage[[i]] <- dfm_weight(no_common_word_dfm_list[[i]], scheme = 'prop') * 100
no_common_word_df_list_percentage[[i]] <- data.frame(topfeatures(no_common_word_dfm_list_percentage[[i]]))
colnames(no_common_word_df_list_percentage[[i]]) <- 'percent'
no_common_word_df_list_percentage[[i]]$words <- row.names(no_common_word_df_list_percentage[[i]])
}
no_common_word_df_list_percentage[[1]]$source <- 'blogs'
no_common_word_df_list_percentage[[2]]$source <- 'news'
no_common_word_df_list_percentage[[3]]$source <- 'tweets'
combined_percent_df_no_common_word <- bind_rows(no_common_word_df_list_percentage[[1]], no_common_word_df_list_percentage[[2]], no_common_word_df_list_percentage[[3]])
ggplot(data = combined_percent_df_no_common_word, aes(x = words, y = percent, group = source, colour = source)) +
geom_line() +
geom_point() +
scale_x_discrete(limits = combined_percent_df_no_common_word$words) +
theme(legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
x="Words",
y= "Percentage",
color= "Source"
)
tokenised_word_list_2_gram <- list()
dfm_list_2_gram <- list()
for(i in 1:length(tokenised_word_list)){
tokenised_word_list_2_gram[[i]] <- tokens_ngrams(tokenised_word_list[[i]], n = 2L, concatenator = ' ')
dfm_list_2_gram[[i]] <- dfm(tokenised_word_list_2_gram[[i]], tolower = TRUE)
}
top_20_word_list_2gram <- list()
top_20_word_df_list_2gram <- list()
for(i in 1:length(no_common_word_dfm_list)){
top_20_word_list_2gram[[i]] <- topfeatures(dfm_list_2_gram[[i]], n=20)
top_20_word_df_list_2gram[[i]] <- as.data.frame(top_20_word_list_2gram[[i]])
top_20_word_df_list_2gram[[i]]$words <- row.names(top_20_word_df_list_2gram[[i]])
}
top_20_word_df_list_2gram[[1]]$source <- "blogs"
top_20_word_df_list_2gram[[2]]$source <- "news"
top_20_word_df_list_2gram[[3]]$source <- "tweets"
combined_top_20_df_2gram <- bind_rows(top_20_word_df_list_2gram[[1]], top_20_word_df_list_2gram[[2]], top_20_word_df_list_2gram[[3]])
colnames(combined_top_20_df_2gram) <- c('frequency', 'words', 'source')
ggplot(data = combined_top_20_df_2gram, aes(x = reorder(words, frequency), y = frequency, fill=source)) +
geom_col(aes(fill = source)) +
geom_col(width = 0.7) +
ylab('Frequency') + xlab('Top 20 common words') +
scale_y_continuous(expand = c(0, 0)) +
guides(fill = guide_legend(title = 'Source')) +
theme(legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1))
#tokenised_word_list_3_gram <- list()
#dfm_list_3_gram <- list()
#for(i in 1:length(tokenised_word_list)){
# tokenised_word_list_3_gram[[i]] <- tokens_ngrams(tokenised_word_list[[i]], n = 3L, concatenator = ' ')
# dfm_list_3_gram[[i]] <- dfm(tokenised_word_list_3_gram[[i]], tolower = TRUE)
#}
#top_20_word_list_3gram <- list()
#top_20_word_df_list_3gram <- list()
#
#for(i in 1:length(no_common_word_dfm_list)){
# top_20_word_list_3gram[[i]] <- topfeatures(dfm_list_3_gram[[i]], n=20)
# top_20_word_df_list_3gram[[i]] <- as.data.frame(top_20_word_list_3gram[[i]])
# top_20_word_df_list_3gram[[i]]$words <- row.names(top_20_word_df_list_3gram[[i]])
#}
#top_20_word_df_list_3gram[[1]]$source <- "blogs"
#top_20_word_df_list_3gram[[2]]$source <- "news"
#top_20_word_df_list_3gram[[3]]$source <- "tweets"
#
#combined_top_20_df_3gram <- bind_rows(top_20_word_df_list_3gram[[1]], top_20_word_df_list_3gram[[2]], top_20_word_df_list_3gram[[3]])
#colnames(combined_top_20_df_3gram) <- c('frequency', 'words', 'source')
#
#ggplot(data = combined_top_20_df_3gram, aes(x = reorder(words, frequency), y = frequency, fill=source)) +
# geom_col(aes(fill = source)) +
# geom_col(width = 0.7) +
# ylab('Frequency') + xlab('Top 20 common words') +
# scale_y_continuous(expand = c(0, 0)) +
# guides(fill = guide_legend(title = 'Source')) +
# theme(legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1))
word_coverage <- function(input_dfm_list, target_coverage){
df_feature_list <- list()
return_list <- list()
list_length <- length(input_dfm_list)
for(i in 1:list_length){
df_feature_list[[i]] <- data.frame(topfeatures(input_dfm_list[[i]]), nfeat(input_dfm_list[[i]]))
names(df_feature_list[[i]]) <- c("count", "total")
df_feature_list[[i]]$words <- row.names(df_feature_list[[i]])
df_feature_list[[i]]$percentage <- cumsum(df_feature_list[[i]]$count)/sum(df_feature_list[[i]]$count) * 100
number_words <- min(which(round(df_feature_list[[i]]$percentage) >= target_coverage))
return_list[[i]] <- c(number_words, df_feature_list[[i]][number_words,])
}
return(return_list)
}
word_coverage(word_dfm_list, 50) #50%
## [[1]]
## [[1]][[1]]
## [1] 4
##
## [[1]]$count
## [1] 897901
##
## [[1]]$total
## [1] 394276
##
## [[1]]$words
## [1] "a"
##
## [[1]]$percentage
## [1] 58.13065
##
##
## [[2]]
## [[2]][[1]]
## [1] 3
##
## [[2]]$count
## [1] 884894
##
## [[2]]$total
## [1] 351172
##
## [[2]]$words
## [1] "and"
##
## [[2]]$percentage
## [1] 51.28499
##
##
## [[3]]
## [[3]][[1]]
## [1] 4
##
## [[3]]$count
## [1] 609415
##
## [[3]]$total
## [1] 423957
##
## [[3]]$words
## [1] "a"
##
## [[3]]$percentage
## [1] 55.33224
word_coverage(word_dfm_list, 90) #90%
## [[1]]
## [[1]][[1]]
## [1] 8
##
## [[1]]$count
## [1] 460355
##
## [[1]]$total
## [1] 394276
##
## [[1]]$words
## [1] "that"
##
## [[1]]$percentage
## [1] 90.11554
##
##
## [[2]]
## [[2]][[1]]
## [1] 8
##
## [[2]]$count
## [1] 346961
##
## [[2]]$total
## [1] 351172
##
## [[2]]$words
## [1] "that"
##
## [[2]]$percentage
## [1] 92.48097
##
##
## [[3]]
## [[3]][[1]]
## [1] 9
##
## [[3]]$count
## [1] 359138
##
## [[3]]$total
## [1] 423957
##
## [[3]]$words
## [1] "of"
##
## [[3]]$percentage
## [1] 93.50452
word_coverage(no_common_word_dfm_list, 50) #50%
## [[1]]
## [[1]][[1]]
## [1] 5
##
## [[1]]$count
## [1] 99194
##
## [[1]]$total
## [1] 316494
##
## [[1]]$words
## [1] "can"
##
## [[1]]$percentage
## [1] 58.00009
##
##
## [[2]]
## [[2]][[1]]
## [1] 3
##
## [[2]]$count
## [1] 86632
##
## [[2]]$total
## [1] 278387
##
## [[2]]$words
## [1] "one"
##
## [[2]]$percentage
## [1] 49.93069
##
##
## [[3]]
## [[3]][[1]]
## [1] 5
##
## [[3]]$count
## [1] 128234
##
## [[3]]$total
## [1] 353012
##
## [[3]]$words
## [1] "go"
##
## [[3]]$percentage
## [1] 57.16677
word_coverage(no_common_word_dfm_list, 90) #90%
## [[1]]
## [[1]][[1]]
## [1] 9
##
## [[1]]$count
## [1] 70847
##
## [[1]]$total
## [1] 316494
##
## [[1]]$words
## [1] "day"
##
## [[1]]$percentage
## [1] 92.67862
##
##
## [[2]]
## [[2]][[1]]
## [1] 9
##
## [[2]]$count
## [1] 60014
##
## [[2]]$total
## [1] 278387
##
## [[2]]$words
## [1] "get"
##
## [[2]]$percentage
## [1] 93.30444
##
##
## [[3]]
## [[3]][[1]]
## [1] 9
##
## [[3]]$count
## [1] 90152
##
## [[3]]$total
## [1] 353012
##
## [[3]]$words
## [1] "can"
##
## [[3]]$percentage
## [1] 92.54845