This report details Exploratory Data Analysis the Swiftkey data. Specifically:
The following packages will be loaded and seed set for sampling.
library(tm)
library(tidyverse)
library(quanteda)
library(gridExtra)
set.seed(20201103)
The following code will process all 3 text files in the en_US folder:
base_path = getwd()
datasets = 'Datasets'
language = 'en_US'
file_path = file.path(base_path, datasets, language)
sample_rate = 0.05
info_df = data.frame()
file_name = 'en_US.twitter.txt'
for (file_name in list.files(file_path)) {
data_from = strsplit(file_name, '\\.')[[1]][2]
temp = sample(x = read_lines(file.path(file_path, file_name)),
size = length(readLines(file.path(file_path, file_name))))
size = round(file.info(file.path(file_path, file_name))$size / 10^6, 1)
# Get some quick info about these txt files
info_df = rbind(info_df, data.frame('File_name'=file_name,
'File_size'= str_c(size, 'MB'),
'Total_lines'=length(temp)))
# Sampling
temp = sample(x = read_lines(file.path(file_path, file_name)),
size = length(readLines(file.path(file_path, file_name))) * sample_rate)
# size = 50) # This is for unit testing
# Remove all non-ASCII characters
temp = iconv(temp, from = 'utf-8', to = 'ASCII', sub = '')
# Clean/Process the lines in temp
corpus = Corpus(VectorSource(temp)) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords,
# readLines('ignore_words.txt'),
readLines(url('https://www.cs.cmu.edu/~biglou/resources/bad-words.txt'))) %>%
quanteda::corpus()
# Tokenize
## 1-gram
top = 30
uni_gram = tokens(corpus)
uni_dfm = dfm(uni_gram, stem = TRUE)
uni_top = topfeatures(uni_dfm, n = top)
uni_df = data.frame(words = names(uni_top), freq = uni_top)
assign(paste(data_from, '_uni_plot', sep = ''),
ggplot(uni_df, aes(y = reorder(words, freq), x = freq)) +
geom_bar(stat = 'identity', fill = 'steelblue') +
xlab('') + ylab('') + ggtitle(str_to_title(data_from)))
## 2-gram
bi_gram = tokens_ngrams(uni_gram, n = 2, concatenator = ' ')
bi_dfm = dfm(bi_gram, stem = TRUE)
bi_top = topfeatures(bi_dfm, n = top)
bi_df = data.frame(words = names(bi_top), freq = bi_top)
assign(paste(data_from, '_bi_plot', sep = ''),
ggplot(bi_df, aes(y = reorder(words, freq), x = freq)) +
geom_bar(stat = 'identity', fill = 'steelblue') +
xlab('') + ylab('') + ggtitle(str_to_title(data_from)))
## 3-gram
tri_gram = tokens_ngrams(uni_gram, n = 3, concatenator = ' ')
tri_dfm = dfm(tri_gram, stem = TRUE)
tri_top = topfeatures(tri_dfm, n = top)
tri_df = data.frame(words = names(tri_top), freq = tri_top)
assign(paste(data_from, '_tri_plot', sep = ''),
ggplot(tri_df, aes(y = reorder(words, freq), x = freq)) +
geom_bar(stat = 'identity', fill = 'steelblue') +
xlab('') + ylab('') + ggtitle(str_to_title(data_from)))
# Remove temp to free memory
rm(temp)
showConnections()
}
The text files are summarized:
## File_name File_size Total_lines
## 1 en_US.blogs.txt 210.2MB 899288
## 2 en_US.news.txt 205.8MB 77259
## 3 en_US.twitter.txt 167.1MB 2360148
The files contain from 70,000 to over 2 million lines, which are easily over millions of words to tokenize. As exploratory analysis, 5% of the total number of lines are sampled and analyzed.
Top 30 most frequent words for each file:
grid.arrange(blogs_uni_plot, news_uni_plot, twitter_uni_plot,
ncol = 3,
top = 'Top 30 Unigram words',
bottom = 'Frequency')
Top 30 most frequent n-gram for Blogs:
blogs_bi_plot = blogs_bi_plot + ggtitle('Bigram')
blogs_tri_plot = blogs_tri_plot + ggtitle('Trigram')
grid.arrange(blogs_bi_plot, blogs_tri_plot,
ncol = 2,
bottom = 'Frequency')
Top 30 most frequent n-grams for News:
news_bi_plot = news_bi_plot + ggtitle('Bigram')
news_tri_plot = news_tri_plot + ggtitle('Trigram')
grid.arrange(news_bi_plot, news_tri_plot,
ncol = 2,
bottom = 'Frequency')
Top 30 most frequent n-grams for Twitter:
twitter_bi_plot = twitter_bi_plot + ggtitle('Bigram')
twitter_tri_plot = twitter_tri_plot + ggtitle('Trigram')
grid.arrange(twitter_bi_plot, twitter_tri_plot,
ncol = 2,
bottom = 'Frequency')