This milestone report presents exploratory data analysis (EDA) for a predictive text model using the Coursera SwiftKey dataset. The report is explicitly designed to be memory-efficient, reproducible, and safe to knit on limited-RAM environments.
library(downloader)
library(stringi)
library(knitr)
library(ggplot2)
library(quanteda)
if (!file.exists("./projectData")) dir.create("./projectData")
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
zip_file <- "./projectData/Coursera-SwiftKey.zip"
if (!file.exists(zip_file)) {
download.file(url, destfile = zip_file, mode = "wb")
}
if (!file.exists("./projectData/final")) {
unzip(zip_file, exdir = "./projectData")
}
sampleLines <- function(file, n = 3000, seed = 5000) {
set.seed(seed)
con <- file(file, open = "r")
sample <- character(0)
i <- 0
repeat {
line <- readLines(con, n = 1, warn = FALSE)
if (length(line) == 0) break
i <- i + 1
if (i <= n) {
sample[i] <- line
} else {
j <- sample.int(i, 1)
if (j <= n) {
sample[j] <- line
}
}
}
close(con)
sample
}
base_path <- "./projectData/final/en_US"
blogs_samp <- sampleLines(file.path(base_path, "en_US.blogs.txt"))
news_samp <- sampleLines(file.path(base_path, "en_US.news.txt"))
twitter_samp <- sampleLines(file.path(base_path, "en_US.twitter.txt"))
all_samp <- c(blogs_samp, news_samp, twitter_samp)
dir.create("./projectData/sample", showWarnings = FALSE)
writeLines(all_samp, "./projectData/sample/all_samp.txt")
data_summary <- data.frame(
source = c("blogs", "news", "twitter"),
sampled.lines = c(length(blogs_samp), length(news_samp), length(twitter_samp)),
total.words = c(
sum(stri_count_words(blogs_samp)),
sum(stri_count_words(news_samp)),
sum(stri_count_words(twitter_samp))
),
mean.words = c(
mean(stri_count_words(blogs_samp)),
mean(stri_count_words(news_samp)),
mean(stri_count_words(twitter_samp))
)
)
kable(data_summary, caption = "Summary of Sampled Data")
| source | sampled.lines | total.words | mean.words |
|---|---|---|---|
| blogs | 3000 | 127801 | 42.60033 |
| news | 3000 | 103107 | 34.36900 |
| 3000 | 39381 | 13.12700 |
tokens_all <- tokens(
all_samp,
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
remove_url = TRUE
)
tokens_all <- tokens_tolower(tokens_all)
tokens_all <- tokens_remove(tokens_all, stopwords("en"))
dfm1 <- dfm(tokens_all)
top_uni <- topfeatures(dfm1, 30)
top_uni
## said one just like can time get new people now day
## 922 826 678 616 605 541 520 492 458 436 411
## also good two see know make first much last back year
## 402 383 356 352 347 342 333 332 327 323 318
## us think go going even way well really
## 317 307 305 305 297 287 286 283
dfm2 <- dfm(tokens_ngrams(tokens_all, n = 2))
top_bi <- topfeatures(dfm2, 30)
top_bi
## right_now last_year years_ago high_school last_week
## 48 47 41 41 40
## new_york first_time new_jersey can_get one_day
## 39 32 32 31 26
## st_louis good_morning don’t_know feel_like many_people
## 25 24 23 23 22
## last_night social_media even_though looks_like get_back
## 22 22 21 21 20
## two_years long_time los_angeles united_states make_sure
## 20 19 19 19 18
## last_time can_see three_years looking_forward can_help
## 18 18 18 17 17
dfm3 <- dfm(tokens_ngrams(tokens_all, n = 3))
top_tri <- topfeatures(dfm3, 30)
top_tri
## new_york_times dah_dah_dah
## 6 5
## couple_years_ago uk_singles_chart
## 5 5
## president_barack_obama airline_ticket_discount
## 5 5
## two_years_ago gov_chris_christie
## 5 5
## miley_beautiful_miley beautiful_miley_beautiful
## 5 5
## don’t_get_wrong monitoring_social_media
## 4 4
## long_time_ago world_book_day
## 4 4
## new_york_giants cinco_de_mayo
## 4 4
## saturday_may_5th mr_omand_wants
## 3 3
## omand_wants_green wants_green_paper
## 3 3
## green_paper_published paper_published_monitoring
## 3 3
## published_monitoring_social social_media_sites
## 3 3
## media_sites_private sites_private_industry
## 3 3
## private_industry_link percent_income_percent
## 3 3
## income_percent_income new_york_n.y
## 3 3
plot_top <- function(x, title) {
df <- data.frame(
term = names(x),
freq = as.numeric(x)
)
ggplot(df, aes(reorder(term, freq), freq)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = title, x = NULL, y = "Frequency") +
theme_minimal()
}
plot_top(top_uni, "Top 30 Unigrams")
plot_top(top_bi, "Top 30 Bigrams")
plot_top(top_tri, "Top 30 Trigrams")
quanteda avoided dense matrices.corpus <- tm::VCorpus(tm::VectorSource(all_samp))
toSpace <- tm::content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm::tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm::tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm::tm_map(corpus, toSpace, "#")
corpus <- tm::tm_map(corpus, tm::content_transformer(tolower))
corpus <- tm::tm_map(corpus, tm::removeWords, tm::stopwords("english"))
corpus <- tm::tm_map(corpus, tm::removePunctuation)
corpus <- tm::tm_map(corpus, tm::removeNumbers)
corpus <- tm::tm_map(corpus, tm::stripWhitespace)
corpus <- tm::tm_map(corpus, tm::PlainTextDocument)
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
data.frame(word = names(freq), freq = freq)
}
bigram <- function(x) RWeka::NGramTokenizer(x, RWeka::Weka_control(min = 2, max = 2))
trigram <- function(x) RWeka::NGramTokenizer(x, RWeka::Weka_control(min = 3, max = 3))
makePlot <- function(data, title) {
ggplot2::ggplot(data[1:30, ], ggplot2::aes(reorder(word, -freq), freq)) +
ggplot2::geom_bar(stat = "identity", fill = "steelblue") +
ggplot2::labs(x = title, y = "Frequency") +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1))
}
tokens_all <- tokens(
all_samp,
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
remove_url = TRUE
)
tokens_all <- tokens_tolower(tokens_all)
tokens_all <- tokens_remove(tokens_all, stopwords("en"))
dfm_uni <- quanteda::dfm(tokens_all)
top_uni <- quanteda::topfeatures(dfm_uni, 30)
top_uni
## said one just like can time get new people now day
## 922 826 678 616 605 541 520 492 458 436 411
## also good two see know make first much last back year
## 402 383 356 352 347 342 333 332 327 323 318
## us think go going even way well really
## 317 307 305 305 297 287 286 283
tokens_bi <- quanteda::tokens_ngrams(tokens_all, n = 2)
dfm_bi <- quanteda::dfm(tokens_bi)
top_bi <- quanteda::topfeatures(dfm_bi, 30)
top_bi
## right_now last_year years_ago high_school last_week
## 48 47 41 41 40
## new_york first_time new_jersey can_get one_day
## 39 32 32 31 26
## st_louis good_morning don’t_know feel_like many_people
## 25 24 23 23 22
## last_night social_media even_though looks_like get_back
## 22 22 21 21 20
## two_years long_time los_angeles united_states make_sure
## 20 19 19 19 18
## last_time can_see three_years looking_forward can_help
## 18 18 18 17 17
tokens_tri <- quanteda::tokens_ngrams(tokens_all, n = 3)
dfm_tri <- quanteda::dfm(tokens_tri)
top_tri <- quanteda::topfeatures(dfm_tri, 30)
top_tri
## new_york_times dah_dah_dah
## 6 5
## couple_years_ago uk_singles_chart
## 5 5
## president_barack_obama airline_ticket_discount
## 5 5
## two_years_ago gov_chris_christie
## 5 5
## miley_beautiful_miley beautiful_miley_beautiful
## 5 5
## don’t_get_wrong monitoring_social_media
## 4 4
## long_time_ago world_book_day
## 4 4
## new_york_giants cinco_de_mayo
## 4 4
## saturday_may_5th mr_omand_wants
## 3 3
## omand_wants_green wants_green_paper
## 3 3
## green_paper_published paper_published_monitoring
## 3 3
## published_monitoring_social social_media_sites
## 3 3
## media_sites_private sites_private_industry
## 3 3
## private_industry_link percent_income_percent
## 3 3
## income_percent_income new_york_n.y
## 3 3