This project (SwiftKey capstone) builds a predictive text model to suggest the next word in a phrase. This report demonstrates successful data loading, basic summaries, exploratory plots, and the plan for the prediction algorithm and Shiny app.
pkgs <- c("dplyr","stringi","tm","tidytext","ggplot2","tokenizers")
to_install <- pkgs[!pkgs %in% installed.packages()[,"Package"]]
if(length(to_install)) install.packages(to_install, quiet = TRUE)
invisible(lapply(pkgs, library, character.only = TRUE))
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: NLP
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
The HC Corpora English files: blogs, news, and twitter. Update
base_path if needed.
base_path <- "~/coursera assignments/"
blogs_path <- file.path(base_path, "en_US.blogs.txt")
news_path <- file.path(base_path, "en_US.news.txt")
twitter_path <- file.path(base_path, "en_US.twitter.txt")
read_txt <- function(p) readLines(p, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
blogs <- read_txt(blogs_path)
news <- read_txt(news_path)
twitter <- read_txt(twitter_path)
data_summary <- data.frame(
File = c("Blogs","News","Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))),
Size_MB = round(c(file.info(blogs_path)$size,
file.info(news_path)$size,
file.info(twitter_path)$size)/1024^2, 1),
check.names = FALSE
)
data_summary
## File Lines Words Size_MB
## 1 Blogs 899288 37546250 200.4
## 2 News 1010242 34762395 196.3
## 3 Twitter 2360148 30093413 159.4
The files are large; below are typical magnitudes: - Blogs ≈ 899k lines (~38M words, ~200 MB) - News ≈ 1,010k lines (~34M words, ~196 MB) - Twitter ≈ 2,360k lines (~30M words, ~159 MB) ## 4. Sampling and Cleaning
set.seed(123)
sample_frac <- 0.01
sample_data <- c(sample(blogs, max(1, floor(length(blogs)*sample_frac))),
sample(news, max(1, floor(length(news)*sample_frac))),
sample(twitter, max(1, floor(length(twitter)*sample_frac))))
corpus <- VCorpus(VectorSource(sample_data))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
tidy_words <- data.frame(text = sapply(corpus, as.character)) |>
tidytext::unnest_tokens(word, text) |>
dplyr::count(word, sort = TRUE)
top20 <- head(tidy_words, 20)
print(head(tidy_words, 10))
## word n
## 1 the 47644
## 2 to 27621
## 3 and 24162
## 4 a 23662
## 5 of 20026
## 6 i 16535
## 7 in 16464
## 8 for 11117
## 9 is 10702
## 10 that 10540
# plot top 20 words
ggplot(top20, aes(reorder(word, n), n)) +
geom_col() +
coord_flip() +
labs(title = "Top 20 Most Frequent Words", x = "Word", y = "Frequency")
Observation: frequent function words dominate (the, to, and, of, a). We will rely more on n-grams and backoff for prediction. ## 6. N-gram Exploration
bigrams <- unlist(tokenizers::tokenize_ngrams(sample_data, n = 2))
trigrams <- unlist(tokenizers::tokenize_ngrams(sample_data, n = 3))
bigram_freq <- as.data.frame(table(bigrams)) |> dplyr::arrange(desc(Freq)) |> head(10)
trigram_freq<- as.data.frame(table(trigrams)) |> dplyr::arrange(desc(Freq)) |> head(10)
bigram_freq
## bigrams Freq
## 1 of the 4240
## 2 in the 4187
## 3 to the 2176
## 4 for the 2093
## 5 on the 1976
## 6 to be 1611
## 7 at the 1333
## 8 and the 1282
## 9 in a 1154
## 10 with the 1061
trigram_freq
## trigrams Freq
## 1 one of the 359
## 2 a lot of 268
## 3 thanks for the 234
## 4 to be a 180
## 5 going to be 177
## 6 the end of 173
## 7 out of the 151
## 8 i want to 148
## 9 it was a 136
## 10 looking forward to 135
knitr::spin("Capstone_Exploratory_Analysis.R", knit = TRUE)
# Then use RStudio "Publish" to RPubs on the generated HTML.