The goal of the project is to use a dataset (consisting of excerpts from Twitter, blogs, and news sources) to build a Predictive Text Model that can predict the next word a user wants to use while typing.
I will segment the data into Training, Test, and Validate datasets with an 80/10/10 split.
I will use exploratory data analysis techniques to discover the frequencies of words in the Training dataset. I will further evaluate the frequencies of word pairs (2-grams) and word triplets (3-grams) to reveal how often words appear together.
The model will make use of these frequencies to predict the next word in a line given the last two words. It will also consider the likelihood given only the last one word, to account for unseen 3-grams in the training data.
The model will be tested against the Test dataset, and once the confidence level is high enough, a final run against the Validation dataset will be used to score the model’s accuracy. Then it will be deployed through shinyapps.io so that it can be tested by real users typing in a text box while the model makes next-word predictions in real-time. This will enable an evaluation of the model’s performance characteristics in a real-world setting.
The remainder of this report will show the results of the initial analysis.
##
## Attaching package: 'sparklyr'
## The following object is masked from 'package:stats':
##
## filter
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
## Warning: package 'forcats' was built under R version 4.3.3
# Imports blogs file
corpora_path <- "C:/Users/joshu/OneDrive/Documents/Coursera/DataScience/DataScienceCapstone/en_US"
blog_path <- paste0(corpora_path,"/en_US.blogs.txt")
blog <- spark_read_text(sc, "blog", blog_path)
twit_path <- paste0(corpora_path,"/en_US.twitter.txt")
twit <- spark_read_text(sc, "twit", twit_path)
news_path <- paste0(corpora_path,"/en_US.news.txt")
news <- spark_read_text(sc, "news", news_path)
head(blog)
## # Source: spark<?> [?? x 1]
## line
## <chr>
## 1 In the years thereafter, most of the Oil fields and platforms were named afte…
## 2 We love you Mr. Brown.
## 3 Chad has been awesome with the kids and holding down the fort while I work la…
## 4 so anyways, i am going to share some home decor inspiration that i have been …
## 5 With graduation season right around the corner, Nancy has whipped up a fun se…
## 6 If you have an alternative argument, let's hear it! :)
80% Train, 10% Test, 10% Validate
trn_lines <- blog_parts$training %>% mutate(source = "blog") %>%
sdf_bind_rows({ twit_parts$training %>% mutate(source = "twit") }) %>%
sdf_bind_rows({ news_parts$training %>% mutate(source = "news") }) %>%
filter(nchar(line) > 0)
trn_lines_stats <- trn_lines %>% group_by(source) %>% summarise(numrows = n())
head (trn_lines_stats)
## # Source: spark<?> [?? x 2]
## source numrows
## <chr> <dbl>
## 1 blog 718840
## 2 news 807432
## 3 twit 1888432
tst_lines <- blog_parts$test %>% mutate(source = "blog") %>%
sdf_bind_rows({ twit_parts$test %>% mutate(source = "twit") }) %>%
sdf_bind_rows({ news_parts$test %>% mutate(source = "news") }) %>%
filter(nchar(line) > 0)
tst_lines_stats <- tst_lines %>% group_by(source) %>% summarise(numrows = n())
#head (tst_lines_stats)
val_lines <- blog_parts$validate %>% mutate(source = "blog") %>%
sdf_bind_rows({ twit_parts$validate %>% mutate(source = "twit") }) %>%
sdf_bind_rows({ news_parts$validate %>% mutate(source = "news") }) %>%
filter(nchar(line) > 0)
val_lines_stats <- val_lines %>% group_by(source) %>% summarise(numrows = n())
#head (val_lines_stats)
trn_tokens <- trn_lines %>% mutate(line = regexp_replace(line, "[\"\']+", "")) %>%
mutate(line = regexp_replace(line, "[_():;,.!?*\\-]+", " ")) %>%
mutate(line = regexp_replace(line, "[ ]+"," ")) %>% # Make sure multi-spaces are collapsed to one space
ft_tokenizer( input_col = "line", output_col = "word_list" ) %>%
#ft_stop_words_remover( input_col = "word_list", output_col = "wo_stop_words" ) %>% #probably should be able to predict stopwords too
ft_ngram(input_col = "word_list", output_col = "twograms", n=2)%>%
ft_ngram(input_col = "word_list", output_col = "threegrams", n=3)
trn_words <- select(trn_tokens,source,word_list) %>%
mutate(word = explode(word_list)) %>%
select(word, source) %>%
filter(nchar(word) > 2) #%>%
#compute("trn_words_comp")
trn_2grams <- select(trn_tokens,source,twograms) %>%
mutate(twogram = explode(twograms)) %>%
select(twogram, source)
trn_3grams <- select(trn_tokens,source,threegrams) %>%
mutate(threegram = explode(threegrams)) %>%
select(threegram, source)
trn_words_stats <- trn_words %>%
group_by(source) %>% summarise(totwords = n() )
trn_2grams_stats <- trn_2grams %>%
group_by(source) %>% summarise(tot2grams = n() )
trn_3grams_stats <- trn_3grams %>%
group_by(source) %>% summarise(tot3grams = n() )
trn_words_stats %>% left_join(trn_2grams_stats) %>% left_join(trn_3grams_stats)
## Joining with `by = join_by(source)`
## Joining with `by = join_by(source)`
## # Source: spark<?> [?? x 4]
## source totwords tot2grams tot3grams
## <chr> <dbl> <dbl> <dbl>
## 1 blog 23410255 29442125 28730852
## 2 news 22473691 27203399 26398468
## 3 twit 18455494 22407527 20519373
#source totwords tot2grams tot3grams
#blog 23410255 29442125 28730852
#news 22473691 27203399 26398468
#twit 18455494 22407527 20519373
trn_2grams_dstats <- trn_2grams %>%
group_by(source,twogram) %>% summarise(num_occs = n() ) %>%
left_join(trn_2grams_stats, by = c("source")) %>%
mutate(cov_pct = num_occs/tot2grams*100) %>% arrange(-cov_pct) %>%
group_by(source) %>% mutate(cum_cov_pct = cumsum(cov_pct))
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
trn_3grams_dstats <- trn_3grams %>%
group_by(source,threegram) %>% summarise(num_occs = n() ) %>%
left_join(trn_3grams_stats, by = c("source")) %>%
mutate(cov_pct = num_occs/tot3grams*100) %>% arrange(-cov_pct) %>%
group_by(source) %>% mutate(cum_cov_pct = cumsum(cov_pct))
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
trn_words_dstats <- trn_words %>%
group_by(source,word) %>% summarise(num_occs = n() ) %>%
left_join(trn_words_stats, by = c("source")) %>%
mutate(cov_pct = num_occs/totwords*100) %>% arrange(-cov_pct) %>%
group_by(source) %>% mutate(cum_cov_pct = cumsum(cov_pct))
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
#head (trn_words_dstats)
#head (trn_2grams_dstats)
#head (trn_3grams_dstats)
head(trn_2grams, n=20)
## # Source: spark<?> [?? x 2]
## twogram source
## <chr> <chr>
## 1 " omg" blog
## 2 "omg how" blog
## 3 "how could" blog
## 4 "could i" blog
## 5 "i have" blog
## 6 "have missed" blog
## 7 "missed this" blog
## 8 " are" blog
## 9 "are you" blog
## 10 "you a" blog
## # ℹ more rows
head(trn_3grams, n=20)
## # Source: spark<?> [?? x 2]
## threegram source
## <chr> <chr>
## 1 " omg how" blog
## 2 "omg how could" blog
## 3 "how could i" blog
## 4 "could i have" blog
## 5 "i have missed" blog
## 6 "have missed this" blog
## 7 " are you" blog
## 8 "are you a" blog
## 9 "you a member" blog
## 10 " do these" blog
## # ℹ more rows
Determine how many dictionary words are needed for 90% and 50% coverage of each portion of the dataset, and what percentage of the total words that represents
trn_words_cum100_stats <- trn_words_dstats %>% summarise(numdwords100 = n()) %>% collect()
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
trn_words_cum90_stats <- trn_words_dstats %>% filter(cum_cov_pct<=90) %>% summarise(numdwords90=n()) %>% collect()
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
trn_words_cum50_stats <- trn_words_dstats %>% filter(cum_cov_pct<=50) %>% summarise(numdwords50=n()) %>% collect()
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
trn_words_cum100_stats %>% left_join(trn_words_cum90_stats) %>% left_join(trn_words_cum50_stats) %>%
mutate(cov_90pct = numdwords90/numdwords100*100,cov_50pct = numdwords50/numdwords100*100)
## Joining with `by = join_by(source)`
## Joining with `by = join_by(source)`
## # A tibble: 3 × 6
## source numdwords100 numdwords90 numdwords50 cov_90pct cov_50pct
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 blog 352659 9708 262 2.75 0.0743
## 2 news 257054 10535 433 4.10 0.168
## 3 twit 375333 8198 228 2.18 0.0607
top50words <- trn_words %>%
count(word) %>% arrange(-n) %>% head(50)
ggplot(top50words, aes(x=n, y=fct_reorder(word,n) )) +
geom_col() +
labs(title="Word Counts", x="Frequency", y="Word") #+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
top50twograms <- trn_2grams %>%
count(twogram) %>% arrange(-n) %>% head(50)
ggplot(top50twograms, aes(x=n, y=fct_reorder(twogram,n) )) +
geom_col() +
labs(title="2-gram Counts", x="Frequency", y="2-gram")
top50threegrams <- trn_3grams %>%
count(threegram) %>% arrange(-n) %>% head(50)
ggplot(top50threegrams, aes(x=n, y=fct_reorder(threegram,n) )) +
geom_col() +
labs(title="3-gram Counts", x="Frequency", y="3-gram")
Blue words from Twitter. Pink from Blogs. Green from News.
wc300 <- trn_words %>% count(source, word) %>%
arrange(-n) %>% head(300) %>% collect() %>% mutate ( col = case_when( source == "twit" ~ "#56B4E9",
source == "blog" ~ "#E69FB4",
source == "news" ~ "#B4C67F") )
wc300 %>%
with(wordcloud::wordcloud(
word,
n,
ordered.colors = TRUE,
colors = wc300$col
))
## Garbage Collection
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1473563 78.7 2346848 125.4 2346848 125.4
## Vcells 2685610 20.5 8388608 64.0 6624690 50.6
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1473670 78.8 2346848 125.4 2346848 125.4
## Vcells 2685696 20.5 8388608 64.0 6624690 50.6