Project Description

The goal of the project is to use a dataset (consisting of excerpts from Twitter, blogs, and news sources) to build a Predictive Text Model that can predict the next word a user wants to use while typing.

Analysis

I will segment the data into Training, Test, and Validate datasets with an 80/10/10 split.

I will use exploratory data analysis techniques to discover the frequencies of words in the Training dataset. I will further evaluate the frequencies of word pairs (2-grams) and word triplets (3-grams) to reveal how often words appear together.

Method

The model will make use of these frequencies to predict the next word in a line given the last two words. It will also consider the likelihood given only the last one word, to account for unseen 3-grams in the training data.

The model will be tested against the Test dataset, and once the confidence level is high enough, a final run against the Validation dataset will be used to score the model’s accuracy. Then it will be deployed through shinyapps.io so that it can be tested by real users typing in a text box while the model makes next-word predictions in real-time. This will enable an evaluation of the model’s performance characteristics in a real-world setting.

Analysis Results

The remainder of this report will show the results of the initial analysis.

Start Spark Local

## 
## Attaching package: 'sparklyr'
## The following object is masked from 'package:stats':
## 
##     filter
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
## Warning: package 'forcats' was built under R version 4.3.3

Get Data

# Imports blogs file

corpora_path <- "C:/Users/joshu/OneDrive/Documents/Coursera/DataScience/DataScienceCapstone/en_US"

blog_path <- paste0(corpora_path,"/en_US.blogs.txt")
blog <-  spark_read_text(sc, "blog", blog_path)
twit_path <- paste0(corpora_path,"/en_US.twitter.txt")
twit <-  spark_read_text(sc, "twit", twit_path)
news_path <- paste0(corpora_path,"/en_US.news.txt")
news <-  spark_read_text(sc, "news", news_path)


head(blog)
## # Source: spark<?> [?? x 1]
##   line                                                                          
##   <chr>                                                                         
## 1 In the years thereafter, most of the Oil fields and platforms were named afte…
## 2 We love you Mr. Brown.                                                        
## 3 Chad has been awesome with the kids and holding down the fort while I work la…
## 4 so anyways, i am going to share some home decor inspiration that i have been …
## 5 With graduation season right around the corner, Nancy has whipped up a fun se…
## 6 If you have an alternative argument, let's hear it! :)

Partition Dataset to Train, Test, and Validate

80% Train, 10% Test, 10% Validate

Combine the training data sets into a single data set for building the model

trn_lines <- blog_parts$training %>% mutate(source = "blog") %>%
  sdf_bind_rows({ twit_parts$training %>% mutate(source = "twit") }) %>%
  sdf_bind_rows({ news_parts$training %>% mutate(source = "news") }) %>%
  filter(nchar(line) > 0)
trn_lines_stats <- trn_lines %>% group_by(source) %>% summarise(numrows = n())
head (trn_lines_stats)
## # Source: spark<?> [?? x 2]
##   source numrows
##   <chr>    <dbl>
## 1 blog    718840
## 2 news    807432
## 3 twit   1888432
tst_lines <- blog_parts$test %>% mutate(source = "blog") %>%
  sdf_bind_rows({ twit_parts$test %>% mutate(source = "twit") }) %>%
  sdf_bind_rows({ news_parts$test %>% mutate(source = "news") }) %>%
  filter(nchar(line) > 0)
tst_lines_stats <- tst_lines %>% group_by(source) %>% summarise(numrows = n())
#head (tst_lines_stats)

val_lines <- blog_parts$validate %>% mutate(source = "blog") %>%
  sdf_bind_rows({ twit_parts$validate %>% mutate(source = "twit") }) %>%
  sdf_bind_rows({ news_parts$validate %>% mutate(source = "news") }) %>%
  filter(nchar(line) > 0)
val_lines_stats <- val_lines %>% group_by(source) %>% summarise(numrows = n())
#head (val_lines_stats)

Tokenize and Find 2-grams, 3-grams

trn_tokens <- trn_lines %>% mutate(line = regexp_replace(line, "[\"\']+", "")) %>%
  mutate(line = regexp_replace(line, "[_():;,.!?*\\-]+", " ")) %>%
  mutate(line = regexp_replace(line, "[ ]+"," ")) %>% # Make sure multi-spaces are collapsed to one space
  ft_tokenizer( input_col = "line",  output_col = "word_list" ) %>%
  #ft_stop_words_remover( input_col = "word_list", output_col = "wo_stop_words" ) %>% #probably should be able to predict stopwords too 
  ft_ngram(input_col = "word_list",  output_col = "twograms", n=2)%>%
  ft_ngram(input_col = "word_list",  output_col = "threegrams", n=3)

trn_words <- select(trn_tokens,source,word_list) %>%  
  mutate(word = explode(word_list)) %>%
  select(word, source) %>%
  filter(nchar(word) > 2) #%>%
  #compute("trn_words_comp")

trn_2grams <- select(trn_tokens,source,twograms) %>%  
  mutate(twogram = explode(twograms)) %>%
  select(twogram, source) 
trn_3grams <- select(trn_tokens,source,threegrams) %>%  
  mutate(threegram = explode(threegrams)) %>%
  select(threegram, source) 

trn_words_stats <- trn_words %>%  
  group_by(source) %>% summarise(totwords = n()  ) 

trn_2grams_stats <- trn_2grams %>%  
  group_by(source) %>% summarise(tot2grams = n()  ) 

trn_3grams_stats <- trn_3grams %>%  
  group_by(source) %>% summarise(tot3grams = n()  ) 

trn_words_stats %>% left_join(trn_2grams_stats) %>% left_join(trn_3grams_stats)
## Joining with `by = join_by(source)`
## Joining with `by = join_by(source)`
## # Source: spark<?> [?? x 4]
##   source totwords tot2grams tot3grams
##   <chr>     <dbl>     <dbl>     <dbl>
## 1 blog   23410255  29442125  28730852
## 2 news   22473691  27203399  26398468
## 3 twit   18455494  22407527  20519373
#source totwords  tot2grams tot3grams
#blog     23410255  29442125    28730852    
#news     22473691  27203399    26398468    
#twit     18455494  22407527    20519373    

trn_2grams_dstats <- trn_2grams %>%  
  group_by(source,twogram) %>% summarise(num_occs = n()  ) %>% 
  left_join(trn_2grams_stats, by = c("source")) %>% 
  mutate(cov_pct = num_occs/tot2grams*100) %>% arrange(-cov_pct) %>%
  group_by(source) %>% mutate(cum_cov_pct = cumsum(cov_pct))
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
trn_3grams_dstats <- trn_3grams %>%  
  group_by(source,threegram) %>% summarise(num_occs = n()  ) %>% 
  left_join(trn_3grams_stats, by = c("source")) %>% 
  mutate(cov_pct = num_occs/tot3grams*100) %>% arrange(-cov_pct) %>%
  group_by(source) %>% mutate(cum_cov_pct = cumsum(cov_pct))
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
trn_words_dstats <- trn_words %>%  
  group_by(source,word) %>% summarise(num_occs = n()  ) %>% 
  left_join(trn_words_stats, by = c("source")) %>% 
  mutate(cov_pct = num_occs/totwords*100) %>% arrange(-cov_pct) %>%
  group_by(source) %>% mutate(cum_cov_pct = cumsum(cov_pct))
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
#head (trn_words_dstats)
#head (trn_2grams_dstats)
#head (trn_3grams_dstats)
head(trn_2grams, n=20)
## # Source: spark<?> [?? x 2]
##    twogram       source
##    <chr>         <chr> 
##  1 " omg"        blog  
##  2 "omg how"     blog  
##  3 "how could"   blog  
##  4 "could i"     blog  
##  5 "i have"      blog  
##  6 "have missed" blog  
##  7 "missed this" blog  
##  8 " are"        blog  
##  9 "are you"     blog  
## 10 "you a"       blog  
## # ℹ more rows
head(trn_3grams, n=20)
## # Source: spark<?> [?? x 2]
##    threegram          source
##    <chr>              <chr> 
##  1 " omg how"         blog  
##  2 "omg how could"    blog  
##  3 "how could i"      blog  
##  4 "could i have"     blog  
##  5 "i have missed"    blog  
##  6 "have missed this" blog  
##  7 " are you"         blog  
##  8 "are you a"        blog  
##  9 "you a member"     blog  
## 10 " do these"        blog  
## # ℹ more rows

Dictionary Coverage Calculations

Determine how many dictionary words are needed for 90% and 50% coverage of each portion of the dataset, and what percentage of the total words that represents

trn_words_cum100_stats <- trn_words_dstats %>% summarise(numdwords100 = n()) %>% collect()
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
trn_words_cum90_stats <- trn_words_dstats %>% filter(cum_cov_pct<=90) %>% summarise(numdwords90=n()) %>% collect()
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
trn_words_cum50_stats <- trn_words_dstats %>% filter(cum_cov_pct<=50) %>% summarise(numdwords50=n()) %>% collect()
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?
trn_words_cum100_stats %>% left_join(trn_words_cum90_stats) %>% left_join(trn_words_cum50_stats) %>% 
        mutate(cov_90pct = numdwords90/numdwords100*100,cov_50pct = numdwords50/numdwords100*100) 
## Joining with `by = join_by(source)`
## Joining with `by = join_by(source)`
## # A tibble: 3 × 6
##   source numdwords100 numdwords90 numdwords50 cov_90pct cov_50pct
##   <chr>         <dbl>       <dbl>       <dbl>     <dbl>     <dbl>
## 1 blog         352659        9708         262      2.75    0.0743
## 2 news         257054       10535         433      4.10    0.168 
## 3 twit         375333        8198         228      2.18    0.0607

Top Fifty Words (any source)

top50words <- trn_words %>%
  count(word) %>% arrange(-n) %>% head(50) 

ggplot(top50words, aes(x=n, y=fct_reorder(word,n) )) +
  geom_col() +
  labs(title="Word Counts", x="Frequency", y="Word") #+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Top Fifty 2-grams (any source)

top50twograms <- trn_2grams %>%
  count(twogram) %>% arrange(-n) %>% head(50) 

ggplot(top50twograms, aes(x=n, y=fct_reorder(twogram,n) )) +
  geom_col() +
  labs(title="2-gram Counts", x="Frequency", y="2-gram")

Top Fifty 3-grams (any source)

top50threegrams <- trn_3grams %>%
  count(threegram) %>% arrange(-n) %>% head(50)

ggplot(top50threegrams, aes(x=n, y=fct_reorder(threegram,n) )) +
  geom_col() +
  labs(title="3-gram Counts", x="Frequency", y="3-gram") 

Word Cloud - Top 300 Words, by source

Blue words from Twitter. Pink from Blogs. Green from News.

wc300 <- trn_words %>% count(source, word) %>% 
  arrange(-n)  %>% head(300) %>% collect() %>% mutate ( col = case_when( source == "twit" ~ "#56B4E9",
                                                                  source == "blog" ~ "#E69FB4",
                                                                  source == "news" ~ "#B4C67F") )
                                               
wc300 %>% 
  with(wordcloud::wordcloud(
    word,
    n,
    ordered.colors = TRUE,
    colors = wc300$col
  ))

## Garbage Collection

##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1473563 78.7    2346848 125.4  2346848 125.4
## Vcells 2685610 20.5    8388608  64.0  6624690  50.6

Disconnect Spark and Cleanup

##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1473670 78.8    2346848 125.4  2346848 125.4
## Vcells 2685696 20.5    8388608  64.0  6624690  50.6