Analysis Results

The remainder of this report will show the results of the initial analysis.

Start Spark Local

## 
## Attaching package: 'sparklyr'

## The following object is masked from 'package:stats':
## 
##     filter

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## 
## Attaching package: 'dbplyr'

## The following objects are masked from 'package:dplyr':
## 
##     ident, sql

## Warning: package 'forcats' was built under R version 4.3.3

Get Data

# Imports blogs file

corpora_path <- "C:/Users/joshu/OneDrive/Documents/Coursera/DataScience/DataScienceCapstone/en_US"

blog_path <- paste0(corpora_path,"/en_US.blogs.txt")
blog <-  spark_read_text(sc, "blog", blog_path)
twit_path <- paste0(corpora_path,"/en_US.twitter.txt")
twit <-  spark_read_text(sc, "twit", twit_path)
news_path <- paste0(corpora_path,"/en_US.news.txt")
news <-  spark_read_text(sc, "news", news_path)


head(blog)

## # Source: spark<?> [?? x 1]
##   line                                                                          
##   <chr>                                                                         
## 1 In the years thereafter, most of the Oil fields and platforms were named afte…
## 2 We love you Mr. Brown.                                                        
## 3 Chad has been awesome with the kids and holding down the fort while I work la…
## 4 so anyways, i am going to share some home decor inspiration that i have been …
## 5 With graduation season right around the corner, Nancy has whipped up a fun se…
## 6 If you have an alternative argument, let's hear it! :)

Partition Dataset to Train, Test, and Validate

80% Train, 10% Test, 10% Validate

Combine the training data sets into a single data set for building the model

trn_lines <- blog_parts$training %>% mutate(source = "blog") %>%
  sdf_bind_rows({ twit_parts$training %>% mutate(source = "twit") }) %>%
  sdf_bind_rows({ news_parts$training %>% mutate(source = "news") }) %>%
  filter(nchar(line) > 0)
trn_lines_stats <- trn_lines %>% group_by(source) %>% summarise(numrows = n())
head (trn_lines_stats)

## # Source: spark<?> [?? x 2]
##   source numrows
##   <chr>    <dbl>
## 1 blog    718840
## 2 news    807432
## 3 twit   1888432

tst_lines <- blog_parts$test %>% mutate(source = "blog") %>%
  sdf_bind_rows({ twit_parts$test %>% mutate(source = "twit") }) %>%
  sdf_bind_rows({ news_parts$test %>% mutate(source = "news") }) %>%
  filter(nchar(line) > 0)
tst_lines_stats <- tst_lines %>% group_by(source) %>% summarise(numrows = n())
#head (tst_lines_stats)

val_lines <- blog_parts$validate %>% mutate(source = "blog") %>%
  sdf_bind_rows({ twit_parts$validate %>% mutate(source = "twit") }) %>%
  sdf_bind_rows({ news_parts$validate %>% mutate(source = "news") }) %>%
  filter(nchar(line) > 0)
val_lines_stats <- val_lines %>% group_by(source) %>% summarise(numrows = n())
#head (val_lines_stats)

Tokenize and Find 2-grams, 3-grams

trn_tokens <- trn_lines %>% mutate(line = regexp_replace(line, "[\"\']+", "")) %>%
  mutate(line = regexp_replace(line, "[_():;,.!?*\\-]+", " ")) %>%
  mutate(line = regexp_replace(line, "[ ]+"," ")) %>% # Make sure multi-spaces are collapsed to one space
  ft_tokenizer( input_col = "line",  output_col = "word_list" ) %>%
  #ft_stop_words_remover( input_col = "word_list", output_col = "wo_stop_words" ) %>% #probably should be able to predict stopwords too 
  ft_ngram(input_col = "word_list",  output_col = "twograms", n=2)%>%
  ft_ngram(input_col = "word_list",  output_col = "threegrams", n=3)

trn_words <- select(trn_tokens,source,word_list) %>%  
  mutate(word = explode(word_list)) %>%
  select(word, source) %>%
  filter(nchar(word) > 2) #%>%
  #compute("trn_words_comp")

trn_2grams <- select(trn_tokens,source,twograms) %>%  
  mutate(twogram = explode(twograms)) %>%
  select(twogram, source) 
trn_3grams <- select(trn_tokens,source,threegrams) %>%  
  mutate(threegram = explode(threegrams)) %>%
  select(threegram, source) 

trn_words_stats <- trn_words %>%  
  group_by(source) %>% summarise(totwords = n()  ) 

trn_2grams_stats <- trn_2grams %>%  
  group_by(source) %>% summarise(tot2grams = n()  ) 

trn_3grams_stats <- trn_3grams %>%  
  group_by(source) %>% summarise(tot3grams = n()  ) 

trn_words_stats %>% left_join(trn_2grams_stats) %>% left_join(trn_3grams_stats)

## Joining with `by = join_by(source)`
## Joining with `by = join_by(source)`

## # Source: spark<?> [?? x 4]
##   source totwords tot2grams tot3grams
##   <chr>     <dbl>     <dbl>     <dbl>
## 1 blog   23410255  29442125  28730852
## 2 news   22473691  27203399  26398468
## 3 twit   18455494  22407527  20519373

#source totwords  tot2grams tot3grams
#blog     23410255  29442125    28730852    
#news     22473691  27203399    26398468    
#twit     18455494  22407527    20519373    

trn_2grams_dstats <- trn_2grams %>%  
  group_by(source,twogram) %>% summarise(num_occs = n()  ) %>% 
  left_join(trn_2grams_stats, by = c("source")) %>% 
  mutate(cov_pct = num_occs/tot2grams*100) %>% arrange(-cov_pct) %>%
  group_by(source) %>% mutate(cum_cov_pct = cumsum(cov_pct))

## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.

## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?

trn_3grams_dstats <- trn_3grams %>%  
  group_by(source,threegram) %>% summarise(num_occs = n()  ) %>% 
  left_join(trn_3grams_stats, by = c("source")) %>% 
  mutate(cov_pct = num_occs/tot3grams*100) %>% arrange(-cov_pct) %>%
  group_by(source) %>% mutate(cum_cov_pct = cumsum(cov_pct))

## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.

## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?

trn_words_dstats <- trn_words %>%  
  group_by(source,word) %>% summarise(num_occs = n()  ) %>% 
  left_join(trn_words_stats, by = c("source")) %>% 
  mutate(cov_pct = num_occs/totwords*100) %>% arrange(-cov_pct) %>%
  group_by(source) %>% mutate(cum_cov_pct = cumsum(cov_pct))

## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.
## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.

## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?

#head (trn_words_dstats)
#head (trn_2grams_dstats)
#head (trn_3grams_dstats)

head(trn_2grams, n=20)

## # Source: spark<?> [?? x 2]
##    twogram       source
##    <chr>         <chr> 
##  1 " omg"        blog  
##  2 "omg how"     blog  
##  3 "how could"   blog  
##  4 "could i"     blog  
##  5 "i have"      blog  
##  6 "have missed" blog  
##  7 "missed this" blog  
##  8 " are"        blog  
##  9 "are you"     blog  
## 10 "you a"       blog  
## # ℹ more rows

head(trn_3grams, n=20)

## # Source: spark<?> [?? x 2]
##    threegram          source
##    <chr>              <chr> 
##  1 " omg how"         blog  
##  2 "omg how could"    blog  
##  3 "how could i"      blog  
##  4 "could i have"     blog  
##  5 "i have missed"    blog  
##  6 "have missed this" blog  
##  7 " are you"         blog  
##  8 "are you a"        blog  
##  9 "you a member"     blog  
## 10 " do these"        blog  
## # ℹ more rows

Dictionary Coverage Calculations

Determine how many dictionary words are needed for 90% and 50% coverage of each portion of the dataset, and what percentage of the total words that represents

trn_words_cum100_stats <- trn_words_dstats %>% summarise(numdwords100 = n()) %>% collect()

## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.

## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?

trn_words_cum90_stats <- trn_words_dstats %>% filter(cum_cov_pct<=90) %>% summarise(numdwords90=n()) %>% collect()

## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.

## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?

trn_words_cum50_stats <- trn_words_dstats %>% filter(cum_cov_pct<=50) %>% summarise(numdwords50=n()) %>% collect()

## `summarise()` has grouped output by "source". You can override using the
## `.groups` argument.

## Warning: ORDER BY is ignored in subqueries without LIMIT
## ℹ Do you need to move arrange() later in the pipeline or use window_order() instead?

trn_words_cum100_stats %>% left_join(trn_words_cum90_stats) %>% left_join(trn_words_cum50_stats) %>% 
        mutate(cov_90pct = numdwords90/numdwords100*100,cov_50pct = numdwords50/numdwords100*100)

## Joining with `by = join_by(source)`
## Joining with `by = join_by(source)`

## # A tibble: 3 × 6
##   source numdwords100 numdwords90 numdwords50 cov_90pct cov_50pct
##   <chr>         <dbl>       <dbl>       <dbl>     <dbl>     <dbl>
## 1 blog         352659        9708         262      2.75    0.0743
## 2 news         257054       10535         433      4.10    0.168 
## 3 twit         375333        8198         228      2.18    0.0607

Top Fifty Words (any source)

top50words <- trn_words %>%
  count(word) %>% arrange(-n) %>% head(50) 

ggplot(top50words, aes(x=n, y=fct_reorder(word,n) )) +
  geom_col() +
  labs(title="Word Counts", x="Frequency", y="Word") #+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Top Fifty 2-grams (any source)

top50twograms <- trn_2grams %>%
  count(twogram) %>% arrange(-n) %>% head(50) 

ggplot(top50twograms, aes(x=n, y=fct_reorder(twogram,n) )) +
  geom_col() +
  labs(title="2-gram Counts", x="Frequency", y="2-gram")

Top Fifty 3-grams (any source)

top50threegrams <- trn_3grams %>%
  count(threegram) %>% arrange(-n) %>% head(50)

ggplot(top50threegrams, aes(x=n, y=fct_reorder(threegram,n) )) +
  geom_col() +
  labs(title="3-gram Counts", x="Frequency", y="3-gram")

Word Cloud - Top 300 Words, by source

Blue words from Twitter. Pink from Blogs. Green from News.

wc300 <- trn_words %>% count(source, word) %>% 
  arrange(-n)  %>% head(300) %>% collect() %>% mutate ( col = case_when( source == "twit" ~ "#56B4E9",
                                                                  source == "blog" ~ "#E69FB4",
                                                                  source == "news" ~ "#B4C67F") )
                                               
wc300 %>% 
  with(wordcloud::wordcloud(
    word,
    n,
    ordered.colors = TRUE,
    colors = wc300$col
  ))

## Garbage Collection

##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1473563 78.7    2346848 125.4  2346848 125.4
## Vcells 2685610 20.5    8388608  64.0  6624690  50.6

Disconnect Spark and Cleanup

##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1473670 78.8    2346848 125.4  2346848 125.4
## Vcells 2685696 20.5    8388608  64.0  6624690  50.6

Swiftkey_Capstone

Joshua Parsell

2024-10-12

Project Description

Analysis

Method