SwiftKey Milestone Report

This milestone report will show and explain some exploratory analysis used for the SwiftKey prediction model project. It will also talk about the goals and plans for the Shiny app used to demostrate how the text prediction algorithm model works.

Project Overview

Dataset

Capstone Dataset

About the Corpa

3 Files

The three files will be sampled to reduce memory and processing. Sample 5% of each file. Sampled files will be cleansed, and tokenized, first to sentences, then to n-grams (1-5). Each sampled set of n-grams will also be reduced to top 80% of n-gram frequency.

The exploratory data analysis will be based on the sampled data.

Basic Plots - Exploratory Data Analysis - Sampled Data 5%

Blogs Cleanse

blog_line_count <- countLines(blogs)
data_blogs <- sample_lines(blogs, blog_line_count * 0.05) # 5% sample size
t <- tibble(text = data_blogs)

# restrict ngram boundaries inside sentences.
t_sentence <- t %>% unnest_tokens(sentence, text, token = "sentences") 
t_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 1) %>% count(ngram)
t2_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 2) %>% count(ngram)
t3_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 3) %>% count(ngram)
t4_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 4) %>% count(ngram)
t5_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 5) %>% count(ngram)

# take top 80% cumulative sum of frequency - throw away the bottom 20% (keep top 80% bulk of area under the curve)
# remove NAs
t80_ngram <- t_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t_ngram$n) * 0.8)
t2_80_ngram <- t2_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t2_ngram$n) * 0.8)
t3_80_ngram <- t3_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t3_ngram$n) * 0.8)
t4_80_ngram <- t4_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t4_ngram$n) * 0.8)
t5_80_ngram <- t5_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t5_ngram$n) * 0.8)

Blogs Summary

Line Count, word count, basic data table with frequencies for each of word, 2-gram, 3-gram, 4-gram, and 5-gram.

# compare line and word count
df <- data.frame(rbind( 
  c("1-Ngram", nrow(t80_ngram), "Ngram"),
  c("1-Ngram", length(data_blogs), "Line"),
  c("2-Ngram", nrow(t2_80_ngram), "Ngram"),
  c("2-Ngram", length(data_blogs), "Line"),
  c("3-Ngram", nrow(t3_80_ngram), "Ngram"),
  c("3-Ngram", length(data_blogs), "Line"),
  c("4-Ngram", nrow(t4_80_ngram), "Ngram"),
  c("4-Ngram", length(data_blogs), "Line"),
  c("5-Ngram", nrow(t5_80_ngram), "Ngram"),
  c("5-Ngram", length(data_blogs), "Line")
))
#convert to number
colnames(df) <- c("Ngram", "Count", "Num")
df$Count <- as.numeric(df$Count)
ggline(df, "Ngram", "Count", group = "Num", color="Num", palette="simpsons", title="Blogs Top 80% Frequency Line Count and Ngram Count")

#basic data tables
head(t80_ngram)
## # A tibble: 6 × 3
##   ngram     n   csum
##   <chr> <int>  <int>
## 1 the   92242  92242
## 2 and   54213 146455
## 3 to    53411 199866
## 4 a     44675 244541
## 5 of    43303 287844
## 6 i     38457 326301
head(t2_80_ngram)
## # A tibble: 6 × 3
##   ngram       n  csum
##   <chr>   <int> <int>
## 1 of the   9186  9186
## 2 in the   7693 16879
## 3 to the   4247 21126
## 4 on the   3737 24863
## 5 to be    3410 28273
## 6 for the  2979 31252
head(t3_80_ngram)
## # A tibble: 6 × 3
##   ngram          n  csum
##   <chr>      <int> <int>
## 1 one of the   722   722
## 2 a lot of     555  1277
## 3 to be a      363  1640
## 4 as well as   339  1979
## 5 it was a     332  2311
## 6 out of the   318  2629
head(t4_80_ngram)
## # A tibble: 6 × 3
##   ngram                  n  csum
##   <chr>              <int> <int>
## 1 the end of the       179   179
## 2 at the end of        151   330
## 3 the rest of the      135   465
## 4 at the same time     115   580
## 5 for the first time   105   685
## 6 when it comes to     105   790
head(t5_80_ngram)
## # A tibble: 6 × 3
##   ngram                     n  csum
##   <chr>                 <int> <int>
## 1 at the end of the       101   101
## 2 in the middle of the     41   142
## 3 the end of the day       27   169
## 4 for the first time in    24   193
## 5 on the other side of     23   216
## 6 the other side of the    22   238
t80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "Blogs 1-Ngram (Word) Top 80% Frequencies")

t2_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "Blogs 2-Ngram Top 80% Frequencies")

t3_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "Blogs 3-Ngram Top 80% Frequencies")

t4_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "Blogs 4-Ngram Top 80% Frequencies")

t5_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "Blogs 5-Ngram Top 80% Frequencies")

News Cleanse

news_line_count <- countLines(news)
data_news <- sample_lines(news, news_line_count * 0.05) # 5% sample size
t <- tibble(text = data_news)

# restrict ngram boundaries inside sentences.
t_sentence <- t %>% unnest_tokens(sentence, text, token = "sentences") 
t_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 1) %>% count(ngram)
t2_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 2) %>% count(ngram)
t3_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 3) %>% count(ngram)
t4_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 4) %>% count(ngram)
t5_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 5) %>% count(ngram)

# take top 80% cumulative sum of frequency - throw away the bottom 20% (keep top 80% bulk of area under the curve)
# remove NAs
t80_ngram <- t_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t_ngram$n) * 0.8)
t2_80_ngram <- t2_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t2_ngram$n) * 0.8)
t3_80_ngram <- t3_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t3_ngram$n) * 0.8)
t4_80_ngram <- t4_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t4_ngram$n) * 0.8)
t5_80_ngram <- t5_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t5_ngram$n) * 0.8)

News Summary

Line Count, word count, basic data table with frequencies for each of word, 2-gram, 3-gram, 4-gram, and 5-gram.

# compare line and word count
df <- data.frame(rbind( 
  c("1-Ngram", nrow(t80_ngram), "Ngram"),
  c("1-Ngram", length(data_blogs), "Line"),
  c("2-Ngram", nrow(t2_80_ngram), "Ngram"),
  c("2-Ngram", length(data_blogs), "Line"),
  c("3-Ngram", nrow(t3_80_ngram), "Ngram"),
  c("3-Ngram", length(data_blogs), "Line"),
  c("4-Ngram", nrow(t4_80_ngram), "Ngram"),
  c("4-Ngram", length(data_blogs), "Line"),
  c("5-Ngram", nrow(t5_80_ngram), "Ngram"),
  c("5-Ngram", length(data_blogs), "Line")
))
#convert to number
colnames(df) <- c("Ngram", "Count", "Num")
df$Count <- as.numeric(df$Count)
ggline(df, "Ngram", "Count", group = "Num", color="Num", palette="simpsons", title="News Top 80% Frequency Line Count and Ngram Count")

#basic data tables
head(t80_ngram)
## # A tibble: 6 × 3
##   ngram     n   csum
##   <chr> <int>  <int>
## 1 the   98733  98733
## 2 to    44464 143197
## 3 and   44272 187469
## 4 a     43656 231125
## 5 of    38793 269918
## 6 in    33759 303677
head(t2_80_ngram)
## # A tibble: 6 × 3
##   ngram       n  csum
##   <chr>   <int> <int>
## 1 of the   9432  9432
## 2 in the   8688 18120
## 3 to the   4052 22172
## 4 on the   3660 25832
## 5 for the  3398 29230
## 6 at the   2920 32150
head(t3_80_ngram)
## # A tibble: 6 × 3
##   ngram            n  csum
##   <chr>        <int> <int>
## 1 one of the     728   728
## 2 a lot of       566  1294
## 3 as well as     307  1601
## 4 in the first   285  1886
## 5 out of the     285  2171
## 6 some of the    277  2448
head(t4_80_ngram)
## # A tibble: 6 × 3
##   ngram                   n  csum
##   <chr>               <int> <int>
## 1 for the first time    137   137
## 2 the end of the        136   273
## 3 the rest of the       119   392
## 4 at the end of         118   510
## 5 said in a statement    88   598
## 6 is one of the          77   675
head(t5_80_ngram)
## # A tibble: 6 × 3
##   ngram                        n  csum
##   <chr>                    <int> <int>
## 1 at the end of the           62    62
## 2 for the first time in       39   101
## 3 by the end of the           33   134
## 4 for the first time since    29   163
## 5 the end of the year         28   191
## 6 in the middle of the        26   217
t80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "1-Ngram (Word) Top 80% Frequencies")

t2_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "2-Ngram Top 80% Frequencies")

t3_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "3-Ngram Top 80% Frequencies")

t4_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "4-Ngram Top 80% Frequencies")

t5_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "5-Ngram Top 80% Frequencies")

Twitter Cleanse

twitter_line_count <- countLines(twitter)
data_twitter <- sample_lines(twitter, twitter_line_count * 0.05) # 5% sample size
t <- tibble(text = data_twitter)

# restrict ngram boundaries inside sentences.
t_sentence <- t %>% unnest_tokens(sentence, text, token = "sentences") 
t_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 1) %>% count(ngram)
t2_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 2) %>% count(ngram)
t3_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 3) %>% count(ngram)
t4_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 4) %>% count(ngram)
t5_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 5) %>% count(ngram)

# take top 80% cumulative sum of frequency - throw away the bottom 20% (keep top 80% bulk of area under the curve)
# remove NAs
t80_ngram <- t_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t_ngram$n) * 0.8)
t2_80_ngram <- t2_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t2_ngram$n) * 0.8)
t3_80_ngram <- t3_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t3_ngram$n) * 0.8)
t4_80_ngram <- t4_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t4_ngram$n) * 0.8)
t5_80_ngram <- t5_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t5_ngram$n) * 0.8)

Twitter Summary

Line Count, word count, basic data table with frequencies for each of word, 2-gram, 3-gram, 4-gram, and 5-gram.

# compare line and word count
df <- data.frame(rbind( 
  c("1-Ngram", nrow(t80_ngram), "Ngram"),
  c("1-Ngram", length(data_blogs), "Line"),
  c("2-Ngram", nrow(t2_80_ngram), "Ngram"),
  c("2-Ngram", length(data_blogs), "Line"),
  c("3-Ngram", nrow(t3_80_ngram), "Ngram"),
  c("3-Ngram", length(data_blogs), "Line"),
  c("4-Ngram", nrow(t4_80_ngram), "Ngram"),
  c("4-Ngram", length(data_blogs), "Line"),
  c("5-Ngram", nrow(t5_80_ngram), "Ngram"),
  c("5-Ngram", length(data_blogs), "Line")
))
#convert to number
colnames(df) <- c("Ngram", "Count", "Num")
df$Count <- as.numeric(df$Count)
ggline(df, "Ngram", "Count", group = "Num", color="Num", palette="simpsons", title="Twitter Top 80% Frequency Line Count and Ngram Count")

#basic data tables
head(t80_ngram)
## # A tibble: 6 × 3
##   ngram     n   csum
##   <chr> <int>  <int>
## 1 the   46891  46891
## 2 to    39501  86392
## 3 i     36261 122653
## 4 a     30496 153149
## 5 you   27413 180562
## 6 and   21836 202398
head(t2_80_ngram)
## # A tibble: 6 × 3
##   ngram          n  csum
##   <chr>      <int> <int>
## 1 in the      3888  3888
## 2 for the     3711  7599
## 3 of the      2878 10477
## 4 to be       2377 12854
## 5 on the      2315 15169
## 6 thanks for  2104 17273
head(t3_80_ngram)
## # A tibble: 6 × 3
##   ngram                  n  csum
##   <chr>              <int> <int>
## 1 thanks for the      1177  1177
## 2 i love you           458  1635
## 3 looking forward to   443  2078
## 4 thank you for        440  2518
## 5 can't wait to        391  2909
## 6 going to be          390  3299
head(t4_80_ngram)
## # A tibble: 6 × 3
##   ngram                     n  csum
##   <chr>                 <int> <int>
## 1 thanks for the follow   270   270
## 2 thanks for the rt       169   439
## 3 can't wait to see       158   597
## 4 thank you for the       148   745
## 5 thank you so much       134   879
## 6 is going to be          111   990
head(t5_80_ngram)
## # A tibble: 6 × 3
##   ngram                        n  csum
##   <chr>                    <int> <int>
## 1 no no no no no              61    61
## 2 can't wait to see you       47   108
## 3 thank you so much for       43   151
## 4 thanks for the shout out    42   193
## 5 it's going to be a          41   234
## 6 thanks so much for the      40   274
t80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "1-Ngram (Word) Top 80% Frequencies")

t2_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "2-Ngram Top 80% Frequencies")

t3_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "3-Ngram Top 80% Frequencies")

t4_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "4-Ngram Top 80% Frequencies")

t5_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal", 
  fill = "ngram", sort.val = "desc", sort.by.groups = TRUE,  palette = "simpsons", title = "5-Ngram Top 80% Frequencies")

Shiny App Goals

The goal of the Shiny app will be to demonstrate the text prediction algorithm model. It takes a phrase as input and predicts the next word.

Shiny App Plan

  1. Download documents for text corpus.
  1. Exploratory analysis. Cleanse and tokenize data.
  1. Build prediction model.
  1. Build Shiny app. To be done in future updates.
  2. Create R presentation to pitch algorithm and app to investor or boss. To be done in future updates.