This milestone report will show and explain some exploratory analysis used for the SwiftKey prediction model project. It will also talk about the goals and plans for the Shiny app used to demostrate how the text prediction algorithm model works.
The three files will be sampled to reduce memory and processing. Sample 5% of each file. Sampled files will be cleansed, and tokenized, first to sentences, then to n-grams (1-5). Each sampled set of n-grams will also be reduced to top 80% of n-gram frequency.
The exploratory data analysis will be based on the sampled data.
blog_line_count <- countLines(blogs)
data_blogs <- sample_lines(blogs, blog_line_count * 0.05) # 5% sample size
t <- tibble(text = data_blogs)
# restrict ngram boundaries inside sentences.
t_sentence <- t %>% unnest_tokens(sentence, text, token = "sentences")
t_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 1) %>% count(ngram)
t2_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 2) %>% count(ngram)
t3_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 3) %>% count(ngram)
t4_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 4) %>% count(ngram)
t5_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 5) %>% count(ngram)
# take top 80% cumulative sum of frequency - throw away the bottom 20% (keep top 80% bulk of area under the curve)
# remove NAs
t80_ngram <- t_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t_ngram$n) * 0.8)
t2_80_ngram <- t2_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t2_ngram$n) * 0.8)
t3_80_ngram <- t3_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t3_ngram$n) * 0.8)
t4_80_ngram <- t4_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t4_ngram$n) * 0.8)
t5_80_ngram <- t5_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t5_ngram$n) * 0.8)
Line Count, word count, basic data table with frequencies for each of word, 2-gram, 3-gram, 4-gram, and 5-gram.
# compare line and word count
df <- data.frame(rbind(
c("1-Ngram", nrow(t80_ngram), "Ngram"),
c("1-Ngram", length(data_blogs), "Line"),
c("2-Ngram", nrow(t2_80_ngram), "Ngram"),
c("2-Ngram", length(data_blogs), "Line"),
c("3-Ngram", nrow(t3_80_ngram), "Ngram"),
c("3-Ngram", length(data_blogs), "Line"),
c("4-Ngram", nrow(t4_80_ngram), "Ngram"),
c("4-Ngram", length(data_blogs), "Line"),
c("5-Ngram", nrow(t5_80_ngram), "Ngram"),
c("5-Ngram", length(data_blogs), "Line")
))
#convert to number
colnames(df) <- c("Ngram", "Count", "Num")
df$Count <- as.numeric(df$Count)
ggline(df, "Ngram", "Count", group = "Num", color="Num", palette="simpsons", title="Blogs Top 80% Frequency Line Count and Ngram Count")
#basic data tables
head(t80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 the 92242 92242
## 2 and 54213 146455
## 3 to 53411 199866
## 4 a 44675 244541
## 5 of 43303 287844
## 6 i 38457 326301
head(t2_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 of the 9186 9186
## 2 in the 7693 16879
## 3 to the 4247 21126
## 4 on the 3737 24863
## 5 to be 3410 28273
## 6 for the 2979 31252
head(t3_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 one of the 722 722
## 2 a lot of 555 1277
## 3 to be a 363 1640
## 4 as well as 339 1979
## 5 it was a 332 2311
## 6 out of the 318 2629
head(t4_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 the end of the 179 179
## 2 at the end of 151 330
## 3 the rest of the 135 465
## 4 at the same time 115 580
## 5 for the first time 105 685
## 6 when it comes to 105 790
head(t5_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 at the end of the 101 101
## 2 in the middle of the 41 142
## 3 the end of the day 27 169
## 4 for the first time in 24 193
## 5 on the other side of 23 216
## 6 the other side of the 22 238
t80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "Blogs 1-Ngram (Word) Top 80% Frequencies")
t2_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "Blogs 2-Ngram Top 80% Frequencies")
t3_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "Blogs 3-Ngram Top 80% Frequencies")
t4_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "Blogs 4-Ngram Top 80% Frequencies")
t5_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "Blogs 5-Ngram Top 80% Frequencies")
news_line_count <- countLines(news)
data_news <- sample_lines(news, news_line_count * 0.05) # 5% sample size
t <- tibble(text = data_news)
# restrict ngram boundaries inside sentences.
t_sentence <- t %>% unnest_tokens(sentence, text, token = "sentences")
t_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 1) %>% count(ngram)
t2_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 2) %>% count(ngram)
t3_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 3) %>% count(ngram)
t4_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 4) %>% count(ngram)
t5_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 5) %>% count(ngram)
# take top 80% cumulative sum of frequency - throw away the bottom 20% (keep top 80% bulk of area under the curve)
# remove NAs
t80_ngram <- t_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t_ngram$n) * 0.8)
t2_80_ngram <- t2_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t2_ngram$n) * 0.8)
t3_80_ngram <- t3_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t3_ngram$n) * 0.8)
t4_80_ngram <- t4_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t4_ngram$n) * 0.8)
t5_80_ngram <- t5_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t5_ngram$n) * 0.8)
Line Count, word count, basic data table with frequencies for each of word, 2-gram, 3-gram, 4-gram, and 5-gram.
# compare line and word count
df <- data.frame(rbind(
c("1-Ngram", nrow(t80_ngram), "Ngram"),
c("1-Ngram", length(data_blogs), "Line"),
c("2-Ngram", nrow(t2_80_ngram), "Ngram"),
c("2-Ngram", length(data_blogs), "Line"),
c("3-Ngram", nrow(t3_80_ngram), "Ngram"),
c("3-Ngram", length(data_blogs), "Line"),
c("4-Ngram", nrow(t4_80_ngram), "Ngram"),
c("4-Ngram", length(data_blogs), "Line"),
c("5-Ngram", nrow(t5_80_ngram), "Ngram"),
c("5-Ngram", length(data_blogs), "Line")
))
#convert to number
colnames(df) <- c("Ngram", "Count", "Num")
df$Count <- as.numeric(df$Count)
ggline(df, "Ngram", "Count", group = "Num", color="Num", palette="simpsons", title="News Top 80% Frequency Line Count and Ngram Count")
#basic data tables
head(t80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 the 98733 98733
## 2 to 44464 143197
## 3 and 44272 187469
## 4 a 43656 231125
## 5 of 38793 269918
## 6 in 33759 303677
head(t2_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 of the 9432 9432
## 2 in the 8688 18120
## 3 to the 4052 22172
## 4 on the 3660 25832
## 5 for the 3398 29230
## 6 at the 2920 32150
head(t3_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 one of the 728 728
## 2 a lot of 566 1294
## 3 as well as 307 1601
## 4 in the first 285 1886
## 5 out of the 285 2171
## 6 some of the 277 2448
head(t4_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 for the first time 137 137
## 2 the end of the 136 273
## 3 the rest of the 119 392
## 4 at the end of 118 510
## 5 said in a statement 88 598
## 6 is one of the 77 675
head(t5_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 at the end of the 62 62
## 2 for the first time in 39 101
## 3 by the end of the 33 134
## 4 for the first time since 29 163
## 5 the end of the year 28 191
## 6 in the middle of the 26 217
t80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "1-Ngram (Word) Top 80% Frequencies")
t2_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "2-Ngram Top 80% Frequencies")
t3_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "3-Ngram Top 80% Frequencies")
t4_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "4-Ngram Top 80% Frequencies")
t5_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "5-Ngram Top 80% Frequencies")
twitter_line_count <- countLines(twitter)
data_twitter <- sample_lines(twitter, twitter_line_count * 0.05) # 5% sample size
t <- tibble(text = data_twitter)
# restrict ngram boundaries inside sentences.
t_sentence <- t %>% unnest_tokens(sentence, text, token = "sentences")
t_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 1) %>% count(ngram)
t2_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 2) %>% count(ngram)
t3_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 3) %>% count(ngram)
t4_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 4) %>% count(ngram)
t5_ngram <- t_sentence %>% unnest_tokens(ngram, sentence, token = "ngrams", n = 5) %>% count(ngram)
# take top 80% cumulative sum of frequency - throw away the bottom 20% (keep top 80% bulk of area under the curve)
# remove NAs
t80_ngram <- t_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t_ngram$n) * 0.8)
t2_80_ngram <- t2_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t2_ngram$n) * 0.8)
t3_80_ngram <- t3_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t3_ngram$n) * 0.8)
t4_80_ngram <- t4_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t4_ngram$n) * 0.8)
t5_80_ngram <- t5_ngram %>% filter(!is.na(ngram)) %>% arrange(desc(n)) %>% mutate(csum = cumsum(n)) %>% filter(csum <= sum(t5_ngram$n) * 0.8)
Line Count, word count, basic data table with frequencies for each of word, 2-gram, 3-gram, 4-gram, and 5-gram.
# compare line and word count
df <- data.frame(rbind(
c("1-Ngram", nrow(t80_ngram), "Ngram"),
c("1-Ngram", length(data_blogs), "Line"),
c("2-Ngram", nrow(t2_80_ngram), "Ngram"),
c("2-Ngram", length(data_blogs), "Line"),
c("3-Ngram", nrow(t3_80_ngram), "Ngram"),
c("3-Ngram", length(data_blogs), "Line"),
c("4-Ngram", nrow(t4_80_ngram), "Ngram"),
c("4-Ngram", length(data_blogs), "Line"),
c("5-Ngram", nrow(t5_80_ngram), "Ngram"),
c("5-Ngram", length(data_blogs), "Line")
))
#convert to number
colnames(df) <- c("Ngram", "Count", "Num")
df$Count <- as.numeric(df$Count)
ggline(df, "Ngram", "Count", group = "Num", color="Num", palette="simpsons", title="Twitter Top 80% Frequency Line Count and Ngram Count")
#basic data tables
head(t80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 the 46891 46891
## 2 to 39501 86392
## 3 i 36261 122653
## 4 a 30496 153149
## 5 you 27413 180562
## 6 and 21836 202398
head(t2_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 in the 3888 3888
## 2 for the 3711 7599
## 3 of the 2878 10477
## 4 to be 2377 12854
## 5 on the 2315 15169
## 6 thanks for 2104 17273
head(t3_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 thanks for the 1177 1177
## 2 i love you 458 1635
## 3 looking forward to 443 2078
## 4 thank you for 440 2518
## 5 can't wait to 391 2909
## 6 going to be 390 3299
head(t4_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 thanks for the follow 270 270
## 2 thanks for the rt 169 439
## 3 can't wait to see 158 597
## 4 thank you for the 148 745
## 5 thank you so much 134 879
## 6 is going to be 111 990
head(t5_80_ngram)
## # A tibble: 6 × 3
## ngram n csum
## <chr> <int> <int>
## 1 no no no no no 61 61
## 2 can't wait to see you 47 108
## 3 thank you so much for 43 151
## 4 thanks for the shout out 42 193
## 5 it's going to be a 41 234
## 6 thanks so much for the 40 274
t80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "1-Ngram (Word) Top 80% Frequencies")
t2_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "2-Ngram Top 80% Frequencies")
t3_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "3-Ngram Top 80% Frequencies")
t4_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "4-Ngram Top 80% Frequencies")
t5_80_ngram %>% top_n(10, n) %>% ggbarplot( x = "ngram", y = "n" , orientation = "horizontal",
fill = "ngram", sort.val = "desc", sort.by.groups = TRUE, palette = "simpsons", title = "5-Ngram Top 80% Frequencies")
The goal of the Shiny app will be to demonstrate the text prediction algorithm model. It takes a phrase as input and predicts the next word.