library(stringi)
library(ggplot2)
library(dplyr)
library(tidytext)
library(readr)
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("Coursera-SwiftKey.zip")) {
download.file(url, "Coursera-SwiftKey.zip", method = "auto")
unzip("Coursera-SwiftKey.zip")
}
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", warn = FALSE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", warn = FALSE)
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", warn = FALSE)
summary_table <- data.frame(
Source = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(
sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))
),
File_Size_MB = c(
file.info("final/en_US/en_US.blogs.txt")$size / 1024^2,
file.info("final/en_US/en_US.news.txt")$size / 1024^2,
file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
)
)
summary_table
## Source Lines Words File_Size_MB
## 1 Blogs 899288 37546250 200.4242
## 2 News 1010242 34762395 196.2775
## 3 Twitter 2360148 30093372 159.3641
set.seed(123)
sample_text <- c(
sample(blogs, 5000),
sample(news, 5000),
sample(twitter, 5000)
)
text_df <- data.frame(text = sample_text)
words <- text_df %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word") %>%
count(word, sort = TRUE)
head(words, 20)
## word n
## 1 time 1007
## 2 people 709
## 3 day 634
## 4 love 516
## 5 2 414
## 6 1 412
## 7 home 376
## 8 3 371
## 9 school 343
## 10 week 337
## 11 life 334
## 12 world 306
## 13 it’s 282
## 14 game 280
## 15 night 274
## 16 city 268
## 17 10 267
## 18 5 264
## 19 4 246
## 20 feel 242
words %>%
top_n(20, n) %>%
ggplot(aes(x = reorder(word, n), y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Top 20 Most Frequent Words",
x = "Words",
y = "Frequency"
)

bigrams <- text_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE)
head(bigrams, 20)
## bigram n
## 1 of the 2019
## 2 in the 1875
## 3 to the 1027
## 4 on the 921
## 5 for the 845
## 6 to be 749
## 7 at the 605
## 8 and the 594
## 9 in a 570
## 10 with the 479
## 11 from the 441
## 12 is a 440
## 13 it was 432
## 14 for a 411
## 15 it is 394
## 16 will be 390
## 17 i was 386
## 18 and i 382
## 19 with a 382
## 20 i have 371
bigrams %>%
top_n(20, n) %>%
ggplot(aes(x = reorder(bigram, n), y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Top 20 Most Frequent Bigrams",
x = "Bigram",
y = "Frequency"
)

trigrams <- text_df %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE)
head(trigrams, 20)
## trigram n
## 1 one of the 160
## 2 a lot of 126
## 3 going to be 83
## 4 to be a 73
## 5 i want to 66
## 6 it was a 66
## 7 as well as 65
## 8 be able to 65
## 9 some of the 64
## 10 i have to 57
## 11 thanks for the 56
## 12 the end of 56
## 13 you want to 55
## 14 this is a 54
## 15 the first time 52
## 16 a couple of 51
## 17 out of the 49
## 18 the rest of 49
## 19 there is a 49
## 20 it is a 46
trigrams %>%
top_n(20, n) %>%
ggplot(aes(x = reorder(trigram, n), y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Top 20 Most Frequent Trigrams",
x = "Trigram",
y = "Frequency"
)
