1. Download data-set
fileURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
f <- file.path(getwd(), "Coursera-SwiftKey.zip")
download.file(fileURL, f)
unzip(zipfile = "Coursera-SwiftKey.zip")
2. Load data-set
con_twitter <- file("final/en_US/en_US.twitter.txt")
twitter <- readLines(con_twitter, encoding = "UTF-8", skipNul = TRUE)
con_blogs <- file("final/en_US/en_US.blogs.txt")
blogs <- readLines(con_blogs, encoding = "UTF-8", skipNul = TRUE)
con_news <- file("final/en_US/en_US.news.txt")
news <- readLines(con_news, encoding = "UTF-8", skipNul = TRUE)
3. Install and load stringi package
## install.packages("stringi")
library(stringi)
4. Explore the basic characteristics of data-set
data.frame(File = c("Blogs", "News", "Twitter"),
Size_Mb = c(round(file.info("en_US.blogs.txt")$size/1024^2, 1), round(file.info("en_US.news.txt")$size/1024^2, 1),
round(file.info("en_US.twitter.txt")$size/1024^2, 1)),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(sum(stri_count_words(blogs)), sum(stri_count_words(news)), sum(stri_count_words(twitter))))
## File Size_Mb Lines Words
## 1 Blogs 200.4 899288 37546250
## 2 News 196.3 1010242 34762395
## 3 Twitter 159.4 2360148 30093413
5. Exploratory Data by sub-setting the data
set.seed(20230426)
subset_blogs <- sample(blogs, 1000)
subset_news <- sample(news, 1000)
subset_twitter <- sample(twitter, 1000)
6. Count words per sub-set data
data.frame(Name = c("subset_blog", "subset_news", "subset_twitter"),
Words_Count = c(sum(stri_count_words(subset_blogs)), sum(stri_count_words(subset_news)), sum(stri_count_words(subset_twitter))))
## Name Words_Count
## 1 subset_blog 39591
## 2 subset_news 34605
## 3 subset_twitter 12653
7. Install “tidyverse” and “ditytext” packages
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# install.packages("tidytext")
library(tidytext)
8. Use tibble to make a data frame for the further analysis
subset_blogs_df <- tibble(line = 1:1000, text = subset_blogs)
subset_news_df <- tibble(line = 1:1000, text = subset_news)
subset_twitter_df <- tibble(line = 1:1000, text = subset_twitter)
9. Clean data with “unnest_tokens” function
subset_blogs_text_df <- subset_blogs_df %>%
unnest_tokens(word, text)
subset_news_text_df <- subset_news_df %>%
unnest_tokens(word, text)
subset_twitter_text_df <- subset_twitter_df %>%
unnest_tokens(word, text)
After using “unnest_tokens”, we’ve split each row so that there is
one token in each row of the new data frame.
By taking tokenization with unnest_tokens() function, we can notice
that
- Columns, such as the line number each word came from, are
retained.
- Punctuation has been stripped.
- Tokens are converted to lowercase.
10. Draw top 20 words (including stop-words) per subsetted
data-set
subset_blogs_text_df %>% count(word, sort = TRUE) %>% top_n(20) %>%
mutate(word = reorder(word, n)) %>% ggplot(aes(n, word)) + geom_col(col = "blue", fill = "white") +
labs(x = "count", y = "Word", title = "Most Frequent words (Blogs)")
## Selecting by n

subset_news_text_df %>% count(word, sort = TRUE) %>% top_n(20) %>%
mutate(word = reorder(word, n)) %>% ggplot(aes(n, word)) + geom_col(col = "orange", fill = "white") +
labs(x = "count", y = "words", title = "Most Frequent words (News)")
## Selecting by n

subset_twitter_text_df %>% count(word, sort = TRUE) %>% top_n(20) %>%
mutate(word = reorder(word, n)) %>% ggplot(aes(n, word)) + geom_col(col = "red", fill = "white") +
labs(x = "count", y = "words", title = "Most Frequent words (Twitter)")
## Selecting by n

We notice that the top 20 words per each data-set have stop-words,
such as “a,” “the,” “is,” “are”, etc. We need to remove the stop-words
for accurate words count.
11. Strip off the stop-words and numbers
subset_blogs_text_df %>% count(word, sort = TRUE) %>% anti_join(stop_words) %>% filter(!str_detect(word, pattern = "\\d")) %>%
top_n(20) %>% mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) + geom_col(col = "blue", fill = "white") + labs(x = "count", y = "Word", title = "Most Frequent words (Blogs)")
## Joining with `by = join_by(word)`
## Selecting by n

subset_news_text_df %>% count(word, sort = TRUE) %>% anti_join(stop_words) %>% filter(!str_detect(word, pattern = "\\d")) %>%
top_n(20) %>% mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) + geom_col(col = "orange", fill = "white") + labs(x = "count", y = "words", title = "Most Frequent words (News)")
## Joining with `by = join_by(word)`
## Selecting by n

subset_twitter_text_df %>% count(word, sort = TRUE) %>% anti_join(stop_words) %>% filter(!str_detect(word, pattern = "\\d")) %>%
top_n(20) %>% mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) + geom_col(col = "red", fill = "white") + labs(x = "count", y = "words", title = "Most Frequent words (Twitter)")
## Joining with `by = join_by(word)`
## Selecting by n

By removing the stop-words and numbers, we can see that each data
set has its own top 20 frequent words.
12. Tokenization with bi-gram
subset_blogs_bigram <- subset_blogs_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
subset_news_bigram <- subset_news_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
subset_twitter_bigram <- subset_twitter_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
13. Examine the top 20 common bi-gram (including stop-words)
subset_blogs_bigram %>% count(bigram, sort = TRUE) %>% top_n(20) %>%
mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(n, bigram)) +
geom_col(col = "blue", fill = "white") +
labs(x = "count", y = "Word", title = "Most Frequent bigram (Blogs)")
## Selecting by n

subset_news_bigram %>% count(bigram, sort = TRUE) %>% top_n(20) %>%
mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(n, bigram)) +
geom_col(col = "orange", fill = "white") +
labs(x = "count", y = "Word", title = "Most Frequent bigram (News)")
## Selecting by n

subset_twitter_bigram %>% count(bigram, sort = TRUE) %>% top_n(20) %>%
mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(n, bigram)) +
geom_col(col = "red", fill = "white") +
labs(x = "count", y = "Word", title = "Most Frequent bigram (Twitter)")
## Selecting by n

Similar to the single token analysis, bigrams have lots of
stop-words. We will remove the stop-words and numbers.
14. Remove stop-words and numbers and re-examine the top 20 common
bi-grams
subset_blogs_bigram_separated <- subset_blogs_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
subset_news_bigram_separated <- subset_news_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
subset_twitter_bigram_separated <- subset_twitter_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
subset_blogs_bigram_filtered <- subset_blogs_bigram_separated %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
filter(!str_detect(word1, pattern = "\\d"), !str_detect(word2, pattern = "\\d"))
subset_news_bigram_filtered <- subset_news_bigram_separated %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
filter(!str_detect(word1, pattern = "\\d"), !str_detect(word2, pattern = "\\d"))
subset_twitter_bigram_filtered <- subset_twitter_bigram_separated %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
filter(!str_detect(word1, pattern = "\\d"), !str_detect(word2, pattern = "\\d"))
subset_blogs_bigram_united <- subset_blogs_bigram_filtered %>%
unite(bigram, word1, word2, sep = " ")
subset_news_bigram_united <- subset_news_bigram_filtered %>%
unite(bigram, word1, word2, sep = " ")
subset_twitter_bigram_united <- subset_twitter_bigram_filtered %>%
unite(bigram, word1, word2, sep = " ")
subset_blogs_bigram_united %>% count(bigram, sort = TRUE) %>% mutate(bigram = reorder(bigram, n)) %>% head(20) %>%
ggplot(aes(n, bigram)) + geom_col(col = "blue", fill = "white") +
labs(x = "count", y = "Word", title = "Most Frequent bigram (Blogs)")

subset_news_bigram_united %>% count(bigram, sort = TRUE) %>% mutate(bigram = reorder(bigram, n)) %>% head(20) %>%
ggplot(aes(n, bigram)) + geom_col(col = "orange", fill = "white") +
labs(x = "count", y = "Word", title = "Most Frequent bigram (News)")

subset_twitter_bigram_united %>% count(bigram, sort = TRUE) %>% mutate(bigram = reorder(bigram, n)) %>% head(20) %>%
ggplot(aes(n, bigram)) + geom_col(col = "red", fill = "white") +
labs(x = "count", y = "Word", title = "Most Frequent bigram (Twitter)")

15. Findings
* The data files are substantially big for the processing.
* Blogs, News and Twitter data set have a lot of stop-words and
numbers.
* Stop-words and numbers are dominant tokens either in single token
or n-grams.
* Stop-words and numbers are meaningless to draw the most frequent
words. I needed to remove them to draw the “real” frequent
word/words.