1. Download data-set

fileURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

f <- file.path(getwd(), "Coursera-SwiftKey.zip")
download.file(fileURL, f)

unzip(zipfile = "Coursera-SwiftKey.zip")

2. Load data-set

con_twitter <- file("final/en_US/en_US.twitter.txt")
twitter <- readLines(con_twitter, encoding = "UTF-8", skipNul = TRUE)

con_blogs <- file("final/en_US/en_US.blogs.txt")
blogs <- readLines(con_blogs, encoding = "UTF-8", skipNul = TRUE)

con_news <- file("final/en_US/en_US.news.txt")
news <- readLines(con_news, encoding = "UTF-8", skipNul = TRUE)

3. Install and load stringi package

## install.packages("stringi")

library(stringi) 

4. Explore the basic characteristics of data-set

data.frame(File = c("Blogs", "News", "Twitter"),
           Size_Mb = c(round(file.info("en_US.blogs.txt")$size/1024^2, 1), round(file.info("en_US.news.txt")$size/1024^2, 1),
                       round(file.info("en_US.twitter.txt")$size/1024^2, 1)), 
           Lines = c(length(blogs), length(news), length(twitter)), 
           Words = c(sum(stri_count_words(blogs)), sum(stri_count_words(news)), sum(stri_count_words(twitter))))
##      File Size_Mb   Lines    Words
## 1   Blogs   200.4  899288 37546250
## 2    News   196.3 1010242 34762395
## 3 Twitter   159.4 2360148 30093413

5. Exploratory Data by sub-setting the data

set.seed(20230426)
subset_blogs <- sample(blogs, 1000)
subset_news <- sample(news, 1000)
subset_twitter <- sample(twitter, 1000)

6. Count words per sub-set data

data.frame(Name = c("subset_blog", "subset_news", "subset_twitter"), 
           Words_Count = c(sum(stri_count_words(subset_blogs)), sum(stri_count_words(subset_news)), sum(stri_count_words(subset_twitter))))
##             Name Words_Count
## 1    subset_blog       39591
## 2    subset_news       34605
## 3 subset_twitter       12653

7. Install “tidyverse” and “ditytext” packages

# install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# install.packages("tidytext")
library(tidytext)

8. Use tibble to make a data frame for the further analysis

subset_blogs_df <- tibble(line = 1:1000, text = subset_blogs)
subset_news_df <- tibble(line = 1:1000, text = subset_news)
subset_twitter_df <- tibble(line = 1:1000, text = subset_twitter)

9. Clean data with “unnest_tokens” function

subset_blogs_text_df <- subset_blogs_df %>%
  unnest_tokens(word, text) 
subset_news_text_df <- subset_news_df %>% 
  unnest_tokens(word, text)
subset_twitter_text_df <- subset_twitter_df %>%
  unnest_tokens(word, text)

After using “unnest_tokens”, we’ve split each row so that there is one token in each row of the new data frame.

By taking tokenization with unnest_tokens() function, we can notice that

- Columns, such as the line number each word came from, are retained.

- Punctuation has been stripped.

- Tokens are converted to lowercase.

10. Draw top 20 words (including stop-words) per subsetted data-set

subset_blogs_text_df %>% count(word, sort = TRUE) %>% top_n(20) %>% 
  mutate(word = reorder(word, n)) %>% ggplot(aes(n, word)) + geom_col(col = "blue", fill = "white") +
  labs(x = "count", y = "Word", title = "Most Frequent words (Blogs)")
## Selecting by n

subset_news_text_df %>% count(word, sort = TRUE) %>% top_n(20) %>% 
  mutate(word = reorder(word, n)) %>% ggplot(aes(n, word)) + geom_col(col = "orange", fill = "white") +
  labs(x = "count", y = "words", title = "Most Frequent words (News)")
## Selecting by n

subset_twitter_text_df %>% count(word, sort = TRUE) %>% top_n(20) %>% 
  mutate(word = reorder(word, n)) %>% ggplot(aes(n, word)) + geom_col(col = "red", fill = "white") + 
  labs(x = "count", y = "words", title = "Most Frequent words (Twitter)")
## Selecting by n

We notice that the top 20 words per each data-set have stop-words, such as “a,” “the,” “is,” “are”, etc. We need to remove the stop-words for accurate words count.

11. Strip off the stop-words and numbers

subset_blogs_text_df %>% count(word, sort = TRUE) %>% anti_join(stop_words) %>% filter(!str_detect(word, pattern = "\\d")) %>%
  top_n(20) %>% mutate(word = reorder(word, n)) %>% 
  ggplot(aes(n, word)) + geom_col(col = "blue", fill = "white") + labs(x = "count", y = "Word", title = "Most Frequent words (Blogs)")
## Joining with `by = join_by(word)`
## Selecting by n

subset_news_text_df %>% count(word, sort = TRUE) %>% anti_join(stop_words) %>% filter(!str_detect(word, pattern = "\\d")) %>%
  top_n(20) %>% mutate(word = reorder(word, n)) %>% 
  ggplot(aes(n, word)) + geom_col(col = "orange", fill = "white") + labs(x = "count", y = "words", title = "Most Frequent words (News)")
## Joining with `by = join_by(word)`
## Selecting by n

subset_twitter_text_df %>% count(word, sort = TRUE) %>% anti_join(stop_words) %>% filter(!str_detect(word, pattern = "\\d")) %>%
  top_n(20) %>% mutate(word = reorder(word, n)) %>% 
  ggplot(aes(n, word)) + geom_col(col = "red", fill = "white") + labs(x = "count", y = "words", title = "Most Frequent words (Twitter)")
## Joining with `by = join_by(word)`
## Selecting by n

By removing the stop-words and numbers, we can see that each data set has its own top 20 frequent words.

12. Tokenization with bi-gram

subset_blogs_bigram <- subset_blogs_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

subset_news_bigram <- subset_news_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

subset_twitter_bigram <- subset_twitter_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

13. Examine the top 20 common bi-gram (including stop-words)

subset_blogs_bigram %>% count(bigram, sort = TRUE) %>% top_n(20) %>%
  mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(n, bigram)) + 
  geom_col(col = "blue", fill = "white") + 
  labs(x = "count", y = "Word", title = "Most Frequent bigram (Blogs)")
## Selecting by n

subset_news_bigram %>% count(bigram, sort = TRUE) %>% top_n(20) %>%
  mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(n, bigram)) + 
  geom_col(col = "orange", fill = "white") + 
  labs(x = "count", y = "Word", title = "Most Frequent bigram (News)")
## Selecting by n

subset_twitter_bigram %>% count(bigram, sort = TRUE) %>% top_n(20) %>%
  mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(n, bigram)) + 
  geom_col(col = "red", fill = "white") + 
  labs(x = "count", y = "Word", title = "Most Frequent bigram (Twitter)")
## Selecting by n

Similar to the single token analysis, bigrams have lots of stop-words. We will remove the stop-words and numbers.

14. Remove stop-words and numbers and re-examine the top 20 common bi-grams

subset_blogs_bigram_separated <- subset_blogs_bigram %>%
  separate(bigram, c("word1", "word2"), sep = " ")

subset_news_bigram_separated <- subset_news_bigram %>%
  separate(bigram, c("word1", "word2"), sep = " ")

subset_twitter_bigram_separated <- subset_twitter_bigram %>%
  separate(bigram, c("word1", "word2"), sep = " ")

subset_blogs_bigram_filtered <- subset_blogs_bigram_separated %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  filter(!str_detect(word1, pattern = "\\d"), !str_detect(word2, pattern = "\\d"))

subset_news_bigram_filtered <- subset_news_bigram_separated %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  filter(!str_detect(word1, pattern = "\\d"), !str_detect(word2, pattern = "\\d"))

subset_twitter_bigram_filtered <- subset_twitter_bigram_separated %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  filter(!str_detect(word1, pattern = "\\d"), !str_detect(word2, pattern = "\\d"))

subset_blogs_bigram_united <- subset_blogs_bigram_filtered %>% 
  unite(bigram, word1, word2, sep = " ") 

subset_news_bigram_united <- subset_news_bigram_filtered %>% 
  unite(bigram, word1, word2, sep = " ") 

subset_twitter_bigram_united <- subset_twitter_bigram_filtered %>% 
  unite(bigram, word1, word2, sep = " ")

subset_blogs_bigram_united %>% count(bigram, sort = TRUE) %>% mutate(bigram = reorder(bigram, n)) %>% head(20) %>%
  ggplot(aes(n, bigram)) + geom_col(col = "blue", fill = "white") + 
  labs(x = "count", y = "Word", title = "Most Frequent bigram (Blogs)")

subset_news_bigram_united %>% count(bigram, sort = TRUE) %>% mutate(bigram = reorder(bigram, n)) %>% head(20) %>% 
  ggplot(aes(n, bigram)) + geom_col(col = "orange", fill = "white") + 
  labs(x = "count", y = "Word", title = "Most Frequent bigram (News)")

subset_twitter_bigram_united %>% count(bigram, sort = TRUE) %>% mutate(bigram = reorder(bigram, n)) %>% head(20) %>% 
  ggplot(aes(n, bigram)) + geom_col(col = "red", fill = "white") + 
  labs(x = "count", y = "Word", title = "Most Frequent bigram (Twitter)")

15. Findings

* The data files are substantially big for the processing.

* Blogs, News and Twitter data set have a lot of stop-words and numbers.

* Stop-words and numbers are dominant tokens either in single token or n-grams.

* Stop-words and numbers are meaningless to draw the most frequent words. I needed to remove them to draw the “real” frequent word/words.