Task 2: Exploratory Data Analysis

Turki 2022-06-17

Introduction

This script uses the tidy data principles applied to text mining, as outlined in Text Mining with R: A Tidy Approach.

Data Loading and Summarizing

English Repository Files

blogs_file   <- "./data/final/en_US/en_US.blogs.txt"
news_file    <- "./data/final/en_US/en_US.news.txt"
twitter_file <- "./data/final/en_US/en_US.twitter.txt"  

File Sizes (Mb)

blogs_size   <- file.size(blogs_file) / (2^20)
news_size    <- file.size(news_file) / (2^20)
twitter_size <- file.size(twitter_file) / (2^20)

Read the data files

blogs   <- readLines(blogs_file, skipNul = TRUE)
news    <- readLines(news_file,  skipNul = TRUE)
twitter <- readLines(twitter_file, skipNul = TRUE)

Number of Lines per file

blogs_lines   <- length(blogs)
news_lines    <- length(news)
twitter_lines <- length(twitter)
total_lines   <- blogs_lines + news_lines + twitter_lines

Distibution of characters per line, by file

blogs_nchar   <- nchar(blogs)
news_nchar    <- nchar(news)
twitter_nchar <- nchar(twitter)

boxplot(blogs_nchar, news_nchar, twitter_nchar, log = "y",
        names = c("blogs", "news", "twitter"),
        ylab = "log(Number of Characters)", xlab = "File Name") 
title("Comparing Distributions of Chracters per Line")

Total characters per file

blogs_nchar_sum   <- sum(blogs_nchar)
news_nchar_sum    <- sum(news_nchar)
twitter_nchar_sum <- sum(twitter_nchar)

Total words per file

blogs_words <- wordcount(blogs, sep = " ")
news_words  <- wordcount(news,  sep = " ")
twitter_words <- wordcount(twitter, sep = " ")

Create summary of repo stats

repo_summary <- data.frame(f_names = c("blogs", "news", "twitter"),
                           f_size  = c(blogs_size, news_size, twitter_size),
                           f_lines = c(blogs_lines, news_lines, twitter_lines),
                           n_char =  c(blogs_nchar_sum, news_nchar_sum, twitter_nchar_sum),
                           n_words = c(blogs_words, news_words, twitter_words))
repo_summary <- repo_summary %>% mutate(pct_n_char = round(n_char/sum(n_char), 2))
repo_summary <- repo_summary %>% mutate(pct_lines = round(f_lines/sum(f_lines), 2))
repo_summary <- repo_summary %>% mutate(pct_words = round(n_words/sum(n_words), 2))
kable(repo_summary)
f_names f_size f_lines n_char n_words pct_n_char pct_lines pct_words
blogs 200.4242 899288 206824505 37334131 0.36 0.21 0.37
news 196.2775 1010242 203223159 34372530 0.36 0.24 0.34
twitter 159.3641 2360148 162096241 30373583 0.28 0.55 0.30
saveRDS(repo_summary, "./data/final/en_US/repo_sample.rds")

Read the data files into dataframes

blogs   <- data_frame(text = blogs)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
news    <- data_frame(text = news)
twitter <- data_frame(text = twitter)

Data Sampling and Cleaning

set.seed(1001)
sample_pct <- 0.1

blogs_sample <- blogs %>%
  sample_n(., nrow(blogs)*sample_pct)
news_sample <- news %>%
  sample_n(., nrow(news)*sample_pct)
twitter_sample <- twitter %>%
  sample_n(., nrow(twitter)*sample_pct)

Create tidy repository

repo_sample <- bind_rows(mutate(blogs_sample, source = "blogs"),
                         mutate(news_sample,  source = "news"),
                         mutate(twitter_sample, source = "twitter")) 
repo_sample$source <- as.factor(repo_sample$source)

Create filters: stopwords, profanity, non-alphanumeric’s, url’s, repeated letters(+3x)

data("stop_words")
swear_words <- read_delim("./data/final/en_US/swearWords.csv", delim = "\n", col_names = FALSE)
## Rows: 452 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\001"
## chr (1): X1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
swear_words <- unnest_tokens(swear_words, word, X1)
replace_reg <- "[^[:alpha:][:space:]]*"
replace_url <- "http[^[:space:]]*"
replace_aaa <- "\\b(?=\\w*(\\w)\\1)\\w+\\b"  

Clean the sample. Cleaning is separted from tidying so unnest_tokens function can be used for words, and ngrams.

clean_sample <-  repo_sample %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  mutate(text = str_replace_all(text, replace_url, "")) %>%
  mutate(text = str_replace_all(text, replace_aaa, "")) %>% 
  mutate(text = iconv(text, "ASCII//TRANSLIT"))

Clean up

rm(blogs, blogs_nchar, news, news_nchar, twitter, twitter_nchar, replace_reg, replace_url, replace_aaa)

Create tidy dataframe for repo sample

tidy_repo <- clean_sample %>%
  unnest_tokens(word, text) %>%
  anti_join(swear_words) %>%
  anti_join(stop_words)
## Joining, by = "word"
## Joining, by = "word"

Most frequent words and word distributions

Word counts: Number of unique words in repo

(repo_count <- tidy_repo %>%
    summarise(keys = n_distinct(word)))
## # A tibble: 1 × 1
##     keys
##    <int>
## 1 155484

Number of words to attain 50% and 90% coverage of all words in repo

cover_50 <- tidy_repo %>%
  count(word) %>%  
  mutate(proportion = n / sum(n)) %>%
  arrange(desc(proportion)) %>%  
  mutate(coverage = cumsum(proportion)) %>%
  filter(coverage <= 0.5)
nrow(cover_50)
## [1] 1274
cover_90 <- tidy_repo %>%
  count(word) %>%  
  mutate(proportion = n / sum(n)) %>%
  arrange(desc(proportion)) %>%  
  mutate(coverage = cumsum(proportion)) %>%
  filter(coverage <= 0.9)
nrow(cover_90)
## [1] 16935

Word distributions

Word distribution

cover_90 %>%
  top_n(20, proportion) %>%
  mutate(word = reorder(word, proportion)) %>%
  ggplot(aes(word, proportion)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

Word distribution by source

freq <- tidy_repo %>%
  count(source, word) %>%
  group_by(source) %>%
  mutate(proportion = n / sum(n)) %>%
  spread(source, proportion) %>%
  gather(source, proportion, `blogs`:`twitter`) %>%
  arrange(desc(proportion), desc(n))

freq %>%
  filter(proportion > 0.002) %>% 
  mutate(word = reorder(word, proportion)) %>% 
  ggplot(aes(word, proportion)) +
  geom_col() + 
  xlab(NULL) + 
  coord_flip() +
  facet_grid(~source, scales = "free")

Word cloud

cover_90 %>%
  with(wordcloud(word, n, max.words = 100, 
                 colors = brewer.pal(6, 'Dark2'), random.order = FALSE))

saveRDS(tidy_repo, "./data/final/tidy_repo.rds")
saveRDS(cover_90, "./data/final/cover_90.rds")
rm(tidy_repo, cover_50, cover_90)

Bigrams

Create bigrams by source using unnest_tokens

bigram_repo <- clean_sample  %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

Number of bigrams to attain 90% coverage of all bigrams in repo

bigram_cover_90 <- bigram_repo %>%
  count(bigram) %>%  
  mutate(proportion = n / sum(n)) %>%
  arrange(desc(proportion)) %>%  
  mutate(coverage = cumsum(proportion)) %>%
  filter(coverage <= 0.9)
nrow(bigram_cover_90)
## [1] 1244584

Bigram distribution

bigram_cover_90 %>%
  top_n(20, proportion) %>%
  mutate(bigram = reorder(bigram, proportion)) %>%
  ggplot(aes(bigram, proportion)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

saveRDS(bigram_cover_90, "./data/final/cover_90.rds")

Trigrams

Create Trigrams by source using unnest_tokens

trigram_repo <- clean_sample  %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3)

Number of trigrams to attain 90% coverage of all trigrams in repo

trigram_cover_90 <- trigram_repo %>%
  count(trigram) %>%  
  mutate(proportion = n / sum(n)) %>%
  arrange(desc(proportion)) %>%  
  mutate(coverage = cumsum(proportion)) %>%
  filter(coverage <= 0.9)
nrow(trigram_cover_90)
## [1] 4446207

trigram distribution

trigram_cover_90 %>%
  top_n(20, proportion) %>%
  mutate(trigram = reorder(trigram, proportion)) %>%
  ggplot(aes(trigram, proportion)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

saveRDS(trigram_cover_90, "./data/final/cover_90.rds")

Quadgrams

Create quadgrams by source using unnest_tokens

quadgram_repo <- clean_sample  %>%
  unnest_tokens(quadgram, text, token = "ngrams", n = 4)

Number of quadgrams to attain 90% coverage of all quadgrams in repo

quadgram_cover_90 <- quadgram_repo %>%
  count(quadgram) %>%  
  mutate(proportion = n / sum(n)) %>%
  arrange(desc(proportion)) %>%  
  mutate(coverage = cumsum(proportion)) %>%
  filter(coverage <= 0.9)
nrow(quadgram_cover_90)
## [1] 6083973

quadgram distribution

quadgram_cover_90 %>%
  top_n(20, proportion) %>%
  mutate(quadgram = reorder(quadgram, proportion)) %>%
  ggplot(aes(quadgram, proportion)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

quadgrams_separated <- quadgram_cover_90 %>%
  separate(quadgram, c("word1", "word2", "word3", "word4"), sep = " ")
quadgrams_separated
## # A tibble: 6,083,973 × 7
##    word1 word2 word3 word4     n proportion coverage
##    <chr> <chr> <chr> <chr> <int>      <dbl>    <dbl>
##  1 <NA>  <NA>  <NA>  <NA>  43978  0.00578    0.00578
##  2 the   end   of    the     740  0.0000973  0.00588
##  3 the   rest  of    the     658  0.0000865  0.00597
##  4 for   the   first time    623  0.0000819  0.00605
##  5 at    the   end   of      616  0.0000810  0.00613
##  6 at    the   same  time    495  0.0000651  0.00619
##  7 is    going to    be      458  0.0000602  0.00625
##  8 is    one   of    the     421  0.0000553  0.00631
##  9 one   of    the   most    401  0.0000527  0.00636
## 10 when  it    comes to      388  0.0000510  0.00641
## # … with 6,083,963 more rows
saveRDS(quadgram_cover_90, "./data/final/cover_90.rds")

end <- Sys.time()

(run_time <- end - start_time)
## Time difference of 18.07368 mins

Session info

sessionInfo()       
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ngram_3.2.1        wordcloud_2.6      RColorBrewer_1.1-3 knitr_1.39        
##  [5] forcats_0.5.1      stringr_1.4.0      dplyr_1.0.9        purrr_0.3.4       
##  [9] readr_2.1.2        tidyr_1.2.0        tibble_3.1.7       ggplot2_3.3.6     
## [13] tidyverse_1.3.1    tidytext_0.3.3    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.8.3      lubridate_1.8.0   lattice_0.20-45   assertthat_0.2.1 
##  [5] digest_0.6.29     utf8_1.2.2        R6_2.5.1          cellranger_1.1.0 
##  [9] backports_1.4.1   reprex_2.0.1      evaluate_0.15     httr_1.4.3       
## [13] highr_0.9         pillar_1.7.0      rlang_1.0.2       readxl_1.4.0     
## [17] rstudioapi_0.13   Matrix_1.4-1      rmarkdown_2.14    labeling_0.4.2   
## [21] bit_4.0.4         munsell_0.5.0     broom_0.8.0       compiler_4.2.0   
## [25] janeaustenr_0.1.5 modelr_0.1.8      xfun_0.31         pkgconfig_2.0.3  
## [29] htmltools_0.5.2   tidyselect_1.1.2  fansi_1.0.3       crayon_1.5.1     
## [33] tzdb_0.3.0        dbplyr_2.2.0      withr_2.5.0       SnowballC_0.7.0  
## [37] grid_4.2.0        jsonlite_1.8.0    gtable_0.3.0      lifecycle_1.0.1  
## [41] DBI_1.1.2         magrittr_2.0.3    scales_1.2.0      tokenizers_0.2.1 
## [45] vroom_1.5.7       cli_3.3.0         stringi_1.7.6     farver_2.1.0     
## [49] fs_1.5.2          xml2_1.3.3        ellipsis_0.3.2    generics_0.1.2   
## [53] vctrs_0.4.1       tools_4.2.0       bit64_4.0.5       glue_1.6.2       
## [57] hms_1.1.1         parallel_4.2.0    fastmap_1.1.0     yaml_2.3.5       
## [61] colorspace_2.0-3  rvest_1.0.2       haven_2.5.0