Task 2: Exploratory Data Analysis

Turki 2022-06-17

Introduction
Data Loading and Summarizing
Data Sampling and Cleaning
Most frequent words and word distributions
Word distributions
Bigrams
Trigrams
Quadgrams
Session info

Introduction

This script uses the tidy data principles applied to text mining, as outlined in Text Mining with R: A Tidy Approach.

Data Loading and Summarizing

English Repository Files

blogs_file   <- "./data/final/en_US/en_US.blogs.txt"
news_file    <- "./data/final/en_US/en_US.news.txt"
twitter_file <- "./data/final/en_US/en_US.twitter.txt"

File Sizes (Mb)

blogs_size   <- file.size(blogs_file) / (2^20)
news_size    <- file.size(news_file) / (2^20)
twitter_size <- file.size(twitter_file) / (2^20)

Read the data files

blogs   <- readLines(blogs_file, skipNul = TRUE)
news    <- readLines(news_file,  skipNul = TRUE)
twitter <- readLines(twitter_file, skipNul = TRUE)

Number of Lines per file

blogs_lines   <- length(blogs)
news_lines    <- length(news)
twitter_lines <- length(twitter)
total_lines   <- blogs_lines + news_lines + twitter_lines

Distibution of characters per line, by file

blogs_nchar   <- nchar(blogs)
news_nchar    <- nchar(news)
twitter_nchar <- nchar(twitter)

boxplot(blogs_nchar, news_nchar, twitter_nchar, log = "y",
        names = c("blogs", "news", "twitter"),
        ylab = "log(Number of Characters)", xlab = "File Name") 
title("Comparing Distributions of Chracters per Line")

Total characters per file

blogs_nchar_sum   <- sum(blogs_nchar)
news_nchar_sum    <- sum(news_nchar)
twitter_nchar_sum <- sum(twitter_nchar)

Total words per file

blogs_words <- wordcount(blogs, sep = " ")
news_words  <- wordcount(news,  sep = " ")
twitter_words <- wordcount(twitter, sep = " ")

Create summary of repo stats

repo_summary <- data.frame(f_names = c("blogs", "news", "twitter"),
                           f_size  = c(blogs_size, news_size, twitter_size),
                           f_lines = c(blogs_lines, news_lines, twitter_lines),
                           n_char =  c(blogs_nchar_sum, news_nchar_sum, twitter_nchar_sum),
                           n_words = c(blogs_words, news_words, twitter_words))
repo_summary <- repo_summary %>% mutate(pct_n_char = round(n_char/sum(n_char), 2))
repo_summary <- repo_summary %>% mutate(pct_lines = round(f_lines/sum(f_lines), 2))
repo_summary <- repo_summary %>% mutate(pct_words = round(n_words/sum(n_words), 2))
kable(repo_summary)

f_names	f_size	f_lines	n_char	n_words	pct_n_char	pct_lines	pct_words
blogs	200.4242	899288	206824505	37334131	0.36	0.21	0.37
news	196.2775	1010242	203223159	34372530	0.36	0.24	0.34
twitter	159.3641	2360148	162096241	30373583	0.28	0.55	0.30

saveRDS(repo_summary, "./data/final/en_US/repo_sample.rds")

Read the data files into dataframes

blogs   <- data_frame(text = blogs)

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

news    <- data_frame(text = news)
twitter <- data_frame(text = twitter)

Data Sampling and Cleaning

set.seed(1001)
sample_pct <- 0.1

blogs_sample <- blogs %>%
  sample_n(., nrow(blogs)*sample_pct)
news_sample <- news %>%
  sample_n(., nrow(news)*sample_pct)
twitter_sample <- twitter %>%
  sample_n(., nrow(twitter)*sample_pct)

Create tidy repository

repo_sample <- bind_rows(mutate(blogs_sample, source = "blogs"),
                         mutate(news_sample,  source = "news"),
                         mutate(twitter_sample, source = "twitter")) 
repo_sample$source <- as.factor(repo_sample$source)

Create filters: stopwords, profanity, non-alphanumeric’s, url’s, repeated letters(+3x)

data("stop_words")
swear_words <- read_delim("./data/final/en_US/swearWords.csv", delim = "\n", col_names = FALSE)

## Rows: 452 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\001"
## chr (1): X1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

swear_words <- unnest_tokens(swear_words, word, X1)
replace_reg <- "[^[:alpha:][:space:]]*"
replace_url <- "http[^[:space:]]*"
replace_aaa <- "\\b(?=\\w*(\\w)\\1)\\w+\\b"

Clean the sample. Cleaning is separted from tidying so unnest_tokens function can be used for words, and ngrams.

clean_sample <-  repo_sample %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  mutate(text = str_replace_all(text, replace_url, "")) %>%
  mutate(text = str_replace_all(text, replace_aaa, "")) %>% 
  mutate(text = iconv(text, "ASCII//TRANSLIT"))

Clean up

rm(blogs, blogs_nchar, news, news_nchar, twitter, twitter_nchar, replace_reg, replace_url, replace_aaa)

Create tidy dataframe for repo sample

tidy_repo <- clean_sample %>%
  unnest_tokens(word, text) %>%
  anti_join(swear_words) %>%
  anti_join(stop_words)

## Joining, by = "word"
## Joining, by = "word"

Most frequent words and word distributions

Word counts: Number of unique words in repo

(repo_count <- tidy_repo %>%
    summarise(keys = n_distinct(word)))

## # A tibble: 1 × 1
##     keys
##    <int>
## 1 155484

Number of words to attain 50% and 90% coverage of all words in repo

cover_50 <- tidy_repo %>%
  count(word) %>%  
  mutate(proportion = n / sum(n)) %>%
  arrange(desc(proportion)) %>%  
  mutate(coverage = cumsum(proportion)) %>%
  filter(coverage <= 0.5)
nrow(cover_50)

## [1] 1274

cover_90 <- tidy_repo %>%
  count(word) %>%  
  mutate(proportion = n / sum(n)) %>%
  arrange(desc(proportion)) %>%  
  mutate(coverage = cumsum(proportion)) %>%
  filter(coverage <= 0.9)
nrow(cover_90)

## [1] 16935

Word distributions

Word distribution

cover_90 %>%
  top_n(20, proportion) %>%
  mutate(word = reorder(word, proportion)) %>%
  ggplot(aes(word, proportion)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

Word distribution by source

freq <- tidy_repo %>%
  count(source, word) %>%
  group_by(source) %>%
  mutate(proportion = n / sum(n)) %>%
  spread(source, proportion) %>%
  gather(source, proportion, `blogs`:`twitter`) %>%
  arrange(desc(proportion), desc(n))

freq %>%
  filter(proportion > 0.002) %>% 
  mutate(word = reorder(word, proportion)) %>% 
  ggplot(aes(word, proportion)) +
  geom_col() + 
  xlab(NULL) + 
  coord_flip() +
  facet_grid(~source, scales = "free")

Word cloud

cover_90 %>%
  with(wordcloud(word, n, max.words = 100, 
                 colors = brewer.pal(6, 'Dark2'), random.order = FALSE))

saveRDS(tidy_repo, "./data/final/tidy_repo.rds")
saveRDS(cover_90, "./data/final/cover_90.rds")
rm(tidy_repo, cover_50, cover_90)

Bigrams

Create bigrams by source using unnest_tokens

bigram_repo <- clean_sample  %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

Number of bigrams to attain 90% coverage of all bigrams in repo

bigram_cover_90 <- bigram_repo %>%
  count(bigram) %>%  
  mutate(proportion = n / sum(n)) %>%
  arrange(desc(proportion)) %>%  
  mutate(coverage = cumsum(proportion)) %>%
  filter(coverage <= 0.9)
nrow(bigram_cover_90)

## [1] 1244584

Bigram distribution

bigram_cover_90 %>%
  top_n(20, proportion) %>%
  mutate(bigram = reorder(bigram, proportion)) %>%
  ggplot(aes(bigram, proportion)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

saveRDS(bigram_cover_90, "./data/final/cover_90.rds")

Trigrams

Create Trigrams by source using unnest_tokens

trigram_repo <- clean_sample  %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3)

Number of trigrams to attain 90% coverage of all trigrams in repo

trigram_cover_90 <- trigram_repo %>%
  count(trigram) %>%  
  mutate(proportion = n / sum(n)) %>%
  arrange(desc(proportion)) %>%  
  mutate(coverage = cumsum(proportion)) %>%
  filter(coverage <= 0.9)
nrow(trigram_cover_90)

## [1] 4446207

trigram distribution

trigram_cover_90 %>%
  top_n(20, proportion) %>%
  mutate(trigram = reorder(trigram, proportion)) %>%
  ggplot(aes(trigram, proportion)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

saveRDS(trigram_cover_90, "./data/final/cover_90.rds")

Quadgrams

Create quadgrams by source using unnest_tokens

quadgram_repo <- clean_sample  %>%
  unnest_tokens(quadgram, text, token = "ngrams", n = 4)

Number of quadgrams to attain 90% coverage of all quadgrams in repo

quadgram_cover_90 <- quadgram_repo %>%
  count(quadgram) %>%  
  mutate(proportion = n / sum(n)) %>%
  arrange(desc(proportion)) %>%  
  mutate(coverage = cumsum(proportion)) %>%
  filter(coverage <= 0.9)
nrow(quadgram_cover_90)

## [1] 6083973

quadgram distribution

quadgram_cover_90 %>%
  top_n(20, proportion) %>%
  mutate(quadgram = reorder(quadgram, proportion)) %>%
  ggplot(aes(quadgram, proportion)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

quadgrams_separated <- quadgram_cover_90 %>%
  separate(quadgram, c("word1", "word2", "word3", "word4"), sep = " ")
quadgrams_separated

## # A tibble: 6,083,973 × 7
##    word1 word2 word3 word4     n proportion coverage
##    <chr> <chr> <chr> <chr> <int>      <dbl>    <dbl>
##  1 <NA>  <NA>  <NA>  <NA>  43978  0.00578    0.00578
##  2 the   end   of    the     740  0.0000973  0.00588
##  3 the   rest  of    the     658  0.0000865  0.00597
##  4 for   the   first time    623  0.0000819  0.00605
##  5 at    the   end   of      616  0.0000810  0.00613
##  6 at    the   same  time    495  0.0000651  0.00619
##  7 is    going to    be      458  0.0000602  0.00625
##  8 is    one   of    the     421  0.0000553  0.00631
##  9 one   of    the   most    401  0.0000527  0.00636
## 10 when  it    comes to      388  0.0000510  0.00641
## # … with 6,083,963 more rows

saveRDS(quadgram_cover_90, "./data/final/cover_90.rds")

end <- Sys.time()

(run_time <- end - start_time)

## Time difference of 18.07368 mins

Session info

sessionInfo()

## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ngram_3.2.1        wordcloud_2.6      RColorBrewer_1.1-3 knitr_1.39        
##  [5] forcats_0.5.1      stringr_1.4.0      dplyr_1.0.9        purrr_0.3.4       
##  [9] readr_2.1.2        tidyr_1.2.0        tibble_3.1.7       ggplot2_3.3.6     
## [13] tidyverse_1.3.1    tidytext_0.3.3    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.8.3      lubridate_1.8.0   lattice_0.20-45   assertthat_0.2.1 
##  [5] digest_0.6.29     utf8_1.2.2        R6_2.5.1          cellranger_1.1.0 
##  [9] backports_1.4.1   reprex_2.0.1      evaluate_0.15     httr_1.4.3       
## [13] highr_0.9         pillar_1.7.0      rlang_1.0.2       readxl_1.4.0     
## [17] rstudioapi_0.13   Matrix_1.4-1      rmarkdown_2.14    labeling_0.4.2   
## [21] bit_4.0.4         munsell_0.5.0     broom_0.8.0       compiler_4.2.0   
## [25] janeaustenr_0.1.5 modelr_0.1.8      xfun_0.31         pkgconfig_2.0.3  
## [29] htmltools_0.5.2   tidyselect_1.1.2  fansi_1.0.3       crayon_1.5.1     
## [33] tzdb_0.3.0        dbplyr_2.2.0      withr_2.5.0       SnowballC_0.7.0  
## [37] grid_4.2.0        jsonlite_1.8.0    gtable_0.3.0      lifecycle_1.0.1  
## [41] DBI_1.1.2         magrittr_2.0.3    scales_1.2.0      tokenizers_0.2.1 
## [45] vroom_1.5.7       cli_3.3.0         stringi_1.7.6     farver_2.1.0     
## [49] fs_1.5.2          xml2_1.3.3        ellipsis_0.3.2    generics_0.1.2   
## [53] vctrs_0.4.1       tools_4.2.0       bit64_4.0.5       glue_1.6.2       
## [57] hms_1.1.1         parallel_4.2.0    fastmap_1.1.0     yaml_2.3.5       
## [61] colorspace_2.0-3  rvest_1.0.2       haven_2.5.0