Required packages

start <- Sys.time()
suppressMessages(library(tm))
suppressMessages(library(data.table))
suppressMessages(library(ngram))
suppressMessages(library(dplyr))
suppressMessages(library(tidytext))
suppressMessages(library(SnowballC))
suppressMessages(library(wordcloud))
suppressMessages(library(tidyr))
suppressMessages(library(stringi))
suppressMessages(library(ggplot2))
sample.dir <- "./final/en_US/sample_5pct//"

sample.5pct <- VCorpus(DirSource(sample.dir))
rm(sample.dir)
# convert to lower cases
corpus <- tm_map(sample.5pct, content_transformer(tolower))

# remove Urls
removeUrl <- function(x) gsub("http[^[:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeUrl))

# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeNumPunct))

# remove what would be emojis
corpus <- tm_map(corpus, content_transformer(gsub), pattern = "\\W", replace = " ")

# remove extra white space
corpus <- tm_map(corpus, stripWhitespace)

# remove stop words
corpus <- tm_map(corpus, removeWords, stopwords("english"))

# remove punctuations
corpus <- tm_map(corpus, removePunctuation)

# remove numbers
corpus <- tm_map(corpus, removeNumbers)

# remove profanity
profanity <- read.table("./final/en_US/list.txt", header = F, sep = "\n")
corpus <- tm_map(corpus, removeWords, profanity$V1)
corpus_blog <- tibble(text = corpus[["blogs_sample.txt"]][["content"]], source = "blog")
corpus_news <- tibble(text = corpus[["news_sample.txt"]][["content"]], source = "news")
corpus_twitter <- tibble(text = corpus[["twitters_sample.txt"]][["content"]], source = "twitter")

clean_sample <- bind_rows(corpus_blog, corpus_news, corpus_twitter)
clean_sample$source <- as.factor(clean_sample$source)
rm(profanity, sample.1pct, corpus_blog, corpus_news, corpus_twitter)
## Warning in rm(profanity, sample.1pct, corpus_blog, corpus_news, corpus_twitter):
## object 'sample.1pct' not found

n-grams tokenization, see renference

unigram <- clean_sample %>%
  unnest_tokens(word, text)

bigrams <- clean_sample %>%
  unnest_tokens(bigram, text, token = "ngrams", n=2)

trigrams <-  clean_sample %>%
  unnest_tokens(trigram, text, token = "ngrams", n=3)

quadgrams <- clean_sample %>%
  unnest_tokens(quadgram, text, token = "ngrams", n = 4)

Analysis the tokens

n-gram = 2 top 25 2-words

bigram_repo <- bigrams %>%
  count(bigram) %>%
  drop_na(bigram) %>%
  filter(n >= 1) %>% # Reduce n-gram files
  arrange(desc(n))

bigrams_separated <- bigram_repo %>%
  separate(bigram, c("word1", "word2", sep = " ")) %>%
  select_if(~sum(!is.na(.)) > 0)
## Warning: Expected 3 pieces. Additional pieces discarded in 10 rows [829939,
## 884562, 950937, 1103036, 1105158, 1245723, 1391263, 1449547, 1506676, 1561817].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1635431 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Top 10 2-gram words
bigrams_separated[1:10, ]
## # A tibble: 10 x 4
##    word1 word2  ` `       n
##    <chr> <chr>  <chr> <int>
##  1 new   york   <NA>    926
##  2 last  year   <NA>    871
##  3 dont  know   <NA>    805
##  4 right now    <NA>    792
##  5 years ago    <NA>    684
##  6 high  school <NA>    656
##  7 last  week   <NA>    594
##  8 cant  wait   <NA>    521
##  9 im    going  <NA>    520
## 10 feel  like   <NA>    518
# Top 25 two-words token
bigram_repo %>%
  top_n(25, n) %>%
  mutate(bigram = reorder(bigram, n)) %>%
  ggplot(aes(x = n, y = bigram)) + geom_col()

n-gram = 3. Top 25 3-words

trigram_repo <- trigrams %>%
  count(trigram) %>%
  drop_na(trigram) %>%
  filter(n >= 1) %>% # Reduce n-gram files
  arrange(desc(n))

trigrams_separated <- trigram_repo %>%
  separate(trigram, c("word1", "word2", "word3", sep = " ")) %>%
  select_if(~sum(!is.na(.)) > 0)
## Warning: Expected 4 pieces. Additional pieces discarded in 14 rows [453616,
## 850611, 890278, 914489, 969366, 1088576, 1222944, 1300903, 1303714, 1495120,
## 1871346, 1910870, 1945971, 2073086].
## Warning: Expected 4 pieces. Missing pieces filled with `NA` in 2073025 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Top 10 3-gram words
trigrams_separated[1:10, ]
## # A tibble: 10 x 5
##    word1     word2   word3 ` `       n
##    <chr>     <chr>   <chr> <chr> <int>
##  1 new       york    city  <NA>    123
##  2 cant      wait    see   <NA>    100
##  3 happy     mothers day   <NA>     80
##  4 president barack  obama <NA>     78
##  5 two       years   ago   <NA>     76
##  6 let       us      know  <NA>     71
##  7 new       york    times <NA>     67
##  8 im        pretty  sure  <NA>     62
##  9 happy     new     year  <NA>     54
## 10 world     war     ii    <NA>     54
# Top 25 tjree-words token
trigram_repo %>%
  top_n(25, n) %>%
  mutate(trigram = reorder(trigram, n)) %>%
  ggplot(aes(x = n, y = trigram)) + geom_col()

n-gram = 4 top 25 4-words

quadgram_repo <- quadgrams %>%
  count(quadgram) %>%
  drop_na(quadgram) %>%
  filter(n >= 1) %>% # Reduce n-gram files
  arrange(desc(n))

quadgram_separated <- quadgram_repo %>%
  separate(quadgram, c("word1", "word2", "word3", "word4", sep = " ")) %>%
  select_if(~sum(!is.na(.)) > 0)
## Warning: Expected 5 pieces. Additional pieces discarded in 16 rows [169709,
## 410332, 514601, 648218, 794524, 833767, 910763, 1157739, 1160884, 1233440,
## 1236150, 1420822, 1462475, 1785438, 1838747, 1981288].
## Warning: Expected 5 pieces. Missing pieces filled with `NA` in 1981226 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Top 10 3-gram words
quadgram_separated[1:10, ]
## # A tibble: 10 x 6
##    word1        word2   word3        word4        ` `       n
##    <chr>        <chr>   <chr>        <chr>        <chr> <int>
##  1 incorporated item    c            pp           <NA>     22
##  2 dow          jones   industrial   average      <NA>     21
##  3 g            fat     g            saturated    <NA>     19
##  4 martin       luther  king         jr           <NA>     19
##  5 new          york    stock        exchange     <NA>     16
##  6 g            protein g            carbohydrate <NA>     15
##  7 protein      g       carbohydrate g            <NA>     15
##  8 rock         roll    hall         fame         <NA>     14
##  9 centers      disease control      prevention   <NA>     13
## 10 love         u       love         u            <NA>     13
# Top 25 tjree-words token
quadgram_repo %>%
  top_n(25, n) %>%
  mutate(quadgram = reorder(quadgram, n)) %>%
  ggplot(aes(x = n, y = quadgram)) + geom_col()

Save n-gram files

saveRDS(bigrams_separated, "./final/en_US/dictionary_5pct/bi_words.rds")
saveRDS(trigrams_separated, "./final/en_US/dictionary_5pct/tri_words.rds")
saveRDS(quadgram_separated, "./final/en_US/dictionary_5pct/quad_words.rds")
end <- Sys.time()
end - start
## Time difference of 6.089633 mins

SessionInfo

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggplot2_3.3.2      stringi_1.5.3      tidyr_1.1.2        wordcloud_2.6     
##  [5] RColorBrewer_1.1-2 SnowballC_0.7.0    tidytext_0.3.0     dplyr_1.0.2       
##  [9] ngram_3.0.4        data.table_1.13.4  tm_0.7-8           NLP_0.2-1         
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.5        compiler_3.6.3    pillar_1.4.7      tokenizers_0.2.1 
##  [5] tools_3.6.3       digest_0.6.27     gtable_0.3.0      evaluate_0.14    
##  [9] lifecycle_0.2.0   tibble_3.0.4      lattice_0.20-41   pkgconfig_2.0.3  
## [13] rlang_0.4.9       Matrix_1.2-18     cli_2.2.0         yaml_2.2.1       
## [17] parallel_3.6.3    xfun_0.19         withr_2.3.0       janeaustenr_0.1.5
## [21] stringr_1.4.0     xml2_1.3.2        knitr_1.30        generics_0.1.0   
## [25] vctrs_0.3.5       grid_3.6.3        tidyselect_1.1.0  glue_1.4.2       
## [29] R6_2.5.0          fansi_0.4.1       rmarkdown_2.6     farver_2.0.3     
## [33] purrr_0.3.4       magrittr_2.0.1    scales_1.1.1      ellipsis_0.3.1   
## [37] htmltools_0.5.0   assertthat_0.2.1  colorspace_2.0-0  labeling_0.4.2   
## [41] utf8_1.1.4        munsell_0.5.0     slam_0.1-48       crayon_1.3.4