For markdown writing, credit to reference

Getting the data

Load the data

To save knitting time, I downloaded the data manually

start <- Sys.time()
enUSnews.path <- "./final/en_US/en_US.news.txt"
enUStwitter.path <- "./final/en_US/en_US.twitter.txt"
enUSblogs.path <- "./final/en_US/en_US.blogs.txt"

enUSnews <- readLines(enUSnews.path, skipNul = TRUE)
enUStwitter <- readLines(enUStwitter.path, skipNul = TRUE)
enUSblogs <- readLines(enUSblogs.path, skipNul = TRUE)

Check file sizes in Mb

news.info <- file.info(enUSnews.path)
twitter.info <- file.info(enUStwitter.path)
blog.info <- file.info(enUSblogs.path)

news.Mb <- round(news.info$size/1024^2)
twitters.Mb <- round(twitter.info$size/1024^2)
blogs.Mb <- round(blog.info$size/1024^2)

data.frame(news_Mb = news.Mb, twitters_Mb = twitters.Mb, blogs_Mb = blogs.Mb)
##   news_Mb twitters_Mb blogs_Mb
## 1     196         159      200

Number of lines in each file

news.lines <- length(enUSnews)
twitters.lines <- length(enUStwitter)
blogs.lines <- length(enUSblogs)
data.frame(news_line = news.lines,
           twitters_line = twitters.lines,
           blogs_line = blogs.lines)
##   news_line twitters_line blogs_line
## 1   1010242       2360148     899288

Number of characters per line

news.nchar <- nchar(enUSnews)
twitter.nchar <- nchar(enUStwitter)
blog.nchar <- nchar(enUSblogs)
boxplot(log2(news.nchar), log2(twitter.nchar), log2(blog.nchar),
        ylab = "nchar per line in log2 transform",
        names = c("news", "twitters", "blogs"),
        main = "en_US_dataset")

## Total characters per file

news.nchar.sum <- sum(nchar(enUSnews))
twitter.nchar.sum <- sum(nchar(enUStwitter))
blog.nchar.sum <- sum(nchar(enUSblogs))
data.frame(news_nchar_sum = news.nchar.sum,
           twitter_nchar_sum = twitter.nchar.sum,
           blog_nchar_sum = blog.nchar.sum)
##   news_nchar_sum twitter_nchar_sum blog_nchar_sum
## 1      203223159         162096241      206824505

Number of total words in each file

suppressMessages(library(ngram))
news.words <- wordcount(enUSnews, sep = " ")
twitters.words <- wordcount(enUStwitter, sep = " ")
blogs.words <- wordcount(enUSblogs, sep = " ")

data.frame(news_words = news.words,
           twitters_words = twitters.words,
           blogs_words = blogs.words)
##   news_words twitters_words blogs_words
## 1   34372530       30373583    37334131

Summarize above stats

enUSsumm <- data.frame(file_names = c("news", "twitters", "blogs"),
                       file_size = c(news.Mb, twitters.Mb, blogs.Mb),
                       file_lines = c(news.lines, twitters.lines, blogs.lines),
                       n_character = c(news.nchar.sum, twitter.nchar.sum, blog.nchar.sum),
                       n_words = c(news.words, twitters.words, blogs.words)
                       )

# Add stats columns
suppressMessages(library(dplyr))
enUSsumm <- enUSsumm %>% mutate(pct_n_character = round(n_character/sum(n_character), 2))
enUSsumm <- enUSsumm %>% mutate(pct_n_words = round(n_words/sum(n_words), 2))
enUSsumm
##   file_names file_size file_lines n_character  n_words pct_n_character
## 1       news       196    1010242   203223159 34372530            0.36
## 2   twitters       159    2360148   162096241 30373583            0.28
## 3      blogs       200     899288   206824505 37334131            0.36
##   pct_n_words
## 1        0.34
## 2        0.30
## 3        0.37

Fractionize 5% of the raw files

frac = 0.05
set.seed(2021)

news.sample <- sample(enUSnews, round(news.lines * frac))
twitters.sample <- sample(enUStwitter, round(news.lines * frac))
blogs.sample <- sample(enUSblogs, round(news.lines * frac))

combined.sample <- c(news.sample, twitters.sample, blogs.sample)

Save the sampled files

writeLines(news.sample, "./final/en_US/en_US.news.sample.txt")
writeLines(twitters.sample, "./final/en_US/en_US.twitters.sample.txt")
writeLines(blogs.sample, "./final/en_US/en_US.blogs.sample.txt")
writeLines(combined.sample, "./final/en_US/en_US.combined.sample.txt")

Cleaning the data

Clean the combined sample data using tm

suppressMessages(library(tm))
combined.sample.clean <- Corpus(VectorSource(combined.sample))

Remove URLs,emojis,non-english words,punctuations,numbers,whitespace, stop words and profanity.

# convert to lower cases
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(tolower)): transformation drops documents
# remove Urls
removeUrl <- function(x) gsub("http[^[:space:]]*", "", x)
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(removeUrl))
## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(removeUrl)): transformation drops documents
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(removeNumPunct)): transformation drops documents
# remove what would be emojis
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(gsub), pattern = "\\W", replace = " ")
## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(gsub), : transformation drops documents
# remove extra white space
combined.sample.clean <- tm_map(combined.sample.clean, stripWhitespace)
## Warning in tm_map.SimpleCorpus(combined.sample.clean, stripWhitespace):
## transformation drops documents
# remove stop words
combined.sample.clean <- tm_map(combined.sample.clean, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(combined.sample.clean, removeWords,
## stopwords("english")): transformation drops documents
# remove punctuations
combined.sample.clean <- tm_map(combined.sample.clean, removePunctuation)
## Warning in tm_map.SimpleCorpus(combined.sample.clean, removePunctuation):
## transformation drops documents
# remove numbers
combined.sample.clean <- tm_map(combined.sample.clean, removeNumbers)
## Warning in tm_map.SimpleCorpus(combined.sample.clean, removeNumbers):
## transformation drops documents
# remove profanity
profanity <- read.table("./final/en_US/list.txt", header = F, sep = "\n")
combined.sample.clean <- tm_map(combined.sample.clean, removeWords, profanity$V1)
## Warning in tm_map.SimpleCorpus(combined.sample.clean, removeWords,
## profanity$V1): transformation drops documents

Save clean Corpus

saveRDS(combined.sample.clean, file = "./final/en_US/clean_sample.rds")

Exploratory Data Analysis

To analyze the textual data, we need a Document-Term Matrix (DTM) format: documents are as the rows, terms/words as the columns, frequency of the term in the document as the entries.

sample.clean <- readRDS("./final/en_US/clean_sample.rds")
sample.clean.dtm <- DocumentTermMatrix(sample.clean)
sample.clean.dtm
## <<DocumentTermMatrix (documents: 151536, terms: 126698)>>
## Non-/sparse entries: 2205188/19197102940
## Sparsity           : 100%
## Maximal term length: 91
## Weighting          : term frequency (tf)
inspect(sample.clean.dtm[100:105, 100:105])
## <<DocumentTermMatrix (documents: 6, terms: 6)>>
## Non-/sparse entries: 0/36
## Sparsity           : 100%
## Maximal term length: 14
## Weighting          : term frequency (tf)
## Sample             :
##      Terms
## Docs  meet networks ocs octastefestcom okeefe olhats
##   100    0        0   0              0      0      0
##   101    0        0   0              0      0      0
##   102    0        0   0              0      0      0
##   103    0        0   0              0      0      0
##   104    0        0   0              0      0      0
##   105    0        0   0              0      0      0

To reduce the dimension of DTM, remove the less frequent terms such that the sparsity is less than 0.95

sample.clean.dtm <- removeSparseTerms(sample.clean.dtm, sparse = 0.99)
inspect(sample.clean.dtm[100:105, 100:105])
## <<DocumentTermMatrix (documents: 6, terms: 6)>>
## Non-/sparse entries: 1/35
## Sparsity           : 97%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## Sample             :
##      Terms
## Docs  enough lot really set sure yet
##   100      0   0      0   1    0   0
##   101      0   0      0   0    0   0
##   102      0   0      0   0    0   0
##   103      0   0      0   0    0   0
##   104      0   0      0   0    0   0
##   105      0   0      0   0    0   0

Most frequent and least frequent words

suppressMessages(library(data.table))
freq <- data.frame(sort(colSums(as.matrix(sample.clean.dtm)), decreasing = TRUE))
colnames(freq) <- "counts"
freq$words <- rownames(freq)
rownames(freq) <- NULL
# 10 most frequent words
head(freq, 10)
##    counts  words
## 1   14914   said
## 2   13590   will
## 3   12940    one
## 4   11187   just
## 5   10569   like
## 6   10172    can
## 7    9060   time
## 8    8661    get
## 9    7921    new
## 10   6836 people
# 10 least frequent words
tail(freq, 10)
##     counts    words
## 132   1718   second
## 133   1695    point
## 134   1684   making
## 135   1671     done
## 136   1659 everyone
## 137   1642      yet
## 138   1626   called
## 139   1597  already
## 140   1589      lol
## 141   1574    least

Plot a simple word cloud

suppressMessages(library(SnowballC))
suppressMessages(library(wordcloud))

wordcloud(freq$words, freq$counts, max.words = 100, colors = brewer.pal(1, "Accent"),
          random.order = FALSE)
## Warning in brewer.pal(1, "Accent"): minimal value for n is 3, returning requested palette with 3 different levels

Plot most frequent words

suppressMessages(library(ggplot2))

ggplot(freq[freq$counts > 7000, ], aes(x = reorder(words, -counts), y = counts)) + 
         geom_bar(stat = "identity", fill = "steelblue") + 
         labs(title = "Most frequent wrods (occurance > 7000)",
          x = "words")

Calculate time spent

end <- Sys.time()
ellapsed <- end - start

Session Info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.2      wordcloud_2.6      RColorBrewer_1.1-2 SnowballC_0.7.0   
## [5] data.table_1.13.4  tm_0.7-8           NLP_0.2-1          dplyr_1.0.2       
## [9] ngram_3.0.4       
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.5       pillar_1.4.7     compiler_3.6.3   tools_3.6.3     
##  [5] digest_0.6.27    evaluate_0.14    lifecycle_0.2.0  tibble_3.0.4    
##  [9] gtable_0.3.0     pkgconfig_2.0.3  rlang_0.4.9      yaml_2.2.1      
## [13] parallel_3.6.3   xfun_0.19        withr_2.3.0      stringr_1.4.0   
## [17] knitr_1.30       xml2_1.3.2       generics_0.1.0   vctrs_0.3.5     
## [21] grid_3.6.3       tidyselect_1.1.0 glue_1.4.2       R6_2.5.0        
## [25] rmarkdown_2.6    farver_2.0.3     purrr_0.3.4      magrittr_2.0.1  
## [29] scales_1.1.1     ellipsis_0.3.1   htmltools_0.5.0  colorspace_2.0-0
## [33] labeling_0.4.2   stringi_1.5.3    munsell_0.5.0    slam_0.1-48     
## [37] crayon_1.3.4