For markdown writing, credit to reference
To save knitting time, I downloaded the data manually
start <- Sys.time()
enUSnews.path <- "./final/en_US/en_US.news.txt"
enUStwitter.path <- "./final/en_US/en_US.twitter.txt"
enUSblogs.path <- "./final/en_US/en_US.blogs.txt"
enUSnews <- readLines(enUSnews.path, skipNul = TRUE)
enUStwitter <- readLines(enUStwitter.path, skipNul = TRUE)
enUSblogs <- readLines(enUSblogs.path, skipNul = TRUE)
news.info <- file.info(enUSnews.path)
twitter.info <- file.info(enUStwitter.path)
blog.info <- file.info(enUSblogs.path)
news.Mb <- round(news.info$size/1024^2)
twitters.Mb <- round(twitter.info$size/1024^2)
blogs.Mb <- round(blog.info$size/1024^2)
data.frame(news_Mb = news.Mb, twitters_Mb = twitters.Mb, blogs_Mb = blogs.Mb)
## news_Mb twitters_Mb blogs_Mb
## 1 196 159 200
news.lines <- length(enUSnews)
twitters.lines <- length(enUStwitter)
blogs.lines <- length(enUSblogs)
data.frame(news_line = news.lines,
twitters_line = twitters.lines,
blogs_line = blogs.lines)
## news_line twitters_line blogs_line
## 1 1010242 2360148 899288
news.nchar <- nchar(enUSnews)
twitter.nchar <- nchar(enUStwitter)
blog.nchar <- nchar(enUSblogs)
boxplot(log2(news.nchar), log2(twitter.nchar), log2(blog.nchar),
ylab = "nchar per line in log2 transform",
names = c("news", "twitters", "blogs"),
main = "en_US_dataset")
## Total characters per file
news.nchar.sum <- sum(nchar(enUSnews))
twitter.nchar.sum <- sum(nchar(enUStwitter))
blog.nchar.sum <- sum(nchar(enUSblogs))
data.frame(news_nchar_sum = news.nchar.sum,
twitter_nchar_sum = twitter.nchar.sum,
blog_nchar_sum = blog.nchar.sum)
## news_nchar_sum twitter_nchar_sum blog_nchar_sum
## 1 203223159 162096241 206824505
suppressMessages(library(ngram))
news.words <- wordcount(enUSnews, sep = " ")
twitters.words <- wordcount(enUStwitter, sep = " ")
blogs.words <- wordcount(enUSblogs, sep = " ")
data.frame(news_words = news.words,
twitters_words = twitters.words,
blogs_words = blogs.words)
## news_words twitters_words blogs_words
## 1 34372530 30373583 37334131
enUSsumm <- data.frame(file_names = c("news", "twitters", "blogs"),
file_size = c(news.Mb, twitters.Mb, blogs.Mb),
file_lines = c(news.lines, twitters.lines, blogs.lines),
n_character = c(news.nchar.sum, twitter.nchar.sum, blog.nchar.sum),
n_words = c(news.words, twitters.words, blogs.words)
)
# Add stats columns
suppressMessages(library(dplyr))
enUSsumm <- enUSsumm %>% mutate(pct_n_character = round(n_character/sum(n_character), 2))
enUSsumm <- enUSsumm %>% mutate(pct_n_words = round(n_words/sum(n_words), 2))
enUSsumm
## file_names file_size file_lines n_character n_words pct_n_character
## 1 news 196 1010242 203223159 34372530 0.36
## 2 twitters 159 2360148 162096241 30373583 0.28
## 3 blogs 200 899288 206824505 37334131 0.36
## pct_n_words
## 1 0.34
## 2 0.30
## 3 0.37
frac = 0.05
set.seed(2021)
news.sample <- sample(enUSnews, round(news.lines * frac))
twitters.sample <- sample(enUStwitter, round(news.lines * frac))
blogs.sample <- sample(enUSblogs, round(news.lines * frac))
combined.sample <- c(news.sample, twitters.sample, blogs.sample)
writeLines(news.sample, "./final/en_US/en_US.news.sample.txt")
writeLines(twitters.sample, "./final/en_US/en_US.twitters.sample.txt")
writeLines(blogs.sample, "./final/en_US/en_US.blogs.sample.txt")
writeLines(combined.sample, "./final/en_US/en_US.combined.sample.txt")
suppressMessages(library(tm))
combined.sample.clean <- Corpus(VectorSource(combined.sample))
# convert to lower cases
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(tolower)): transformation drops documents
# remove Urls
removeUrl <- function(x) gsub("http[^[:space:]]*", "", x)
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(removeUrl))
## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(removeUrl)): transformation drops documents
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(removeNumPunct)): transformation drops documents
# remove what would be emojis
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(gsub), pattern = "\\W", replace = " ")
## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(gsub), : transformation drops documents
# remove extra white space
combined.sample.clean <- tm_map(combined.sample.clean, stripWhitespace)
## Warning in tm_map.SimpleCorpus(combined.sample.clean, stripWhitespace):
## transformation drops documents
# remove stop words
combined.sample.clean <- tm_map(combined.sample.clean, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(combined.sample.clean, removeWords,
## stopwords("english")): transformation drops documents
# remove punctuations
combined.sample.clean <- tm_map(combined.sample.clean, removePunctuation)
## Warning in tm_map.SimpleCorpus(combined.sample.clean, removePunctuation):
## transformation drops documents
# remove numbers
combined.sample.clean <- tm_map(combined.sample.clean, removeNumbers)
## Warning in tm_map.SimpleCorpus(combined.sample.clean, removeNumbers):
## transformation drops documents
# remove profanity
profanity <- read.table("./final/en_US/list.txt", header = F, sep = "\n")
combined.sample.clean <- tm_map(combined.sample.clean, removeWords, profanity$V1)
## Warning in tm_map.SimpleCorpus(combined.sample.clean, removeWords,
## profanity$V1): transformation drops documents
saveRDS(combined.sample.clean, file = "./final/en_US/clean_sample.rds")
sample.clean <- readRDS("./final/en_US/clean_sample.rds")
sample.clean.dtm <- DocumentTermMatrix(sample.clean)
sample.clean.dtm
## <<DocumentTermMatrix (documents: 151536, terms: 126698)>>
## Non-/sparse entries: 2205188/19197102940
## Sparsity : 100%
## Maximal term length: 91
## Weighting : term frequency (tf)
inspect(sample.clean.dtm[100:105, 100:105])
## <<DocumentTermMatrix (documents: 6, terms: 6)>>
## Non-/sparse entries: 0/36
## Sparsity : 100%
## Maximal term length: 14
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs meet networks ocs octastefestcom okeefe olhats
## 100 0 0 0 0 0 0
## 101 0 0 0 0 0 0
## 102 0 0 0 0 0 0
## 103 0 0 0 0 0 0
## 104 0 0 0 0 0 0
## 105 0 0 0 0 0 0
sample.clean.dtm <- removeSparseTerms(sample.clean.dtm, sparse = 0.99)
inspect(sample.clean.dtm[100:105, 100:105])
## <<DocumentTermMatrix (documents: 6, terms: 6)>>
## Non-/sparse entries: 1/35
## Sparsity : 97%
## Maximal term length: 6
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs enough lot really set sure yet
## 100 0 0 0 1 0 0
## 101 0 0 0 0 0 0
## 102 0 0 0 0 0 0
## 103 0 0 0 0 0 0
## 104 0 0 0 0 0 0
## 105 0 0 0 0 0 0
suppressMessages(library(data.table))
freq <- data.frame(sort(colSums(as.matrix(sample.clean.dtm)), decreasing = TRUE))
colnames(freq) <- "counts"
freq$words <- rownames(freq)
rownames(freq) <- NULL
# 10 most frequent words
head(freq, 10)
## counts words
## 1 14914 said
## 2 13590 will
## 3 12940 one
## 4 11187 just
## 5 10569 like
## 6 10172 can
## 7 9060 time
## 8 8661 get
## 9 7921 new
## 10 6836 people
# 10 least frequent words
tail(freq, 10)
## counts words
## 132 1718 second
## 133 1695 point
## 134 1684 making
## 135 1671 done
## 136 1659 everyone
## 137 1642 yet
## 138 1626 called
## 139 1597 already
## 140 1589 lol
## 141 1574 least
suppressMessages(library(SnowballC))
suppressMessages(library(wordcloud))
wordcloud(freq$words, freq$counts, max.words = 100, colors = brewer.pal(1, "Accent"),
random.order = FALSE)
## Warning in brewer.pal(1, "Accent"): minimal value for n is 3, returning requested palette with 3 different levels
suppressMessages(library(ggplot2))
ggplot(freq[freq$counts > 7000, ], aes(x = reorder(words, -counts), y = counts)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Most frequent wrods (occurance > 7000)",
x = "words")
end <- Sys.time()
ellapsed <- end - start
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_3.3.2 wordcloud_2.6 RColorBrewer_1.1-2 SnowballC_0.7.0
## [5] data.table_1.13.4 tm_0.7-8 NLP_0.2-1 dplyr_1.0.2
## [9] ngram_3.0.4
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 pillar_1.4.7 compiler_3.6.3 tools_3.6.3
## [5] digest_0.6.27 evaluate_0.14 lifecycle_0.2.0 tibble_3.0.4
## [9] gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.9 yaml_2.2.1
## [13] parallel_3.6.3 xfun_0.19 withr_2.3.0 stringr_1.4.0
## [17] knitr_1.30 xml2_1.3.2 generics_0.1.0 vctrs_0.3.5
## [21] grid_3.6.3 tidyselect_1.1.0 glue_1.4.2 R6_2.5.0
## [25] rmarkdown_2.6 farver_2.0.3 purrr_0.3.4 magrittr_2.0.1
## [29] scales_1.1.1 ellipsis_0.3.1 htmltools_0.5.0 colorspace_2.0-0
## [33] labeling_0.4.2 stringi_1.5.3 munsell_0.5.0 slam_0.1-48
## [37] crayon_1.3.4