For markdown writing, credit to reference

Getting the data

Load the data

To save knitting time, I downloaded the data manually

start <- Sys.time()
enUSnews.path <- "./final/en_US/en_US.news.txt"
enUStwitter.path <- "./final/en_US/en_US.twitter.txt"
enUSblogs.path <- "./final/en_US/en_US.blogs.txt"

enUSnews <- readLines(enUSnews.path, skipNul = TRUE)
enUStwitter <- readLines(enUStwitter.path, skipNul = TRUE)
enUSblogs <- readLines(enUSblogs.path, skipNul = TRUE)

Check file sizes in Mb

news.info <- file.info(enUSnews.path)
twitter.info <- file.info(enUStwitter.path)
blog.info <- file.info(enUSblogs.path)

news.Mb <- round(news.info$size/1024^2)
twitters.Mb <- round(twitter.info$size/1024^2)
blogs.Mb <- round(blog.info$size/1024^2)

data.frame(news_Mb = news.Mb, twitters_Mb = twitters.Mb, blogs_Mb = blogs.Mb)

##   news_Mb twitters_Mb blogs_Mb
## 1     196         159      200

Number of lines in each file

news.lines <- length(enUSnews)
twitters.lines <- length(enUStwitter)
blogs.lines <- length(enUSblogs)
data.frame(news_line = news.lines,
           twitters_line = twitters.lines,
           blogs_line = blogs.lines)

##   news_line twitters_line blogs_line
## 1   1010242       2360148     899288

Number of characters per line

news.nchar <- nchar(enUSnews)
twitter.nchar <- nchar(enUStwitter)
blog.nchar <- nchar(enUSblogs)
boxplot(log2(news.nchar), log2(twitter.nchar), log2(blog.nchar),
        ylab = "nchar per line in log2 transform",
        names = c("news", "twitters", "blogs"),
        main = "en_US_dataset")

## Total characters per file

news.nchar.sum <- sum(nchar(enUSnews))
twitter.nchar.sum <- sum(nchar(enUStwitter))
blog.nchar.sum <- sum(nchar(enUSblogs))
data.frame(news_nchar_sum = news.nchar.sum,
           twitter_nchar_sum = twitter.nchar.sum,
           blog_nchar_sum = blog.nchar.sum)

##   news_nchar_sum twitter_nchar_sum blog_nchar_sum
## 1      203223159         162096241      206824505

Number of total words in each file

suppressMessages(library(ngram))
news.words <- wordcount(enUSnews, sep = " ")
twitters.words <- wordcount(enUStwitter, sep = " ")
blogs.words <- wordcount(enUSblogs, sep = " ")

data.frame(news_words = news.words,
           twitters_words = twitters.words,
           blogs_words = blogs.words)

##   news_words twitters_words blogs_words
## 1   34372530       30373583    37334131

Summarize above stats

enUSsumm <- data.frame(file_names = c("news", "twitters", "blogs"),
                       file_size = c(news.Mb, twitters.Mb, blogs.Mb),
                       file_lines = c(news.lines, twitters.lines, blogs.lines),
                       n_character = c(news.nchar.sum, twitter.nchar.sum, blog.nchar.sum),
                       n_words = c(news.words, twitters.words, blogs.words)
                       )

# Add stats columns
suppressMessages(library(dplyr))
enUSsumm <- enUSsumm %>% mutate(pct_n_character = round(n_character/sum(n_character), 2))
enUSsumm <- enUSsumm %>% mutate(pct_n_words = round(n_words/sum(n_words), 2))
enUSsumm

##   file_names file_size file_lines n_character  n_words pct_n_character
## 1       news       196    1010242   203223159 34372530            0.36
## 2   twitters       159    2360148   162096241 30373583            0.28
## 3      blogs       200     899288   206824505 37334131            0.36
##   pct_n_words
## 1        0.34
## 2        0.30
## 3        0.37

Fractionize 5% of the raw files

frac = 0.05
set.seed(2021)

news.sample <- sample(enUSnews, round(news.lines * frac))
twitters.sample <- sample(enUStwitter, round(news.lines * frac))
blogs.sample <- sample(enUSblogs, round(news.lines * frac))

combined.sample <- c(news.sample, twitters.sample, blogs.sample)

Save the sampled files

writeLines(news.sample, "./final/en_US/en_US.news.sample.txt")
writeLines(twitters.sample, "./final/en_US/en_US.twitters.sample.txt")
writeLines(blogs.sample, "./final/en_US/en_US.blogs.sample.txt")
writeLines(combined.sample, "./final/en_US/en_US.combined.sample.txt")

Cleaning the data

Clean the combined sample data using tm

suppressMessages(library(tm))
combined.sample.clean <- Corpus(VectorSource(combined.sample))

Remove URLs,emojis,non-english words,punctuations,numbers,whitespace, stop words and profanity.

# convert to lower cases
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(tolower)): transformation drops documents

# remove Urls
removeUrl <- function(x) gsub("http[^[:space:]]*", "", x)
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(removeUrl))

## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(removeUrl)): transformation drops documents

# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(removeNumPunct))

## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(removeNumPunct)): transformation drops documents

# remove what would be emojis
combined.sample.clean <- tm_map(combined.sample.clean, content_transformer(gsub), pattern = "\\W", replace = " ")

## Warning in tm_map.SimpleCorpus(combined.sample.clean,
## content_transformer(gsub), : transformation drops documents

# remove extra white space
combined.sample.clean <- tm_map(combined.sample.clean, stripWhitespace)

## Warning in tm_map.SimpleCorpus(combined.sample.clean, stripWhitespace):
## transformation drops documents

# remove stop words
combined.sample.clean <- tm_map(combined.sample.clean, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(combined.sample.clean, removeWords,
## stopwords("english")): transformation drops documents

# remove punctuations
combined.sample.clean <- tm_map(combined.sample.clean, removePunctuation)

## Warning in tm_map.SimpleCorpus(combined.sample.clean, removePunctuation):
## transformation drops documents

# remove numbers
combined.sample.clean <- tm_map(combined.sample.clean, removeNumbers)

## Warning in tm_map.SimpleCorpus(combined.sample.clean, removeNumbers):
## transformation drops documents

# remove profanity
profanity <- read.table("./final/en_US/list.txt", header = F, sep = "\n")
combined.sample.clean <- tm_map(combined.sample.clean, removeWords, profanity$V1)

## Warning in tm_map.SimpleCorpus(combined.sample.clean, removeWords,
## profanity$V1): transformation drops documents

Save clean Corpus

saveRDS(combined.sample.clean, file = "./final/en_US/clean_sample.rds")

Exploratory Data Analysis

To analyze the textual data, we need a Document-Term Matrix (DTM) format: documents are as the rows, terms/words as the columns, frequency of the term in the document as the entries.

sample.clean <- readRDS("./final/en_US/clean_sample.rds")
sample.clean.dtm <- DocumentTermMatrix(sample.clean)
sample.clean.dtm

## <<DocumentTermMatrix (documents: 151536, terms: 126698)>>
## Non-/sparse entries: 2205188/19197102940
## Sparsity           : 100%
## Maximal term length: 91
## Weighting          : term frequency (tf)

inspect(sample.clean.dtm[100:105, 100:105])

## <<DocumentTermMatrix (documents: 6, terms: 6)>>
## Non-/sparse entries: 0/36
## Sparsity           : 100%
## Maximal term length: 14
## Weighting          : term frequency (tf)
## Sample             :
##      Terms
## Docs  meet networks ocs octastefestcom okeefe olhats
##   100    0        0   0              0      0      0
##   101    0        0   0              0      0      0
##   102    0        0   0              0      0      0
##   103    0        0   0              0      0      0
##   104    0        0   0              0      0      0
##   105    0        0   0              0      0      0

To reduce the dimension of DTM, remove the less frequent terms such that the sparsity is less than 0.95

sample.clean.dtm <- removeSparseTerms(sample.clean.dtm, sparse = 0.99)
inspect(sample.clean.dtm[100:105, 100:105])

## <<DocumentTermMatrix (documents: 6, terms: 6)>>
## Non-/sparse entries: 1/35
## Sparsity           : 97%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## Sample             :
##      Terms
## Docs  enough lot really set sure yet
##   100      0   0      0   1    0   0
##   101      0   0      0   0    0   0
##   102      0   0      0   0    0   0
##   103      0   0      0   0    0   0
##   104      0   0      0   0    0   0
##   105      0   0      0   0    0   0

Most frequent and least frequent words

suppressMessages(library(data.table))
freq <- data.frame(sort(colSums(as.matrix(sample.clean.dtm)), decreasing = TRUE))
colnames(freq) <- "counts"
freq$words <- rownames(freq)
rownames(freq) <- NULL
# 10 most frequent words
head(freq, 10)

##    counts  words
## 1   14914   said
## 2   13590   will
## 3   12940    one
## 4   11187   just
## 5   10569   like
## 6   10172    can
## 7    9060   time
## 8    8661    get
## 9    7921    new
## 10   6836 people

# 10 least frequent words
tail(freq, 10)

##     counts    words
## 132   1718   second
## 133   1695    point
## 134   1684   making
## 135   1671     done
## 136   1659 everyone
## 137   1642      yet
## 138   1626   called
## 139   1597  already
## 140   1589      lol
## 141   1574    least

Plot a simple word cloud

suppressMessages(library(SnowballC))
suppressMessages(library(wordcloud))

wordcloud(freq$words, freq$counts, max.words = 100, colors = brewer.pal(1, "Accent"),
          random.order = FALSE)

## Warning in brewer.pal(1, "Accent"): minimal value for n is 3, returning requested palette with 3 different levels

Plot most frequent words

suppressMessages(library(ggplot2))

ggplot(freq[freq$counts > 7000, ], aes(x = reorder(words, -counts), y = counts)) + 
         geom_bar(stat = "identity", fill = "steelblue") + 
         labs(title = "Most frequent wrods (occurance > 7000)",
          x = "words")

Calculate time spent

end <- Sys.time()
ellapsed <- end - start

Session Info

sessionInfo()

## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.2      wordcloud_2.6      RColorBrewer_1.1-2 SnowballC_0.7.0   
## [5] data.table_1.13.4  tm_0.7-8           NLP_0.2-1          dplyr_1.0.2       
## [9] ngram_3.0.4       
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.5       pillar_1.4.7     compiler_3.6.3   tools_3.6.3     
##  [5] digest_0.6.27    evaluate_0.14    lifecycle_0.2.0  tibble_3.0.4    
##  [9] gtable_0.3.0     pkgconfig_2.0.3  rlang_0.4.9      yaml_2.2.1      
## [13] parallel_3.6.3   xfun_0.19        withr_2.3.0      stringr_1.4.0   
## [17] knitr_1.30       xml2_1.3.2       generics_0.1.0   vctrs_0.3.5     
## [21] grid_3.6.3       tidyselect_1.1.0 glue_1.4.2       R6_2.5.0        
## [25] rmarkdown_2.6    farver_2.0.3     purrr_0.3.4      magrittr_2.0.1  
## [29] scales_1.1.1     ellipsis_0.3.1   htmltools_0.5.0  colorspace_2.0-0
## [33] labeling_0.4.2   stringi_1.5.3    munsell_0.5.0    slam_0.1-48     
## [37] crayon_1.3.4

Week2 - Milestone Report

Tianming

1/24/2021

Getting the data

Load the data

Check file sizes in Mb

Number of lines in each file

Number of characters per line

Number of total words in each file

Summarize above stats

Fractionize 5% of the raw files

Save the sampled files

Cleaning the data

Clean the combined sample data using tm

Remove URLs,emojis,non-english words,punctuations,numbers,whitespace, stop words and profanity.

Save clean Corpus

Exploratory Data Analysis

To analyze the textual data, we need a Document-Term Matrix (DTM) format: documents are as the rows, terms/words as the columns, frequency of the term in the document as the entries.

To reduce the dimension of DTM, remove the less frequent terms such that the sparsity is less than 0.95

Most frequent and least frequent words

Plot a simple word cloud

Plot most frequent words

Calculate time spent

Session Info