Required packages
start <- Sys.time()
suppressMessages(library(tm))
suppressMessages(library(data.table))
suppressMessages(library(ngram))
suppressMessages(library(dplyr))
suppressMessages(library(tidytext))
suppressMessages(library(SnowballC))
suppressMessages(library(wordcloud))
suppressMessages(library(tidyr))
suppressMessages(library(stringi))
suppressMessages(library(ggplot2))
sample.dir <- "./final/en_US/sample_5pct//"
sample.5pct <- VCorpus(DirSource(sample.dir))
rm(sample.dir)
# convert to lower cases
corpus <- tm_map(sample.5pct, content_transformer(tolower))
# remove Urls
removeUrl <- function(x) gsub("http[^[:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeUrl))
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeNumPunct))
# remove what would be emojis
corpus <- tm_map(corpus, content_transformer(gsub), pattern = "\\W", replace = " ")
# remove extra white space
corpus <- tm_map(corpus, stripWhitespace)
# remove stop words
corpus <- tm_map(corpus, removeWords, stopwords("english"))
# remove punctuations
corpus <- tm_map(corpus, removePunctuation)
# remove numbers
corpus <- tm_map(corpus, removeNumbers)
# remove profanity
profanity <- read.table("./final/en_US/list.txt", header = F, sep = "\n")
corpus <- tm_map(corpus, removeWords, profanity$V1)
corpus_blog <- tibble(text = corpus[["blogs_sample.txt"]][["content"]], source = "blog")
corpus_news <- tibble(text = corpus[["news_sample.txt"]][["content"]], source = "news")
corpus_twitter <- tibble(text = corpus[["twitters_sample.txt"]][["content"]], source = "twitter")
clean_sample <- bind_rows(corpus_blog, corpus_news, corpus_twitter)
clean_sample$source <- as.factor(clean_sample$source)
rm(profanity, sample.1pct, corpus_blog, corpus_news, corpus_twitter)
## Warning in rm(profanity, sample.1pct, corpus_blog, corpus_news, corpus_twitter):
## object 'sample.1pct' not found
Analysis the tokens
n-gram = 2 top 25 2-words
bigram_repo <- bigrams %>%
count(bigram) %>%
drop_na(bigram) %>%
filter(n >= 1) %>% # Reduce n-gram files
arrange(desc(n))
bigrams_separated <- bigram_repo %>%
separate(bigram, c("word1", "word2", sep = " ")) %>%
select_if(~sum(!is.na(.)) > 0)
## Warning: Expected 3 pieces. Additional pieces discarded in 10 rows [829939,
## 884562, 950937, 1103036, 1105158, 1245723, 1391263, 1449547, 1506676, 1561817].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1635431 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Top 10 2-gram words
bigrams_separated[1:10, ]
## # A tibble: 10 x 4
## word1 word2 ` ` n
## <chr> <chr> <chr> <int>
## 1 new york <NA> 926
## 2 last year <NA> 871
## 3 dont know <NA> 805
## 4 right now <NA> 792
## 5 years ago <NA> 684
## 6 high school <NA> 656
## 7 last week <NA> 594
## 8 cant wait <NA> 521
## 9 im going <NA> 520
## 10 feel like <NA> 518
# Top 25 two-words token
bigram_repo %>%
top_n(25, n) %>%
mutate(bigram = reorder(bigram, n)) %>%
ggplot(aes(x = n, y = bigram)) + geom_col()

n-gram = 3. Top 25 3-words
trigram_repo <- trigrams %>%
count(trigram) %>%
drop_na(trigram) %>%
filter(n >= 1) %>% # Reduce n-gram files
arrange(desc(n))
trigrams_separated <- trigram_repo %>%
separate(trigram, c("word1", "word2", "word3", sep = " ")) %>%
select_if(~sum(!is.na(.)) > 0)
## Warning: Expected 4 pieces. Additional pieces discarded in 14 rows [453616,
## 850611, 890278, 914489, 969366, 1088576, 1222944, 1300903, 1303714, 1495120,
## 1871346, 1910870, 1945971, 2073086].
## Warning: Expected 4 pieces. Missing pieces filled with `NA` in 2073025 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Top 10 3-gram words
trigrams_separated[1:10, ]
## # A tibble: 10 x 5
## word1 word2 word3 ` ` n
## <chr> <chr> <chr> <chr> <int>
## 1 new york city <NA> 123
## 2 cant wait see <NA> 100
## 3 happy mothers day <NA> 80
## 4 president barack obama <NA> 78
## 5 two years ago <NA> 76
## 6 let us know <NA> 71
## 7 new york times <NA> 67
## 8 im pretty sure <NA> 62
## 9 happy new year <NA> 54
## 10 world war ii <NA> 54
# Top 25 tjree-words token
trigram_repo %>%
top_n(25, n) %>%
mutate(trigram = reorder(trigram, n)) %>%
ggplot(aes(x = n, y = trigram)) + geom_col()

n-gram = 4 top 25 4-words
quadgram_repo <- quadgrams %>%
count(quadgram) %>%
drop_na(quadgram) %>%
filter(n >= 1) %>% # Reduce n-gram files
arrange(desc(n))
quadgram_separated <- quadgram_repo %>%
separate(quadgram, c("word1", "word2", "word3", "word4", sep = " ")) %>%
select_if(~sum(!is.na(.)) > 0)
## Warning: Expected 5 pieces. Additional pieces discarded in 16 rows [169709,
## 410332, 514601, 648218, 794524, 833767, 910763, 1157739, 1160884, 1233440,
## 1236150, 1420822, 1462475, 1785438, 1838747, 1981288].
## Warning: Expected 5 pieces. Missing pieces filled with `NA` in 1981226 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Top 10 3-gram words
quadgram_separated[1:10, ]
## # A tibble: 10 x 6
## word1 word2 word3 word4 ` ` n
## <chr> <chr> <chr> <chr> <chr> <int>
## 1 incorporated item c pp <NA> 22
## 2 dow jones industrial average <NA> 21
## 3 g fat g saturated <NA> 19
## 4 martin luther king jr <NA> 19
## 5 new york stock exchange <NA> 16
## 6 g protein g carbohydrate <NA> 15
## 7 protein g carbohydrate g <NA> 15
## 8 rock roll hall fame <NA> 14
## 9 centers disease control prevention <NA> 13
## 10 love u love u <NA> 13
# Top 25 tjree-words token
quadgram_repo %>%
top_n(25, n) %>%
mutate(quadgram = reorder(quadgram, n)) %>%
ggplot(aes(x = n, y = quadgram)) + geom_col()
