We set the path for each file
blogs_path <- list.files("final/en_US/", full.names = TRUE)[1]
news_path <- list.files("final/en_US/", full.names = TRUE)[2]
twitter_path <- list.files("final/en_US/", full.names = TRUE)[3]
blogs <- readLines(blogs_path, warn = F, encoding = "UTF-8")
news <- readLines(news_path, warn = F, encoding = "UTF-8")
twitter <- readLines(twitter_path, warn = F, encoding = "UTF-8")
blogs_corpus <- blogs %>% corpus()
news_corpus <- news %>% corpus()
twitter_corpus <- twitter %>% corpus()
summary <- data.frame('File' = c("Blogs","News","Twitter"),
"File Size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
"Documents" = sapply(list(blogs_corpus, news_corpus, twitter_corpus), function(x){ndoc(x)}),
"Words" = sapply(list(blogs, news, twitter), function(x){wordcount(x)})
)
summary
## File File.Size Documents Words
## 1 Blogs 255.4 Mb 899288 37334131
## 2 News 19.8 Mb 77259 2643969
## 3 Twitter 319 Mb 2360148 30373543
rm(blogs, news, twitter, blogs_corpus, news_corpus)
toks_tweets <- tokens(twitter_corpus, remove_punct = TRUE) %>%
tokens_keep(pattern = "#*")
dfmat_tweets <- dfm(toks_tweets)
tstat_freq <- textstat_frequency(dfmat_tweets, n = 5)
head(tstat_freq, 20)
## feature frequency rank docfreq group
## 1 #ff 5035 1 4976 all
## 2 #1 1751 2 1726 all
## 3 #39 1365 3 1030 all
## 4 #brewers 1121 4 1103 all
## 5 #np 851 5 848 all
dfmat_tweets %>%
textstat_frequency(n = 15) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()
textplot_wordcloud(dfmat_tweets, max_words = 100)