Path

We set the path for each file

blogs_path <- list.files("final/en_US/", full.names = TRUE)[1]
news_path <- list.files("final/en_US/", full.names = TRUE)[2]
twitter_path <- list.files("final/en_US/", full.names = TRUE)[3]

Reading files

blogs <- readLines(blogs_path, warn = F, encoding = "UTF-8")
news <- readLines(news_path, warn = F, encoding = "UTF-8")
twitter <- readLines(twitter_path, warn = F, encoding = "UTF-8")

blogs_corpus <- blogs %>% corpus()
news_corpus <- news %>% corpus()
twitter_corpus <- twitter %>% corpus()

summary

summary <- data.frame('File' = c("Blogs","News","Twitter"),
                      "File Size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
                      "Documents" = sapply(list(blogs_corpus, news_corpus, twitter_corpus), function(x){ndoc(x)}),
                      "Words" = sapply(list(blogs, news, twitter), function(x){wordcount(x)})
)

summary
##      File File.Size Documents    Words
## 1   Blogs  255.4 Mb    899288 37334131
## 2    News   19.8 Mb     77259  2643969
## 3 Twitter    319 Mb   2360148 30373543

Tokens twitter example

rm(blogs, news, twitter, blogs_corpus, news_corpus)

toks_tweets <- tokens(twitter_corpus, remove_punct = TRUE) %>% 
  tokens_keep(pattern = "#*")
dfmat_tweets <- dfm(toks_tweets)

Ploting the frequency

tstat_freq <- textstat_frequency(dfmat_tweets, n = 5)
head(tstat_freq, 20)
##    feature frequency rank docfreq group
## 1      #ff      5035    1    4976   all
## 2       #1      1751    2    1726   all
## 3      #39      1365    3    1030   all
## 4 #brewers      1121    4    1103   all
## 5      #np       851    5     848   all
dfmat_tweets %>% 
  textstat_frequency(n = 15) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL, y = "Frequency") +
  theme_minimal()

wordcloud

textplot_wordcloud(dfmat_tweets, max_words = 100)