Overview

This report explores the text files of twitter, blogs, and news. Specifically, I summarized the file size, the line count, and the word count of each of the text file. I then generated 1-gram and 2-gram for each of the file and visualize the top 100 features using both barplots and word clouds.

Load the required packages and the data

Import the twitter, blogs, and news files into R.

library(quanteda)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)

Twitter_dataset<-file("en_US.twitter.txt","r")
Twitter_line<-readLines(Twitter_dataset)
close(Twitter_dataset)
Twitter_corpus <- corpus(Twitter_line)

Blogs_dataset<-file("en_US.blogs.txt","r")
Blogs_line<-readLines(Blogs_dataset)
close(Blogs_dataset)
Blogs_corpus <- corpus(Blogs_line)

News_dataset<-file("en_US.news.txt","r")
News_line<-readLines(News_dataset)
close(News_dataset)
News_corpus <- corpus(News_line)

Summarize the three files

Summarize the file size, the line count, and the word count of each of the text file.

data.frame(FileName=c("Twitter", "Blog", "News"), 
           FileSize_MB=sapply(list(Twitter_corpus, Blogs_corpus, News_corpus), function(x){object.size(x)/(1024^2)}),
           LineCount=sapply(list(Twitter_corpus, Blogs_corpus, News_corpus), length),
           WordCount=sapply(list(Twitter_corpus, Blogs_corpus, News_corpus), function(x){sum(nchar(x))})
             )
##   FileName FileSize_MB LineCount WordCount
## 1  Twitter    823.1544   2360148 162096026
## 2     Blog    447.4456    899288 206824319
## 3     News    473.1343   1010242 203223153

Processing the Twitter file

Tokenize the Twitter and remove all the numbers, punctuation, and symbols. After converting all the words to lowercases and removing the stop words, I generate the 1-gram and 2-gram files, as well as the corresponding document-frequency matrix files. The top 100 features are then selected for visualization.

Twitter_corpus_token <- tokens(Twitter_corpus, what = "word",
                               remove_numbers = TRUE, remove_punct = TRUE,
                               remove_symbols = TRUE)
Twitter_corpus_token <- tokens_tolower(Twitter_corpus_token)
Twitter_corpus_token <- tokens_select(Twitter_corpus_token, stopwords(),
                             selection="remove")
Twitter_1gram <- tokens_ngrams(Twitter_corpus_token, n=1)
Twitter_2gram <- tokens_ngrams(Twitter_corpus_token, n=2)

Twitter_1gram_dfm <- dfm(Twitter_1gram, tolower = FALSE)
Twitter_2gram_dfm <- dfm(Twitter_2gram, tolower = FALSE)

Twitter_1gram_top100 <-topfeatures(Twitter_1gram_dfm,100)
Twitter_2gram_top100 <-topfeatures(Twitter_2gram_dfm,100)

Twitter_1gram_top100<-data.frame(Text=names(Twitter_1gram_top100), Frequency=unname(Twitter_1gram_top100))
Twitter_2gram_top100<-data.frame(Text=names(Twitter_2gram_top100), Frequency=unname(Twitter_2gram_top100))

Processing the Blogs file

Tokenize the Blogs and remove all the numbers, punctuation, and symbols. After converting all the words to lowercases and removing the stop words, I generate the 1-gram and 2-gram files, as well as the corresponding document-frequency matrix files. The top 100 features are then selected for visualization.

Blogs_corpus_token <- tokens(Blogs_corpus, what = "word",
                               remove_numbers = TRUE, remove_punct = TRUE,
                               remove_symbols = TRUE)
Blogs_corpus_token <- tokens_tolower(Blogs_corpus_token)
Blogs_corpus_token <- tokens_select(Blogs_corpus_token, stopwords(),
                                      selection="remove")
Blogs_1gram <- tokens_ngrams(Blogs_corpus_token, n=1)
Blogs_2gram <- tokens_ngrams(Blogs_corpus_token, n=2)

Blogs_1gram_dfm <- dfm(Blogs_1gram, tolower = FALSE)
Blogs_2gram_dfm <- dfm(Blogs_2gram, tolower = FALSE)

Blogs_1gram_top100 <-topfeatures(Blogs_1gram_dfm,100)
Blogs_2gram_top100 <-topfeatures(Blogs_2gram_dfm,100)

Blogs_1gram_top100<-data.frame(Text=names(Blogs_1gram_top100), Frequency=unname(Blogs_1gram_top100))
Blogs_2gram_top100<-data.frame(Text=names(Blogs_2gram_top100), Frequency=unname(Blogs_2gram_top100))

Processing the News file

Tokenize the News and remove all the numbers, punctuation, and symbols. After converting all the words to lowercases and removing the stop words, I generate the 1-gram and 2-gram files, as well as the corresponding document-frequency matrix files. The top 100 features are then selected for visualization.

News_corpus_token <- tokens(News_corpus, what = "word",
                               remove_numbers = TRUE, remove_punct = TRUE,
                               remove_symbols = TRUE)
News_corpus_token <- tokens_tolower(News_corpus_token)
News_corpus_token <- tokens_select(News_corpus_token, stopwords(),
                                      selection="remove")
News_1gram <- tokens_ngrams(News_corpus_token, n=1)
News_2gram <- tokens_ngrams(News_corpus_token, n=2)

News_1gram_dfm <- dfm(News_1gram, tolower = FALSE)
News_2gram_dfm <- dfm(News_2gram, tolower = FALSE)

News_1gram_top100 <-topfeatures(News_1gram_dfm,100)
News_2gram_top100 <-topfeatures(News_2gram_dfm,100)

News_1gram_top100<-data.frame(Text=names(News_1gram_top100), Frequency=unname(News_1gram_top100))
News_2gram_top100<-data.frame(Text=names(News_2gram_top100), Frequency=unname(News_2gram_top100))

Visualization of the Twitter file

The barplots and word cloud plots of the 1-gram and 2-gram Twitter file.

ggplot(Twitter_1gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
  geom_bar(stat="identity")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
  xlab("Text (Twitter)")

wordcloud(words=Twitter_1gram_top100$Text, freq=Twitter_1gram_top100$Frequency, random.order = FALSE,
          colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(2.5, 0.2))

ggplot(Twitter_2gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
  geom_bar(stat="identity")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
  xlab("Text (Twitter)")

wordcloud(words=Twitter_2gram_top100$Text, freq=Twitter_2gram_top100$Frequency, random.order = FALSE,
          colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(1, 0.01))

Visualization of the Blogs file

The barplots and word cloud plots of the 1-gram and 2-gram Blogs file.

ggplot(Blogs_1gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
  geom_bar(stat="identity")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
  xlab("Text (Blogs)")

wordcloud(words=Blogs_1gram_top100$Text, freq=Blogs_1gram_top100$Frequency, random.order = FALSE,
          colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(2.5, 0.2))

ggplot(Blogs_2gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
  geom_bar(stat="identity")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
  xlab("Text (Blogs)")

wordcloud(words=Blogs_2gram_top100$Text, freq=Blogs_2gram_top100$Frequency, random.order = FALSE,
          colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(1, 0.01))

Visualization of the News file

The barplots and word cloud plots of the 1-gram and 2-gram News file.

ggplot(News_1gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
  geom_bar(stat="identity")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
  xlab("Text (News)")

wordcloud(words=News_1gram_top100$Text, freq=News_1gram_top100$Frequency, random.order = FALSE,
          colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(2.5, 0.2))

ggplot(News_2gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
  geom_bar(stat="identity")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
  xlab("Text (News)")

wordcloud(words=News_2gram_top100$Text, freq=News_2gram_top100$Frequency, random.order = FALSE,
          colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(1, 0.01))