This report explores the text files of twitter, blogs, and news. Specifically, I summarized the file size, the line count, and the word count of each of the text file. I then generated 1-gram and 2-gram for each of the file and visualize the top 100 features using both barplots and word clouds.
Import the twitter, blogs, and news files into R.
library(quanteda)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
Twitter_dataset<-file("en_US.twitter.txt","r")
Twitter_line<-readLines(Twitter_dataset)
close(Twitter_dataset)
Twitter_corpus <- corpus(Twitter_line)
Blogs_dataset<-file("en_US.blogs.txt","r")
Blogs_line<-readLines(Blogs_dataset)
close(Blogs_dataset)
Blogs_corpus <- corpus(Blogs_line)
News_dataset<-file("en_US.news.txt","r")
News_line<-readLines(News_dataset)
close(News_dataset)
News_corpus <- corpus(News_line)
Summarize the file size, the line count, and the word count of each of the text file.
data.frame(FileName=c("Twitter", "Blog", "News"),
FileSize_MB=sapply(list(Twitter_corpus, Blogs_corpus, News_corpus), function(x){object.size(x)/(1024^2)}),
LineCount=sapply(list(Twitter_corpus, Blogs_corpus, News_corpus), length),
WordCount=sapply(list(Twitter_corpus, Blogs_corpus, News_corpus), function(x){sum(nchar(x))})
)
## FileName FileSize_MB LineCount WordCount
## 1 Twitter 823.1544 2360148 162096026
## 2 Blog 447.4456 899288 206824319
## 3 News 473.1343 1010242 203223153
Tokenize the Twitter and remove all the numbers, punctuation, and symbols. After converting all the words to lowercases and removing the stop words, I generate the 1-gram and 2-gram files, as well as the corresponding document-frequency matrix files. The top 100 features are then selected for visualization.
Twitter_corpus_token <- tokens(Twitter_corpus, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE)
Twitter_corpus_token <- tokens_tolower(Twitter_corpus_token)
Twitter_corpus_token <- tokens_select(Twitter_corpus_token, stopwords(),
selection="remove")
Twitter_1gram <- tokens_ngrams(Twitter_corpus_token, n=1)
Twitter_2gram <- tokens_ngrams(Twitter_corpus_token, n=2)
Twitter_1gram_dfm <- dfm(Twitter_1gram, tolower = FALSE)
Twitter_2gram_dfm <- dfm(Twitter_2gram, tolower = FALSE)
Twitter_1gram_top100 <-topfeatures(Twitter_1gram_dfm,100)
Twitter_2gram_top100 <-topfeatures(Twitter_2gram_dfm,100)
Twitter_1gram_top100<-data.frame(Text=names(Twitter_1gram_top100), Frequency=unname(Twitter_1gram_top100))
Twitter_2gram_top100<-data.frame(Text=names(Twitter_2gram_top100), Frequency=unname(Twitter_2gram_top100))
Tokenize the Blogs and remove all the numbers, punctuation, and symbols. After converting all the words to lowercases and removing the stop words, I generate the 1-gram and 2-gram files, as well as the corresponding document-frequency matrix files. The top 100 features are then selected for visualization.
Blogs_corpus_token <- tokens(Blogs_corpus, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE)
Blogs_corpus_token <- tokens_tolower(Blogs_corpus_token)
Blogs_corpus_token <- tokens_select(Blogs_corpus_token, stopwords(),
selection="remove")
Blogs_1gram <- tokens_ngrams(Blogs_corpus_token, n=1)
Blogs_2gram <- tokens_ngrams(Blogs_corpus_token, n=2)
Blogs_1gram_dfm <- dfm(Blogs_1gram, tolower = FALSE)
Blogs_2gram_dfm <- dfm(Blogs_2gram, tolower = FALSE)
Blogs_1gram_top100 <-topfeatures(Blogs_1gram_dfm,100)
Blogs_2gram_top100 <-topfeatures(Blogs_2gram_dfm,100)
Blogs_1gram_top100<-data.frame(Text=names(Blogs_1gram_top100), Frequency=unname(Blogs_1gram_top100))
Blogs_2gram_top100<-data.frame(Text=names(Blogs_2gram_top100), Frequency=unname(Blogs_2gram_top100))
Tokenize the News and remove all the numbers, punctuation, and symbols. After converting all the words to lowercases and removing the stop words, I generate the 1-gram and 2-gram files, as well as the corresponding document-frequency matrix files. The top 100 features are then selected for visualization.
News_corpus_token <- tokens(News_corpus, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE)
News_corpus_token <- tokens_tolower(News_corpus_token)
News_corpus_token <- tokens_select(News_corpus_token, stopwords(),
selection="remove")
News_1gram <- tokens_ngrams(News_corpus_token, n=1)
News_2gram <- tokens_ngrams(News_corpus_token, n=2)
News_1gram_dfm <- dfm(News_1gram, tolower = FALSE)
News_2gram_dfm <- dfm(News_2gram, tolower = FALSE)
News_1gram_top100 <-topfeatures(News_1gram_dfm,100)
News_2gram_top100 <-topfeatures(News_2gram_dfm,100)
News_1gram_top100<-data.frame(Text=names(News_1gram_top100), Frequency=unname(News_1gram_top100))
News_2gram_top100<-data.frame(Text=names(News_2gram_top100), Frequency=unname(News_2gram_top100))
The barplots and word cloud plots of the 1-gram and 2-gram Twitter file.
ggplot(Twitter_1gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
geom_bar(stat="identity")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
xlab("Text (Twitter)")
wordcloud(words=Twitter_1gram_top100$Text, freq=Twitter_1gram_top100$Frequency, random.order = FALSE,
colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(2.5, 0.2))
ggplot(Twitter_2gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
geom_bar(stat="identity")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
xlab("Text (Twitter)")
wordcloud(words=Twitter_2gram_top100$Text, freq=Twitter_2gram_top100$Frequency, random.order = FALSE,
colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(1, 0.01))
The barplots and word cloud plots of the 1-gram and 2-gram Blogs file.
ggplot(Blogs_1gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
geom_bar(stat="identity")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
xlab("Text (Blogs)")
wordcloud(words=Blogs_1gram_top100$Text, freq=Blogs_1gram_top100$Frequency, random.order = FALSE,
colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(2.5, 0.2))
ggplot(Blogs_2gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
geom_bar(stat="identity")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
xlab("Text (Blogs)")
wordcloud(words=Blogs_2gram_top100$Text, freq=Blogs_2gram_top100$Frequency, random.order = FALSE,
colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(1, 0.01))
The barplots and word cloud plots of the 1-gram and 2-gram News file.
ggplot(News_1gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
geom_bar(stat="identity")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
xlab("Text (News)")
wordcloud(words=News_1gram_top100$Text, freq=News_1gram_top100$Frequency, random.order = FALSE,
colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(2.5, 0.2))
ggplot(News_2gram_top100, aes(x=reorder(Text, -Frequency), y=Frequency))+
geom_bar(stat="identity")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
xlab("Text (News)")
wordcloud(words=News_2gram_top100$Text, freq=News_2gram_top100$Frequency, random.order = FALSE,
colors = brewer.pal(8, "Dark2"), rot.per = 0.35, scale = c(1, 0.01))