First,I will load the important libraries which are very helping in Text minig Process
library(quanteda)
library(tm)
library(ggplot2)
library(stringi)
library(wordcloud)
library(readtext)
Now, we load the the three data which are: - en_US.news - en_US.blogs - en_US.twitter
#setting the connections
con_blog <- file("en_US.blogs.txt", "r")
con_news <- file("en_US.news.txt", "r")
con_twitter <- file("en_US.twitter.txt", "r")
#reading the text files using readlines
blogs<-readLines(con_blog, skipNul = T, encoding = "UTF-8")
news<-readLines(con_news, skipNul = T, encoding = "UTF-8")
twitter<-readLines(con_twitter,skipNul = T, encoding = "UTF-8")
#Closing the connections
close(con_blog)
close(con_news)
close(con_twitter)
Now,We look at some summary statistics of the data
#Size of the three files
object.size(blogs)
## 260564320 bytes
object.size(news)
## 20111392 bytes
object.size(twitter)
## 316037600 bytes
# Number of sentences in each files
summary(nchar(blogs))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 47 156 230 329 40833
summary(nchar(news))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 111.0 186.0 202.4 270.0 5760.0
summary(nchar(twitter))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 37.00 64.00 68.68 100.00 140.00
Since, the size of document is too large we will sample it using sample function before changing to corpus
set.seed(763)
blogs<-sample(blogs,length(blogs)*0.005)
news<-sample(news,length(news)*0.005)
twitter<-sample(twitter,length(twitter)*0.005)
#making corpus and removig the unneccessory elements from the text
blogs<-corpus(blogs)
news<-corpus(news)
twitter<-corpus(twitter)
#Tokenizing and removing the useless symbols and punctuations
blogs <- tokens(blogs, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens = TRUE)
news<- tokens(news, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens = TRUE)
twitter <- tokens(twitter, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens = TRUE)
Now we make all word small letters and also remove the stop words
blogs <- tokens_tolower(blogs)
blogs<-tokens_select(blogs,stopwords(),selection = "remove")
news <- tokens_tolower(news)
news<-tokens_select(news,stopwords(),selection = "remove")
twitter<- tokens_tolower(twitter)
twitter<-tokens_select(twitter,stopwords(),selection = "remove")
Performing stemming process using the same quanteda package
blogs<- tokens_wordstem(blogs, language = "english")
news <- tokens_wordstem(news, language = "english")
twitter <- tokens_wordstem(twitter, language = "english")
changing the charecters to dfm “bags of woods model”
blogs<-dfm(blogs,tolower = F)
news<-dfm(news,tolower = F)
twitter<-dfm(twitter,tolower = F)
set.seed(100)
This is the word cloud which shows the frequency of a word by just showing it with more density than other words
#
#word Cloud of the blog data
textplot_wordcloud(blogs)
#wordcloud of the news data
textplot_wordcloud(news)
#Word cloud of the twitter data
textplot_wordcloud(twitter)
Frequency plot of the blogs data
features_dfm_inaug <- textstat_frequency(blogs, n = 100)
# Sort by reverse frequency order
features_dfm_inaug$feature <- with(features_dfm_inaug, reorder(feature, -frequency))
ggplot(features_dfm_inaug, aes(x = feature, y = frequency)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Frequency plot of the news data
features_dfm_inaug <- textstat_frequency(news, n = 100)
# Sort by reverse frequency order
features_dfm_inaug$feature <- with(features_dfm_inaug, reorder(feature, -frequency))
ggplot(features_dfm_inaug, aes(x = feature, y = frequency)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Frequency plot of twitter data
features_dfm_inaug <- textstat_frequency(twitter, n = 100)
features_dfm_inaug$feature <- with(features_dfm_inaug, reorder(feature, -frequency))
ggplot(features_dfm_inaug, aes(x = feature, y = frequency)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))