In this report I will try to demonstrate the following things:

First,I will load the important libraries which are very helping in Text minig Process

library(quanteda)
library(tm)
library(ggplot2)
library(stringi)
library(wordcloud)
library(readtext)

Now, we load the the three data which are: - en_US.news - en_US.blogs - en_US.twitter

#setting the connections

con_blog <- file("en_US.blogs.txt", "r")
con_news <- file("en_US.news.txt", "r")
con_twitter <- file("en_US.twitter.txt", "r")

#reading the text files using readlines
blogs<-readLines(con_blog, skipNul = T, encoding = "UTF-8")
news<-readLines(con_news, skipNul = T, encoding = "UTF-8")
twitter<-readLines(con_twitter,skipNul = T, encoding = "UTF-8")

#Closing the connections 
close(con_blog)
close(con_news)
close(con_twitter)

Now,We look at some summary statistics of the data

#Size of the three files
object.size(blogs)
## 260564320 bytes
object.size(news)
## 20111392 bytes
object.size(twitter)
## 316037600 bytes
# Number of sentences in each files
summary(nchar(blogs))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1      47     156     230     329   40833
summary(nchar(news))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0   111.0   186.0   202.4   270.0  5760.0
summary(nchar(twitter))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   37.00   64.00   68.68  100.00  140.00

Since, the size of document is too large we will sample it using sample function before changing to corpus

set.seed(763)
blogs<-sample(blogs,length(blogs)*0.005)
news<-sample(news,length(news)*0.005)
twitter<-sample(twitter,length(twitter)*0.005)
#making corpus and removig the unneccessory elements from the text

blogs<-corpus(blogs)
news<-corpus(news)
twitter<-corpus(twitter)

#Tokenizing and removing the useless  symbols and punctuations
blogs <- tokens(blogs, what = "word", 
                       remove_numbers = TRUE, remove_punct = TRUE,
                       remove_symbols = TRUE, remove_hyphens = TRUE)
news<- tokens(news, what = "word", 
                       remove_numbers = TRUE, remove_punct = TRUE,
                       remove_symbols = TRUE, remove_hyphens = TRUE)
twitter <- tokens(twitter, what = "word", 
                       remove_numbers = TRUE, remove_punct = TRUE,
                       remove_symbols = TRUE, remove_hyphens = TRUE)

Now we make all word small letters and also remove the stop words

blogs <- tokens_tolower(blogs)
blogs<-tokens_select(blogs,stopwords(),selection = "remove")
news <- tokens_tolower(news)
news<-tokens_select(news,stopwords(),selection = "remove")
twitter<- tokens_tolower(twitter)
twitter<-tokens_select(twitter,stopwords(),selection = "remove")

Performing stemming process using the same quanteda package

blogs<- tokens_wordstem(blogs, language = "english")
news <- tokens_wordstem(news, language = "english")
twitter <- tokens_wordstem(twitter, language = "english")

changing the charecters to dfm “bags of woods model”

blogs<-dfm(blogs,tolower = F)
news<-dfm(news,tolower = F)
twitter<-dfm(twitter,tolower = F)
set.seed(100)

Now we see some Exploratory data analysis:

This is the word cloud which shows the frequency of a word by just showing it with more density than other words

#
#word Cloud of the blog data
textplot_wordcloud(blogs)

#wordcloud of the news data
textplot_wordcloud(news)

#Word cloud of the twitter data
textplot_wordcloud(twitter)

Now we plot Histogram plot of the word frequency of top 100 words n each file .I have used the quanteda packge and ggplot for this:

Frequency plot of the blogs data

features_dfm_inaug <- textstat_frequency(blogs, n = 100)
# Sort by reverse frequency order
features_dfm_inaug$feature <- with(features_dfm_inaug, reorder(feature, -frequency))

ggplot(features_dfm_inaug, aes(x = feature, y = frequency)) +
    geom_point() + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

Frequency plot of the news data

features_dfm_inaug <- textstat_frequency(news, n = 100)
# Sort by reverse frequency order
features_dfm_inaug$feature <- with(features_dfm_inaug, reorder(feature, -frequency))

ggplot(features_dfm_inaug, aes(x = feature, y = frequency)) +
    geom_point() + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

Frequency plot of twitter data

features_dfm_inaug <- textstat_frequency(twitter, n = 100)
features_dfm_inaug$feature <- with(features_dfm_inaug, reorder(feature, -frequency))

ggplot(features_dfm_inaug, aes(x = feature, y = frequency)) +
    geom_point() + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

In this way we can see the word frequency of the three files

Next plans: