Exploratory Data Analysis

Introduction

Project consists of creating an applicationto aid in writting a text by displaying a next suggested word in a phrase or a sentence.

In this project US english data set of news, blogs and twitter will be used.

To understand the structure of datasets of three files data will be organized into lines of text, number of words and charecters.

In this explaratory data analysis variables of interest will be compared against each other from three files.

The variables of interest:

  • size of the three files

  • number of lines

  • number of total words

  • number of words per line

  • maximum words per line

  • number of characters

  • number of characters per word

Datasets files used:

  • “en_US.blogs.txt” ,

  • “en_US.news.txt” ,

  • “en_US.twitter.txt” .

Libraries used for the exploratory analysis

suppressPackageStartupMessages({
  library(tidytext)
  library(ggplot2)
  library(tidyverse)
  library(stringr)
  library(R.utils)
  library(ngram)
  library(dplyr)
  library(stringi)
  library(RWeka)
  library(tm)
  library(plotly)
  library(tmap)
  library(knitr)
  library(wordcloud)
  library(ggraph)
  library(igraph)
  library(readr)
  
})

Data Exploration

Downloading data

Compressed archive available at “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

# Load the Data
blogs_file <- "en_US.blogs.txt"
news_file <- "en_US.news.txt"
twitter_file <- "en_US.twitter.txt"

Size of the files

size<-file.info("en_US.blogs.txt")
size<-file.info("en_US.news.txt")
size<-file.info("en_US.twitter.txt")

kb<-size$size/1024
kb<-size$size/1024
kb<-size$size/1024

sizeBlogs<-kb/1024
sizeNews<-kb/1024
sizeTwitter<-kb/1024

Reading the lines of each dataset

#blogs
blogs <- readLines(blogs_file, encoding = "UTF-8", skipNul = TRUE)
#twitter
twitter <- readLines(twitter_file, encoding = "UTF-8", skipNul = TRUE)
#news
con <- file(news_file, open="rb")
news <- readLines(con, encoding = "UTF-8",  skipNul = TRUE)
close(con)
rm(con)

Reading the number of words of each dataset

wordsBlogs<-wordcount(blogs, sep = " ", count.function = sum)
wordsNews<-wordcount(news, sep = " ", count.function = sum)
wordsTwitter<-wordcount(twitter, sep = " ", count.function = sum)

Number of lines of each dataset

nlinesBlogs <- countLines("en_US.blogs.txt")
nlinesTwitter <- countLines("en_US.twitter.txt")
nlinesNews <- countLines("en_US.news.txt")

Number of charecters

statsBlogs <- stri_stats_general(blogs)
statsTwitter <- stri_stats_general(twitter)
statsNews <- stri_stats_general(news)

charectersBlogs <- statsBlogs[3]
charectersTwitter <- statsTwitter[3]
charectersNews <- statsNews[3]

Histograms of number of words

WordsBlogs <- stri_count_words(blogs)
WordsTwitter <- stri_count_words(twitter)
WordsNews <- stri_count_words(news)

hist(WordsBlogs, main="Histogram of words in blogs", xlab="No. of words per blog post", col="lightblue", breaks=100, xlim=c(0, 500))

hist(WordsNews, main="Histogram of words in news", xlab="No. of words per news post", col="lightblue", breaks=100, xlim=c(0, 500))

hist(WordsTwitter, main="Histogram of words in twitter", xlab="No. of words per twitter post", col="lightblue", breaks=20, xlim=c(0, 50))

Words/Lines Ratio

plot <- tibble(counts = c(wordsBlogs, wordsTwitter, wordsNews, nlinesBlogs, nlinesTwitter, nlinesNews),
    class = as.factor(c(rep("words",3), rep("lines", 3))), 
    medium = as.factor(c(rep(c("blogs", "twitter", "news"),2 ))))

plot$names <- paste(plot$class, plot$medium)
WordsLinesRatio = data_frame(WordsLinesRatio = c(wordsBlogs/ nlinesBlogs, wordsNews/nlinesNews, wordsTwitter/nlinesTwitter), MediaType = as.factor(c("Blogs","News","Twitter")))
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
ggplot(data = WordsLinesRatio, aes(x=MediaType, y= WordsLinesRatio, fill = MediaType)) + geom_bar(stat="identity") + ggtitle("Words/Lines Ratio") + ylab("Words/Lines")

samplesize = 0.2

blogs_sample <- sample(blogs, samplesize*length(blogs))
news_sample <- sample(news, samplesize*length(news))
twitter_sample <- sample(twitter, samplesize*length(twitter))
blog_file <- data_frame(line = 1:length(blogs_sample), text=blogs_sample)
news_file <- data_frame(line = 1:length(news_sample), text=news_sample)
twitter_file <- data_frame(line = 1:length(twitter_sample), text=twitter_sample)

Tokenize into consecutive sequences of words, called ngrams

# Blogs
blog_token <- blog_file %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)
blog_token
## # A tibble: 7,345,640 x 2
##     line bigram     
##    <int> <chr>      
##  1     1 thank you  
##  2     1 you two    
##  3     1 two ton    
##  4     1 ton strap  
##  5     2 the other  
##  6     2 other day  
##  7     2 day because
##  8     2 because of 
##  9     2 of the     
## 10     2 the stupid 
## # … with 7,345,630 more rows
# News 
news_token <- news_file %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)
news_token
## # A tibble: 6,743,327 x 2
##     line bigram              
##    <int> <chr>               
##  1     1 rock ohio           
##  2     1 ohio spokeswoman    
##  3     1 spokeswoman jennifer
##  4     1 jennifer kulczycki  
##  5     1 kulczycki declined  
##  6     1 declined to         
##  7     1 to comment          
##  8     1 comment the         
##  9     1 the company         
## 10     1 company has         
## # … with 6,743,317 more rows
# Twitter 
twitter_token <- twitter_file %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)
twitter_token
## # A tibble: 5,545,369 x 2
##     line bigram       
##    <int> <chr>        
##  1     1 dm me        
##  2     1 me your      
##  3     1 your email   
##  4     1 email address
##  5     1 address and  
##  6     1 and i'll     
##  7     1 i'll send    
##  8     1 send you     
##  9     1 you details  
## 10     2 so can       
## # … with 5,545,359 more rows
sample_file <- rbind(blog_file, news_file, twitter_file)
sampletoken <- sample_file %>% unnest_tokens(word, text)
commontokens <- sampletoken %>% count(word, sort = TRUE)
commontokens_nostopwords <- commontokens %>% anti_join(stop_words)
## Joining, by = "word"
totalwords <- commontokens_nostopwords %>%
    summarize(total=sum(n))
totalwords <- as.integer(totalwords)
totalwords <- rep(totalwords, dim(commontokens_nostopwords)[1])
commontokens_nostopwords <- cbind(commontokens_nostopwords, totalwords)
commontokens_nostopwords <- commontokens_nostopwords %>% mutate(tf=n/totalwords)
head(commontokens_nostopwords, 10)
##      word     n totalwords          tf
## 1    time 44638    8681955 0.005141469
## 2     day 35393    8681955 0.004076616
## 3    love 32426    8681955 0.003734873
## 4  people 31583    8681955 0.003637775
## 5       2 21270    8681955 0.002449909
## 6       3 20916    8681955 0.002409135
## 7       1 18827    8681955 0.002168521
## 8    life 18359    8681955 0.002114616
## 9      rt 17798    8681955 0.002049999
## 10   home 16589    8681955 0.001910745

Visualization of the network

Generating bigrams with stop words filtered out and separating it into two words

bigrams <- sample_file %>%
    unnest_tokens(bigram, text, token="ngrams", n = 2)


bigrams <- bigrams %>% count(bigram, sort=TRUE)


bigrams_separated <- bigrams %>% 
    separate(bigram, c("word1", "word2"), sep=" ")


bigrams_filtered <- bigrams_separated %>%
    filter(!word1 %in% stop_words$word) %>%
    filter(!word2 %in% stop_words$word)


totalbigrams <- bigrams_filtered %>%
    summarize(total=sum(n))
totalbigrams <- as.integer(totalbigrams)
totalbigrams <- rep(totalbigrams, dim(bigrams_filtered)[1])


bigrams <- cbind(bigrams_filtered, totalbigrams)
bigrams <- bigrams %>% mutate(tf=n/totalbigrams)

Generate the network graph using ggraph

bigram_graph <- bigrams %>%
    filter (n > 300) %>%
    graph_from_data_frame()

set.seed(3226)
#png(filename="ggraph_bigram.png")
arrow <- grid::arrow(type="closed", length=unit(.10, "inches"))
ggraph(bigram_graph, layout="fr") +
    geom_edge_link(aes(edge_alpha=n, color = "blue"), show.legend = FALSE, arrow=arrow) +
    geom_node_point(color="green", size=1.5) +
    geom_node_text(aes(label=name), vjust=1, hjust=1) +
    theme_void()

#dev.off()
set.seed(3693)
SampleBlog <- sample(blogs, length(blogs) * 0.001)
SampleNews <- sample(news, length(news) * 0.001)
SampleTwitter <- sample(twitter, length(twitter) * 0.001)
data_sample <- c(sample(blogs, length(blogs) * 0.001),
                 sample(news, length(news) * 0.001),
                 sample(twitter, length(twitter) * 0.001))

Create corpus

corpus <- VCorpus(VectorSource(data_sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))

Clean the created data sample

corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

Create corpus and clean the created blogs data sample

corpusBlogs <- VCorpus(VectorSource(SampleBlog))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpusBlogs <- tm_map(corpusBlogs, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpusBlogs <- tm_map(corpusBlogs, toSpace, "@[^\\s]+")
corpusBlogs <- tm_map(corpusBlogs, tolower)
corpusBlogs <- tm_map(corpusBlogs, removeWords, stopwords("en"))
corpusBlogs <- tm_map(corpusBlogs, removePunctuation)
corpusBlogs <- tm_map(corpusBlogs, removeNumbers)
corpusBlogs <- tm_map(corpusBlogs, stripWhitespace)
corpusBlogs <- tm_map(corpusBlogs, PlainTextDocument)

Create corpus and clean the created news data sample

corpusNews <- VCorpus(VectorSource(SampleNews))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpusNews <- tm_map(corpusNews, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpusNews <- tm_map(corpusNews, toSpace, "@[^\\s]+")
corpusNews <- tm_map(corpusNews, tolower)
corpusNews <- tm_map(corpusNews, removeWords, stopwords("en"))
corpusNews <- tm_map(corpusNews, removePunctuation)
corpusNews <- tm_map(corpusNews, removeNumbers)
corpusNews <- tm_map(corpusNews, stripWhitespace)
corpusNews <- tm_map(corpusNews, PlainTextDocument)

Create corpus and clean the created twitter data sample

corpusTwitter <- VCorpus(VectorSource(SampleTwitter))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpusTwitter <- tm_map(corpusTwitter, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpusTwitter <- tm_map(corpusTwitter, toSpace, "@[^\\s]+")
corpusTwitter <- tm_map(corpusTwitter, tolower)
corpusTwitter <- tm_map(corpusTwitter, removeWords, stopwords("en"))
corpusTwitter <- tm_map(corpusTwitter, removePunctuation)
corpusTwitter <- tm_map(corpusTwitter, removeNumbers)
corpusTwitter <- tm_map(corpusTwitter, stripWhitespace)
corpusTwitter <- tm_map(corpusTwitter, PlainTextDocument)

Calculating frequency - unigram

Blogs

options(mc.cores=1)

# Unigram
tdmUniBlog <- removeSparseTerms(TermDocumentMatrix(corpusBlogs), 0.9999)
FreqUniBlog <- rowSums(as.matrix(tdmUniBlog))
FreqUniBlog <- sort(FreqUniBlog, decreasing=TRUE)
FreqUniBlog <- data.frame(word=names(FreqUniBlog), freq=FreqUniBlog)

News

tdmUniNews <- removeSparseTerms(TermDocumentMatrix(corpusNews), 0.9999)
FreqUniNews <- rowSums(as.matrix(tdmUniNews))
FreqUniNews <- sort(FreqUniNews, decreasing=TRUE)
FreqUniNews <- data.frame(word=names(FreqUniNews), freq=FreqUniNews)

Twitter

tdmUniTwitter <- removeSparseTerms(TermDocumentMatrix(corpusTwitter), 0.9999)
FreqUniTwitter <- rowSums(as.matrix(tdmUniTwitter))
FreqUniTwitter <- sort(FreqUniTwitter, decreasing=TRUE)
FreqUniTwitter <- data.frame(word=names(FreqUniTwitter), freq=FreqUniTwitter)

Calculating frequency - bigram

Blogs

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdmBiBlog <- removeSparseTerms(TermDocumentMatrix(corpusBlogs, control = list(tokenize = bigram)), 0.9999)
FreqBiBlog <- rowSums(as.matrix(tdmBiBlog))
FreqBiBlog <- sort(FreqBiBlog, decreasing=TRUE)
FreqBiBlog <- data.frame(word=names(FreqBiBlog), freq=FreqBiBlog)

News

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdmBiNews <- removeSparseTerms(TermDocumentMatrix(corpusNews, control = list(tokenize = bigram)), 0.9999)
FreqBiNews <- rowSums(as.matrix(tdmBiNews))
FreqBiNews <- sort(FreqBiNews, decreasing=TRUE)
FreqBiNews <- data.frame(word=names(FreqBiNews), freq=FreqBiNews)

Twitter

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdmBiTwitter <- removeSparseTerms(TermDocumentMatrix(corpusTwitter, control = list(tokenize = bigram)), 0.9999)
FreqBiTwitter <- rowSums(as.matrix(tdmBiTwitter))
FreqBiTwitter <- sort(FreqBiTwitter, decreasing=TRUE)
FreqBiTwitter <- data.frame(word=names(FreqBiTwitter), freq=FreqBiTwitter)

Calculating frequency - trigram

Blogs

trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdmTriBlog <- removeSparseTerms(TermDocumentMatrix(corpusBlogs, control = list(tokenize = trigram)), 0.9999)
FreqTriBlog <- rowSums(as.matrix(tdmTriBlog))
FreqTriBlog <- sort(FreqTriBlog, decreasing=TRUE)
FreqTriBlog <- data.frame(word=names(FreqTriBlog), freq=FreqTriBlog)

News

trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdmTriNews <- removeSparseTerms(TermDocumentMatrix(corpusNews, control = list(tokenize = trigram)), 0.9999)
FreqTriNews <- rowSums(as.matrix(tdmTriNews))
FreqTriNews <- sort(FreqTriNews, decreasing=TRUE)
FreqTriNews <- data.frame(word=names(FreqTriNews), freq=FreqTriNews)

Twitter

trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdmTriTwitter <- removeSparseTerms(TermDocumentMatrix(corpusTwitter, control = list(tokenize = trigram)), 0.9999)
FreqTriTwitter <- rowSums(as.matrix(tdmTriTwitter))
FreqTriTwitter <- sort(FreqTriTwitter, decreasing=TRUE)
FreqTriTwitter <- data.frame(word=names(FreqTriTwitter), freq=FreqTriTwitter)

Calculating frequency - quadgram

Blogs

quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
tdmQuadBlog <- removeSparseTerms(TermDocumentMatrix(corpusBlogs, control = list(tokenize = quadgram)), 0.9999)
FreqQuadBlog <- rowSums(as.matrix(tdmQuadBlog))
FreqQuadBlog <- sort(FreqQuadBlog, decreasing=TRUE)
FreqQuadBlog <- data.frame(word=names(FreqQuadBlog), freq=FreqQuadBlog)

News

quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
tdmQuadNews <- removeSparseTerms(TermDocumentMatrix(corpusNews, control = list(tokenize = quadgram)), 0.9999)
FreqQuadNews <- rowSums(as.matrix(tdmQuadNews))
FreqQuadNews <- sort(FreqQuadNews, decreasing=TRUE)
FreqQuadNews <- data.frame(word=names(FreqQuadNews), freq=FreqQuadNews)

Twitter

quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
tdmQuadTwitter <- removeSparseTerms(TermDocumentMatrix(corpusTwitter, control = list(tokenize = quadgram)), 0.9999)
FreqQuadTwitter <- rowSums(as.matrix(tdmQuadTwitter))
FreqQuadTwitter <- sort(FreqQuadTwitter, decreasing=TRUE)
FreqQuadTwitter <- data.frame(word=names(FreqQuadTwitter), freq=FreqQuadTwitter)

Histograms of frequency - unigram

ggplot(FreqUniBlog[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Blogs") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("green"))

ggplot(FreqUniNews[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("News") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("blue"))

ggplot(FreqUniTwitter[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Twitter") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("orange"))

Histograms of frequency - bigram

ggplot(FreqBiBlog[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Blogs") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("green"))

ggsave("Frequency_bigram_blog.png")
## Saving 7 x 5 in image
ggplot(FreqBiNews[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("News") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("blue"))

ggsave("Frequency_bigram_news.png")
## Saving 7 x 5 in image
ggplot(FreqBiTwitter[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Twitter") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("orange"))

ggsave("Frequency_bigram_twitter.png")
## Saving 7 x 5 in image

Histograms of frequency - trigram

ggplot(FreqTriBlog[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Blogs") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("green"))

ggplot(FreqTriNews[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("News") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("blue"))

ggplot(FreqTriTwitter[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Twitter") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("orange"))

Histograms of frequency - quadgram

ggplot(FreqQuadBlog[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Blogs") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("green"))

ggplot(FreqQuadNews[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("News") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("blue"))

ggplot(FreqQuadTwitter[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Twitter") +
         theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("orange"))

Frequency of words in all datasets

Calculating frequency - unigram

tdmUni <- removeSparseTerms(TermDocumentMatrix(corpus), 0.9999)
FreqUni <- rowSums(as.matrix(tdmUni))
FreqUni <- sort(FreqUni, decreasing=TRUE)
FreqUni <- data.frame(word=names(FreqUni), freq=FreqUni)

Calculating frequency - bigram

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdmBi <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999)
FreqBi <- rowSums(as.matrix(tdmBi))
FreqBi <- sort(FreqBi, decreasing=TRUE)
FreqBi <- data.frame(word=names(FreqBi), freq=FreqBi)

Calculating frequency - trigram

trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdmTri <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999)
FreqTri <- rowSums(as.matrix(tdmTri))
FreqTri <- sort(FreqTri, decreasing=TRUE)
FreqTri <- data.frame(word=names(FreqTri), freq=FreqTri)

Calculating frequency - quadgram

quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
tdmQuad <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = quadgram)), 0.9999)
FreqQuad <- rowSums(as.matrix(tdmQuad))
FreqQuad <- sort(FreqQuad, decreasing=TRUE)
FreqQuad <- data.frame(word=names(FreqQuad), freq=FreqQuad)

Histograms of words in all datasets

Unigram

ggplot(FreqUni[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Unigram") +
         theme(axis.text.x = element_text(angle = 60, size = 8, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("green3"))

ggsave("unigram.png")
## Saving 7 x 5 in image

Bigram

ggplot(FreqBi[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Bigram") +
         theme(axis.text.x = element_text(angle = 60, size = 8, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("blue4"))

ggsave("bigram.png")
## Saving 7 x 5 in image

Trigram

ggplot(FreqTri[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Trigram") +
         theme(axis.text.x = element_text(angle = 60, size = 8, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("red"))

ggsave("trigram.png")
## Saving 7 x 5 in image

Quadgram

ggplot(FreqQuad[1:40,], aes(reorder(word, freq), freq)) +
         labs(x="Words", y = "Frequency") + ggtitle("Quadgram") +
         theme(axis.text.x = element_text(angle = 60, size = 8, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("magenta2"))

ggsave("quadgram.png")
## Saving 7 x 5 in image

Conclusion

The initial exploratory analysis of datasets showed that blogs have the smallest number of lines but largest numbers of words, total characters and words per line.

The news articles have the largest number of characters per word indicating that the language used in the text is more complex than text in blogs and twitter.