Project summary

The data will be downloaded and loaded in as a whole. Summary statistics are derived (table and plots). An extensive filtering on the dataset is applied based on discrepancy findings (ie profanity, first names, numbers, punctuations heavy mispellings etc). The frequencies of single words are computed and plotted. Based on these frequencies, a subsequent filtering is applied to account for almost the 90% of the dataset, but have far fewer unique words. Frequent n-grams are then computed. The fact that non-frequent words are discarded in the previous step, does not affect 2-grams, 3-grams etc since in order for a set of words to be frequent, all of the member words should aso be frequent.

Prior all set working directory to the location of the source file.

Demonstration of data downloading and successfull loading it in.

##Set to source file location
setwd("D:/Education/Data Science/Johns Hopkins - Specialization certificate/Capstone Project")
##Get working directory 
working_directory <- getwd()
## Assign the URL from where the data will be downloaded

fileURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

## Download the zip file containing the data, if this zip file has not already been downloaded.

file <- paste(working_directory,"/data.zip",sep="")
if(!file.exists(file)){ 
       download.file(fileURL, file)
}

## Unzip the downloaded zip file.

unzip(file, exdir = working_directory)           # unzip file to data folder

## Delete the zip file (if it has not already been deleted!), since this is no longer needed.
if (file.exists(file)) file.remove(file)
## [1] TRUE
##Remove unnecessary stuff
rm(file);rm(fileURL);rm(working_directory)
##Load the data
library(tm)

##Lets read the text. Instead of reading the text in seperate files, parallel programming will be employed.

text <- as.list(c("final/en_US/en_US.twitter.txt","final/en_US/en_US.blogs.txt","final/en_US/en_US.news.txt"))
text <- lapply(text,readLines)

Creation of a basic report of summary statistics about the data sets.

##Count the lines of each text
twitter_lines <- as.numeric((as.vector(summary(text[1])))[1])
blogs_lines <- as.numeric((as.vector(summary(text[2])))[1])
news_lines <- as.numeric((as.vector(summary(text[3])))[1])
##Bulding the function that takes as input text (it is obvious that an analogus function could be built in the same notion to receive not a list of texts but a single one) and exports a tokenized version.
tokenization <- function(y) {
##Lets unify the string elements of the vector into one string, by collapsing the breakpoints
y <- lapply(y,function(x) paste(x, collapse= " "))
##Let's convert all letters to lower case
y <- lapply(y,tolower)
##The regural expression used, suggests that the string should be split each time a word ends. So the following extracts all words from the text. 
y <- lapply(y, function(x) strsplit(x, "\\W"))
##In order to further process the extracted words, these should be detouched from the list stracture into a vector.Lets do that.
y <- lapply(y, unlist)

}
#Let's apply the function to the texts.
text <- tokenization(text)
#Let's count the words 
twitter_words <- as.numeric((as.vector(summary(text[1])))[1])
blogs_words <- as.numeric((as.vector(summary(text[2])))[1])
news_words <- as.numeric((as.vector(summary(text[3])))[1])

##Let's count unique words
unique <- lapply(text, unique)
twitter_unique_words <- as.numeric((as.vector(summary(unique[1])))[1])
blogs_unique_words <- as.numeric((as.vector(summary(unique[2])))[1])
news_unique_words <- as.numeric((as.vector(summary(unique[3])))[1])
lines = c(twitter_lines, blogs_lines, news_lines) 
words = c(twitter_words, blogs_words, news_words) 
uniques = c(twitter_unique_words, blogs_unique_words, news_unique_words) 

##Let's create the statistics dataframe
statistics = data.frame(lines,words, uniques)
rownames(statistics) <- c("twitter","blogs","news")
##Let's remove unnecessary stuff
rm(lines);rm(words);rm(uniques)
rm(twitter_lines);rm(blogs_lines);rm(news_lines)
rm(twitter_words);rm(blogs_words);rm(news_words)
rm(twitter_unique_words);rm(blogs_unique_words);rm(news_unique_words)
statistics
##           lines    words uniques
## twitter 2360148 38585883  336489
## blogs    899288 44676636  304449
## news      77259  3208673   78947
##Some plots
barL=barplot(statistics$lines, beside=FALSE, main="Number of lines" , col="red",axes=FALSE)
axis(1, labels = rownames(statistics), at = barL, las = 1, cex.axis = 1.5)
axis(2, at=statistics$lines, las = 2, cex.axis = 1)

barW=barplot(statistics$words, beside=FALSE, main="Number of words" , col="gold",axes=FALSE)
axis(1, labels = rownames(statistics), at = barW, las = 1, cex.axis = 1.5)
axis(2, at=statistics$words, las = 2, cex.axis = 1)

barU=barplot(statistics$unique, beside=FALSE, main="Number of unique words" , col="gold",axes=FALSE)
axis(1, labels = rownames(statistics), at = barU, las = 1, cex.axis = 1.5)
axis(2, at=statistics$unique, las = 2, cex.axis = 1)

Report on any interesting findings amassed so far.

Based on exploratory data analysis by means of observing raw data the following discrepancies were observed: Based on the primary exploratory data analysis the following were found:

  • Numbers are mixed with words.
  • Greek characters are mixed with words.
  • Non alphabetical characters exist (ie. _ or $ etc).
  • Single character words, besides meaningfull “a” and “I”
  • There exist words meaningfull but bear more characters in the essence of stressing a point (ie peeeerfect).
  • Profanity words
  • First names

Bearing in mind the project’s scope, it is reasonable to filter such words.

## Handling all the words is time consuming, so a nice idea is to process the unique words in terms of data quality. Hence all filtering will be applied to unique words. The remaining unique words per text, shall be used as screening guide to perform text filtering
#unique <- lapply(text, unique) 
## 1st step: Discard numbers, greek letters, and non-alphabetical terms 
unique <- lapply(unique, function(x) gsub("[0-9]+|_|α|β|γ|δ|ε|ζ|η|θ|ι|κ|λ|μ|ν|ξ|ο|π|ρ|σ|τ|υ|φ|χ|ψ|ω|ά|έ|ή|ί|ϊ|ό|ύ|ϋ|ΰ|ώ|³|²|?","", x))

##Lets discard one letter strings, other meaningless words and some stopwords (ie the)
words <- as.vector(c("b","c","d","e","f","g","h","j","k","l","m","n","o","p","r","s","t","u","v","w","x","y","z")) 
words <- c(words,"ff","fri","ga","im","ha","the")
unique <- lapply(unique, function (x) removeWords(x,words))
unique <- lapply(unique, function(x) x[which(x != "")])
## Lets discard some non words or words that contain heavy mispellings (ie aaallll)
z = gregexpr(pattern = "aa|bbb|ccc|dd|eee|fff|ggg|hh|ii|jj|kk|lll|mmm|nnn|oo|pp|qq|rr|sss|ttt|uu|vvv|www|xx|yy|zz|haha|hehe", text = unlist(unique[1]), ignore.case = TRUE)
num_z = sapply(z, function(x) ifelse(x[1] > 0, x, 0))
num_z <- as.data.frame(num_z)
hold1   <- unlist(unique[1])[-which(num_z$num_z!=0)]

z = gregexpr(pattern = "aa|bbb|ccc|dd|eee|fff|ggg|hh|ii|jj|kk|lll|mmm|nnn|oo|pp|qq|rr|sss|ttt|uu|vvv|www|xx|yy|zz|haha|hehe", text = unlist(unique[2]), ignore.case = TRUE)
num_z = sapply(z, function(x) ifelse(x[1] > 0, x, 0))
num_z <- as.data.frame(num_z)
hold2   <- unlist(unique[2])[-which(num_z$num_z!=0)]

z = gregexpr(pattern = "aa|bbb|ccc|dd|eee|fff|ggg|hh|ii|jj|kk|lll|mmm|nnn|oo|pp|qq|rr|sss|ttt|uu|vvv|www|xx|yy|zz|haha|hehe", text = unlist(unique[3]), ignore.case = TRUE)
num_z = sapply(z, function(x) ifelse(x[1] > 0, x, 0))
num_z <- as.data.frame(num_z)
hold3   <- unlist(unique[3])[-which(num_z$num_z!=0)]

rm(z)
rm(num_z)

###Profanity and first names filtering. In order to do so, we have collected such words from the web and stored them in txt files as follows
library(tm)
a <- as.list(c("names1.txt","names2.txt","profanity.txt"))
a <- lapply(a,readLines)
a <- lapply(a,function(x) paste(x, collapse= " "))
a <- lapply(a,tolower)
a <- lapply(a, function(x) strsplit(x, "\\W"))
a <- lapply(a, unlist)
a <- lapply(a, unique)
a <- lapply(a, function(x) x[which(x != "")])

##Remove first names

#The original file name has been split in two due to inability of "removeWords" command to handle bigger vectors.
names1 <-unlist(a[1])
names2 <-unlist(a[2])
hold1 <- removeWords(hold1,names1)
hold1 <- removeWords(hold1,names2)

hold2 <- removeWords(hold2,names1)
hold2 <- removeWords(hold2,names2)

hold3 <- removeWords(hold3,names1)
hold3 <- removeWords(hold3,names2)

## Profanity filtering
profanity  <- unlist(a[3])

hold1 <- removeWords(hold1,profanity)
hold2 <- removeWords(hold2,profanity)
hold3 <- removeWords(hold3,profanity)

##Remove white spaces
hold1 <- hold1[which(hold1 != "")]
hold2 <- hold2[which(hold2 != "")]
hold3 <- hold3[which(hold3 != "")]

##Lets remove all unncessary stuff from memmory
rm(a);rm(names1);rm(names2);rm(profanity);rm(unique);rm(words);rm(tokenization)


##Let's filter the true texts discarding all of the words in our main vector that are not found in the unique vector that was data quality approved.
twitter <- unlist(text[1])
twitter <- twitter[twitter %in% hold1]

blogs <- unlist(text[2])
blogs <- blogs[blogs %in% hold2]

news <- unlist(text[3])
news <- news[news %in% hold3]
##Having cocluded the filtering pipeline, let's count again to see the words, and unique words of waht is left. 
words = c(length(twitter), length(blogs), length(news)) 
uniques = c(length(unique(twitter)), length(unique(blogs)), length(unique(news))) 

##Let's create the statistics dataframe
statistics.f = data.frame(words, uniques)
rownames(statistics.f) <- c("twitter","blogs","news")
##Let's remove unnecessary stuff
rm(words);rm(uniques)

statistics.f
##            words uniques
## twitter 25513708  256679
## blogs   31980874  224087
## news     2224339   65112
##Some plots

barW=barplot(statistics.f$words, beside=FALSE, main="Number of words" , col="gold",axes=FALSE)
axis(1, labels = rownames(statistics.f), at = barW, las = 1, cex.axis = 1.5)
axis(2, at=statistics.f$words, las = 2, cex.axis = 1)

barU=barplot(statistics.f$uniques, beside=FALSE, main="Number of unique words" , col="gold",axes=FALSE)
axis(1, labels = rownames(statistics.f), at = barU, las = 1, cex.axis = 1.5)
axis(2, at=statistics.f$uniques, las = 2, cex.axis = 1)

meanw <- mean((statistics$words-statistics.f$words)/statistics$words)
meanu <- mean((statistics$unique-statistics.f$unique)/statistics$unique)
  • The filtering approach is considered succesfull since, there is a average decrease of about 0.3099081% in the number of words and a respective one of 0.2254625 regarding the unique words.
rm(text)
rm(meanu);rm(meanw);rm(hold1);rm(hold2);rm(hold3);rm(statistics);rm(statistics.f);rm(barU);rm(barW);rm(barL)
### Frequencies of single words occurences in the texts
 ## Unique words frequencies
twitter.freqs <- table(twitter)
blogs.freqs <- table(blogs)
news.freqs <- table(news)

##Let's sort them in a decreasing order
twitter.freqs <- sort(twitter.freqs, decreasing=TRUE)
blogs.freqs <- sort(blogs.freqs, decreasing=TRUE)
news.freqs <- sort(news.freqs, decreasing=TRUE)
##Lets compute the frequencies of the words expressed in relative percentage. 
twitter.rel.freqs <- 100*(twitter.freqs/sum(twitter.freqs))
blogs.rel.freqs <- 100*(blogs.freqs/sum(blogs.freqs))
news.rel.freqs <- 100*(news.freqs/sum(news.freqs))

##Let's plot in barplot style the sorted relevant frequencies of the words.

##Twitter text
barplot(twitter.rel.freqs[1:30], beside=T, col="grey",xlab="Top 100 Words", ylab="Percentage of Full Text", main="The 30 most frequent words in tweets")

##Blogs text
barplot(blogs.rel.freqs[1:30], beside=T, col="grey",xlab="Top 100 Words", ylab="Percentage of Full Text", main="The 30 most frequent words in blogs")

##News text
barplot(news.rel.freqs[1:30], beside=T, col="grey",xlab="Top 100 Words", ylab="Percentage of Full Text", main="The 30 most frequent words in news")

  • Twitter text consists of 27.438.564 words. Of these, 256707 are unique. Let’s select the most frequent words and compute the percentage of real text these correspond to. The 100 most frequent words correspond to 50% of the twitter text (really amazing, if one thinks that the rest 256606 unique words correspond the rest 50%), while the 4000 most frequent words correspond to 90% of the text.
##Lets filter the 100 most frequent words
data_twitter <- as.data.frame(twitter.freqs[1:100])
data_twitter$words <- rownames(data_twitter)
colnames(data_twitter) <- c("frequency","words")
rownames(data_twitter) <- c(1:100)
sum(data_twitter[,1])/length(twitter)
## [1] 0.4886544
#Lets filter the 4000 most frequent words
data_twitter <- as.data.frame(twitter.freqs[1:4000])
data_twitter$words <- rownames(data_twitter)
colnames(data_twitter) <- c("frequency","words")
rownames(data_twitter) <- c(1:4000)
sum(data_twitter[,1])/length(twitter)
## [1] 0.8951463
  • Blogs text consists of 34.516.523 words. Of these, 224115 are unique. Let’s select the most frequent words and compute the percentage of real text these correspond to. The 90 most frequent words correspond to 50% of the blogs text (again really amazing, if one thinks that the rest 224025 unique words correspond the rest 50%), while the 5100 most frequent words correspond to slightly more than 90% of the text.
##Blogs
##Lets filter the most frequent 90 words
data_blogs <- as.data.frame(blogs.freqs[1:90])
data_blogs$words<-rownames(data_blogs)
colnames(data_blogs) <- c("frequency","words")
rownames(data_blogs) <- c(1:90)
sum(data_blogs[,1])/length(blogs)
## [1] 0.4734444
##Lets filter the 5100 most frequent words 
data_blogs <- as.data.frame(blogs.freqs[1:5100])
data_blogs$words<-rownames(data_blogs)
colnames(data_blogs) <- c("frequency","words")
rownames(data_blogs) <- c(1:5100)
sum(data_blogs[,1])/length(blogs)
## [1] 0.8937786
  • News text consists of 2.435.198 words. Of these, 65139 are unique. Let’s select the most frequent words and compute the percentage of real text these correspond to. The 150 most frequent words correspond to 50% of the news text (again really amazing, if one thinks that the rest 64989 unique words correspond the rest 50%), while the 6000 most frequent words correspond to slightly less than 90% of the text.
##News
##Lets filter the 150 most frequent words
data_news <- as.data.frame(news.freqs[1:150])
data_news$words<-rownames(data_news)
colnames(data_news) <- c("frequency","words")
rownames(data_news) <- c(1:150)
sum(data_news[,1])/length(news)
## [1] 0.4665296
##Lets filter the 6000 most frequent words
data_news <- as.data.frame(news.freqs[1:6000])
data_news$words<-rownames(data_news)
colnames(data_news) <- c("frequency","words")
rownames(data_news) <- c(1:6000)
sum(data_news[,1])/length(news)
## [1] 0.8906803
hold4000twitter <- twitter[twitter %in% data_twitter$words]
hold5100blogs <- blogs[blogs %in% data_blogs$words]
hold6000news <- news[news %in% data_news$words]
  • Spelling errors. The exploratory data analysis shows that spelling errors exist. Now, that unique words have been reduced, the elegant library qdap is applied. Spelling errors corrections is disabled from the sourcing of the code. It has been applied once to locate errs, and with the use of the car library, these get corrected.
#v <- unique(hold4000twitter)
#v <- paste(v, collapse= " ")

#library(qdap)
#check_spelling_interactive(v, range = 2, assume.first.correct = TRUE,
#  click = TRUE, method = "jw",
#  dictionary = qdapDictionaries::GradyAugmented, parallel = TRUE,
#  cores = parallel::detectCores()/2, n.suggests = 8)
##Then the car library is used to recode all mispelled words. Of course this has to take 


library(car)

hold4000twitter <- recode(hold4000twitter, '"ll" = "will";"rt" = "retweet"; "re" = "are";"aren"="are not";"atleast"="at least";"btw"="between";"chillin"="chilling";"comin"="coming";"congrats"="congratulations";"couldn"="could not";"couldnt"="could not";"didn"="did not";"didnt"="did not";"doesn"="does not";"doesnt"="does not";"doin"="doing";"dont"="do not";"dunno"="do not know";"feelin"="feeling";"thx"="thanks";"tryin"="trying";"ve"="have";"wasn"="was not";"wasnt"="was not";"watchin"="watching";"weren"="were not";"wknd"="weekend";"workin"="working";"wouldn"="would not";"xmas"="christmas";"youre"="you are"')

hold5100blogs <- recode(hold5100blogs, '"ll" = "will";"rt" = "retweet"; "re" = "are";"aren"="are not";"atleast"="at least";"btw"="between";"chillin"="chilling";"comin"="coming";"congrats"="congratulations";"couldn"="could not";"couldnt"="could not";"didn"="did not";"didnt"="did not";"doesn"="does not";"doesnt"="does not";"doin"="doing";"dont"="do not";"dunno"="do not know";"feelin"="feeling";"thx"="thanks";"tryin"="trying";"ve"="have";"wasn"="was not";"wasnt"="was not";"watchin"="watching";"weren"="were not";"wknd"="weekend";"workin"="working";"wouldn"="would not";"xmas"="christmas";"youre"="you are"')

hold6000news <- recode(hold6000news, '"ll" = "will";"rt" = "retweet"; "re" = "are";"aren"="are not";"atleast"="at least";"btw"="between";"chillin"="chilling";"comin"="coming";"congrats"="congratulations";"couldn"="could not";"couldnt"="could not";"didn"="did not";"didnt"="did not";"doesn"="does not";"doesnt"="does not";"doin"="doing";"dont"="do not";"dunno"="do not know";"feelin"="feeling";"thx"="thanks";"tryin"="trying";"ve"="have";"wasn"="was not";"wasnt"="was not";"watchin"="watching";"weren"="were not";"wknd"="weekend";"workin"="working";"wouldn"="would not";"xmas"="christmas";"youre"="you are"')

rm(v)
rm(text)
rm(news)
rm(twitter)
rm(blogs)
rm(news.freqs)
rm(blogs.freqs)
rm(twitter.freqs)
rm(twitter.rel.freqs)
rm(blogs.rel.freqs)
rm(news.rel.freqs)
rm(data_twitter)
rm(data_blogs)
rm(data_news)
  • The remaining texts will be used to locate frequent bigrams, trigrams and fourgrams, meaning frequent sets of two, three and four words.The samples are random samples of 100000 words. The samples are huge, but the algorithm functions within satisfactory time (less than 1 munite for each n-gram finder) limits due to the succesfull compression of the data during the preprocessing stage.

Twitter n-grams frequencies

library("RWeka")
library("tm")
## Loading required package: NLP
BigramTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
FourgramTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))


##First we shall investigate twitter text 
twitter <- hold4000twitter[5000001:5100000]
twitter <- paste(twitter,collapse =" ")
sample_twitter_corpus = VCorpus(VectorSource(twitter))

##2-grams

bigram_dtm_twitter = DocumentTermMatrix(sample_twitter_corpus, control = list(tokenize = BigramTokenizer))

freq_bigram_twitter = sort(colSums(as.matrix(bigram_dtm_twitter)), decreasing = TRUE)

bigram_twitter_df = data.frame(word = names(freq_bigram_twitter), frequency = freq_bigram_twitter)
rownames(bigram_twitter_df) = NULL

# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(bigram_twitter_df$frequency[1:30],
              xaxt="n", xlab="",
              ylab="Frequency",
              main = "The 30 most frequent 2-grams for twitter",
              col=rainbow(10))

axis(1, labels = bigram_twitter_df$word[1:30], at = bar,
     las = 2, cex.axis = 0.6)

##3-grams
trigram_dtm_twitter = DocumentTermMatrix(sample_twitter_corpus, control = list(tokenize = TrigramTokenizer))


freq_trigram_twitter = sort(colSums(as.matrix(trigram_dtm_twitter)), decreasing = TRUE)

trigram_twitter_df = data.frame(word = names(freq_trigram_twitter), frequency = freq_trigram_twitter)
rownames(trigram_twitter_df) = NULL

# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(trigram_twitter_df$frequency[1:30],
              xaxt="n", xlab="",
              ylab="Frequency",
              main = "The 30 most frequent 3-grams for twitter",
              col=rainbow(10))

axis(1, labels = trigram_twitter_df$word[1:30], at = bar,
     las = 2, cex.axis = 0.6)

##4-grams
fourgram_dtm_twitter = DocumentTermMatrix(sample_twitter_corpus, control = list(tokenize = FourgramTokenizer))

freq_fourgram_twitter = sort(colSums(as.matrix(fourgram_dtm_twitter)), decreasing = TRUE)

fourgram_twitter_df = data.frame(word = names(freq_fourgram_twitter), frequency = freq_fourgram_twitter)
rownames(fourgram_twitter_df) = NULL

# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(fourgram_twitter_df$frequency[1:30],
              xaxt="n", xlab="",
              ylab="Frequency",
              main = "The 30 most frequent 4-grams for twitter",
              col=rainbow(10))
axis(1, labels = fourgram_twitter_df$word[1:30], at = bar,
     las = 2, cex.axis = 0.6)

Blogs n-grams frequencies

##Then we shall investigate blogs text 
blogs <- hold5100blogs[5000001:5100000]
blogs <- paste(blogs,collapse =" ")
sample_blogs_corpus = VCorpus(VectorSource(blogs))
##2-grams

bigram_dtm_blogs = DocumentTermMatrix(sample_blogs_corpus, control = list(tokenize = BigramTokenizer))

freq_bigram_blogs = sort(colSums(as.matrix(bigram_dtm_blogs)), decreasing = TRUE)

bigram_blogs_df = data.frame(word = names(freq_bigram_blogs), frequency = freq_bigram_blogs)
rownames(bigram_blogs_df) = NULL

# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(bigram_blogs_df$frequency[1:30],
              xaxt="n", xlab="",
              ylab="Frequency",
              main = "The 30 most frequent 2-grams for blogs",
              col=rainbow(10))

axis(1, labels = bigram_blogs_df$word[1:30], at = bar,
     las = 2, cex.axis = 0.6)

##3-grams
trigram_dtm_blogs = DocumentTermMatrix(sample_blogs_corpus, control = list(tokenize = TrigramTokenizer))


freq_trigram_blogs = sort(colSums(as.matrix(trigram_dtm_blogs)), decreasing = TRUE)

trigram_blogs_df = data.frame(word = names(freq_trigram_blogs), frequency = freq_trigram_blogs)
rownames(trigram_blogs_df) = NULL

# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(trigram_blogs_df$frequency[1:30],
              xaxt="n", xlab="",
              ylab="Frequency",
              main = "The 30 most frequent 3-grams for blogs",
              col=rainbow(10))

axis(1, labels = trigram_blogs_df$word[1:30], at = bar,
     las = 2, cex.axis = 0.6)

##4-grams
fourgram_dtm_blogs = DocumentTermMatrix(sample_blogs_corpus, control = list(tokenize = FourgramTokenizer))

freq_fourgram_blogs = sort(colSums(as.matrix(fourgram_dtm_blogs)), decreasing = TRUE)

fourgram_blogs_df = data.frame(word = names(freq_fourgram_blogs), frequency = freq_fourgram_blogs)
rownames(fourgram_blogs_df) = NULL

# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(fourgram_blogs_df$frequency[1:30],
              xaxt="n", xlab="",
              ylab="Frequency",
              main = "The 30 most frequent 4-grams for blogs",
              col=rainbow(10))
axis(1, labels = fourgram_blogs_df$word[1:30], at = bar,
     las = 2, cex.axis = 0.6)

News n-grams frequencies

##Then we shall investigate news text 
news <- hold6000news[500001:510000]
news <- paste(news,collapse =" ")
sample_news_corpus = VCorpus(VectorSource(news))
##2-grams

bigram_dtm_news = DocumentTermMatrix(sample_news_corpus, control = list(tokenize = BigramTokenizer))

freq_bigram_news = sort(colSums(as.matrix(bigram_dtm_news)), decreasing = TRUE)

bigram_news_df = data.frame(word = names(freq_bigram_news), frequency = freq_bigram_news)
rownames(bigram_news_df) = NULL

# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(bigram_news_df$frequency[1:30],
              xaxt="n", xlab="",
              ylab="Frequency",
              main = "The 30 most frequent 2-grams for news",
              col=rainbow(10))

axis(1, labels = bigram_news_df$word[1:30], at = bar,
     las = 2, cex.axis = 0.6)

##3-grams
trigram_dtm_news = DocumentTermMatrix(sample_news_corpus, control = list(tokenize = TrigramTokenizer))


freq_trigram_news = sort(colSums(as.matrix(trigram_dtm_news)), decreasing = TRUE)

trigram_news_df = data.frame(word = names(freq_trigram_news), frequency = freq_trigram_news)
rownames(trigram_news_df) = NULL

# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(trigram_news_df$frequency[1:30],
              xaxt="n", xlab="",
              ylab="Frequency",
              main = "The 30 most frequent 3-grams for news",
              col=rainbow(10))

axis(1, labels = trigram_news_df$word[1:30], at = bar,
     las = 2, cex.axis = 0.6)

##4-grams
fourgram_dtm_news = DocumentTermMatrix(sample_news_corpus, control = list(tokenize = FourgramTokenizer))

freq_fourgram_news = sort(colSums(as.matrix(fourgram_dtm_news)), decreasing = TRUE)

fourgram_news_df = data.frame(word = names(freq_fourgram_news), frequency = freq_fourgram_news)
rownames(fourgram_news_df) = NULL

# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(fourgram_news_df$frequency[1:30],
              xaxt="n", xlab="",
              ylab="Frequency",
              main = "The 30 most frequent 4-grams for news",
              col=rainbow(10))
axis(1, labels = fourgram_news_df$word[1:30], at = bar,
     las = 2, cex.axis = 0.6)

Get feedback on your plans for creating a prediction algorithm and Shiny app.