The data will be downloaded and loaded in as a whole. Summary statistics are derived (table and plots). An extensive filtering on the dataset is applied based on discrepancy findings (ie profanity, first names, numbers, punctuations heavy mispellings etc). The frequencies of single words are computed and plotted. Based on these frequencies, a subsequent filtering is applied to account for almost the 90% of the dataset, but have far fewer unique words. Frequent n-grams are then computed. The fact that non-frequent words are discarded in the previous step, does not affect 2-grams, 3-grams etc since in order for a set of words to be frequent, all of the member words should aso be frequent.
Prior all set working directory to the location of the source file.
##Set to source file location
setwd("D:/Education/Data Science/Johns Hopkins - Specialization certificate/Capstone Project")
##Get working directory
working_directory <- getwd()
## Assign the URL from where the data will be downloaded
fileURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
## Download the zip file containing the data, if this zip file has not already been downloaded.
file <- paste(working_directory,"/data.zip",sep="")
if(!file.exists(file)){
download.file(fileURL, file)
}
## Unzip the downloaded zip file.
unzip(file, exdir = working_directory) # unzip file to data folder
## Delete the zip file (if it has not already been deleted!), since this is no longer needed.
if (file.exists(file)) file.remove(file)
## [1] TRUE
##Remove unnecessary stuff
rm(file);rm(fileURL);rm(working_directory)
##Load the data
library(tm)
##Lets read the text. Instead of reading the text in seperate files, parallel programming will be employed.
text <- as.list(c("final/en_US/en_US.twitter.txt","final/en_US/en_US.blogs.txt","final/en_US/en_US.news.txt"))
text <- lapply(text,readLines)
##Count the lines of each text
twitter_lines <- as.numeric((as.vector(summary(text[1])))[1])
blogs_lines <- as.numeric((as.vector(summary(text[2])))[1])
news_lines <- as.numeric((as.vector(summary(text[3])))[1])
##Bulding the function that takes as input text (it is obvious that an analogus function could be built in the same notion to receive not a list of texts but a single one) and exports a tokenized version.
tokenization <- function(y) {
##Lets unify the string elements of the vector into one string, by collapsing the breakpoints
y <- lapply(y,function(x) paste(x, collapse= " "))
##Let's convert all letters to lower case
y <- lapply(y,tolower)
##The regural expression used, suggests that the string should be split each time a word ends. So the following extracts all words from the text.
y <- lapply(y, function(x) strsplit(x, "\\W"))
##In order to further process the extracted words, these should be detouched from the list stracture into a vector.Lets do that.
y <- lapply(y, unlist)
}
#Let's apply the function to the texts.
text <- tokenization(text)
#Let's count the words
twitter_words <- as.numeric((as.vector(summary(text[1])))[1])
blogs_words <- as.numeric((as.vector(summary(text[2])))[1])
news_words <- as.numeric((as.vector(summary(text[3])))[1])
##Let's count unique words
unique <- lapply(text, unique)
twitter_unique_words <- as.numeric((as.vector(summary(unique[1])))[1])
blogs_unique_words <- as.numeric((as.vector(summary(unique[2])))[1])
news_unique_words <- as.numeric((as.vector(summary(unique[3])))[1])
lines = c(twitter_lines, blogs_lines, news_lines)
words = c(twitter_words, blogs_words, news_words)
uniques = c(twitter_unique_words, blogs_unique_words, news_unique_words)
##Let's create the statistics dataframe
statistics = data.frame(lines,words, uniques)
rownames(statistics) <- c("twitter","blogs","news")
##Let's remove unnecessary stuff
rm(lines);rm(words);rm(uniques)
rm(twitter_lines);rm(blogs_lines);rm(news_lines)
rm(twitter_words);rm(blogs_words);rm(news_words)
rm(twitter_unique_words);rm(blogs_unique_words);rm(news_unique_words)
statistics
## lines words uniques
## twitter 2360148 38585883 336489
## blogs 899288 44676636 304449
## news 77259 3208673 78947
##Some plots
barL=barplot(statistics$lines, beside=FALSE, main="Number of lines" , col="red",axes=FALSE)
axis(1, labels = rownames(statistics), at = barL, las = 1, cex.axis = 1.5)
axis(2, at=statistics$lines, las = 2, cex.axis = 1)
barW=barplot(statistics$words, beside=FALSE, main="Number of words" , col="gold",axes=FALSE)
axis(1, labels = rownames(statistics), at = barW, las = 1, cex.axis = 1.5)
axis(2, at=statistics$words, las = 2, cex.axis = 1)
barU=barplot(statistics$unique, beside=FALSE, main="Number of unique words" , col="gold",axes=FALSE)
axis(1, labels = rownames(statistics), at = barU, las = 1, cex.axis = 1.5)
axis(2, at=statistics$unique, las = 2, cex.axis = 1)
Based on exploratory data analysis by means of observing raw data the following discrepancies were observed: Based on the primary exploratory data analysis the following were found:
Bearing in mind the project’s scope, it is reasonable to filter such words.
## Handling all the words is time consuming, so a nice idea is to process the unique words in terms of data quality. Hence all filtering will be applied to unique words. The remaining unique words per text, shall be used as screening guide to perform text filtering
#unique <- lapply(text, unique)
## 1st step: Discard numbers, greek letters, and non-alphabetical terms
unique <- lapply(unique, function(x) gsub("[0-9]+|_|α|β|γ|δ|ε|ζ|η|θ|ι|κ|λ|μ|ν|ξ|ο|π|ρ|σ|τ|υ|φ|χ|ψ|ω|ά|έ|ή|ί|ϊ|ό|ύ|ϋ|ΰ|ώ|³|²|?","", x))
##Lets discard one letter strings, other meaningless words and some stopwords (ie the)
words <- as.vector(c("b","c","d","e","f","g","h","j","k","l","m","n","o","p","r","s","t","u","v","w","x","y","z"))
words <- c(words,"ff","fri","ga","im","ha","the")
unique <- lapply(unique, function (x) removeWords(x,words))
unique <- lapply(unique, function(x) x[which(x != "")])
## Lets discard some non words or words that contain heavy mispellings (ie aaallll)
z = gregexpr(pattern = "aa|bbb|ccc|dd|eee|fff|ggg|hh|ii|jj|kk|lll|mmm|nnn|oo|pp|qq|rr|sss|ttt|uu|vvv|www|xx|yy|zz|haha|hehe", text = unlist(unique[1]), ignore.case = TRUE)
num_z = sapply(z, function(x) ifelse(x[1] > 0, x, 0))
num_z <- as.data.frame(num_z)
hold1 <- unlist(unique[1])[-which(num_z$num_z!=0)]
z = gregexpr(pattern = "aa|bbb|ccc|dd|eee|fff|ggg|hh|ii|jj|kk|lll|mmm|nnn|oo|pp|qq|rr|sss|ttt|uu|vvv|www|xx|yy|zz|haha|hehe", text = unlist(unique[2]), ignore.case = TRUE)
num_z = sapply(z, function(x) ifelse(x[1] > 0, x, 0))
num_z <- as.data.frame(num_z)
hold2 <- unlist(unique[2])[-which(num_z$num_z!=0)]
z = gregexpr(pattern = "aa|bbb|ccc|dd|eee|fff|ggg|hh|ii|jj|kk|lll|mmm|nnn|oo|pp|qq|rr|sss|ttt|uu|vvv|www|xx|yy|zz|haha|hehe", text = unlist(unique[3]), ignore.case = TRUE)
num_z = sapply(z, function(x) ifelse(x[1] > 0, x, 0))
num_z <- as.data.frame(num_z)
hold3 <- unlist(unique[3])[-which(num_z$num_z!=0)]
rm(z)
rm(num_z)
###Profanity and first names filtering. In order to do so, we have collected such words from the web and stored them in txt files as follows
library(tm)
a <- as.list(c("names1.txt","names2.txt","profanity.txt"))
a <- lapply(a,readLines)
a <- lapply(a,function(x) paste(x, collapse= " "))
a <- lapply(a,tolower)
a <- lapply(a, function(x) strsplit(x, "\\W"))
a <- lapply(a, unlist)
a <- lapply(a, unique)
a <- lapply(a, function(x) x[which(x != "")])
##Remove first names
#The original file name has been split in two due to inability of "removeWords" command to handle bigger vectors.
names1 <-unlist(a[1])
names2 <-unlist(a[2])
hold1 <- removeWords(hold1,names1)
hold1 <- removeWords(hold1,names2)
hold2 <- removeWords(hold2,names1)
hold2 <- removeWords(hold2,names2)
hold3 <- removeWords(hold3,names1)
hold3 <- removeWords(hold3,names2)
## Profanity filtering
profanity <- unlist(a[3])
hold1 <- removeWords(hold1,profanity)
hold2 <- removeWords(hold2,profanity)
hold3 <- removeWords(hold3,profanity)
##Remove white spaces
hold1 <- hold1[which(hold1 != "")]
hold2 <- hold2[which(hold2 != "")]
hold3 <- hold3[which(hold3 != "")]
##Lets remove all unncessary stuff from memmory
rm(a);rm(names1);rm(names2);rm(profanity);rm(unique);rm(words);rm(tokenization)
##Let's filter the true texts discarding all of the words in our main vector that are not found in the unique vector that was data quality approved.
twitter <- unlist(text[1])
twitter <- twitter[twitter %in% hold1]
blogs <- unlist(text[2])
blogs <- blogs[blogs %in% hold2]
news <- unlist(text[3])
news <- news[news %in% hold3]
##Having cocluded the filtering pipeline, let's count again to see the words, and unique words of waht is left.
words = c(length(twitter), length(blogs), length(news))
uniques = c(length(unique(twitter)), length(unique(blogs)), length(unique(news)))
##Let's create the statistics dataframe
statistics.f = data.frame(words, uniques)
rownames(statistics.f) <- c("twitter","blogs","news")
##Let's remove unnecessary stuff
rm(words);rm(uniques)
statistics.f
## words uniques
## twitter 25513708 256679
## blogs 31980874 224087
## news 2224339 65112
##Some plots
barW=barplot(statistics.f$words, beside=FALSE, main="Number of words" , col="gold",axes=FALSE)
axis(1, labels = rownames(statistics.f), at = barW, las = 1, cex.axis = 1.5)
axis(2, at=statistics.f$words, las = 2, cex.axis = 1)
barU=barplot(statistics.f$uniques, beside=FALSE, main="Number of unique words" , col="gold",axes=FALSE)
axis(1, labels = rownames(statistics.f), at = barU, las = 1, cex.axis = 1.5)
axis(2, at=statistics.f$uniques, las = 2, cex.axis = 1)
meanw <- mean((statistics$words-statistics.f$words)/statistics$words)
meanu <- mean((statistics$unique-statistics.f$unique)/statistics$unique)
rm(text)
rm(meanu);rm(meanw);rm(hold1);rm(hold2);rm(hold3);rm(statistics);rm(statistics.f);rm(barU);rm(barW);rm(barL)
### Frequencies of single words occurences in the texts
## Unique words frequencies
twitter.freqs <- table(twitter)
blogs.freqs <- table(blogs)
news.freqs <- table(news)
##Let's sort them in a decreasing order
twitter.freqs <- sort(twitter.freqs, decreasing=TRUE)
blogs.freqs <- sort(blogs.freqs, decreasing=TRUE)
news.freqs <- sort(news.freqs, decreasing=TRUE)
##Lets compute the frequencies of the words expressed in relative percentage.
twitter.rel.freqs <- 100*(twitter.freqs/sum(twitter.freqs))
blogs.rel.freqs <- 100*(blogs.freqs/sum(blogs.freqs))
news.rel.freqs <- 100*(news.freqs/sum(news.freqs))
##Let's plot in barplot style the sorted relevant frequencies of the words.
##Twitter text
barplot(twitter.rel.freqs[1:30], beside=T, col="grey",xlab="Top 100 Words", ylab="Percentage of Full Text", main="The 30 most frequent words in tweets")
##Blogs text
barplot(blogs.rel.freqs[1:30], beside=T, col="grey",xlab="Top 100 Words", ylab="Percentage of Full Text", main="The 30 most frequent words in blogs")
##News text
barplot(news.rel.freqs[1:30], beside=T, col="grey",xlab="Top 100 Words", ylab="Percentage of Full Text", main="The 30 most frequent words in news")
##Lets filter the 100 most frequent words
data_twitter <- as.data.frame(twitter.freqs[1:100])
data_twitter$words <- rownames(data_twitter)
colnames(data_twitter) <- c("frequency","words")
rownames(data_twitter) <- c(1:100)
sum(data_twitter[,1])/length(twitter)
## [1] 0.4886544
#Lets filter the 4000 most frequent words
data_twitter <- as.data.frame(twitter.freqs[1:4000])
data_twitter$words <- rownames(data_twitter)
colnames(data_twitter) <- c("frequency","words")
rownames(data_twitter) <- c(1:4000)
sum(data_twitter[,1])/length(twitter)
## [1] 0.8951463
##Blogs
##Lets filter the most frequent 90 words
data_blogs <- as.data.frame(blogs.freqs[1:90])
data_blogs$words<-rownames(data_blogs)
colnames(data_blogs) <- c("frequency","words")
rownames(data_blogs) <- c(1:90)
sum(data_blogs[,1])/length(blogs)
## [1] 0.4734444
##Lets filter the 5100 most frequent words
data_blogs <- as.data.frame(blogs.freqs[1:5100])
data_blogs$words<-rownames(data_blogs)
colnames(data_blogs) <- c("frequency","words")
rownames(data_blogs) <- c(1:5100)
sum(data_blogs[,1])/length(blogs)
## [1] 0.8937786
##News
##Lets filter the 150 most frequent words
data_news <- as.data.frame(news.freqs[1:150])
data_news$words<-rownames(data_news)
colnames(data_news) <- c("frequency","words")
rownames(data_news) <- c(1:150)
sum(data_news[,1])/length(news)
## [1] 0.4665296
##Lets filter the 6000 most frequent words
data_news <- as.data.frame(news.freqs[1:6000])
data_news$words<-rownames(data_news)
colnames(data_news) <- c("frequency","words")
rownames(data_news) <- c(1:6000)
sum(data_news[,1])/length(news)
## [1] 0.8906803
hold4000twitter <- twitter[twitter %in% data_twitter$words]
hold5100blogs <- blogs[blogs %in% data_blogs$words]
hold6000news <- news[news %in% data_news$words]
#v <- unique(hold4000twitter)
#v <- paste(v, collapse= " ")
#library(qdap)
#check_spelling_interactive(v, range = 2, assume.first.correct = TRUE,
# click = TRUE, method = "jw",
# dictionary = qdapDictionaries::GradyAugmented, parallel = TRUE,
# cores = parallel::detectCores()/2, n.suggests = 8)
##Then the car library is used to recode all mispelled words. Of course this has to take
library(car)
hold4000twitter <- recode(hold4000twitter, '"ll" = "will";"rt" = "retweet"; "re" = "are";"aren"="are not";"atleast"="at least";"btw"="between";"chillin"="chilling";"comin"="coming";"congrats"="congratulations";"couldn"="could not";"couldnt"="could not";"didn"="did not";"didnt"="did not";"doesn"="does not";"doesnt"="does not";"doin"="doing";"dont"="do not";"dunno"="do not know";"feelin"="feeling";"thx"="thanks";"tryin"="trying";"ve"="have";"wasn"="was not";"wasnt"="was not";"watchin"="watching";"weren"="were not";"wknd"="weekend";"workin"="working";"wouldn"="would not";"xmas"="christmas";"youre"="you are"')
hold5100blogs <- recode(hold5100blogs, '"ll" = "will";"rt" = "retweet"; "re" = "are";"aren"="are not";"atleast"="at least";"btw"="between";"chillin"="chilling";"comin"="coming";"congrats"="congratulations";"couldn"="could not";"couldnt"="could not";"didn"="did not";"didnt"="did not";"doesn"="does not";"doesnt"="does not";"doin"="doing";"dont"="do not";"dunno"="do not know";"feelin"="feeling";"thx"="thanks";"tryin"="trying";"ve"="have";"wasn"="was not";"wasnt"="was not";"watchin"="watching";"weren"="were not";"wknd"="weekend";"workin"="working";"wouldn"="would not";"xmas"="christmas";"youre"="you are"')
hold6000news <- recode(hold6000news, '"ll" = "will";"rt" = "retweet"; "re" = "are";"aren"="are not";"atleast"="at least";"btw"="between";"chillin"="chilling";"comin"="coming";"congrats"="congratulations";"couldn"="could not";"couldnt"="could not";"didn"="did not";"didnt"="did not";"doesn"="does not";"doesnt"="does not";"doin"="doing";"dont"="do not";"dunno"="do not know";"feelin"="feeling";"thx"="thanks";"tryin"="trying";"ve"="have";"wasn"="was not";"wasnt"="was not";"watchin"="watching";"weren"="were not";"wknd"="weekend";"workin"="working";"wouldn"="would not";"xmas"="christmas";"youre"="you are"')
rm(v)
rm(text)
rm(news)
rm(twitter)
rm(blogs)
rm(news.freqs)
rm(blogs.freqs)
rm(twitter.freqs)
rm(twitter.rel.freqs)
rm(blogs.rel.freqs)
rm(news.rel.freqs)
rm(data_twitter)
rm(data_blogs)
rm(data_news)
library("RWeka")
library("tm")
## Loading required package: NLP
BigramTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
FourgramTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
##First we shall investigate twitter text
twitter <- hold4000twitter[5000001:5100000]
twitter <- paste(twitter,collapse =" ")
sample_twitter_corpus = VCorpus(VectorSource(twitter))
##2-grams
bigram_dtm_twitter = DocumentTermMatrix(sample_twitter_corpus, control = list(tokenize = BigramTokenizer))
freq_bigram_twitter = sort(colSums(as.matrix(bigram_dtm_twitter)), decreasing = TRUE)
bigram_twitter_df = data.frame(word = names(freq_bigram_twitter), frequency = freq_bigram_twitter)
rownames(bigram_twitter_df) = NULL
# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(bigram_twitter_df$frequency[1:30],
xaxt="n", xlab="",
ylab="Frequency",
main = "The 30 most frequent 2-grams for twitter",
col=rainbow(10))
axis(1, labels = bigram_twitter_df$word[1:30], at = bar,
las = 2, cex.axis = 0.6)
##3-grams
trigram_dtm_twitter = DocumentTermMatrix(sample_twitter_corpus, control = list(tokenize = TrigramTokenizer))
freq_trigram_twitter = sort(colSums(as.matrix(trigram_dtm_twitter)), decreasing = TRUE)
trigram_twitter_df = data.frame(word = names(freq_trigram_twitter), frequency = freq_trigram_twitter)
rownames(trigram_twitter_df) = NULL
# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(trigram_twitter_df$frequency[1:30],
xaxt="n", xlab="",
ylab="Frequency",
main = "The 30 most frequent 3-grams for twitter",
col=rainbow(10))
axis(1, labels = trigram_twitter_df$word[1:30], at = bar,
las = 2, cex.axis = 0.6)
##4-grams
fourgram_dtm_twitter = DocumentTermMatrix(sample_twitter_corpus, control = list(tokenize = FourgramTokenizer))
freq_fourgram_twitter = sort(colSums(as.matrix(fourgram_dtm_twitter)), decreasing = TRUE)
fourgram_twitter_df = data.frame(word = names(freq_fourgram_twitter), frequency = freq_fourgram_twitter)
rownames(fourgram_twitter_df) = NULL
# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(fourgram_twitter_df$frequency[1:30],
xaxt="n", xlab="",
ylab="Frequency",
main = "The 30 most frequent 4-grams for twitter",
col=rainbow(10))
axis(1, labels = fourgram_twitter_df$word[1:30], at = bar,
las = 2, cex.axis = 0.6)
##Then we shall investigate blogs text
blogs <- hold5100blogs[5000001:5100000]
blogs <- paste(blogs,collapse =" ")
sample_blogs_corpus = VCorpus(VectorSource(blogs))
##2-grams
bigram_dtm_blogs = DocumentTermMatrix(sample_blogs_corpus, control = list(tokenize = BigramTokenizer))
freq_bigram_blogs = sort(colSums(as.matrix(bigram_dtm_blogs)), decreasing = TRUE)
bigram_blogs_df = data.frame(word = names(freq_bigram_blogs), frequency = freq_bigram_blogs)
rownames(bigram_blogs_df) = NULL
# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(bigram_blogs_df$frequency[1:30],
xaxt="n", xlab="",
ylab="Frequency",
main = "The 30 most frequent 2-grams for blogs",
col=rainbow(10))
axis(1, labels = bigram_blogs_df$word[1:30], at = bar,
las = 2, cex.axis = 0.6)
##3-grams
trigram_dtm_blogs = DocumentTermMatrix(sample_blogs_corpus, control = list(tokenize = TrigramTokenizer))
freq_trigram_blogs = sort(colSums(as.matrix(trigram_dtm_blogs)), decreasing = TRUE)
trigram_blogs_df = data.frame(word = names(freq_trigram_blogs), frequency = freq_trigram_blogs)
rownames(trigram_blogs_df) = NULL
# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(trigram_blogs_df$frequency[1:30],
xaxt="n", xlab="",
ylab="Frequency",
main = "The 30 most frequent 3-grams for blogs",
col=rainbow(10))
axis(1, labels = trigram_blogs_df$word[1:30], at = bar,
las = 2, cex.axis = 0.6)
##4-grams
fourgram_dtm_blogs = DocumentTermMatrix(sample_blogs_corpus, control = list(tokenize = FourgramTokenizer))
freq_fourgram_blogs = sort(colSums(as.matrix(fourgram_dtm_blogs)), decreasing = TRUE)
fourgram_blogs_df = data.frame(word = names(freq_fourgram_blogs), frequency = freq_fourgram_blogs)
rownames(fourgram_blogs_df) = NULL
# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(fourgram_blogs_df$frequency[1:30],
xaxt="n", xlab="",
ylab="Frequency",
main = "The 30 most frequent 4-grams for blogs",
col=rainbow(10))
axis(1, labels = fourgram_blogs_df$word[1:30], at = bar,
las = 2, cex.axis = 0.6)
##Then we shall investigate news text
news <- hold6000news[500001:510000]
news <- paste(news,collapse =" ")
sample_news_corpus = VCorpus(VectorSource(news))
##2-grams
bigram_dtm_news = DocumentTermMatrix(sample_news_corpus, control = list(tokenize = BigramTokenizer))
freq_bigram_news = sort(colSums(as.matrix(bigram_dtm_news)), decreasing = TRUE)
bigram_news_df = data.frame(word = names(freq_bigram_news), frequency = freq_bigram_news)
rownames(bigram_news_df) = NULL
# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(bigram_news_df$frequency[1:30],
xaxt="n", xlab="",
ylab="Frequency",
main = "The 30 most frequent 2-grams for news",
col=rainbow(10))
axis(1, labels = bigram_news_df$word[1:30], at = bar,
las = 2, cex.axis = 0.6)
##3-grams
trigram_dtm_news = DocumentTermMatrix(sample_news_corpus, control = list(tokenize = TrigramTokenizer))
freq_trigram_news = sort(colSums(as.matrix(trigram_dtm_news)), decreasing = TRUE)
trigram_news_df = data.frame(word = names(freq_trigram_news), frequency = freq_trigram_news)
rownames(trigram_news_df) = NULL
# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(trigram_news_df$frequency[1:30],
xaxt="n", xlab="",
ylab="Frequency",
main = "The 30 most frequent 3-grams for news",
col=rainbow(10))
axis(1, labels = trigram_news_df$word[1:30], at = bar,
las = 2, cex.axis = 0.6)
##4-grams
fourgram_dtm_news = DocumentTermMatrix(sample_news_corpus, control = list(tokenize = FourgramTokenizer))
freq_fourgram_news = sort(colSums(as.matrix(fourgram_dtm_news)), decreasing = TRUE)
fourgram_news_df = data.frame(word = names(freq_fourgram_news), frequency = freq_fourgram_news)
rownames(fourgram_news_df) = NULL
# op <- par(mar= c(15,4,4,2) + 0.1)
bar = barplot(fourgram_news_df$frequency[1:30],
xaxt="n", xlab="",
ylab="Frequency",
main = "The 30 most frequent 4-grams for news",
col=rainbow(10))
axis(1, labels = fourgram_news_df$word[1:30], at = bar,
las = 2, cex.axis = 0.6)