The data is downloaded from the course website. It is then extracted and loaded. Finally, we do exploratory data analysis and present the model done so far. For this, we will use only “en_US” data set.
Download, unzip and load the training data.
library(plyr)
library(dplyr)
library(tm)
setwd("F:/Data Science/Capstone/en_US")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
We will now summarize the data that is loaded.
library(stringi)
#file size
blogsSize <- file.info("en_US.blogs.txt")$size / 2^20
newsSize <- file.info("en_US.news.txt")$size / 2^20
twitterSize <- file.info("en_US.twitter.txt")$size / 2^20
#word count
blogWords <- stri_count_words(blogs)
newsWords <- stri_count_words(news)
twitterWords <- stri_count_words(twitter)
#summary
data.frame(source = c("blogs", "news", "twitter"),
file.size.MB = c(blogsSize, newsSize, twitterSize),
num.lines = c(length(blogs), length(news), length(twitter)),
num.words = c(sum(blogWords), sum(newsWords), sum(twitterWords)),
mean.num.words = c(mean(blogWords), mean(newsWords), mean(twitterWords)))
## source file.size.MB num.lines num.words mean.num.words
## 1 blogs NA 899288 37546239 41.75107
## 2 news NA 77259 2674536 34.61779
## 3 twitter NA 2360148 30093372 12.75063
To get a good prediction cleaning of data is necessary. This includes, removing URLs, excess white spaces, special characters, numbers etc.,. Since, the data set is too large we will take a part of it.
library(tm)
#sampling data
set.seed(11)
data.sample <- c(sample(blogs, length(blogs) * 0.0015),
sample(news, length(news) * 0.0015),
sample(twitter, length(twitter) * 0.0015))
#The data is too low because of the space complexity
#Cleaning data and creating corpus
corpus <- VCorpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
In this section, we will visually communicate the most frequently occuring n-grams(where n = 1, 2, 3)
library(RWeka)
library(ggplot2)
options(mc.cores=1)
getFrequency <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
makePlot <- function(data, label) {
ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I("blue"))
}
#frequency of most common n-grams
freq1 <- getFrequency(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999))
freq2 <- getFrequency(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- getFrequency(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
The below is a histogram of 30 most common unigrams in the data sample.
makePlot(freq1, "30 Most Common Uni-grams")
The below is a histogram of the 30 most common bigrams in the data sample.
makePlot(freq2, "30 Most Common Bi-grams")
The below is a histogram of the 30 most common trigrams in the data sample.
makePlot(freq3, "30 Most Common Tri-grams")