The aim of this project is to develop a better understanding of the text data set with the help of various statistical properties that can later be used when building the prediction model for the final data product. The data set is provided in the course as zip file and the analysis is based on the en_US data which has three files, blogs, news and twitter. The general procedures used for this project are:
1. Loading the Required Packages
2. Getting the data
3. Basic Statistics of the Data
4. Data Cleaning
5. Exploratory Data Analysis
6. Next Steps
library(tm); library(knitr); library(dplyr); library(ggplot2); library(stringi);
library(SnowballC); library(rJava); library(RWeka); library(RWekajars); library(NLP); library(openNLP); library(wordcloud); library(RColorBrewer)
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
DataFile <- "Data/Coursera-SwiftKey.zip"
if (!file.exists('Data')) {
dir.create('Data')
}
if (!file.exists("Data/final/en_US")) {
tempFile <- tempfile()
download.file(url, tempFile)
unzip(tempFile, exdir = "Data")
unlink(tempFile)
}
# blogs
blogsFile <- "Data/final/en_US/en_US.blogs.txt"
con <- file(blogsFile, open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
# news
newsFile<- "Data/final/en_US/en_US.news.txt"
con <- file(newsFile, open = "r")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
# twitter
twitterFile <- "Data/final/en_US/en_US.twitter.txt"
con <- file(twitterFile, open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
rm(con) # remove con file
The summary of basic statistics of the three text corpora is presented in the following table.
The statistics includes file size(MB), object size(MB), line count, word length(millions), and character count (millions).
create.summary.table <-function(blogs, news, twitter){
stats<- data.frame(source = c("blogs", "news", "twitter"),
File.size_MB = c(file.info("Data/final/en_US/en_US.blogs.txt")$size/1024^2,
file.info("Data/final/en_US/en_US.news.txt")$size/1024^2,
file.info("Data/final/en_US/en_US.twitter.txt")$size/1024^2),
Obj.size_MB = c(object.size(blogs)/1024^2,
object.size(news)/1024^2,
object.size(twitter)/1024^2),
Line.count = c(length(blogs), length(news), length(twitter)),
Word.length_Mil = c(sum(stri_count_words(blogs))/10^6,
sum(stri_count_words(news))/10^6,
sum(stri_count_words(twitter))/10^6),
Chars.count_Mil = c(stri_stats_general(blogs)[3]/10^6,
stri_stats_general(news)[3]/10^6,
stri_stats_general(twitter)[3]/10^6)
)
print(stats)
}
create.summary.table(blogs, news, twitter)
source File.size_MB Obj.size_MB Line.count Word.length_Mil Chars.count_Mil
1 blogs 200.4242 255.35453 899288 37.546239 206.82438
2 news 196.2775 19.76917 77259 2.674536 15.63941
3 twitter 159.3641 318.98975 2360148 30.093413 162.09624
As shown above in the table, blogs has highest file size followed by news and twitter. Further more, blogs tend to have more words per line, followed by twitter and then news which has the least words per line. Twitter has largest text line count followed by blogs and then news.
1% of the data is considered for this analysis due to the reason that all the three text files are relatively large in size as can be seen from the above table.
set.seed(1234567)
sample.data<- c(sample(blogs, length(blogs)*0.01),
sample(news, length(news)*0.01),
sample(twitter, length(twitter)*0.01))
First the data must be cleaned before performing exploratory analysis. This involves removing special characters, stopwords, punctuation, numbers, white spaces and changing into all lower case.
corpus<- VCorpus(VectorSource(sample.data))
toSpace<- content_transformer(function(x, pattern){
return(gsub(pattern, " ", x))
})
# Clean all non ASCII characters
corpus<- tm_map(corpus, toSpace, "[^[:graph:]]")
# Transform all data to lower case
corpus<- tm_map(corpus, content_transformer(tolower))
# Delete all English stopwords and any non ASCII letters left
corpus<- tm_map(corpus, removeWords, c(stopwords("english"), letters))
# Remove Punctuation
corpus<- tm_map(corpus, removePunctuation)
# Remove Numbers
corpus<- tm_map(corpus, removeNumbers)
# Remove all whitespace
corpus<- tm_map(corpus, stripWhitespace)
# Create Plain Text Document
corpus<- tm_map(corpus, PlainTextDocument)
Different techniques are used to explore the properties of the data set. These techniques are tokenization, the most frequecnt words, construction of word cloud and histograms using ggplot.
create uni-gram DTM
Tokenizer1 <- function(x) {
NGramTokenizer(x, Weka_control(min = 1, max = 1))
}
unigramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = Tokenizer1, wordLength = c(0, Inf)))
unigramMatrixFreq<-sort(rowSums(as.matrix(removeSparseTerms(unigramMatrix, 0.9999))),
decreasing = TRUE) # sparse should be less than 1
unigramMatrixFreq <- data.frame(Word = names(unigramMatrixFreq),
Freq = unigramMatrixFreq)
unigramMatrixFreq[1:5, ]
## Word Freq
## just just 2535
## like like 2292
## one one 2138
## will will 2050
## can can 1979
Create a bigram DTM
Tokenizer2 <- function(x) {
NGramTokenizer(x, Weka_control(min = 2, max = 2))
}
bigramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = Tokenizer2))
bigramMatrixFreq<-sort(rowSums(as.matrix(removeSparseTerms(bigramMatrix, 0.9999))),
decreasing = TRUE)
bigramMatrixFreq <- data.frame(Word = names(bigramMatrixFreq),
Freq = bigramMatrixFreq)
bigramMatrixFreq[1:5, ]
## Word Freq
## right now right now 199
## last night last night 134
## looks like looks like 115
## feel like feel like 113
## looking forward looking forward 96
Create a trigram DTM
Tokenizer3 <- function(x) {
NGramTokenizer(x, Weka_control(min = 3, max = 3))
}
trigramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = Tokenizer3))
trigramMatrixFreq<-sort(rowSums(as.matrix(removeSparseTerms(trigramMatrix, 0.9999))),
decreasing = TRUE)
trigramMatrixFreq <- data.frame(Word = names(trigramMatrixFreq),
Freq = trigramMatrixFreq)
trigramMatrixFreq[1:5,]
## Word Freq
## let us know let us know 32
## happy new year happy new year 18
## happy mothers day happy mothers day 16
## new york city new york city 14
## cinco de mayo cinco de mayo 13
unigram wordcloud
wordcloud(words = unigramMatrixFreq$Word,
freq = unigramMatrixFreq$Freq,
min.freq = 1,
max.words = 20,
random.order = FALSE,
rot.per = 0.35,
colors=brewer.pal(8, "Dark2"))
bigram wordcloud
wordcloud(words = bigramMatrixFreq$Word,
freq = bigramMatrixFreq$Freq,
min.freq = 1,
max.words = 20,
random.order = FALSE,
rot.per = 0.35,
colors=brewer.pal(8, "Dark2"))
trigram wordcloud
wordcloud(words = trigramMatrixFreq$Word,
freq = trigramMatrixFreq$Freq,
min.freq = 1,
max.words = 20,
random.order = FALSE,
rot.per = 0.35,
colors=brewer.pal(8, "Dark2"))
Unigram plot
g <- ggplot(unigramMatrixFreq[1:20,], aes(x = reorder(Word, -Freq), y = Freq))
g <- g + geom_bar(stat = "identity", fill = I("lightblue"))
g <- g + xlab("Unigrams")+ ylab("Frequency")+
ggtitle("20 Most Common Unigrams")
unigramPlot <- g + theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5),
axis.text.x = element_text(hjust = 1, angle = 45, size = 12),
axis.text.y = element_text(hjust = 0.5, vjust = 0.5))
unigramPlot
Bigram plot
g <- ggplot(bigramMatrixFreq[1:20,], aes(x = reorder(Word, -Freq), y = Freq))
g <- g + geom_bar(stat = "identity", fill = I("blue"))
g <- g + xlab("Bigrams")+ ylab("Frequency")+
ggtitle("20 Most Common Bigrams")
bigramPlot <- g + theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5),
axis.text.x = element_text(hjust = 1, angle = 45, size = 12),
axis.text.y = element_text(hjust = 0.5, vjust = 0.5))
bigramPlot
Trigram plot
g <- ggplot(trigramMatrixFreq[1:20,], aes(x = reorder(Word, -Freq), y = Freq))
g <- g + geom_bar(stat = "identity", fill = I("purple"))
g <- g + xlab("Trigrams")+ ylab("Frequency")+
ggtitle("20 Most Common Trigrams")
trigramPlot <- g + theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5),
axis.text.x = element_text(hjust = 1, angle = 45, size = 12),
axis.text.y = element_text(hjust = 0.5, vjust = 0.5))
trigramPlot
The above word cloud and plots show that “just” is the most frequent word followed by “like” in unigrams. In bigram “right now” is the most frequent word followed by “last night”. The most frequent word in trigram is “let us know” followed by “happy new year”.
The next step is to build a predictive algorithm and deploy it as a Shiny app. The predictive algorithm will be developed using an n-gram model with a word frequency lookup. Furthermore, the strategy may also include predicting the next word using the NGrams model.
\[=====================================================================\]