Load the libraries that will be used for this analysis
library(tm)
library(wordcloud)
The dataset was downloaded as a zipped file from the Coursera site and then extracted to the local disk. Review the files and filesize
fp <-file.path(".", "data", "capstone", "final", "en_US", fsep = "/")
fn <- list.files(fp)
Read the files into memory and review data structure
ustweets <- readLines(file.path(fp,"en_US.twitter.txt", fsep = "/"))
usnews <- readLines(file.path(fp, "en_US.news.txt", fsep = "/"))
usblogs <- readLines(file.path(fp, "en_US.blogs.txt", fsep = "/"))
Review file sizes
file.info(file.path(fp, fn, fsep = "/"))[1]
## size
## ./data/capstone/final/en_US/en_US.blogs.txt 210160014
## ./data/capstone/final/en_US/en_US.news.txt 205811889
## ./data/capstone/final/en_US/en_US.twitter.txt 167105338
After review the str() of the file, proceed to review the number of lines in each file
print(paste("tweets - no of lines :", length(ustweets), "; max line length :", max(nchar(ustweets)) ))
## [1] "tweets - no of lines : 2360148 ; max line length : 140"
print(paste("news - no of lines :", length(usnews), "; max line length :", max(nchar(usnews)) ))
## [1] "news - no of lines : 1010242 ; max line length : 11384"
print(paste("blogs - no of lines :", length(usblogs), " ; max line length :", max(nchar(usblogs)) ))
## [1] "blogs - no of lines : 899288 ; max line length : 40833"
Clearly, there’s a lot of data. We’ll restrict our sample data to 10K lines of each dataset
ustweets <- sample(ustweets, 5000)
usnews <- sample(usnews, 2500)
usblogs <- sample(usblogs, 2500)
Create a corpus.
cps.tweets <- Corpus(VectorSource(ustweets))
cps.news <- Corpus(VectorSource(usnews))
cps.blogs <- Corpus(VectorSource(usblogs))
Clear memory of defunct variables
rm(ustweets, usnews, usblogs)
Transform the corpus
term.doc.control <- list(removePunctuation = TRUE,
stopwords = c(stopwords("en")),
removeNumbers = TRUE, tolower = TRUE,
stripWhitespace = TRUE )
term.doc.tweets <- as.matrix(TermDocumentMatrix(cps.tweets, control = term.doc.control))
term.doc.news <- as.matrix(TermDocumentMatrix(cps.news, control = term.doc.control))
term.doc.blogs <- as.matrix(TermDocumentMatrix(cps.blogs, control = term.doc.control))
Review the most used words from the Tweets sample
freq.tweets <- sort(rowSums(term.doc.tweets), decreasing=TRUE)
df.tweets <- data.frame(word=names(freq.tweets), freq=freq.tweets)
head(df.tweets,10)
## word freq
## just just 316
## can can 272
## get get 261
## like like 259
## will will 237
## love love 234
## good good 215
## thanks thanks 198
## know know 193
## now now 188
Review the most used words from the News sample
freq.news <- sort(rowSums(term.doc.news), decreasing=TRUE)
df.news <- data.frame(word=names(freq.news), freq=freq.news)
head(df.news,10)
## word freq
## said said 611
## will will 271
## year year 219
## one one 214
## can can 182
## new new 180
## two two 156
## first first 153
## also also 138
## last last 135
Review the most used words from the Blogs sample
freq.blogs <- sort(rowSums(term.doc.blogs), decreasing=TRUE)
df.blogs <- data.frame(word=names(freq.blogs), freq=freq.blogs)
head(df.blogs,10)
## word freq
## one one 321
## will will 309
## can can 298
## just just 284
## like like 281
## time time 232
## get get 199
## know know 198
## now now 180
## people people 171
Compare the differences in the dataset through a nice visual wordcloud display
par(mfrow=c(1,3))
wordcloud(df.tweets$word, df.tweets$freq, random.order=FALSE)
text(x = 0.5, y = 0, "Tweets")
wordcloud(df.news$word, df.news$freq, random.order=FALSE)
text(x = 0.5, y = 0, "News")
wordcloud(df.blogs$word, df.blogs$freq, random.order=FALSE)
text(x = 0.5, y = 0, "Blogs")
It’s interesting to note the differences in the most frequently used words of of each of the three datasets, perhaps in line with what one might expect from the tone and language of the respective sources.
To further build on the predictive model, I’ll consider:
* review more preprocessing options to make the model simpler and cleaner * improve model performance perhaps by scaling down the sample dataset further * combining the three samples into one corpus * review n-grams for sentence prediction * build a predictive model