Milestone Report

Synopsis

This document contains the milestone report for week 2 of the Coursera Data SCience Specialization. In it I will explore the basic features of the dataset(s) as wel as try some text data mining techniques on a small sample of the dataset to get acquainted with the basic ideas behind the project.

Used libraries

library(tm)
library(ggplot2)

library(SnowballC)
library(wordcloud)

library(stringi)
library(ngram)
library(gridExtra)

Explore the full dataset

Here I will explore some basic characteristics of the datasets, e.g. show the number of lines and get a word count.

#fully read all data from the files, ds.*.fh is a string reference to the datafile.
blogs.full <- readLines(ds.blogs.fh)
twitter.full <- readLines(ds.twitter.fh)
news.full <- readLines(ds.news.fh)

#create a corpus and then a dtm of all data
blogs.corpusfull <- Corpus(VectorSource(blogs.full))
twitter.corpusfull <- Corpus(VectorSource(twitter.full))
news.corpusfull <- Corpus(VectorSource(news.full))

blogs.dtmfull <- DocumentTermMatrix(blogs.corpusfull)
twitter.dtmfull <- DocumentTermMatrix(twitter.corpusfull)
news.dtmfull <- DocumentTermMatrix(news.corpusfull)

#create a table of characteristics for presentation
df <- data.frame(Corpus=c("Blogs", "Twitter", "News"), Lines=c(nrow(blogs.dtmfull), nrow(twitter.dtmfull), nrow(news.dtmfull)), Wordcount=c(ncol(blogs.dtmfull), ncol(twitter.dtmfull), ncol(news.dtmfull)))

df

##    Corpus   Lines Wordcount
## 1   Blogs  899288    345327
## 2 Twitter 2360148    341136
## 3    News 1010242    252164

Subset and clean the datasets

Below I will take the datasets, take a small number of lines from each dataset and clean up the data.

#Open file connection
ds.blogs.fh <- file(ds.blogs.fileref, "r")
ds.twitter.fh <- file(ds.twitter.fileref, "r")
ds.news.fh <- file(ds.news.fileref, "r")

#test further with n number of lines first to save time but show the basic workflow 
blogs.sample <- readLines(ds.blogs.fh, n=2000) 
twitter.sample <- readLines(ds.twitter.fh, n=2000)
news.sample <- readLines(ds.news.fh, n=2000)

#Close fileconnections
close(ds.blogs.fh)
close(ds.twitter.fh)
close(ds.news.fh)

blogs.corpus <- Corpus(VectorSource(blogs.sample))
twitter.corpus <- Corpus(VectorSource(twitter.sample))
news.corpus <- Corpus(VectorSource(news.sample))

#apply the following transformations on the data: transform all characters to lowercase, remove numbers, stopwords, punctuation and white space.
#Then stem (reduce words to their root) the corpus

blogs.corpus <- tm_map(blogs.corpus, content_transformer(tolower))
blogs.corpus <- tm_map(blogs.corpus, removeNumbers)
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords("english"))
blogs.corpus <- tm_map(blogs.corpus, removePunctuation)
blogs.corpus <- tm_map(blogs.corpus, stripWhitespace)
blogs.corpus <- tm_map(blogs.corpus, stemDocument)

twitter.corpus <- tm_map(twitter.corpus, content_transformer(tolower))
twitter.corpus <- tm_map(twitter.corpus, removeNumbers)
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords("english"))
twitter.corpus <- tm_map(twitter.corpus, removePunctuation)
twitter.corpus <- tm_map(twitter.corpus, stripWhitespace)
twitter.corpus <- tm_map(twitter.corpus, stemDocument)

news.corpus <- tm_map(news.corpus, content_transformer(tolower))
news.corpus <- tm_map(news.corpus, removeNumbers)
news.corpus <- tm_map(news.corpus, removeWords, stopwords("english"))
news.corpus <- tm_map(news.corpus, removePunctuation)
news.corpus <- tm_map(news.corpus, stripWhitespace)
news.corpus <- tm_map(news.corpus, stemDocument)

Create DTM’s and n-grams

The code belows creates three DTM’s and tables of uni, bi and trigrams for use in plots.

#Create dtm's and uni,bi and trigrams
blogs.dtm <- DocumentTermMatrix(blogs.corpus)
blogs.str <- concatenate(lapply(blogs.corpus, "[", 1))
blogs.1gram <- ngram(blogs.str, 1)
blogs.2gram <- ngram(blogs.str, 2)
blogs.3gram <- ngram(blogs.str, 3)
blogs.1gram.table <- get.phrasetable(blogs.1gram)
blogs.2gram.table <- get.phrasetable(blogs.2gram)
blogs.3gram.table <- get.phrasetable(blogs.3gram)

twitter.dtm <- DocumentTermMatrix(twitter.corpus)
twitter.str <- concatenate(lapply(twitter.corpus, "[", 1))
twitter.1gram <- ngram(twitter.str, 1)
twitter.2gram <- ngram(twitter.str, 2)
twitter.3gram <- ngram(twitter.str, 3)
twitter.1gram.table <- get.phrasetable(twitter.1gram)
twitter.2gram.table <- get.phrasetable(twitter.2gram)
twitter.3gram.table <- get.phrasetable(twitter.3gram)

news.dtm <- DocumentTermMatrix(news.corpus)
news.str <- concatenate(lapply(news.corpus, "[", 1))
news.1gram <- ngram(news.str, 1)
news.2gram <- ngram(news.str, 2)
news.3gram <- ngram(news.str, 3)
news.1gram.table <- get.phrasetable(news.1gram)
news.2gram.table <- get.phrasetable(news.2gram)
news.3gram.table <- get.phrasetable(news.3gram)

Single word frequencies

Here I gather single word frequencies for use in wordclouds below.

#find word frequencies
blogs.freq <- sort(colSums(as.matrix(blogs.dtm)), decreasing=TRUE)
blogs.wordfreq <- data.frame(word=names(blogs.freq), freq = blogs.freq)

twitter.freq <- sort(colSums(as.matrix(twitter.dtm)), decreasing=TRUE)
twitter.wordfreq <- data.frame(word=names(twitter.freq), freq = twitter.freq)

news.freq <- sort(colSums(as.matrix(news.dtm)), decreasing=TRUE)
news.wordfreq <- data.frame(word=names(news.freq), freq = news.freq)

Plots and wordclouds

The following section shows a single word wordcloud of the top 100 words in the sample corpus as well as two barcharts showing the top 20 bi and trigrams. Only code for the Blogs dataset is show, code for the twitter and news datasets has been ommitted.

Blogs dataset

#wordcloud of the 100 most frequently used words + two barplots showing the 20 most used bigrams and trigrams
blogs.top20words <- head(blogs.wordfreq, 20)
blogs.top100words <- head(blogs.wordfreq, 100)

p1 <- ggplot(blogs.2gram.table[1:20,], aes(reorder(ngrams, (freq)), freq, colour=freq)) + geom_bar(stat="identity") + ggtitle("Top 20 Bigrams") + xlab("Bigrams") + ylab("count") + coord_flip() + theme(legend.position = "none")

p2 <- ggplot(blogs.3gram.table[1:20,], aes(reorder(ngrams, (freq)), freq, colour=freq)) + geom_bar(stat="identity") + ggtitle("Top 20 Trigrams") + xlab("Trigrams") + ylab("count") + coord_flip() + theme(legend.position = "none")

wordcloud(blogs.top100words$word, blogs.top100words$freq, min.freq=3, colors=brewer.pal(5, "Dark2"), random.order = FALSE, scale= c(3,1))

grid.arrange(p1, p2, ncol=2)

Twitter dataset

News dataset

Conclusion and further steps

This concludes a first exploration of the datasets. Next steps will be to create predictive models in order to predict next words based on user input and eventually to develop a data product (shiny app). Note: some further cleaning of the data will be necessary, e.g. removing certain non-latin based characters as well as meaningless ngrams (such as the ‘u u u’ trigram in the news dataset).