This document contains the milestone report for week 2 of the Coursera Data SCience Specialization. In it I will explore the basic features of the dataset(s) as wel as try some text data mining techniques on a small sample of the dataset to get acquainted with the basic ideas behind the project.
library(tm)
library(ggplot2)
library(SnowballC)
library(wordcloud)
library(stringi)
library(ngram)
library(gridExtra)
Here I will explore some basic characteristics of the datasets, e.g. show the number of lines and get a word count.
#fully read all data from the files, ds.*.fh is a string reference to the datafile.
blogs.full <- readLines(ds.blogs.fh)
twitter.full <- readLines(ds.twitter.fh)
news.full <- readLines(ds.news.fh)
#create a corpus and then a dtm of all data
blogs.corpusfull <- Corpus(VectorSource(blogs.full))
twitter.corpusfull <- Corpus(VectorSource(twitter.full))
news.corpusfull <- Corpus(VectorSource(news.full))
blogs.dtmfull <- DocumentTermMatrix(blogs.corpusfull)
twitter.dtmfull <- DocumentTermMatrix(twitter.corpusfull)
news.dtmfull <- DocumentTermMatrix(news.corpusfull)
#create a table of characteristics for presentation
df <- data.frame(Corpus=c("Blogs", "Twitter", "News"), Lines=c(nrow(blogs.dtmfull), nrow(twitter.dtmfull), nrow(news.dtmfull)), Wordcount=c(ncol(blogs.dtmfull), ncol(twitter.dtmfull), ncol(news.dtmfull)))
df
## Corpus Lines Wordcount
## 1 Blogs 899288 345327
## 2 Twitter 2360148 341136
## 3 News 1010242 252164
Below I will take the datasets, take a small number of lines from each dataset and clean up the data.
#Open file connection
ds.blogs.fh <- file(ds.blogs.fileref, "r")
ds.twitter.fh <- file(ds.twitter.fileref, "r")
ds.news.fh <- file(ds.news.fileref, "r")
#test further with n number of lines first to save time but show the basic workflow
blogs.sample <- readLines(ds.blogs.fh, n=2000)
twitter.sample <- readLines(ds.twitter.fh, n=2000)
news.sample <- readLines(ds.news.fh, n=2000)
#Close fileconnections
close(ds.blogs.fh)
close(ds.twitter.fh)
close(ds.news.fh)
blogs.corpus <- Corpus(VectorSource(blogs.sample))
twitter.corpus <- Corpus(VectorSource(twitter.sample))
news.corpus <- Corpus(VectorSource(news.sample))
#apply the following transformations on the data: transform all characters to lowercase, remove numbers, stopwords, punctuation and white space.
#Then stem (reduce words to their root) the corpus
blogs.corpus <- tm_map(blogs.corpus, content_transformer(tolower))
blogs.corpus <- tm_map(blogs.corpus, removeNumbers)
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords("english"))
blogs.corpus <- tm_map(blogs.corpus, removePunctuation)
blogs.corpus <- tm_map(blogs.corpus, stripWhitespace)
blogs.corpus <- tm_map(blogs.corpus, stemDocument)
twitter.corpus <- tm_map(twitter.corpus, content_transformer(tolower))
twitter.corpus <- tm_map(twitter.corpus, removeNumbers)
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords("english"))
twitter.corpus <- tm_map(twitter.corpus, removePunctuation)
twitter.corpus <- tm_map(twitter.corpus, stripWhitespace)
twitter.corpus <- tm_map(twitter.corpus, stemDocument)
news.corpus <- tm_map(news.corpus, content_transformer(tolower))
news.corpus <- tm_map(news.corpus, removeNumbers)
news.corpus <- tm_map(news.corpus, removeWords, stopwords("english"))
news.corpus <- tm_map(news.corpus, removePunctuation)
news.corpus <- tm_map(news.corpus, stripWhitespace)
news.corpus <- tm_map(news.corpus, stemDocument)
The code belows creates three DTM’s and tables of uni, bi and trigrams for use in plots.
#Create dtm's and uni,bi and trigrams
blogs.dtm <- DocumentTermMatrix(blogs.corpus)
blogs.str <- concatenate(lapply(blogs.corpus, "[", 1))
blogs.1gram <- ngram(blogs.str, 1)
blogs.2gram <- ngram(blogs.str, 2)
blogs.3gram <- ngram(blogs.str, 3)
blogs.1gram.table <- get.phrasetable(blogs.1gram)
blogs.2gram.table <- get.phrasetable(blogs.2gram)
blogs.3gram.table <- get.phrasetable(blogs.3gram)
twitter.dtm <- DocumentTermMatrix(twitter.corpus)
twitter.str <- concatenate(lapply(twitter.corpus, "[", 1))
twitter.1gram <- ngram(twitter.str, 1)
twitter.2gram <- ngram(twitter.str, 2)
twitter.3gram <- ngram(twitter.str, 3)
twitter.1gram.table <- get.phrasetable(twitter.1gram)
twitter.2gram.table <- get.phrasetable(twitter.2gram)
twitter.3gram.table <- get.phrasetable(twitter.3gram)
news.dtm <- DocumentTermMatrix(news.corpus)
news.str <- concatenate(lapply(news.corpus, "[", 1))
news.1gram <- ngram(news.str, 1)
news.2gram <- ngram(news.str, 2)
news.3gram <- ngram(news.str, 3)
news.1gram.table <- get.phrasetable(news.1gram)
news.2gram.table <- get.phrasetable(news.2gram)
news.3gram.table <- get.phrasetable(news.3gram)
Here I gather single word frequencies for use in wordclouds below.
#find word frequencies
blogs.freq <- sort(colSums(as.matrix(blogs.dtm)), decreasing=TRUE)
blogs.wordfreq <- data.frame(word=names(blogs.freq), freq = blogs.freq)
twitter.freq <- sort(colSums(as.matrix(twitter.dtm)), decreasing=TRUE)
twitter.wordfreq <- data.frame(word=names(twitter.freq), freq = twitter.freq)
news.freq <- sort(colSums(as.matrix(news.dtm)), decreasing=TRUE)
news.wordfreq <- data.frame(word=names(news.freq), freq = news.freq)
The following section shows a single word wordcloud of the top 100 words in the sample corpus as well as two barcharts showing the top 20 bi and trigrams. Only code for the Blogs dataset is show, code for the twitter and news datasets has been ommitted.
#wordcloud of the 100 most frequently used words + two barplots showing the 20 most used bigrams and trigrams
blogs.top20words <- head(blogs.wordfreq, 20)
blogs.top100words <- head(blogs.wordfreq, 100)
p1 <- ggplot(blogs.2gram.table[1:20,], aes(reorder(ngrams, (freq)), freq, colour=freq)) + geom_bar(stat="identity") + ggtitle("Top 20 Bigrams") + xlab("Bigrams") + ylab("count") + coord_flip() + theme(legend.position = "none")
p2 <- ggplot(blogs.3gram.table[1:20,], aes(reorder(ngrams, (freq)), freq, colour=freq)) + geom_bar(stat="identity") + ggtitle("Top 20 Trigrams") + xlab("Trigrams") + ylab("count") + coord_flip() + theme(legend.position = "none")
wordcloud(blogs.top100words$word, blogs.top100words$freq, min.freq=3, colors=brewer.pal(5, "Dark2"), random.order = FALSE, scale= c(3,1))
grid.arrange(p1, p2, ncol=2)
This concludes a first exploration of the datasets. Next steps will be to create predictive models in order to predict next words based on user input and eventually to develop a data product (shiny app). Note: some further cleaning of the data will be necessary, e.g. removing certain non-latin based characters as well as meaningless ngrams (such as the ‘u u u’ trigram in the news dataset).