The goal of this milestone report is to demonstrate that we’ve gotten used to working with the data. This marks the beginning of the road to creating a prediction algorithm. Let us get started on exploring the text data, which includes lines written in the English language from blogs, news and twitter.
First, we start by downloading and importing the data.
file.blog <- "Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
file.news <- "Coursera-SwiftKey/final/en_US/en_US.news.txt"
file.twitter <- "Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
data.blog <- readLines(file.blog, warn=FALSE, encoding="UTF-8")
data.news <- readLines(file.news, warn=FALSE, encoding="UTF-8")
data.twitter <- readLines(file.twitter, warn=FALSE, encoding="UTF-8")
To better understand the data, we provide a summary of some interesting statistics on all three sets of texts.
size.blog <- file.size(file.blog)
size.news <- file.size(file.news)
size.twitter <- file.size(file.twitter)
filesize <- data.frame(TYPE=c("Blog", "News", "Twitter"), SIZE=c(size.blog, size.news, size.twitter))
barplot(filesize$SIZE, ylab="File Size", names.arg=c("Blog", "News", "Twitter"))
lines.blog <- length(data.blog)
lines.news <- length(data.news)
lines.twitter <- length(data.twitter)
numberoflines <- data.frame(TYPE=c("Blog", "News", "Twitter"), LINES=c(lines.blog, lines.news, lines.twitter))
barplot(numberoflines$LINES, ylab="Number of Lines", names.arg=c("Blog", "News", "Twitter"))
maxchar.blog <- max(nchar(data.blog))
maxchar.news <- max(nchar(data.news))
maxchar.twitter <- max(nchar(data.twitter))
maxchar <- data.frame(TYPE=c("Blog", "News", "Twitter"), MAX_CHAR=c(maxchar.blog, maxchar.news, maxchar.twitter))
barplot(maxchar$MAX_CHAR, ylab="Max Number of Characters", names.arg=c("Blog", "News", "Twitter"))
avgchar.blog <- mean(nchar(data.blog))
avgchar.news <- mean(nchar(data.news))
avgchar.twitter <- mean(nchar(data.twitter))
avgchar <- data.frame(TYPE=c("Blog", "News", "Twitter"), AVG_CHAR=c(avgchar.blog, avgchar.news, avgchar.twitter))
barplot(avgchar$AVG_CHAR, ylab="Average Number of Characters", names.arg=c("Blog", "News", "Twitter"))
Given that the dataset is fairly large, we do not necessarily need to load the entire dataset into building algorithms. Often, relatively few randomly selected rows or chunks can get an accurate approximation to results that would be obtained using all the data. Hence, we use a smaller subset of data - 1% of the number of lines. In addition, the data may have non-ASCII characters and these should be removed. Thereafter, we can see from the word cloud frequently used words in each data set.
## Subsetting sample data
set.seed(1234)
sample.blog <- sample(data.blog, 0.01*length(data.blog), replace=FALSE)
sample.news <- sample(data.news, 0.01*length(data.news), replace=FALSE)
sample.twitter <- sample(data.twitter, 0.01*length(data.twitter), replace=FALSE)
## Removing non-ASCII characters
sample.blog <- iconv(sample.blog, "UTF-8", "ASCII", sub="")
sample.news <- iconv(sample.news, "UTF-8", "ASCII", sub="")
sample.twitter <- iconv(sample.twitter, "UTF-8", "ASCII", sub="")
## Wordcloud
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(tm)
## Loading required package: NLP
wordcloud(sample.blog, max.words= 50, colors=brewer.pal(8, "Dark2"))
wordcloud(sample.news, max.words= 50, colors=brewer.pal(8, "Dark2"))
wordcloud(sample.twitter, max.words= 50, colors=brewer.pal(8, "Dark2"))
Common data cleaning tasks associated with text mining include converting to lower case, removing punctuation marks, numbers and extra white space. We use these as guidelines for cleaning the data set.
## Load data into a Corpus (collection of documents)
sample.data <- c(sample.blog, sample.news, sample.twitter)
corpus.data <- VCorpus(VectorSource(sample.data))
## Removing capital letters, punctuation marks, numbers, extra white space and stop words
corpus.data <- tm_map(corpus.data, removePunctuation)
corpus.data <- tm_map(corpus.data, removeNumbers)
corpus.data <- tm_map(corpus.data, removeWords, stopwords("english"))
corpus.data <- tm_map(corpus.data, content_transformer(tolower))
corpus.data <- tm_map(corpus.data, stripWhitespace)
An n-gram model allows one to predict the next word based on the previous 1, 2 or 3 words.
## Tokenization - identifying appropriate tokens such as words, punctuations or numbers.
one.tokenizer <- function(x) unlist(lapply(ngrams(words(x), 1), paste, collapse=" "), use.names=FALSE)
two.tokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse=" "), use.names=FALSE)
tre.tokenizer <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse=" "), use.names=FALSE)
## Converting into term document matrix to present words as a matrix of numbers
one.matrix <- TermDocumentMatrix(corpus.data, control=list(tokenize=one.tokenizer))
two.matrix <- TermDocumentMatrix(corpus.data, control=list(tokenize=two.tokenizer))
tre.matrix <- TermDocumentMatrix(corpus.data, control=list(tokenize=tre.tokenizer))
## Find frequent terms in the term document matrix
one.freq <- findFreqTerms(one.matrix, lowfreq=20)
two.freq <- findFreqTerms(two.matrix, lowfreq=20)
tre.freq <- findFreqTerms(tre.matrix, lowfreq=20)
## Frequency that common words appear
one.freq <- rowSums(as.matrix(one.matrix[one.freq,]))
two.freq <- rowSums(as.matrix(two.matrix[two.freq,]))
tre.freq <- rowSums(as.matrix(tre.matrix[tre.freq,]))
## Ranking of words based on frequency
one.freq <- sort(one.freq, decreasing=TRUE)
two.freq <- sort(two.freq, decreasing=TRUE)
tre.freq <- sort(tre.freq, decreasing=TRUE)
## Some interesting charts
barplot(one.freq[1:30], las=2, main="Frequency of Top 30 Common Words")
barplot(two.freq[1:30], las=2, main="Frequency of Top 30 Common Double Words")
barplot(tre.freq[1:30], las=2, main="Frequency of Top 30 Common Triple Words")