Introduction

This is the Milestone Report for the Coursera Data Science Capstone project. This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.

Loading data sets

The data was downloaded at this link: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

suppressPackageStartupMessages(suppressWarnings(library(plyr)))
suppressPackageStartupMessages(suppressWarnings(library(dplyr)))
suppressPackageStartupMessages(suppressWarnings(library(data.table)))
suppressPackageStartupMessages(suppressWarnings(library(stringi)))
suppressPackageStartupMessages(suppressWarnings(library(tm)))
suppressPackageStartupMessages(suppressWarnings(library(RWeka)))
suppressPackageStartupMessages(suppressWarnings(library(ggplot2)))
suppressPackageStartupMessages(suppressWarnings(library(grid)))
suppressPackageStartupMessages(suppressWarnings(library(gridExtra)))

# Read the blogs and Twitter data into R
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines("final/en_US/en_US.news.txt", encoding =
## "UTF-8", skipNul = TRUE): incomplete final line found on 'final/en_US/
## en_US.news.txt'

twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

Data sets

Brief summary

Our second step is to look for some basic characteristics of the data sets. File size, number of lines, total number of words and words per line are considered.

# File size
blogs.size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info("final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 ^ 2

# Word counting
blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
# Brief summary
brief <- data.table(source = c("twitter", "news", "blogs"),
           file.size = c(twitter.size, news.size, blogs.size),
           count.lines = c(length(twitter), length(news),length(blogs)),
           count.words = c(sum(twitter.words), sum(news.words), sum(blogs.words)),
           words.per.line = c(mean(twitter.words), mean(news.words), mean(blogs.words))
           )

g1 <- ggplot(brief)+geom_col(aes(x=source,y=file.size),fill="blue")+theme_bw()+labs(x="",y="MB")
g2 <- ggplot(brief)+geom_col(aes(x=source,y=count.lines/1000000),fill="green")+theme_bw()+labs(x="",y="Lines (Millions)")+scale_y_continuous(breaks = c(seq(0,max(brief$count.lines),0.5)),limits=c(0,3))
g3 <- ggplot(brief)+geom_col(aes(x=source,y=count.words/1000000),fill="red")+theme_bw()+labs(x="",y="Words (Millions)")+scale_y_continuous(breaks = c(seq(0,max(brief$count.words),5)),limits=c(0,40))
g4 <- ggplot(brief)+geom_col(aes(x=source,y=words.per.line),fill="purple")+theme_bw()+labs(x="",y="Average words per line")

grid.arrange(g1,g2,g3,g4,nrow=2,ncol=2,
             top=textGrob("Brief Summary of Data Sets", vjust = 1,gp = gpar(fontface = "bold", cex = 1.0)))

Data cleaning

Since the data sets are large (around 556 MB), we first randomly choose 1% of the data. We also clean this new randomly chosen data set by removing special characters, punctuations, numbers, stopwords and as typically done we change the text to lower case.

# Sample the data
set.seed(970812)
data.sample <- c(sample(blogs, length(blogs) * 0.01),
                 sample(news, length(news) * 0.01),
                 sample(twitter, length(twitter) * 0.01))

# Create corpus and clean the data
corpus <- VCorpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

Now we are ready to make some exploratory analysis. In this part we are going to look for the 20 most common unigrams, bigrams and trigrams in our texts.

options(mc.cores=1)

getFreq <- function(corpora) {
  freq <- sort(rowSums(as.matrix(corpora)), decreasing = TRUE)
  return(data.table(word = names(freq), freq = freq))
}

unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

makePlot <- function(data, label) {
ggplot(data[1:20,], aes(reorder(word, -freq), freq)) +
    labs(x = label, y = "Frequency") +theme_bw()+
    theme(axis.text.x = element_text(angle = 90, size = 10, hjust = 1)) +
    geom_bar(stat = "identity",fill="blue")

}

# Get most common n-grams in our text sample
unigrams <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = unigram)), 0.9999))
bigrams <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
trigrams <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))

makePlot(unigrams, "20 Most Common Unigrams")

makePlot(bigrams, "20 Most Common Bigrams")

makePlot(trigrams, "20 Most Common Trigrams")

Further steps

In the future steps we are going to implement a predictive algorithm and prove its forcastability. Finally, a data science product such as an shiny app will be delivered. This final app intends to predict text from previous text introduced by the user.

Milestone Report 1: Exploratory Analysis

Sebastian Sanin