Outline

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings that you amassed so far. 4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

library(stringi)
library(tm)
## Loading required package: NLP
library(NLP)
library(RWeka)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Loading dataset into R

destfile = "./Coursera-SwiftKey.zip"
if(!file.exists(destfile)){
  url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
  file <- basename(url)
  download.file(url, file, method="curl")
  unzip(file)
}

Summary statistics of the dataset

#blogs in english
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
blogs_size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
blogs_words <- stri_count_words(blogs)

#news in english
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
news_size <- file.info("final/en_US/en_US.news.txt")$size / 1024 ^ 2
news_words <- stri_count_words(news)

#twitter in english
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
twitter_size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
twitter_words <- stri_count_words(twitter)

summary_en <- data.frame(data_source = c("blogs", "news", "twitter"),
           file_size_MB = c(blogs_size, news_size, twitter_size),
           line_counts = c(length(blogs), length(news), length(twitter)),
           words_counts = c(sum(blogs_words), sum(news_words), sum(twitter_words)),
           num_of_words_per_line_mean = c(mean(blogs_words), mean(news_words), mean(twitter_words)))

summary_en
##   data_source file_size_MB line_counts words_counts num_of_words_per_line_mean
## 1       blogs     200.4242      899288     37546250                   41.75109
## 2        news     196.2775     1010242     34762395                   34.40997
## 3     twitter     159.3641     2360148     30093413                   12.75065

Interesting Findings

Create the corpus

Using just 1% of the data will help speed things up as the dataset is quite large, therefore we will make a corpus using the tm package

set.seed(111)
data_sample <- c(sample(blogs, length(blogs) * 0.01),
                 sample(news, length(news) * 0.01),
                 sample(twitter, length(twitter) * 0.01))

# Create corpus and clean the data
corpus <- VCorpus(VectorSource(data_sample))
print(corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 42695

Clean the data

Before removing any unwanted word, we can convert the documents to all lowercase using tm_map(dataset, tolower). After that, we remove the stopwords, punctuations, numbers, and whitespace.

corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

print(corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 42695

Use n-grams (1-3) to group ordered sequences

options(mc.cores=1)

getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

Histogram for unigram (n=1)

corpus_dtm1 <- TermDocumentMatrix(corpus)
print(corpus_dtm1)
## <<TermDocumentMatrix (terms: 58101, documents: 42695)>>
## Non-/sparse entries: 509117/2480113078
## Sparsity           : 100%
## Maximal term length: 74
## Weighting          : term frequency (tf)
freq1 <- getFreq(removeSparseTerms(corpus_dtm1, 0.9999))
ggplot(freq1[1:10, ], aes(reorder(word, -freq), freq)) +
  geom_bar(stat="identity", fill="lightyellow", colour="yellow") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) + 
  ggtitle("10 Most Common Unigrams") +
  xlab("Words") +
  ylab("Frequency")

Histogram for bigram (n=2)

corpus_dtm2 <- TermDocumentMatrix(corpus, control = list(tokenize = bigram))
print(corpus_dtm2)
## <<TermDocumentMatrix (terms: 434381, documents: 42695)>>
## Non-/sparse entries: 515330/18545381465
## Sparsity           : 100%
## Maximal term length: 84
## Weighting          : term frequency (tf)
freq2 <- getFreq(removeSparseTerms(corpus_dtm2, 0.9999))
ggplot(freq2[1:10, ], aes(reorder(word, -freq), freq)) +
  geom_bar(stat="identity", fill="tan", colour="orange") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) + 
  ggtitle("10 Most Common Bigrams") +
  xlab("Words") +
  ylab("Frequency")

Histogram for trigram (n=3)

corpus_dtm3 <- TermDocumentMatrix(corpus, control = list(tokenize = trigram))
print(corpus_dtm3)
## <<TermDocumentMatrix (terms: 472734, documents: 42695)>>
## Non-/sparse entries: 476883/20182901247
## Sparsity           : 100%
## Maximal term length: 97
## Weighting          : term frequency (tf)
freq3 <- getFreq(removeSparseTerms(corpus_dtm3, 0.9999))
ggplot(freq3[1:10, ], aes(reorder(word, -freq), freq)) +
  geom_bar(stat="identity", fill="pink", colour="red") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) + 
  ggtitle("10 Most Common Trigrams") +
  xlab("Words") +
  ylab("Frequency")