Data Science Capstone: Milestone Report

Introduction

This is a milestone report of Week 2 Data Science Specialization Capstone Project. This report is the first step to complete the final goal, to build a prediction algorithm and Shiny app that allows a predictive text model. The model will be trained using a collection of English text that is compiled from 3 sources - news, blogs, and tweets.

The main parts are loading and cleaning the data, as well as using NLM (natural language processing) applications in R s, in the direction of building the model.

The data to use can be downloaded at:

https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

1. Load Data

I have selected to use only english_US dataset for this project.

Twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
New <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE )
Blog <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE )

2. Data Summary

I calculated summary statistics for the data by determining size. the number of lines, number of characters for each of the 3 datasets (twitter, blogs, and news).

Files <- c("Twitter", "New", "Blog")

FileInfoTwitter <- file.info("en_US.twitter.txt")
sizeTwitter <- FileInfoTwitter$size
sizeTwitterMB <-  round(sizeTwitter/1024^2, 4)

FileInfoNew <- file.info("en_US.news.txt")
sizeNew <- FileInfoNew$size
sizeNewMB <-  round(sizeNew/1024^2, 4)

FileInfoBlog <- file.info("en_US.blogs.txt")
sizeBlog <- FileInfoBlog$size
sizeBlogMB <-  round(sizeBlog/1024^2, 4)

SizeMB <- c(sizeTwitterMB, sizeNewMB,  sizeBlogMB)

Lines <- c(length(Twitter), length(New), length(Blog))

Chars <- c(sum(nchar(Twitter)), sum(nchar(New)), sum(nchar(Blog)))

DataSummary <- cbind.data.frame(Files, SizeMB, Lines, Chars)

DataSummary

##     Files   SizeMB   Lines     Chars
## 1 Twitter 159.3641 2360148 162096241
## 2     New 196.2775   77259  15639408
## 3    Blog 200.4242  899288 206824505

3. Sampling Data and Cleaning

Because the data is very long, we will take samples from each file to create the model. Likewise, the data will be cleaned.

set.seed(2009)

sTwitter <- sample(Twitter, length(Twitter)*.01)
sNew <- sample(New, length(New)*.01)
sBlog <- sample(Blog, length(Blog)*.01)


sampleData <- c(sTwitter, sNew, sBlog)

sampleData <- iconv(sampleData, "UTF-8", "ASCII", sub="")

4. Build corpus

Using the tm package I will clean the data by changing all characters to lowercase, trimming white space, removing numbers, removing punctuation, removing profanity and english stopwords, and to convert the text to plaintext.

library(tm)
library(NLP)

corpus <- VCorpus(VectorSource(sampleData))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)

#Read in Profanity List
profanity <- readLines('badWords.txt')

corpus <- tm_map(corpus, removeWords, profanity)
corpus <- tm_map(corpus, removeWords, stopwords('english') )
corpus <- tm_map(corpus, PlainTextDocument)

5. Tokenize and Calculate Frequencies of N-Grams

I use the RWeka package to construct functions that tokenize the sample and construct matrices of uniqrams, bigrams, and trigrams.

library(RWeka)

unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

unigrams <- TermDocumentMatrix(corpus, control = list(tokenize = unigramTokenizer))
bigrams <- TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer))
trigrams <- TermDocumentMatrix(corpus, control = list(tokenize = trigramTokenizer))

unigrams

## <<TermDocumentMatrix (terms: 44095, documents: 33365)>>
## Non-/sparse entries: 341663/1470888012
## Sparsity           : 100%
## Maximal term length: 119
## Weighting          : term frequency (tf)

bigrams

## <<TermDocumentMatrix (terms: 289055, documents: 33365)>>
## Non-/sparse entries: 342134/9643977941
## Sparsity           : 100%
## Maximal term length: 125
## Weighting          : term frequency (tf)

trigrams

## <<TermDocumentMatrix (terms: 309238, documents: 33365)>>
## Non-/sparse entries: 312001/10317413869
## Sparsity           : 100%
## Maximal term length: 131
## Weighting          : term frequency (tf)

We can find the frequency for each n-gram and plot it.

unigrams_freqTerm <- findFreqTerms(unigrams,lowfreq = 50)
bigrams_freqTerm <- findFreqTerms(bigrams,lowfreq=50)
trigrams_freqTerm <- findFreqTerms(trigrams,lowfreq=8)

## Unigram freqeuncy dataframe
unigrams_freq <- rowSums(as.matrix(unigrams[unigrams_freqTerm,]))
unigrams_freq <- data.frame(word=names(unigrams_freq), frequency=unigrams_freq)
head(unigrams_freq)

##                  word frequency
## able             able       186
## absolutely absolutely        69
## accept         accept        52
## access         access        62
## according   according        86
## account       account        80

## Bigram freqeuncy dataframe
bigrams_freq <- rowSums(as.matrix(bigrams[bigrams_freqTerm,]))
bigrams_freq <- data.frame(word=names(bigrams_freq), frequency=bigrams_freq)
head(bigrams_freq)

##                word frequency
## can get     can get       101
## can see     can see        62
## cant wait cant wait       172
## come back come back        50
## dont get   dont get        59
## dont know dont know       154

## Trigram freqeuncy dataframe
trigrams_freq <- rowSums(as.matrix(trigrams[trigrams_freqTerm,]))
trigrams_freq <- data.frame(word=names(trigrams_freq), frequency=trigrams_freq)
head(trigrams_freq)

##                          word frequency
## cake cake cake cake cake cake         8
## cant wait get   cant wait get         8
## cant wait see   cant wait see        39
## cinco de mayo   cinco de mayo         9
## come see us       come see us        11
## dont even know dont even know        15

6. Top n-grams Frequency Visualization

library(ggplot2)

plot_n_grams <- function(df_gram, title, num, barC) {
  df_sort <- df_gram[order(-df_gram$frequency),][1:num,] 
  ggplot(data = df_sort[1:num,], aes(x = reorder(word, -frequency), y = frequency)) +
    geom_bar(stat = "identity", fill = barC, colour = "black") +
    coord_cartesian(xlim = c(0, num+1)) +
    labs(title = title) +
    xlab("Words") +
    ylab("Count") +
    theme(axis.text.x=element_text(angle=90))
}

Unigram Plot

plot_n_grams(unigrams_freq,"Top 40 Unigrams",40,"blue")

Bigram Plot

plot_n_grams(bigrams_freq,"Top 40 Bigrams",40,"orange")

Trigram Plot

plot_n_grams(trigrams_freq,"Top 40 Bigrams",40,"green")

7-. Next Steps

Once the exploratory analysis of the data is completed, we can move forward in building the predictive model, using the data from the N-Grams.

The model will be built using the Shiny package and should allow the user to enter text and then suggest the following words.