Data Science Capstone Week 2 Milestone Report

Pandatas December 2019

Introduction

The Data Science Specialization Capstone Project consists of a data science project to develop a predictive model of text using multiple text datasets. This milestone report describes how the provided datasets were loaded, cleaned, and sampled. It also contains some basic exploratory analysis of the data and a short summary on the plans for the predictive model and Shiny App.

Downloading and reading the data

In this Section, the R libraries and the provided data will be loaded.

Loading the libraries:

# Loading libraries
library(tm)
## Loading required package: NLP
library(ngram)
library(RColorBrewer)
library(readtext)
library(pryr)
## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp
## 
## Attaching package: 'pryr'
## The following object is masked from 'package:tm':
## 
##     inspect
library(rJava)
library(RWeka)
library(wordcloud)
library(quanteda)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
## 
##     View
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Downloading and unzipping the data:

# Downloading the provided data
fileURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(fileURL, "final.zip")

# Unzip the data
unzip("final.zip")

Reading the data in R

The data was read in R and some basic statistics were run on the provided data:

#Get file sizes
size_blogs <- file.info("./final/en_US/en_US.blogs.txt")$size /1024^2
size_news <- file.info("./final/en_US/en_US.news.txt")$size /1024^2
size_twitter <- file.info("./final/en_US/en_US.twitter.txt")$size/1024^2

# Word and line count blogs file
blogs<-file("./final/en_US/en_US.blogs.txt","r")
blogs_lines<-readLines(blogs)
close(blogs)
wordcount(blogs_lines)
## [1] 37334131
length(blogs_lines)
## [1] 899288
# Word and line count news file
news<-file("./final/en_US/en_US.news.txt","r")
news_lines<-readLines(news)
close(news)
wordcount(news_lines)
## [1] 34372530
length(news_lines)
## [1] 1010242
# Word and line count twitter file
twitter<-file("./final/en_US/en_US.twitter.txt","r")
twitter_lines<-readLines(twitter)
close(twitter)
wordcount(twitter_lines)
## [1] 30373543
length(twitter_lines)
## [1] 2360148
# Summary of the data sets
data.frame(Dataset = c("Blogs", "News", "Twitter"),
           filesize.in.MB = c(size_blogs, size_news, size_twitter),
           Linecount = c(length(blogs_lines), length(news_lines), length(twitter_lines)),
           wordscount = c(wordcount(blogs_lines), wordcount(news_lines), wordcount(twitter_lines))
           )
##   Dataset filesize.in.MB Linecount wordscount
## 1   Blogs       200.4242    899288   37334131
## 2    News       196.2775   1010242   34372530
## 3 Twitter       159.3641   2360148   30373543

Data sampling and cleaning

Then the dataset was sampled and cleaned removing the following: * White space * Punctuation * Upper case letters (converting to lower case) * Numbers * Stopwords

# Subsample and prepare corpus
set.seed(12345) # Ensuring a reproducible sample set
samplesize <- 0.01 # Subsample to 1%

corpussample <- c(sample(blogs_lines, length(blogs_lines) * samplesize),
                    sample(news_lines, length(news_lines) * samplesize),
                    sample(twitter_lines, length(twitter_lines) * samplesize))

corpussample <- VCorpus(VectorSource(corpussample))

corpussample <- tm_map(corpussample, stripWhitespace) # Remove unnecessary white spaces
corpussample <- tm_map(corpussample, removePunctuation) # Remove punctuation
corpussample <- tm_map(corpussample, content_transformer(tolower)) # Convert to lowercase
corpussample <- tm_map(corpussample, removeNumbers) # Remove numbers
corpus <- tm_map(corpussample, removeWords, stopwords("english")) # Remove stopwords

Exploratory analysis

To explore the data, a wordcloud was produced to get an impression of frequently used words in the corpus:

wordcloud(corpus, scale=c(3,0.5), min.freq=5, max.words=100, random.order=TRUE, rot.per=0.5, colors=brewer.pal(5, "Set1"), use.r.layout=FALSE)

The corpus was analyzed to determine the word frequencies of unigrams, bigrams and trigrams:

corpus.dataframe <- data.frame(text = unlist(sapply(corpus, '[', 'content')), stringsAsFactors = F)

# Unigram
unigramtoken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 1, max = 1))))
unigram <- unigramtoken[order(unigramtoken$Freq, decreasing = TRUE),]
par(mfrow = c(1, 1))
par(mar=c(5,4,2,0))
barplot(unigram[1:20,2], 
        names.arg=unigram[1:20,1], 
        col = "blue", 
        main="Top 20 of unigrams", 
        las=2, 
        ylab = "Frequency")

# Bigram
bigramtoken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 2, max = 2))))
bigram <- bigramtoken[order(bigramtoken$Freq, decreasing = TRUE),]
par(mfrow = c(1, 1))
par(mar=c(5,4,2,1))
barplot(bigram[1:20,2], 
        names.arg=bigram[1:20,1], 
        col = "blue", 
        main="Top 20 of bigrams", 
        las=2, 
        ylab = "Frequency")

# Trigram
trigramtoken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 3, max = 3))))
trigram <- trigramtoken[order(trigramtoken$Freq, decreasing = TRUE),]
par(mfrow = c(1, 1))
par(mar=c(9,4,2,1))
barplot(trigram[1:20,2], 
        names.arg=trigram[1:20,1], 
        col = "blue", 
        main="Top 20 of trigrams", 
        las=2, 
        ylab = "Frequency")

Plans for Prediction Algorith and Shiny App

Following items will need to be addressed for the development of the prediction algorithm and Shiny App: * further optimize data cleaning * develop training and test sets from the data * look at the trade-off between size and runtime of the prediction algorithm * develop prediction algorithm * develop Shiny App