Coursera Data Science Capstone

Introduction

The goal of this milestone is to demonstrate my progress with working with the data, and that I am well on track to making the prediction algorithm. I look forward to hearing your feedback!

Loading the Raw Data

library(pryr) # Size Checking 'object_size()'

#Read all of the raw data
blogs <- readLines(con = "./final/en_US/en_US.blogs.txt", encoding= "UTF-8", skipNul = TRUE, n  = 899288) #899288 total lines
news <- readLines(con = "./final/en_US/en_US.news.txt", encoding= "UTF-8", skipNul = TRUE, n = 1010240) #1010241 total lines
twitter <- readLines(con = "./final/en_US/en_US.twitter.txt", encoding= "UTF-8", skipNul = TRUE, n = 2360147) #2360147 total lines

Basic File Information

library(qdap) # Word/Line Count

object_size(blogs) # Size of Data
sum(lengths(strsplit(blogs, "\\W+"))) # Number of Words
length(blogs) # Number of Lines

object_size(news) # Size of Data
sum(lengths(strsplit(news, "\\W+"))) # Number of Words
length(news) # Number of Lines

object_size(twitter) # Size of Data
sum(lengths(strsplit(twitter, "\\W+"))) # Number of Words
length(twitter) # Number of Lines

The table below gives the results of the Basic file information commands:

File Name	Size Loaded in R (MB)	Number of Lines	Number of Words
en_US.blogs.txt	261	899288	38371704
en_US.news.txt	262	1010240	35783000
en_US.twitter.txt	316	2360147	31149377

Sample 4% of the entire data set for each data set

4% is a computationally manageable percentage of the corpus for efficiency purposes

sample_percent <- 0.04 #decimal form (20% of 20% should contain 80% of 80% or 64% of the information)

set.seed(72570) #seed from 1 - 10^6 on random.org
blog_s <- sample(blogs, size = ceiling(sample_percent * length(blogs)), replace = FALSE)
news_s <- sample(news, size = ceiling(sample_percent * length(news)), replace = FALSE)
twitter_s <- sample(twitter, size = ceiling(sample_percent * length(twitter)), replace = FALSE)

object_size(blog_s)

## 10.4 MB

object_size(news_s)

## 10.4 MB

object_size(twitter_s)

## 12.8 MB

rm(blogs, news, twitter, sample_percent) #remove the original data sets to save RAM

Concatenate all data sets into a single set (training)

training <- c(blog_s, news_s, twitter_s)

set.seed(848441)
training <- sample(training) #shuffle the vector randomly

object_size(training)

## 33.6 MB

Export Data for Later Analysis

save(training, file = "training-f.RData")
rm(training, training, twitter_s) #save RAM again

## Warning in rm(training, training, twitter_s): object 'training' not found

Clear Workspace and Load ‘quanteda’ Package

Chose ‘quanteda’ package for corpus analysis
I chose this package as opposed to the more popular ‘tm’ and ‘tidytext’ packages
Higher memory and computational efficiency
More features
To read more about ‘quanteda’, see these links: Performance Analysis, Features Comparison

rm(list=ls()) #Clear the workspace to save RAM

load('training-f.RData')

library(quanteda)

## Package version: 1.3.0

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

library(readtext)
library(ggplot2)

quanteda_options("threads" = 4) # Set Number of Threads

Create the ‘training’ corpus ‘c_training’

c_training <- corpus(training) # Create corpus
save(c_training, file = "c_training-f.RData")

Tokenize, clean data as well

Remove the following from the data while tokenizing it:
Numbers
Punctuation
Symbols
Twitter
Hyphens

t_training <- tokens(c_training, remove_numbers = TRUE, remove_punct = TRUE, 
                     remove_symbols = TRUE, remove_twitter = TRUE, remove_hyphens = TRUE)
save(t_training, file = "t_training-f.RData")

Generate n-grams (2, 3, and 4)

n2 <- tokens_ngrams(t_training, n = 2)
n3 <- tokens_ngrams(t_training, n = 3)
n4 <- tokens_ngrams(t_training, n = 4)

save(n2, file = "n2-f.RData")
save(n3, file = "n3-f.RData")
save(n4, file = "n4-f.RData")

Generate DFM (Document-Feature Matrix)

For uni-grams

d_training <- dfm(t_training)

Plot Frequency of Unigrams, Bigrams, Trigrams, and Quadgrams

Multiple histograms with the frequencies
Top 20 hits for unigrams, top 10 for bigrams, trigrams, and top 5 for quadgrams

textstat_frequency(d_training, n = 20) %>% 
  ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
  geom_bar(stat = "identity") + 
  labs(x = "", y = "Unigram Term Frequency")

textstat_frequency(dfm(n2), n = 10) %>% 
  ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
  geom_bar(stat = "identity") + 
  labs(x = "Text", y = "Bigram Term Frequency")

textstat_frequency(dfm(n3), n = 10) %>% 
  ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
  geom_bar(stat = "identity") + 
  labs(x = "Text", y = "Trigram Term Frequency")

textstat_frequency(dfm(n4), n = 5) %>% 
  ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
  geom_bar(stat = "identity") + 
  labs(x = "Text", y = "Quadgram Term Frequency")

Next Steps

See if there is a way to further cut down the data set, for example removing n-grams that occur only once (or a few) times
Build model (Katz’s Back-Off)
Further streamline efficiency of code for the Shiny App
Build Shiny App (end goal)

Coursera Data Science Capstone - Milestone Report

Hari Ravichandran

June 10, 2018