Data Science Capstone

Jinkwan Hong
Saturday, Feb 30, 2019

Milstone Report

Synopsis

This report was prepared as a part of Data Scinece Capstone. The final goal is to create word prediction algorithm and the Shiny app that allows the public to use easily.

As for this documents, I am going to illustrate the data summaries to grasp the data profile.

Data Source

Getting and Cleaning the Data

Getting Data

#library(testthat)
#library(tokenizers)
library(stopwords)
library(sentimentr)
library(dplyr)
library(ggplot2)
library(NLP)
library(openNLP)
library(RWeka)
library(tm)

set.seed(43)

Sys.setlocale("LC_ALL", "American")
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
destinyDirectory = "./data/"
destinyFile = "Coursera_SwiftKey.zip"
destiny = paste0(destinyDirectory, destinyFile)
dataURL = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

blogsFile <- unz(destiny, 'final/en_US/en_US.blogs.txt')
blogs <- readLines(blogsFile, skipNul = TRUE)
close(blogsFile)

newsFile <- unz(destiny, 'final/en_US/en_US.news.txt')
news <- readLines(newsFile, skipNul = TRUE)
close(newsFile)

twitterFile <- unz(destiny, 'final/en_US/en_US.twitter.txt')
twitter <- readLines(twitterFile, skipNul = TRUE)
close(twitterFile)

profanityList <- lexicon::profanity_banned # Banned words

Original Data Summary

#docsOrg <- Corpus(DirSource(unz(destiny, 'final/en_US/')), readerControl = list(readPlain,language="en_US", load = TRUE))
#blogsOrg <- Corpus(VectorSource(blogs), readerControl = list(readPlain,language="en_US", load = TRUE))
#newsOrg <- Corpus(VectorSource(news), readerControl = list(readPlain,language="en_US", load = TRUE))
#twitterOrg <- Corpus(VectorSource(twitter), readerControl = list(readPlain,language="en_US", load = TRUE))
docsOrg <- Corpus(VectorSource(c(blogs, news, twitter)), readerControl = list(readPlain,language="en_US", load = TRUE))
#docsOrg <- Corpus(DirSource('./data/final/en_US'), readerControl = list(readPlain,language="en_US", load = TRUE))
#tdmOrg <- TermDocumentMatrix(docsOrg)
#dtmBlogs <- DocumentTermMatrix(blogsOrg)
#dtmNews <- DocumentTermMatrix(newsOrg)
#dtmTwitter <- DocumentTermMatrix(twitterOrg)
dtmOrg <- DocumentTermMatrix(docsOrg)
#length(docsOrg)
#object.size(dtmOrg)

The summary plots says that there are 800,000 lines in blogs, 1 million lines in news, and 2 million lines in twitter. However number of words goes opposite way. Blogs 37 million, News 34 million, and Twitter 30 million words. It’s probably because twitter limits the number of character on each twit by 140 bytes.

The size of docsOrg corpus is 1.5 giga bytes which is quite big to work with. I am going to randomly sample then analyze.

Sampling

Here I am randomly sampling 1% of the data in order to perform explarotory analysis and turn samples into files to avoid unnecessary computing and drop the original data from the memory. Finally I am writing the text into files so I do not have to go through the same process all over again.

blogsSample <- blogs[rbinom(length(blogs)*.01, length(blogs), 0.5)]
newsSample <- news[rbinom(length(news)*.01, length(news), 0.5)]
twitterSample <- twitter[rbinom(length(twitter)*.01, length(twitter), 0.5)]

rm(blogs) # Remove from memory to save the memory.
rm(news)
rm(twitter)
rm(docsOrg)
rm(dtmOrg)

write.table(blogsSample, file = './data/sample/en_US.blogsSample.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)
write.table(newsSample, file = './data/sample/en_US.newsSample.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)
write.table(twitterSample, file = './data/sample/en_US.twitterSample.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)
write.table(profanityList, file = './data/profanityList.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)

After sampling and cleanup word counts for the whole text went down to around 450,000 and the lines counts

# Making Corpus 
#docsSamples <- Corpus(VectorSource(c(blogsSample, newsSample, twitterSample)), readerControl = list(readPlain,language="en_US", load = TRUE))
docsSamples <- VCorpus(DirSource('./data/sample'), readerControl = list(readPlain,language="en_US", load = TRUE))

Cleaning Data

There are lots of irregularity in the data since they are from different sources. Here I am removing the followings utilizing tm package.

  • Whitespace
  • Punctuation
  • Numbers
  • URL
  • Hashtags
  • Twitter Handles
  • HTML Tags
  • Stopwords (common grammartical word with little to no added meaning)
removeURL <- function(x) gsub("http:[[:alnum:]]*", "", x)
removeHashTags <- function(x) gsub("#\\S+", "", x)
removeTwitterHandles <- function(x) gsub("@\\S+", "", x)
removeHTMLTags <- function(x) gsub("<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>","",x)

docsSamples <- tm_map(docsSamples, content_transformer(tolower))
docsSamples <- tm_map(docsSamples, removeWords, stopwords("english"))
docsSamples <- tm_map(docsSamples, removePunctuation)
docsSamples <- tm_map(docsSamples, removeNumbers)
docsSamples <- tm_map(docsSamples, stripWhitespace)
#docsSamples <- tm_map(docsSamples, stemDocument)
docsSamples <- tm_map(docsSamples, content_transformer(removeURL))
docsSamples <- tm_map(docsSamples, content_transformer(removeHashTags))
docsSamples <- tm_map(docsSamples, content_transformer(removeTwitterHandles))
docsSamples <- tm_map(docsSamples, content_transformer(removeHTMLTags))
dtmSamples <- DocumentTermMatrix(docsSamples)
#rowSums(as.matrix(dtmSamples))
tdmSamples <- TermDocumentMatrix(docsSamples, control = list(tokenize = RWeka::WordTokenizer))
tdmMatrix <- as.matrix(tdmSamples)
tdmMatrixSorted <- sort(rowSums(tdmMatrix), decreasing = TRUE)

#tdmSamples <- TermDocumentMatrix(docsSamples, control = list(tokenize = NLP::wordpunct_tokenizer))

Analysis

Corpus is now ready for analysis.

sum(tdmMatrix)
## [1] 536193
length(tdmMatrix[,1])
## [1] 23239
freqWords <- findFreqTerms(tdmSamples, 2)
length(freqWords)
## [1] 20502

There are total of 540,627 words in the corpus and 23,419 are distinctive. 20,547 of them are used more than 2 times.

Top Frequent Words

n <- 25L # variable to set top n words
dfTop <- data.frame(words = names(tdmMatrixSorted[1:n]), frequency = tdmMatrixSorted[1:n])
dfTop$words <- reorder(dfTop$words, dfTop$frequency)
g.top <- ggplot(dfTop, aes(x = words, y = frequency))
g.top <- g.top + geom_bar(stat = "identity") + coord_flip() + labs(title = "Most Frequent")
g.top

NGram Tokenize

I reduced the sparse terms to reduce computation.

#BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)) #RWeka
#TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3)) #RWeka
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = TRUE) # OpenNLP
TrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE) # OpenNLP
#tdmSamplesSingle <- TermDocumentMatrix(docsSamples[], control = list(tokenize = RWeka::WordTokenizer))
tdmSamplesBigram <- TermDocumentMatrix(docsSamples, control = list(tokenize = BigramTokenizer))
tdmSamplesBigram <- removeSparseTerms(tdmSamplesBigram, .99)
tdmSamplesBigramMatrix <- as.matrix(tdmSamplesBigram)
tdmSamplesBigramMatrixSorted <- sort(rowSums(tdmSamplesBigramMatrix), decreasing = TRUE)
tdmSamplesTrigram <- TermDocumentMatrix(docsSamples, control = list(tokenize = TrigramTokenizer))
tdmSamplesTrigram <- removeSparseTerms(tdmSamplesTrigram, .99)
tdmSamplesTrigramMatrix <- as.matrix(tdmSamplesTrigram)
tdmSamplesTrigramMatrixSorted <- sort(rowSums(tdmSamplesTrigramMatrix), decreasing = TRUE)
n <- 25L # variable to set top n words
dfBigramTop <- data.frame(words = names(tdmSamplesBigramMatrixSorted[1:n]), frequency = tdmSamplesBigramMatrixSorted[1:n])
dfBigramTop$words <- reorder(dfBigramTop$words, dfBigramTop$frequency)
g.BigramTop <- ggplot(dfBigramTop, aes(x = words, y = frequency))
g.BigramTop <- g.BigramTop + geom_bar(stat = "identity") + coord_flip() + labs(title = "Bigram Most Frequent")
g.BigramTop

dftdmTrigram <- data.frame(words = names(tdmSamplesTrigramMatrixSorted[1:n]), frequency = tdmSamplesTrigramMatrixSorted[1:n])
dftdmTrigram$words <- reorder(dftdmTrigram$words, dftdmTrigram$frequency)
g.TrigramTop <- ggplot(dftdmTrigram, aes(x = words, y = frequency))
g.TrigramTop <- g.TrigramTop + geom_bar(stat = "identity") + coord_flip() + labs(title = "Trigram Most Frequent")
g.TrigramTop