Getting and Cleaning the Data

Getting Data

#library(testthat)
#library(tokenizers)
library(stopwords)
library(sentimentr)
library(dplyr)
library(ggplot2)
library(NLP)
library(openNLP)
library(RWeka)
library(tm)

set.seed(43)

Sys.setlocale("LC_ALL", "American")

## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"

destinyDirectory = "./data/"
destinyFile = "Coursera_SwiftKey.zip"
destiny = paste0(destinyDirectory, destinyFile)
dataURL = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

blogsFile <- unz(destiny, 'final/en_US/en_US.blogs.txt')
blogs <- readLines(blogsFile, skipNul = TRUE)
close(blogsFile)

newsFile <- unz(destiny, 'final/en_US/en_US.news.txt')
news <- readLines(newsFile, skipNul = TRUE)
close(newsFile)

twitterFile <- unz(destiny, 'final/en_US/en_US.twitter.txt')
twitter <- readLines(twitterFile, skipNul = TRUE)
close(twitterFile)

profanityList <- lexicon::profanity_banned # Banned words

Original Data Summary

#docsOrg <- Corpus(DirSource(unz(destiny, 'final/en_US/')), readerControl = list(readPlain,language="en_US", load = TRUE))
#blogsOrg <- Corpus(VectorSource(blogs), readerControl = list(readPlain,language="en_US", load = TRUE))
#newsOrg <- Corpus(VectorSource(news), readerControl = list(readPlain,language="en_US", load = TRUE))
#twitterOrg <- Corpus(VectorSource(twitter), readerControl = list(readPlain,language="en_US", load = TRUE))
docsOrg <- Corpus(VectorSource(c(blogs, news, twitter)), readerControl = list(readPlain,language="en_US", load = TRUE))
#docsOrg <- Corpus(DirSource('./data/final/en_US'), readerControl = list(readPlain,language="en_US", load = TRUE))
#tdmOrg <- TermDocumentMatrix(docsOrg)
#dtmBlogs <- DocumentTermMatrix(blogsOrg)
#dtmNews <- DocumentTermMatrix(newsOrg)
#dtmTwitter <- DocumentTermMatrix(twitterOrg)
dtmOrg <- DocumentTermMatrix(docsOrg)
#length(docsOrg)
#object.size(dtmOrg)

The summary plots says that there are 800,000 lines in blogs, 1 million lines in news, and 2 million lines in twitter. However number of words goes opposite way. Blogs 37 million, News 34 million, and Twitter 30 million words. It’s probably because twitter limits the number of character on each twit by 140 bytes.

The size of docsOrg corpus is 1.5 giga bytes which is quite big to work with. I am going to randomly sample then analyze.

Sampling

Here I am randomly sampling 1% of the data in order to perform explarotory analysis and turn samples into files to avoid unnecessary computing and drop the original data from the memory. Finally I am writing the text into files so I do not have to go through the same process all over again.

blogsSample <- blogs[rbinom(length(blogs)*.01, length(blogs), 0.5)]
newsSample <- news[rbinom(length(news)*.01, length(news), 0.5)]
twitterSample <- twitter[rbinom(length(twitter)*.01, length(twitter), 0.5)]

rm(blogs) # Remove from memory to save the memory.
rm(news)
rm(twitter)
rm(docsOrg)
rm(dtmOrg)

write.table(blogsSample, file = './data/sample/en_US.blogsSample.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)
write.table(newsSample, file = './data/sample/en_US.newsSample.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)
write.table(twitterSample, file = './data/sample/en_US.twitterSample.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)
write.table(profanityList, file = './data/profanityList.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)

After sampling and cleanup word counts for the whole text went down to around 450,000 and the lines counts

# Making Corpus 
#docsSamples <- Corpus(VectorSource(c(blogsSample, newsSample, twitterSample)), readerControl = list(readPlain,language="en_US", load = TRUE))
docsSamples <- VCorpus(DirSource('./data/sample'), readerControl = list(readPlain,language="en_US", load = TRUE))

Cleaning Data

There are lots of irregularity in the data since they are from different sources. Here I am removing the followings utilizing tm package.

Whitespace
Punctuation
Numbers
URL
Hashtags
Twitter Handles
HTML Tags
Stopwords (common grammartical word with little to no added meaning)

removeURL <- function(x) gsub("http:[[:alnum:]]*", "", x)
removeHashTags <- function(x) gsub("#\\S+", "", x)
removeTwitterHandles <- function(x) gsub("@\\S+", "", x)
removeHTMLTags <- function(x) gsub("<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>","",x)

docsSamples <- tm_map(docsSamples, content_transformer(tolower))
docsSamples <- tm_map(docsSamples, removeWords, stopwords("english"))
docsSamples <- tm_map(docsSamples, removePunctuation)
docsSamples <- tm_map(docsSamples, removeNumbers)
docsSamples <- tm_map(docsSamples, stripWhitespace)
#docsSamples <- tm_map(docsSamples, stemDocument)
docsSamples <- tm_map(docsSamples, content_transformer(removeURL))
docsSamples <- tm_map(docsSamples, content_transformer(removeHashTags))
docsSamples <- tm_map(docsSamples, content_transformer(removeTwitterHandles))
docsSamples <- tm_map(docsSamples, content_transformer(removeHTMLTags))
dtmSamples <- DocumentTermMatrix(docsSamples)
#rowSums(as.matrix(dtmSamples))
tdmSamples <- TermDocumentMatrix(docsSamples, control = list(tokenize = RWeka::WordTokenizer))
tdmMatrix <- as.matrix(tdmSamples)
tdmMatrixSorted <- sort(rowSums(tdmMatrix), decreasing = TRUE)

#tdmSamples <- TermDocumentMatrix(docsSamples, control = list(tokenize = NLP::wordpunct_tokenizer))

Analysis

Corpus is now ready for analysis.

sum(tdmMatrix)

## [1] 536193

length(tdmMatrix[,1])

## [1] 23239

freqWords <- findFreqTerms(tdmSamples, 2)
length(freqWords)

## [1] 20502

There are total of 540,627 words in the corpus and 23,419 are distinctive. 20,547 of them are used more than 2 times.

Top Frequent Words

n <- 25L # variable to set top n words
dfTop <- data.frame(words = names(tdmMatrixSorted[1:n]), frequency = tdmMatrixSorted[1:n])
dfTop$words <- reorder(dfTop$words, dfTop$frequency)
g.top <- ggplot(dfTop, aes(x = words, y = frequency))
g.top <- g.top + geom_bar(stat = "identity") + coord_flip() + labs(title = "Most Frequent")
g.top

NGram Tokenize

I reduced the sparse terms to reduce computation.

#BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)) #RWeka
#TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3)) #RWeka
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = TRUE) # OpenNLP
TrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE) # OpenNLP
#tdmSamplesSingle <- TermDocumentMatrix(docsSamples[], control = list(tokenize = RWeka::WordTokenizer))
tdmSamplesBigram <- TermDocumentMatrix(docsSamples, control = list(tokenize = BigramTokenizer))
tdmSamplesBigram <- removeSparseTerms(tdmSamplesBigram, .99)
tdmSamplesBigramMatrix <- as.matrix(tdmSamplesBigram)
tdmSamplesBigramMatrixSorted <- sort(rowSums(tdmSamplesBigramMatrix), decreasing = TRUE)
tdmSamplesTrigram <- TermDocumentMatrix(docsSamples, control = list(tokenize = TrigramTokenizer))
tdmSamplesTrigram <- removeSparseTerms(tdmSamplesTrigram, .99)
tdmSamplesTrigramMatrix <- as.matrix(tdmSamplesTrigram)
tdmSamplesTrigramMatrixSorted <- sort(rowSums(tdmSamplesTrigramMatrix), decreasing = TRUE)

n <- 25L # variable to set top n words
dfBigramTop <- data.frame(words = names(tdmSamplesBigramMatrixSorted[1:n]), frequency = tdmSamplesBigramMatrixSorted[1:n])
dfBigramTop$words <- reorder(dfBigramTop$words, dfBigramTop$frequency)
g.BigramTop <- ggplot(dfBigramTop, aes(x = words, y = frequency))
g.BigramTop <- g.BigramTop + geom_bar(stat = "identity") + coord_flip() + labs(title = "Bigram Most Frequent")
g.BigramTop

dftdmTrigram <- data.frame(words = names(tdmSamplesTrigramMatrixSorted[1:n]), frequency = tdmSamplesTrigramMatrixSorted[1:n])
dftdmTrigram$words <- reorder(dftdmTrigram$words, dftdmTrigram$frequency)
g.TrigramTop <- ggplot(dftdmTrigram, aes(x = words, y = frequency))
g.TrigramTop <- g.TrigramTop + geom_bar(stat = "identity") + coord_flip() + labs(title = "Trigram Most Frequent")
g.TrigramTop

Week2_Assignment

Jinkwan Hong

2019년 2월 21일

Data Science Capstone

Milstone Report

Synopsis