Jinkwan Hong
Saturday, Feb 30, 2019
This report was prepared as a part of Data Scinece Capstone. The final goal is to create word prediction algorithm and the Shiny app that allows the public to use easily.
As for this documents, I am going to illustrate the data summaries to grasp the data profile.
#library(testthat)
#library(tokenizers)
library(stopwords)
library(sentimentr)
library(dplyr)
library(ggplot2)
library(NLP)
library(openNLP)
library(RWeka)
library(tm)
set.seed(43)
Sys.setlocale("LC_ALL", "American")
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
destinyDirectory = "./data/"
destinyFile = "Coursera_SwiftKey.zip"
destiny = paste0(destinyDirectory, destinyFile)
dataURL = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
blogsFile <- unz(destiny, 'final/en_US/en_US.blogs.txt')
blogs <- readLines(blogsFile, skipNul = TRUE)
close(blogsFile)
newsFile <- unz(destiny, 'final/en_US/en_US.news.txt')
news <- readLines(newsFile, skipNul = TRUE)
close(newsFile)
twitterFile <- unz(destiny, 'final/en_US/en_US.twitter.txt')
twitter <- readLines(twitterFile, skipNul = TRUE)
close(twitterFile)
profanityList <- lexicon::profanity_banned # Banned words
#docsOrg <- Corpus(DirSource(unz(destiny, 'final/en_US/')), readerControl = list(readPlain,language="en_US", load = TRUE))
#blogsOrg <- Corpus(VectorSource(blogs), readerControl = list(readPlain,language="en_US", load = TRUE))
#newsOrg <- Corpus(VectorSource(news), readerControl = list(readPlain,language="en_US", load = TRUE))
#twitterOrg <- Corpus(VectorSource(twitter), readerControl = list(readPlain,language="en_US", load = TRUE))
docsOrg <- Corpus(VectorSource(c(blogs, news, twitter)), readerControl = list(readPlain,language="en_US", load = TRUE))
#docsOrg <- Corpus(DirSource('./data/final/en_US'), readerControl = list(readPlain,language="en_US", load = TRUE))
#tdmOrg <- TermDocumentMatrix(docsOrg)
#dtmBlogs <- DocumentTermMatrix(blogsOrg)
#dtmNews <- DocumentTermMatrix(newsOrg)
#dtmTwitter <- DocumentTermMatrix(twitterOrg)
dtmOrg <- DocumentTermMatrix(docsOrg)
#length(docsOrg)
#object.size(dtmOrg)
The summary plots says that there are 800,000 lines in blogs, 1 million lines in news, and 2 million lines in twitter. However number of words goes opposite way. Blogs 37 million, News 34 million, and Twitter 30 million words. It’s probably because twitter limits the number of character on each twit by 140 bytes.
The size of docsOrg corpus is 1.5 giga bytes which is quite big to work with. I am going to randomly sample then analyze.
Here I am randomly sampling 1% of the data in order to perform explarotory analysis and turn samples into files to avoid unnecessary computing and drop the original data from the memory. Finally I am writing the text into files so I do not have to go through the same process all over again.
blogsSample <- blogs[rbinom(length(blogs)*.01, length(blogs), 0.5)]
newsSample <- news[rbinom(length(news)*.01, length(news), 0.5)]
twitterSample <- twitter[rbinom(length(twitter)*.01, length(twitter), 0.5)]
rm(blogs) # Remove from memory to save the memory.
rm(news)
rm(twitter)
rm(docsOrg)
rm(dtmOrg)
write.table(blogsSample, file = './data/sample/en_US.blogsSample.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)
write.table(newsSample, file = './data/sample/en_US.newsSample.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)
write.table(twitterSample, file = './data/sample/en_US.twitterSample.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)
write.table(profanityList, file = './data/profanityList.csv', row.names = FALSE, col.names = FALSE, sep = ",", quote = FALSE)
After sampling and cleanup word counts for the whole text went down to around 450,000 and the lines counts
# Making Corpus
#docsSamples <- Corpus(VectorSource(c(blogsSample, newsSample, twitterSample)), readerControl = list(readPlain,language="en_US", load = TRUE))
docsSamples <- VCorpus(DirSource('./data/sample'), readerControl = list(readPlain,language="en_US", load = TRUE))
There are lots of irregularity in the data since they are from different sources. Here I am removing the followings utilizing tm package.
removeURL <- function(x) gsub("http:[[:alnum:]]*", "", x)
removeHashTags <- function(x) gsub("#\\S+", "", x)
removeTwitterHandles <- function(x) gsub("@\\S+", "", x)
removeHTMLTags <- function(x) gsub("<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>","",x)
docsSamples <- tm_map(docsSamples, content_transformer(tolower))
docsSamples <- tm_map(docsSamples, removeWords, stopwords("english"))
docsSamples <- tm_map(docsSamples, removePunctuation)
docsSamples <- tm_map(docsSamples, removeNumbers)
docsSamples <- tm_map(docsSamples, stripWhitespace)
#docsSamples <- tm_map(docsSamples, stemDocument)
docsSamples <- tm_map(docsSamples, content_transformer(removeURL))
docsSamples <- tm_map(docsSamples, content_transformer(removeHashTags))
docsSamples <- tm_map(docsSamples, content_transformer(removeTwitterHandles))
docsSamples <- tm_map(docsSamples, content_transformer(removeHTMLTags))
dtmSamples <- DocumentTermMatrix(docsSamples)
#rowSums(as.matrix(dtmSamples))
tdmSamples <- TermDocumentMatrix(docsSamples, control = list(tokenize = RWeka::WordTokenizer))
tdmMatrix <- as.matrix(tdmSamples)
tdmMatrixSorted <- sort(rowSums(tdmMatrix), decreasing = TRUE)
#tdmSamples <- TermDocumentMatrix(docsSamples, control = list(tokenize = NLP::wordpunct_tokenizer))
Corpus is now ready for analysis.
sum(tdmMatrix)
## [1] 536193
length(tdmMatrix[,1])
## [1] 23239
freqWords <- findFreqTerms(tdmSamples, 2)
length(freqWords)
## [1] 20502
There are total of 540,627 words in the corpus and 23,419 are distinctive. 20,547 of them are used more than 2 times.
n <- 25L # variable to set top n words
dfTop <- data.frame(words = names(tdmMatrixSorted[1:n]), frequency = tdmMatrixSorted[1:n])
dfTop$words <- reorder(dfTop$words, dfTop$frequency)
g.top <- ggplot(dfTop, aes(x = words, y = frequency))
g.top <- g.top + geom_bar(stat = "identity") + coord_flip() + labs(title = "Most Frequent")
g.top
I reduced the sparse terms to reduce computation.
#BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)) #RWeka
#TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3)) #RWeka
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = TRUE) # OpenNLP
TrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE) # OpenNLP
#tdmSamplesSingle <- TermDocumentMatrix(docsSamples[], control = list(tokenize = RWeka::WordTokenizer))
tdmSamplesBigram <- TermDocumentMatrix(docsSamples, control = list(tokenize = BigramTokenizer))
tdmSamplesBigram <- removeSparseTerms(tdmSamplesBigram, .99)
tdmSamplesBigramMatrix <- as.matrix(tdmSamplesBigram)
tdmSamplesBigramMatrixSorted <- sort(rowSums(tdmSamplesBigramMatrix), decreasing = TRUE)
tdmSamplesTrigram <- TermDocumentMatrix(docsSamples, control = list(tokenize = TrigramTokenizer))
tdmSamplesTrigram <- removeSparseTerms(tdmSamplesTrigram, .99)
tdmSamplesTrigramMatrix <- as.matrix(tdmSamplesTrigram)
tdmSamplesTrigramMatrixSorted <- sort(rowSums(tdmSamplesTrigramMatrix), decreasing = TRUE)
n <- 25L # variable to set top n words
dfBigramTop <- data.frame(words = names(tdmSamplesBigramMatrixSorted[1:n]), frequency = tdmSamplesBigramMatrixSorted[1:n])
dfBigramTop$words <- reorder(dfBigramTop$words, dfBigramTop$frequency)
g.BigramTop <- ggplot(dfBigramTop, aes(x = words, y = frequency))
g.BigramTop <- g.BigramTop + geom_bar(stat = "identity") + coord_flip() + labs(title = "Bigram Most Frequent")
g.BigramTop
dftdmTrigram <- data.frame(words = names(tdmSamplesTrigramMatrixSorted[1:n]), frequency = tdmSamplesTrigramMatrixSorted[1:n])
dftdmTrigram$words <- reorder(dftdmTrigram$words, dftdmTrigram$frequency)
g.TrigramTop <- ggplot(dftdmTrigram, aes(x = words, y = frequency))
g.TrigramTop <- g.TrigramTop + geom_bar(stat = "identity") + coord_flip() + labs(title = "Trigram Most Frequent")
g.TrigramTop