Data Science Capstone: Milestone Report

Synopsis:

This is to report on the exploratory analysis of the data downloaded from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip and the plan for building the predictive text model and the shiny application. For this report, I have considered only 1% of the data as processing the entire data set requires more time and may require additional hw resources.

Positioning Datasets for analysis:

The compressed dataset has been downloaded manually, and was unpacked into ~/coursera/DS_Capstone/final. I was able to create mydocs corpus with the original datasets and use it as a source to create sample datasets in the next step. As the kniter is taking too long to create the corpus, I have disabled the code chunk that has corpus creation command. The sample datasets with 1% data will be read into mydocs1 corpus in the next step.

library(tm)
library(tidyverse)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(RWeka)
library(parallel)
library(doParallel)
library(tools)

cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

getwd()
print("Counts from OS:")
print("lines  words characters   filename", quote=FALSE)
system("wc final/en_US/*.txt")

files_folder <- "~/coursera/DS_Capstone/final/en_US"

mydocs <- Corpus(DirSource(files_folder),
                readerControl = list(reader = readPlain,
                                     language = "en_US",
                                     load = TRUE))

# Summaries and Inspect
summary(mydocs)
inspect(mydocs)
nrow(as.data.frame(content(mydocs[[1]])))
nrow(as.data.frame(content(mydocs[[2]])))
nrow(as.data.frame(content(mydocs[[3]])))

# word counts from corpus 
length(words(mydocs[[1]]))
length(words(mydocs[[2]]))
length(words(mydocs[[3]]))

Create Sample Corpus:

In this step, I have created mydocs1 corpus with 1% random sample set from ~/coursera/DS_Capstone/en_US.1/*.txt.

# work on 1% random sample first
#blogs_text1 <- content(mydocs[[1]])[rbinom(.01*length(content(mydocs[[1]])),length(content(mydocs[[1]])),.5)]
#news_text1 <- content(mydocs[[2]])[rbinom(.01*length(content(mydocs[[2]])),length(content(mydocs[[2]])),.5)]
#twitter_text1 <- content(mydocs[[3]])[rbinom(.01*length(content(mydocs[[3]])),length(content(mydocs[[3]])),.5)]

# write files to a separate location
files_folder1 <- "~/coursera/DS_Capstone/en_US.1"
blogs_file1 <- paste0(files_folder1, "/en_US.blogs.1.txt")
news_file1 <- paste0(files_folder1, "/en_US.news.1.txt")
twitter_file1 <- paste0(files_folder1, "/en_US.twitter.1.txt")
profanity_file1 <- "~/coursera/DS_Capstone/profanity.txt"

con <- file(profanity_file1, "r")
profanity_txt <- readLines(con)
close(con)

#con <- file(blogs_file1, "w") 
#writeLines(blogs_text1, con)
#close(con)
#con <- file(news_file1, "w") 
#writeLines(news_text1, con)
#close(con)
#con <- file(twitter_file1, "w") 
#writeLines(twitter_text1, con)
#close(con)

mydocs1 <- Corpus(DirSource(files_folder1, encoding = "UTF-8"),
                readerControl = list(reader = readPlain,
                                     language = "en_US",
                                     load = TRUE))

# Counts for 1% sample sets from OS:
getwd()

## [1] "/Users/RamanaSonti/coursera/DS_Capstone"

print("Counts for 1% sample sets from OS:")

## [1] "Counts for 1% sample sets from OS:"

print("lines  words characters   filename", quote=FALSE)

## [1] lines  words characters   filename

system("wc en_US.1/*.txt")

# Summaries and Inspect
summary(mydocs1)

##                     Length Class             Mode
## en_US.blogs.1.txt   2      PlainTextDocument list
## en_US.news.1.txt    2      PlainTextDocument list
## en_US.twitter.1.txt 2      PlainTextDocument list

inspect(mydocs1)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2111606
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2008709
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 1619154

nrow(as.data.frame(content(mydocs1[[1]])))

## [1] 8992

nrow(as.data.frame(content(mydocs1[[2]])))

## [1] 10102

nrow(as.data.frame(content(mydocs1[[3]])))

## [1] 23601

# word counts from mydocs1 corpus 
length(words(mydocs1[[1]]))

## [1] 380297

length(words(mydocs1[[2]]))

## [1] 339873

length(words(mydocs1[[3]]))

## [1] 304552

Cleanup mydocs1 corpus:

In this step, I have used the functions to remove the numbers, punctuation, stop words, profanity words, white space from the samples corpus. I have also converted all alphabetic charcters to lower case and removed all non-ascii charcters. I have barplot to visualize the frequency distribution ofthe words in the sample corpus.

# convert to lower case
mydocs1 <- tm_map(mydocs1, content_transformer(tolower))

# remove numbers
mydocs1 <- tm_map(mydocs1, removeNumbers)

# remove stopwords
mydocs1 <- tm_map(mydocs1, removeWords, stopwords("english"))

# remove words from profanity list
mydocs1 <- tm_map(mydocs1, removeWords, profanity_txt)

# remove punctuation
mydocs1 <- tm_map(mydocs1, removePunctuation)

# remove whitespace
mydocs1 <- tm_map(mydocs1, stripWhitespace)
# Text stemming -- runs longer
# mydocs1 <- tm_map(mydocs1, stemDocument, language = "english")

# remove non-ascii
removeNonASCII <- content_transformer(function(x) iconv(x, "latin1", "ASCII", ""))
mydocs1 <- tm_map(mydocs1, removeNonASCII)

# build tdm
tdm1 <- TermDocumentMatrix(mydocs1)
m1 <- as.matrix(tdm1)
v1 <- sort(rowSums(m1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
head(d1)

##      word freq
## will will 3379
## just just 3002
## said said 2968
## one   one 2822
## like like 2741
## can   can 2328

# generate word cloud
set.seed(1234)
wordcloud(words = d1$word, freq = d1$freq, min.freq = 1, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(6, "Dark2"))

findFreqTerms(tdm1, lowfreq = 1000)

##  [1] "also"   "back"   "best"   "can"    "day"    "even"   "first" 
##  [8] "get"    "going"  "good"   "got"    "great"  "just"   "know"  
## [15] "last"   "like"   "love"   "make"   "much"   "new"    "now"   
## [22] "one"    "people" "really" "said"   "see"    "thanks" "think" 
## [29] "time"   "today"  "two"    "want"   "way"    "well"   "will"  
## [36] "work"   "year"

head(as.data.frame(findAssocs(tdm1, terms = "universe", corlimit = 0.3)))

##             universe
## actual             1
## airwaves           1
## alive              1
## arrangement        1
## asap               1
## award              1

head(d1)

##      word freq
## will will 3379
## just just 3002
## said said 2968
## one   one 2822
## like like 2741
## can   can 2328

barplot(d1[1:10,]$freq, las = 2, names.arg = d1[1:10,]$word, col ="green", main ="Most Frequent Words", ylab = "Word Frequencies")

## Tokenization and generating n-grams: In this step, I have generated unigrams, bigrams, trigrams, and quadrigrams for examination. This is to set the spage for computing the probabilities for each word that appears in bigram or trigram.

# word counts from corpus after cleanup 
length(words(mydocs1[[1]]))

## [1] 196275

length(words(mydocs1[[2]]))

## [1] 192144

length(words(mydocs1[[3]]))

## [1] 167667

# generate unigrams
UnigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
tdm0 <- TermDocumentMatrix(mydocs1, control = list(tokenize = UnigramTokenizer))
tdm10 <- removeSparseTerms(tdm0[, 1:3], 0.7)
head(as.matrix(tdm10))

##        Docs
## Terms   en_US.blogs.1.txt en_US.news.1.txt en_US.twitter.1.txt
##   aaa                   0                1                   0
##   aaaah                 0                8                   0
##   aacc                  4                0                   0
##   aaja                  2                0                   0
##   aam                   0                0                  12
##   aamir                 4                0                   0

# generate bigrams
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
tdm2 <- TermDocumentMatrix(mydocs1, control = list(tokenize = BigramTokenizer))
tdm20 <- removeSparseTerms(tdm2[, 1:3], 0.7)
head(as.matrix(tdm20))

##                  Docs
## Terms             en_US.blogs.1.txt en_US.news.1.txt en_US.twitter.1.txt
##   aa bunch                        0                0                   1
##   aa member                       0                0                   7
##   aaa midatlantic                 0                1                   0
##   aaaah swing                     0                8                   0
##   aacc challenged                 4                0                   0
##   aaja ni                         1                0                   0

# genearate trigrams
TrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
tdm3 <- TermDocumentMatrix(mydocs1, control = list(tokenize = TrigramTokenizer))
tdm30 <- removeSparseTerms(tdm3[, 1:3], 0.7)
head(as.matrix(tdm30))

##                           Docs
## Terms                      en_US.blogs.1.txt en_US.news.1.txt
##   aa bunch matchup                         0                0
##   aa member walks                          0                0
##   aaa midatlantic maryland                 0                1
##   aaaah swing miss                         0                8
##   aacc challenged founder                  4                0
##   aaja ni aaja                             1                0
##                           Docs
## Terms                      en_US.twitter.1.txt
##   aa bunch matchup                           1
##   aa member walks                            7
##   aaa midatlantic maryland                   0
##   aaaah swing miss                           0
##   aacc challenged founder                    0
##   aaja ni aaja                               0

# generate quadrigrams
QuadrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 4), paste, collapse = " "), use.names = FALSE)
tdm4 <- TermDocumentMatrix(mydocs1, control = list(tokenize = QuadrigramTokenizer))
tdm40 <- removeSparseTerms(tdm4[, 1:3], 0.7)
head(as.matrix(tdm40))

##                                      Docs
## Terms                                 en_US.blogs.1.txt en_US.news.1.txt
##   aa bunch matchup tonight                            0                0
##   aa member walks exchange                            0                0
##   aaa midatlantic maryland department                 0                1
##   aaaah swing miss actually                           0                8
##   aacc challenged founder dr                          4                0
##   aaja ni aaja now                                    1                0
##                                      Docs
## Terms                                 en_US.twitter.1.txt
##   aa bunch matchup tonight                              1
##   aa member walks exchange                              7
##   aaa midatlantic maryland department                   0
##   aaaah swing miss actually                             0
##   aacc challenged founder dr                            0
##   aaja ni aaja now                                      0

#stopCluster(cluster)
#registerDoSEQ()

Conclusion:

At a high level, I have been looking into the following steps to complete the project.

Stemming as part of the cleanup
Assign probablities to bigrams
Apply Smoothing for the words that are not seen in the corpus
Building model that works
Split the original dataset into training and held-out datasets
Train the model on the training dataset
Evaluate the model against held-out dataset
Fine tune the model against training dataset
Build Shiny App
Prepare slide deck for presentaion

References:

http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know
http://tm.r-forge.r-project.org/faq.html
NLP Stanford University - Lecture Videos