This is to report on the exploratory analysis of the data downloaded from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip and the plan for building the predictive text model and the shiny application. For this report, I have considered only 1% of the data as processing the entire data set requires more time and may require additional hw resources.
The compressed dataset has been downloaded manually, and was unpacked into ~/coursera/DS_Capstone/final. I was able to create mydocs corpus with the original datasets and use it as a source to create sample datasets in the next step. As the kniter is taking too long to create the corpus, I have disabled the code chunk that has corpus creation command. The sample datasets with 1% data will be read into mydocs1 corpus in the next step.
library(tm)
library(tidyverse)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(RWeka)
library(parallel)
library(doParallel)
library(tools)
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)
getwd()
print("Counts from OS:")
print("lines words characters filename", quote=FALSE)
system("wc final/en_US/*.txt")
files_folder <- "~/coursera/DS_Capstone/final/en_US"
mydocs <- Corpus(DirSource(files_folder),
readerControl = list(reader = readPlain,
language = "en_US",
load = TRUE))
# Summaries and Inspect
summary(mydocs)
inspect(mydocs)
nrow(as.data.frame(content(mydocs[[1]])))
nrow(as.data.frame(content(mydocs[[2]])))
nrow(as.data.frame(content(mydocs[[3]])))
# word counts from corpus
length(words(mydocs[[1]]))
length(words(mydocs[[2]]))
length(words(mydocs[[3]]))
In this step, I have created mydocs1 corpus with 1% random sample set from ~/coursera/DS_Capstone/en_US.1/*.txt.
# work on 1% random sample first
#blogs_text1 <- content(mydocs[[1]])[rbinom(.01*length(content(mydocs[[1]])),length(content(mydocs[[1]])),.5)]
#news_text1 <- content(mydocs[[2]])[rbinom(.01*length(content(mydocs[[2]])),length(content(mydocs[[2]])),.5)]
#twitter_text1 <- content(mydocs[[3]])[rbinom(.01*length(content(mydocs[[3]])),length(content(mydocs[[3]])),.5)]
# write files to a separate location
files_folder1 <- "~/coursera/DS_Capstone/en_US.1"
blogs_file1 <- paste0(files_folder1, "/en_US.blogs.1.txt")
news_file1 <- paste0(files_folder1, "/en_US.news.1.txt")
twitter_file1 <- paste0(files_folder1, "/en_US.twitter.1.txt")
profanity_file1 <- "~/coursera/DS_Capstone/profanity.txt"
con <- file(profanity_file1, "r")
profanity_txt <- readLines(con)
close(con)
#con <- file(blogs_file1, "w")
#writeLines(blogs_text1, con)
#close(con)
#con <- file(news_file1, "w")
#writeLines(news_text1, con)
#close(con)
#con <- file(twitter_file1, "w")
#writeLines(twitter_text1, con)
#close(con)
mydocs1 <- Corpus(DirSource(files_folder1, encoding = "UTF-8"),
readerControl = list(reader = readPlain,
language = "en_US",
load = TRUE))
# Counts for 1% sample sets from OS:
getwd()
## [1] "/Users/RamanaSonti/coursera/DS_Capstone"
print("Counts for 1% sample sets from OS:")
## [1] "Counts for 1% sample sets from OS:"
print("lines words characters filename", quote=FALSE)
## [1] lines words characters filename
system("wc en_US.1/*.txt")
# Summaries and Inspect
summary(mydocs1)
## Length Class Mode
## en_US.blogs.1.txt 2 PlainTextDocument list
## en_US.news.1.txt 2 PlainTextDocument list
## en_US.twitter.1.txt 2 PlainTextDocument list
inspect(mydocs1)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 2111606
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 2008709
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 1619154
nrow(as.data.frame(content(mydocs1[[1]])))
## [1] 8992
nrow(as.data.frame(content(mydocs1[[2]])))
## [1] 10102
nrow(as.data.frame(content(mydocs1[[3]])))
## [1] 23601
# word counts from mydocs1 corpus
length(words(mydocs1[[1]]))
## [1] 380297
length(words(mydocs1[[2]]))
## [1] 339873
length(words(mydocs1[[3]]))
## [1] 304552
In this step, I have used the functions to remove the numbers, punctuation, stop words, profanity words, white space from the samples corpus. I have also converted all alphabetic charcters to lower case and removed all non-ascii charcters. I have barplot to visualize the frequency distribution ofthe words in the sample corpus.
# convert to lower case
mydocs1 <- tm_map(mydocs1, content_transformer(tolower))
# remove numbers
mydocs1 <- tm_map(mydocs1, removeNumbers)
# remove stopwords
mydocs1 <- tm_map(mydocs1, removeWords, stopwords("english"))
# remove words from profanity list
mydocs1 <- tm_map(mydocs1, removeWords, profanity_txt)
# remove punctuation
mydocs1 <- tm_map(mydocs1, removePunctuation)
# remove whitespace
mydocs1 <- tm_map(mydocs1, stripWhitespace)
# Text stemming -- runs longer
# mydocs1 <- tm_map(mydocs1, stemDocument, language = "english")
# remove non-ascii
removeNonASCII <- content_transformer(function(x) iconv(x, "latin1", "ASCII", ""))
mydocs1 <- tm_map(mydocs1, removeNonASCII)
# build tdm
tdm1 <- TermDocumentMatrix(mydocs1)
m1 <- as.matrix(tdm1)
v1 <- sort(rowSums(m1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
head(d1)
## word freq
## will will 3379
## just just 3002
## said said 2968
## one one 2822
## like like 2741
## can can 2328
# generate word cloud
set.seed(1234)
wordcloud(words = d1$word, freq = d1$freq, min.freq = 1, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(6, "Dark2"))
findFreqTerms(tdm1, lowfreq = 1000)
## [1] "also" "back" "best" "can" "day" "even" "first"
## [8] "get" "going" "good" "got" "great" "just" "know"
## [15] "last" "like" "love" "make" "much" "new" "now"
## [22] "one" "people" "really" "said" "see" "thanks" "think"
## [29] "time" "today" "two" "want" "way" "well" "will"
## [36] "work" "year"
head(as.data.frame(findAssocs(tdm1, terms = "universe", corlimit = 0.3)))
## universe
## actual 1
## airwaves 1
## alive 1
## arrangement 1
## asap 1
## award 1
head(d1)
## word freq
## will will 3379
## just just 3002
## said said 2968
## one one 2822
## like like 2741
## can can 2328
barplot(d1[1:10,]$freq, las = 2, names.arg = d1[1:10,]$word, col ="green", main ="Most Frequent Words", ylab = "Word Frequencies")
## Tokenization and generating n-grams: In this step, I have generated unigrams, bigrams, trigrams, and quadrigrams for examination. This is to set the spage for computing the probabilities for each word that appears in bigram or trigram.
# word counts from corpus after cleanup
length(words(mydocs1[[1]]))
## [1] 196275
length(words(mydocs1[[2]]))
## [1] 192144
length(words(mydocs1[[3]]))
## [1] 167667
# generate unigrams
UnigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
tdm0 <- TermDocumentMatrix(mydocs1, control = list(tokenize = UnigramTokenizer))
tdm10 <- removeSparseTerms(tdm0[, 1:3], 0.7)
head(as.matrix(tdm10))
## Docs
## Terms en_US.blogs.1.txt en_US.news.1.txt en_US.twitter.1.txt
## aaa 0 1 0
## aaaah 0 8 0
## aacc 4 0 0
## aaja 2 0 0
## aam 0 0 12
## aamir 4 0 0
# generate bigrams
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
tdm2 <- TermDocumentMatrix(mydocs1, control = list(tokenize = BigramTokenizer))
tdm20 <- removeSparseTerms(tdm2[, 1:3], 0.7)
head(as.matrix(tdm20))
## Docs
## Terms en_US.blogs.1.txt en_US.news.1.txt en_US.twitter.1.txt
## aa bunch 0 0 1
## aa member 0 0 7
## aaa midatlantic 0 1 0
## aaaah swing 0 8 0
## aacc challenged 4 0 0
## aaja ni 1 0 0
# genearate trigrams
TrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
tdm3 <- TermDocumentMatrix(mydocs1, control = list(tokenize = TrigramTokenizer))
tdm30 <- removeSparseTerms(tdm3[, 1:3], 0.7)
head(as.matrix(tdm30))
## Docs
## Terms en_US.blogs.1.txt en_US.news.1.txt
## aa bunch matchup 0 0
## aa member walks 0 0
## aaa midatlantic maryland 0 1
## aaaah swing miss 0 8
## aacc challenged founder 4 0
## aaja ni aaja 1 0
## Docs
## Terms en_US.twitter.1.txt
## aa bunch matchup 1
## aa member walks 7
## aaa midatlantic maryland 0
## aaaah swing miss 0
## aacc challenged founder 0
## aaja ni aaja 0
# generate quadrigrams
QuadrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 4), paste, collapse = " "), use.names = FALSE)
tdm4 <- TermDocumentMatrix(mydocs1, control = list(tokenize = QuadrigramTokenizer))
tdm40 <- removeSparseTerms(tdm4[, 1:3], 0.7)
head(as.matrix(tdm40))
## Docs
## Terms en_US.blogs.1.txt en_US.news.1.txt
## aa bunch matchup tonight 0 0
## aa member walks exchange 0 0
## aaa midatlantic maryland department 0 1
## aaaah swing miss actually 0 8
## aacc challenged founder dr 4 0
## aaja ni aaja now 1 0
## Docs
## Terms en_US.twitter.1.txt
## aa bunch matchup tonight 1
## aa member walks exchange 7
## aaa midatlantic maryland department 0
## aaaah swing miss actually 0
## aacc challenged founder dr 0
## aaja ni aaja now 0
#stopCluster(cluster)
#registerDoSEQ()
At a high level, I have been looking into the following steps to complete the project.