This Milestone Report is about exploratory data analysis of the capstone project of the Data Science Coursera specialization. In this project, we attend to create a APP (Shiny application) that using a large text corpus from documents to predict the next word on preceding input.
The goal of this Milestone Report is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. The motivation for this Milestone Report is to:
SwiftKey_folder <- "../Data/final"
if (!file.exists(SwiftKey_folder)) {
SwiftKey_file <- "../Data/Coursera-SwiftKey.zip"
if (!file.exists(SwiftKey_file)) {
link <- "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(link, SwiftKey_file, method="curl") #Download the data
rm(link)
}
setwd("../Data/")
unzip("./Coursera-SwiftKey.zip") # Unzip the data
unlink("./final/de_DE/",recursive = TRUE)
unlink("./final/fi_FI/",recursive = TRUE)
unlink("./final/ru_RU/",recursive = TRUE)
unlink("./Coursera-SwiftKey.zip")
setwd("../MiestoneReport/")
}
blog_file <- "../Data/final/en_US/en_US.blogs.txt"
twitter_file <- "../Data/final/en_US/en_US.twitter.txt"
news_file <- "../Data/final/en_US/en_US.news.txt"
blogs <- readLines(blog_file, encoding="UTF-8")
twitter <- readLines(twitter_file, encoding="UTF-8")
news <- readLines(news_file, encoding="UTF-8")
Blog
size of blog file is
utils:::format.object_size(file.info(blog_file)$size,"auto")
## [1] "200.4 Mb"
total number of lines of blog file is
length(blogs)
## [1] 899288
total number of words of blog file is
sum(sapply(gregexpr("\\S+", blogs), length))
## [1] 37334147
Twitter
size of twitter file is
utils:::format.object_size(file.info(twitter_file)$size,"auto")
## [1] "159.4 Mb"
total number of lines of twitter file is
length(twitter)
## [1] 2360148
total number of words of twitter file is
sum(sapply(gregexpr("\\S+", twitter), length))
## [1] 30373563
News
size of news file is
utils:::format.object_size(file.info(news_file)$size,"auto")
## [1] "196.3 Mb"
total number of lines of news file is
length(news)
## [1] 1010242
total number of words of news file is
sum(sapply(gregexpr("\\S+", news), length))
## [1] 34372530
# text mining
library(NLP)
library(tm)
library(rJava)
library(RWeka)
# matrix
library('Matrix')
# dataframe process
library(plyr)
# visualization
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(RColorBrewer)
library(wordcloud)
In the testing stage, we could use fewer data to test our flow and model. If our flow and model is just usable, we can turn back to handle all of data.
sampling_data <- function(size_sample,seed,file) {
set.seed(seed)
sample_blogs <- sample(blogs, size = size_sample, replace = TRUE)
sample_twitter <- sample(twitter, size = size_sample, replace = TRUE)
sample_news <- sample(news, size = size_sample, replace = TRUE)
sample_data = c(sample_blogs, sample_twitter, sample_news)
writeLines(sample_data, file)
}
sampling_data(size_sample = 1000 ,seed=1234,file="sample3000_data.txt")
sampling_data(size_sample = 10000 ,seed=1234,file="sample30000_data.txt")
sampling_data(size_sample = 50000 ,seed=1234,file="sample150000_data.txt")
We create three files sample3000_data.txt, sample30000_data.txt and sample150000_data.txt in different size.
Transform from line data to clean corpus. corpus_train for training and corpus_valid for validation.
gen_corpus <- function(list){
corpus <- tm::Corpus(tm::VectorSource(list))
# transform to lower captial
corpus <- tm::tm_map(corpus, content_transformer(tolower))
# remove numbers
corpus <- tm::tm_map(corpus, removeNumbers)
# remove punctuation
corpus <- tm::tm_map(corpus, removePunctuation)
# strip whitespace
corpus <- tm::tm_map(corpus, stripWhitespace)
# remove stop words
corpus <- tm::tm_map(corpus, removeWords, stopwords("english"))
# remove URL
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
corpus <- tm::tm_map(corpus, content_transformer(removeURL))
return(corpus)
}
sampleData <- readLines("sample30000_data.txt", encoding="UTF-8")
num_valid <- 100
trainData <- head(sampleData,round(length(sampleData) - num_valid))
validData <- tail(sampleData,num_valid)
corpus_train <- gen_corpus(trainData)
corpus_valid <- gen_corpus(validData)
saveRDS(corpus_train, file = "sample30000_train_corpus.RData")
saveRDS(corpus_valid, file = "sample30000_valid_corpus.RData")
In the fields of computational linguistics and probability, an n-gram is a contiguous sequence of n items from a given sequence of text or speech. An n-gram of size 1 is referred to as a “unigram”; size 2 is a “bigram”; size 3 is a “trigram”. Larger sizes are sometimes referred to by the value of n in modern language, e.g., “four-gram”, “five-gram”, and so on.
gen_n_gram <- function(corpus,n) {
tokenizer <- RWeka::NGramTokenizer(corpus,
RWeka::Weka_control(min=n,max=n,delimiters =" \\r\\n\\t.,;:\"()?!"))
freq_df <- data.frame(table(tokenizer))
names(freq_df) <- c("word","freq")
freq_df$word <- as.character(freq_df$word)
freq_df <- freq_df[order(freq_df$freq,decreasing = TRUE),]
rownames(freq_df) <- NULL # reset index
return(freq_df)
}
freq_uni <- gen_n_gram(corpus=corpus_train,n=1)
freq_bi <- gen_n_gram(corpus=corpus_train,n=2)
freq_tri <- gen_n_gram(corpus=corpus_train,n=3)
makeHistogram <- function(df) {
ggplot(df[30:1,], aes(reorder(word,freq), freq)) +
labs(x = "30 Most Frequency Word", y = "Frequency") +
geom_bar(stat = "identity", fill = I("grey50")) +
coord_flip()
}
makeWordCloud <- function(df) {
wordcloud(df$word,df$freq, scale=c(3,0.5), max.words=250, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Accent"))
}
makeHistogram(freq_uni)
makeWordCloud(freq_uni)
makeHistogram(freq_bi)
makeWordCloud(freq_bi)
makeHistogram(freq_tri)
makeWordCloud(freq_tri)
How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language?
tot_word <- length(freq_uni$word)
tot <- sum(freq_uni$freq)
ratio <- 0.5
accu <- 0
for (i in 1:nrow(freq_uni)){
if (accu > tot*ratio) {
top_word <- i
break
}
accu <- accu + freq_uni$freq[i]
}
top_word / tot_word * 100
## [1] 2.048226
How many unique words do you need in a frequency sorted dictionary to cover 90% of all word instances in the language?
ratio <- 0.9
accu <- 0
for (i in 1:nrow(freq_uni)){
if (accu > tot*ratio) {
top_word <- i
break
}
accu <- accu + freq_uni$freq[i]
}
top_word / tot_word * 100
## [1] 29.23235
I use a simple Katz’s back-off model to construct our prediction model. Katz back-off is a generative n-gram language model that estimates the conditional probability of a word given its history in the n-gram. It accomplishes this estimation by “backing-off” to models with smaller histories under certain conditions. By doing so, the model with the most reliable information about a given history is used to provide the better results.
subset_find_regex <- function(df,regex) {
df[grep(regex,df$word),]
}
subset_find_max_freq <- function(df) {
df[df$freq==max(df$freq),]
}
split_string <- function(string) {
unlist(strsplit(string,split=" +"))
}
Katz_backoff_model <- function(string) {
word_list <- split_string(string)
leng <- length(word_list)
if (leng>=2) {
prefix <- paste(tail(word_list,2), collapse = " ")
pred_from_3_gram <-
subset_find_regex(freq_tri,paste(prefix,".*",sep = " "))
if (nrow(pred_from_3_gram)==0) {
prefix <- paste(tail(word_list,1), collapse = " ")
pred_from_2_gram <-
subset_find_regex(freq_bi,paste(prefix,".*",sep = " "))
if (nrow(pred_from_2_gram)==0) {
pred_from_1_gram <- subset_find_regex(freq_uni,".*")
pred_string <- subset_find_max_freq(pred_from_1_gram)$word[1]
} else {
pred_string <- subset_find_max_freq(pred_from_2_gram)$word[1]
}
} else {
pred_string <- subset_find_max_freq(pred_from_3_gram)$word[1]
}
} else if (leng==1) {
prefix <- paste(tail(word_list,1), collapse = " ")
pred_from_2_gram <-
subset_find_regex(freq_bi,paste(prefix,".*",sep = " "))
if (nrow(pred_from_2_gram)==0) {
pred_from_1_gram <- subset_find_regex(freq_uni,".*")
pred_string <- subset_find_max_freq(pred_from_1_gram)$word[1]
} else {
pred_string <- subset_find_max_freq(pred_from_2_gram)$word[1]
}
}
pred_word_list <- split_string(pred_string)
return(tail(pred_word_list,1))
}
validate our model and get accuracy and time spend
tot <- 0
corr <- 0
ptm <- proc.time()
for (line in sapply(corpus_valid,identity)) {
word_list <- split_string(line)
if (length(word_list)<3) {next}
test_word_list <- word_list[-length(word_list)]
test_string <- word_list[1]
i <- 3
for (word in test_word_list[-1]) {
test_string <- paste(test_string,word,sep=" ")
if (Katz_backoff_model(test_string)==word_list[i]) {
print(paste0(test_string," => ",word_list[i]))
corr <- corr + 1
}
tot <- tot + 1
i <- i + 1
}
}
## [1] " due respect monte butes opinion regarding minnesota state => colleges"
## [1] " due respect monte butes opinion regarding minnesota state colleges => universities"
## [1] " due respect monte butes opinion regarding minnesota state colleges universities chancellor steven rosenstones workforce assessment initiative find hard => believe"
## [1] " due respect monte butes opinion regarding minnesota state colleges universities chancellor steven rosenstones workforce assessment initiative find hard believe views collaboration mnscu minnesota => department"
## [1] "thats ok seely => said"
## [1] "shooting magic city wrapped last fall ms ramsey spent hold rented costumes seasontwo production starts later => year"
## [1] " workout group exercises collection workout dvds including firm series weights cardio biggest => loser"
## [1] "thanks switch buildings use fossil fuels expected drop gallonsperyear conservancy => president"
## [1] " event now attracts thousands people makes thousands => dollars"
## [1] "according indictment col james e teare sr police chief knew allegations leopold took action teare declined => comment"
## [1] "last year banner year solar costs solar panels dropped => percent"
## [1] " getting tip law => enforcement"
## [1] " getting tip law enforcement officers sent search densely wooded => area"
## [1] " getting tip law enforcement officers sent search densely wooded area west mayes home guntown miss said aaron t ford special agent => charge"
## [1] " seen unfriendly business state lewis quoted => saying"
## [1] " monitor robert warshaw praised new police chief howard jordan effort => bring"
## [1] " pentagon recently sent american military trainers yemen washington spent hundreds millions => dollars"
## [1] " worth every penny collector paid said tobias meyer served auctioneer called one great icons fine => art"
## [1] " extra time weigh likely yet => another"
## [1] "option halfcent => sales"
## [1] "option halfcent sales => tax"
## [1] "option halfcent sales tax increase sunsetting five => years"
## [1] "option halfcent sales tax increase sunsetting five years option b contracting police services san mateo => county"
## [1] "option halfcent sales tax increase sunsetting five years option b contracting police services san mateo county sheriff option c collection cuts social programs raising fees etc note three options include million annual wage reductions option half cent sales => tax"
## [1] "one tweet response dawkins retirement announcement came elway wrote congratulations hall => fame"
## [1] " twostory traditional built sits => acres"
## [1] " twostory traditional built sits acres squarefoot house features dormer bay windows white columns french doors five bedrooms bathrooms outdoor amenities include swimming => pool"
## [1] "cardinals starter jake westbrook able rely sinker needed tuesday pitched two scoreless => innings"
## [1] " pm ncaa tourney => masslowell"
## [1] " pm ncaa tourney masslowell => vs"
## [1] " pm ncaa tourney masslowell vs => miami"
## [1] " pm ncaa tourney masslowell vs miami => ohio"
## [1] " pm ncaa tourney masslowell vs miami ohio => espnu"
## [1] "mike dayton acting secretary emergency => management"
## [1] "mike dayton acting secretary emergency management => agency"
## [1] "mike dayton acting secretary emergency management agency gave estimate today touring santa => cruz"
## [1] "added pukala weve meetings since last => year"
## [1] " definitely support teachers going strike bentatou => said"
## [1] " new album smooth departure original janes => addiction"
## [1] " operations department one largest crucial areas school system also generated large amount overtime staffing cut past four => years"
## [1] "wesley matthews one blazers best players preseason played four minutes leave sore => right"
## [1] "wesley matthews one blazers best players preseason played four minutes leave sore right => achilles"
## [1] "wesley matthews one blazers best players preseason played four minutes leave sore right achilles tendon first tweaked thursdays scrimmage seattle pulled game immediately took shoe sock ice placed heel come locker => room"
## [1] "wesley matthews one blazers best players preseason played four minutes leave sore right achilles tendon first tweaked thursdays scrimmage seattle pulled game immediately took shoe sock ice placed heel come locker room second => half"
## [1] " blazers saw streaks four wins overall seven road wins snapped charlotte ended sixgame losing => streak"
## [1] "judy moorman larry rose among first customers moorman => said"
## [1] "judy moorman larry rose among first customers moorman said waitress old restaurant high => school"
## [1] "judy moorman larry rose among first customers moorman said waitress old restaurant high school counting days new big boys opened got allyoucaneat fried => chicken"
## [1] "turchi double mastectomy followed chemotherapy radiation gearing breast reconstruction surgery next => week"
## [1] "portantino said matter time confrontation law => enforcement"
## [1] "mitt romney presumptive republican nominee proposed broad largely unspecified cuts spending opposes obamas tax increases also => said"
## [1] " lots opportunities decide games cuba said cespedes represented country world baseball classic defecting last => year"
## [1] " degrees knot wind fished walleyes river couple => days"
## [1] " degrees knot wind fished walleyes river couple days => ago"
## [1] "cole emphasized speed needed decisionmaking process worries increasingly future race planned labor => day"
## [1] "cole emphasized speed needed decisionmaking process worries increasingly future race planned labor day => weekend"
## [1] " pools close time => get"
## [1] " pools close time get back serious matters like matter checking christmas displays mall next four months watching nonstop marathons real housewives big explosion movies finally make way weepy dramas academy => award"
## [1] "coconut cream pour coconut => milk"
## [1] "coconut cream pour coconut milk small saucepan place mediumhigh => heat"
## [1] "teamed ben maddow sidney meyers strick colleagues spent four years making savage eye lowbudget => film"
## [1] "teamed ben maddow sidney meyers strick colleagues spent four years making savage eye lowbudget film incorporated documentary footage shot gritty los => angeles"
## [1] "teamed ben maddow sidney meyers strick colleagues spent four years making savage eye lowbudget film incorporated documentary footage shot gritty los angeles locations story young divorced woman attempting start => new"
## [1] "proposition serves purpose effect lessen status human dignity gays => lesbians"
## [1] " edina city => council"
## [1] " edina city council endorsed concept last week allowing planning project proceed called hornets nest twostory addition cost => million"
## [1] " edina city council endorsed concept last week allowing planning project proceed called hornets nest twostory addition cost million million built north side braemar west arena include four locker rooms two => bathrooms"
pred_tot_time <- proc.time() - ptm
accuracy
accu <- corr/tot*100
accu
## [1] 4.016787
average time per prediction
pred_avg_time <- pred_tot_time / tot
pred_avg_time
## user system elapsed
## 0.966660072 0.006184053 0.975443645