This is a milestone report for the Coursera Data Science Capstone project at week 2. The overall goal of this project is to build a prediction model by using Natural Language Processing (NLP) and create an application to predict words based on the user inputs.
The objective of this milestone report is to conduct an exploratory data analysis explaining the characteristics of the given data with a data summary. This report also explains how the prediction model will be created and embedded in the Shiny-web app in a way that is understandable to general public.
Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.
Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.
Build basic n-gram model - using the exploratory analysis you performed, build a basic n-gram model for predicting the next word based on the previous 1, 2, or 3 words.
Build a model to handle unseen n-grams - in some cases people will want to type a combination of words that does not appear in the corpora. Build a model to handle cases where a particular n-gram isn’t observed.
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
library(RWeka)
library(tm)
library(wordcloud)
library(ngram)
library(data.table)
USblogs_txt <- "en_US.blogs.txt"
USnews_txt <- "en_US.news.txt"
UStwitter_txt <- "en_US.twitter.txt"
USblogs <- readLines(USblogs_txt, encoding = "UTF-8", skipNul = TRUE)
USnews <- readLines(USnews_txt, encoding = "UTF-8", skipNul = TRUE)
UStwitter <- readLines(UStwitter_txt, encoding = "UTF-8", skipNul = TRUE)
file_size <- sapply(c(USblogs_txt, USnews_txt, UStwitter_txt), function(x){file.size(x)/1024^2})
data_summary <- as.data.frame(sapply(list(USblogs, USnews, UStwitter), function(x){ c(length(x) , sum(str_count(x,'\\S+')) )}))
data_summary <- rbind(file_size, data_summary)
colnames(data_summary) <- c("USblogs", "USnews", "UStwitter")
row.names(data_summary) <- c("filesize(MB)", "Lines", "word_count")
data_summary
## USblogs USnews UStwitter
## filesize(MB) 2.004242e+02 196.2775 1.593641e+02
## Lines 8.992880e+05 77259.0000 2.360148e+06
## word_count 3.733413e+07 2643969.0000 3.037358e+07
Here shows the file sizes, number of lines and words of each given data set.
set.seed(777)
data <- c(sample(USblogs, length(USblogs)*0.01),
sample(USnews, length(USnews)*0.01),
sample(UStwitter, length(UStwitter)*0.01))
data_source <- VectorSource(data)
data_corpus <- VCorpus(data_source)
cleanning <- function(words){
words <- tm_map(words, content_transformer(tolower)) # convert to lower case
words <- tm_map(words, stripWhitespace) # remove white space between words
words <- tm_map(words, removePunctuation) # remove Punctuation
words <- tm_map(words, removeNumbers) # remove numbers
words <- tm_map(words, content_transformer(function(x) gsub("http[[:alnum:]]*","",x))) # remove url
words <- tm_map(words, content_transformer(function(x) iconv(x, "latin1","ASCII", sub = ""))) # remove non-ASCII character
words <- tm_map(words, PlainTextDocument)
return(words)
}
clean_corpus <- cleanning(data_corpus)
Tokenize and create matrix for each gram.
Given example sentence: “i have a dream”
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
uni_matrix <- TermDocumentMatrix(clean_corpus, control = list(tokenize = unigram))
bi_matrix <- TermDocumentMatrix(clean_corpus, control = list(tokenize = bigram))
tri_matrix <- TermDocumentMatrix(clean_corpus, control = list(tokenize = trigram))
quad_matrix <- TermDocumentMatrix(clean_corpus, control = list(tokenize = quadgram))
uni_ft <- findFreqTerms(uni_matrix, lowfreq = 1)
bi_ft <- findFreqTerms(bi_matrix, lowfreq = 2)
tri_ft <- findFreqTerms(tri_matrix, lowfreq = 3)
quad_ft <- findFreqTerms(quad_matrix, lowfreq = 4)
Creating a data frame for each Ngram matrix to examine the words table for each pattern.
uni_df <- rowSums(as.matrix(uni_matrix[uni_ft,]))
uni_df <- data.frame(word = names(uni_df), frequency = uni_df)
bi_df <- rowSums(as.matrix(bi_matrix[bi_ft,]))
bi_df <- data.frame(word = names(bi_df),frequency = bi_df)
tri_df <- rowSums(as.matrix(tri_matrix[tri_ft,]))
tri_df <- data.frame(word = names(tri_df),frequency = tri_df)
quad_df <- rowSums(as.matrix(quad_matrix[quad_ft,]))
quad_df <- data.frame(word = names(quad_df),frequency = quad_df)
uni_data <- uni_df %>%
arrange(desc(frequency))%>%
mutate(prob = frequency/sum(frequency), cumumalative_prob = cumsum(prob))
row.names(uni_data) <- 1:nrow(uni_data)
save(uni_data, file = "unigram.Rds")
head(uni_data)
## word frequency prob cumumalative_prob
## 1 the 29606 0.055280970 0.05528097
## 2 and 15966 0.029812064 0.08509303
## 3 you 8466 0.015807900 0.10090093
## 4 for 7848 0.014653957 0.11555489
## 5 that 7151 0.013352503 0.12890740
## 6 with 4663 0.008706856 0.13761425
Plotting the top 10 frequency words of unigram
uni_data_top10 <- uni_data[1:10,]
barplot(uni_data_top10$frequency, name = uni_data_top10$word, ylab = "Frequency", xlab = "Top 10 words", col = "Blue", las = 2, main = "Word Frequency of unigram")
barplot(uni_data_top10$prob, name = uni_data_top10$word, ylab = "Probability", xlab = "Top 10 words", col = "yellow", main = "Probility of words from unigram model")
plot(uni_data$cumumalative_prob, ylab = "Cumulative probability" ,type = "l", main = "Coverage of words from unigram model")
bi_data <- bi_df %>%
arrange(desc(frequency))%>%
mutate(prob = frequency/sum(frequency), cumumalative_prob = cumsum(prob))
row.names(bi_data) <- 1:nrow(bi_data)
save(bi_data, file = "bigram.Rds")
head(bi_data)
## word frequency prob cumumalative_prob
## 1 of the 2605 0.006444303 0.006444303
## 2 in the 2499 0.006182078 0.012626381
## 3 for the 1424 0.003522721 0.016149102
## 4 to the 1371 0.003391608 0.019540710
## 5 on the 1282 0.003171438 0.022712149
## 6 to be 1202 0.002973533 0.025685681
bi_data_top10 <- bi_data[1:10,]
barplot(bi_data_top10$frequency, name = bi_data_top10$word, ylab = "Frequency", xlab = "Top 10 word patterns", col = "Blue", las = 2, main = "Word Frequency of the data")
barplot(bi_data_top10$prob, name = bi_data_top10$word, ylab = "Probability", xlab = "Top 10 word patterns", col = "yellow", main = "Probility of words from unigram model")
plot(bi_data$cumumalative_prob, ylab = "Cumulative probability" ,type = "l", main = "Coverage of words from unigram model")
tri_data <- tri_df %>%
arrange(desc(frequency))%>%
mutate(prob = frequency/sum(frequency), cumumalative_prob = cumsum(prob))
row.names(tri_data) <- 1:nrow(tri_data)
save(tri_data, file = "trigram.Rds")
head(tri_data, 10)
## word frequency prob cumumalative_prob
## 1 thanks for the 247 0.002816580 0.002816580
## 2 a lot of 211 0.002406066 0.005222647
## 3 one of the 202 0.002303438 0.007526085
## 4 i want to 143 0.001630652 0.009156736
## 5 to be a 131 0.001493814 0.010650550
## 6 looking forward to 130 0.001482411 0.012132961
## 7 going to be 122 0.001391185 0.013524146
## 8 i have a 110 0.001254347 0.014778494
## 9 i have to 110 0.001254347 0.016032841
## 10 it was a 106 0.001208735 0.017241576
tri_data_top10 <- tri_data[1:10,]
barplot(tri_data_top10$frequency, name = tri_data_top10$word, ylab = "Frequency", col = "Blue", las = 2, main = "Word Frequency of trigram")
barplot(tri_data_top10$prob, name = tri_data_top10$word, ylab = "Probability", col = "yellow", main = "Probility of words from trigram model")
plot(tri_data$cumumalative_prob, ylab = "Cumulative probability" ,type = "l", main = "Coverage of words from trigram model")
quad_data <- quad_df %>%
arrange(desc(frequency))%>%
mutate(prob = frequency/sum(frequency), cumumalative_prob = cumsum(prob))
row.names(quad_data) <- 1:nrow(quad_data)
save(quad_data, file = "quadgram.Rds")
head(quad_data, 10)
## word frequency prob cumumalative_prob
## 1 thanks for the follow 58 0.007463647 0.007463647
## 2 the end of the 56 0.007206280 0.014669927
## 3 thanks for the rt 43 0.005533393 0.020203320
## 4 for the first time 42 0.005404710 0.025608030
## 5 at the end of 41 0.005276026 0.030884056
## 6 the rest of the 41 0.005276026 0.036160082
## 7 thank you for the 40 0.005147343 0.041307425
## 8 i dont want to 38 0.004889976 0.046197401
## 9 is going to be 34 0.004375241 0.050572642
## 10 at the same time 33 0.004246558 0.054819200
quad_data_top10 <- quad_data[1:10,]
barplot(quad_data_top10$frequency, name = quad_data_top10$word, ylab = "Frequency", col = "Blue", las = 2, main = "Word Frequency of quadgram")
barplot(quad_data_top10$prob, name = quad_data_top10$word, ylab = "Probability", col = "yellow", main = "Probility of words from quadgram model")
plot(quad_data$cumumalative_prob, ylab = "Cumulative probability" ,type = "l", main = "Coverage of words from quadgram model")
It is another interesting way to present the frequency of the Ngram models. The words or phases with higher number of occurrence will stand out better from others.
wordcloud(uni_data$word, uni_data$frequency, max.words=100, random.order=FALSE, use.r.layout = FALSE, rot.per=0.15, colors = brewer.pal(7, "Dark2"), scale=c(4, 0.5))
wordcloud(bi_data$word, bi_data$frequency, scale = c(4, 0.5), max.words = 100, random.order = FALSE, rot.per = 0.15, use.r.layout = FALSE, colors = brewer.pal(7, "Dark2"))
wordcloud(tri_data$word, tri_data$frequency, scale = c(2, 0.5), max.words = 100, random.order = FALSE, rot.per = 0.15, use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
wordcloud(quad_data$word, quad_data$frequency, max.words=100, random.order=FALSE, use.r.layout = FALSE, rot.per=0.15, colors = brewer.pal(8, "Dark2"), scale=c(3, 0.2))
Although this data was sampling only 1% of the data, this may create some memory issues when running the prediction model.
Trying different sample sizes based on the memory consumption to reach a balance between sufficient data and computer performance for a decent model.
Build a predictive model based on the previous data modeling steps
Evaluate the model for efficiency and accuracy - use timing software to evaluate the computational complexity of your model. Evaluate the model accuracy using different metrics like perplexity, accuracy at the first word, second word, and third word.