suppressMessages(library(tm))
suppressMessages(library(knitr))
suppressMessages(library(ggplot2))
suppressMessages(library(wordcloud))
directory=getwd()
workspace=getwd()
workspace=sprintf("%s/.RData",workspace)
The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs http://rpubs.com/ that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
This is the training data to get started that will be the basis for most of the capstone. It must download the data from the link below and not from external websites to start.
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
destfile <- paste0(getwd(),"/Coursera-SwiftKey.zip")
download.file(url,destfile)
unzip(destfile)
blogs <- readLines("final/en_US/en_US.blogs.txt")
news <- readLines("final/en_US/en_US.news.txt")
twitter <- readLines("final/en_US/en_US.twitter.txt")
blogs:
blogs_size <- file.info("final/en_US/en_US.blogs.txt")[1]/1024^2
blogs_nchars <- which.max(lapply(blogs, nchar))
blogs_wordcount <- sum(sapply(strsplit(blogs, "//s+"), length))
blogs_summary <- c("final/en_US/en_US.blogs.txt",
format(round(as.double(blogs_size),2), nsmall= 2), length(blogs), blogs_nchars, blogs_wordcount)
news:
news_size <- file.info("final/en_US/en_US.news.txt")[1]/1024^2
news_nchars <- which.max(lapply(news, nchar))
news_wordcount <- sum(sapply(strsplit(news, "//s+"), length))
news_summary <- c("final/en_US/en_US.news.txt",
format(round(as.double(news_size),2), nsmall= 2), length(news), news_nchars, news_wordcount)
twitter:
twitter_size <- file.info("final/en_US/en_US.twitter.txt")[1]/1024^2
twitter_nchars <- which.max(lapply(twitter, nchar))
twitter_wordcount <- sum(sapply(strsplit(twitter, "//s+"), length))
twitter_summary <- c("final/en_US/en_US.twitter.txt",
format(round(as.double(twitter_size),2), nsmall= 2), length(twitter), twitter_nchars, twitter_wordcount)
Complete summary
summary <- c(blogs_summary, news_summary, twitter_summary)
summary <- data.frame(matrix(unlist(summary), nrow = 3, byrow = T))
colnames(summary) <- c("FILE", "SIZE (Mb)", "LINE COUNT", "MAX LINE LENGTH", "WORDS COUNT")
saveRDS(summary,'summary.rds')
summary <- readRDS('summary.rds')
print(kable(summary, "rst"))
##
##
## ============================= ========= ========== =============== ===========
## FILE SIZE (Mb) LINE COUNT MAX LINE LENGTH WORDS COUNT
## ============================= ========= ========== =============== ===========
## final/en_US/en_US.blogs.txt 200.42 899288 483415 899347
## final/en_US/en_US.news.txt 196.28 77259 14556 77260
## final/en_US/en_US.twitter.txt 159.36 2360148 1105776 2360169
## ============================= ========= ========== =============== ===========
Sample the data and create the corpus for each text file.
blogs: High frequency words in blogs
blogs_sample <- sample(blogs, round(0.01*length(blogs)), replace = F)
# Create the corpus
gen_blogs_corp <- paste(blogs_sample, collapse = " ")
gen_blogs_corp <- VectorSource(gen_blogs_corp)
gen_blogs_corp <- Corpus(gen_blogs_corp)
# Cleaning the data
gen_blogs_corp <- tm_map(gen_blogs_corp, removeNumbers)
gen_blogs_corp <- tm_map(gen_blogs_corp, content_transformer(tolower))
gen_blogs_corp<- tm_map(gen_blogs_corp, removeWords, stopwords("english"))
gen_blogs_corp<- tm_map(gen_blogs_corp, removePunctuation)
gen_blogs_corp<- tm_map(gen_blogs_corp, stripWhitespace)
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
gen_blogs_corp <- tm_map(gen_blogs_corp, toSpace, "/|@|//|$|:|:)|*|&|!|?|_|-|#|’|“|–")
# Count high frequency words
freq_words_blogs <- DocumentTermMatrix(gen_blogs_corp)
freq_words_blogs <- as.matrix(freq_words_blogs)
freq_words_blogs <- colSums(freq_words_blogs)
freq_words_blogs <- as.data.frame(sort(freq_words_blogs, decreasing = TRUE))
freq_words_blogs$words <- rownames(freq_words_blogs)
colnames(freq_words_blogs) <- c("frequency", "word")
# Top 15 high frequency words
high_freq_words_blogs <- freq_words_blogs[1:15, ]
saveRDS(freq_words_blogs,'freq_words_blogs.rds')
saveRDS(high_freq_words_blogs,'high_freq_words_blogs.rds')
news: High frequency words in news
news_sample <- sample(news, round(0.01*length(news)), replace = F)
# Create the corpus
gen_news_corp <- paste(news_sample, collapse = " ")
gen_news_corp <- VectorSource(gen_news_corp)
gen_news_corp <- Corpus(gen_news_corp)
# Cleaning the data
gen_news_corp <- tm_map(gen_news_corp, removeNumbers)
gen_news_corp <- tm_map(gen_news_corp, content_transformer(tolower))
gen_news_corp<- tm_map(gen_news_corp, removeWords, stopwords("english"))
gen_news_corp<- tm_map(gen_news_corp, removePunctuation)
gen_news_corp<- tm_map(gen_news_corp, stripWhitespace)
gen_news_corp <- tm_map(gen_news_corp, toSpace, "/|@|//|$|:|:)|*|&|!|?|_|-|#|’|“|–")
# Count high frequency words
freq_words_news <- DocumentTermMatrix(gen_news_corp)
freq_words_news <- as.matrix(freq_words_news)
freq_words_news <- colSums(freq_words_news)
freq_words_news <- as.data.frame(sort(freq_words_news, decreasing = TRUE))
freq_words_news$words <- rownames(freq_words_news)
colnames(freq_words_news) <- c("frequency", "word")
# Top 15 high frequency words
high_freq_words_news <- freq_words_news[1:15, ]
saveRDS(freq_words_news,'freq_words_news.rds')
saveRDS(high_freq_words_news,'high_freq_words_news.rds')
twitter: High frequency words in twitter
twitter_sample <- sample(twitter, round(0.01*length(twitter)), replace = F)
# Create the corpus
gen_twitter_corp <- paste(twitter_sample, collapse = " ")
gen_twitter_corp <- VectorSource(gen_twitter_corp)
gen_twitter_corp <- Corpus(gen_twitter_corp)
# Cleaning the data
gen_twitter_corp <- tm_map(gen_twitter_corp, removeNumbers)
gen_twitter_corp <- tm_map(gen_twitter_corp, content_transformer(tolower))
gen_twitter_corp<- tm_map(gen_twitter_corp, removeWords, stopwords("english"))
gen_twitter_corp<- tm_map(gen_twitter_corp, removePunctuation)
gen_twitter_corp<- tm_map(gen_twitter_corp, stripWhitespace)
gen_twitter_corp <- tm_map(gen_twitter_corp, toSpace, "/|@|//|$|:|:)|*|&|!|?|_|-|#|’|“|–")
# Count high frequency words
freq_words_twitter <- DocumentTermMatrix(gen_twitter_corp)
freq_words_twitter <- as.matrix(freq_words_twitter)
freq_words_twitter <- colSums(freq_words_twitter)
freq_words_twitter <- as.data.frame(sort(freq_words_twitter, decreasing = TRUE))
freq_words_twitter$words <- rownames(freq_words_twitter)
colnames(freq_words_twitter) <- c("frequency", "word")
# Top 15 high frequency words
high_freq_words_twitter <- freq_words_twitter[1:15, ]
saveRDS(freq_words_twitter,'freq_words_twitter.rds')
saveRDS(high_freq_words_twitter,'high_freq_words_twitter.rds')
high_freq_words_blogs <- readRDS('high_freq_words_blogs.rds')
freq_words_blogs <- readRDS('freq_words_blogs.rds')
print(kable(high_freq_words_blogs, "rst"))
##
##
## ====== ========= ======
## \ frequency word
## ====== ========= ======
## one 1328 one
## can 1158 can
## will 1137 will
## just 1067 just
## like 1046 like
## time 941 time
## get 707 get
## now 633 now
## people 617 people
## know 597 know
## also 564 also
## first 560 first
## really 518 really
## even 517 even
## much 508 much
## ====== ========= ======
ggplot(data= high_freq_words_blogs,
aes(x= reorder(word, frequency), y = frequency,
fill = factor(reorder(word, -frequency)))) +
geom_bar(stat = "identity")+ labs(x= "words", title = "most frequent words in blogs") + theme(legend.title = element_blank())+ coord_flip()
blogs word cloud
wordcloud(freq_words_blogs$word[1:100], freq_words_blogs$frequency[1:100],
colors = brewer.pal(8, "Dark2"))
high_freq_words_news <- readRDS('high_freq_words_news.rds')
freq_words_news <- readRDS('freq_words_news.rds')
print(kable(high_freq_words_news, "rst"))
##
##
## ====== ========= ======
## \ frequency word
## ====== ========= ======
## said 176 said
## will 84 will
## year 70 year
## one 59 one
## also 54 also
## new 53 new
## state 43 state
## years 43 years
## people 41 people
## time 41 time
## last 40 last
## just 39 just
## two 38 two
## first 37 first
## can 36 can
## ====== ========= ======
ggplot(data= high_freq_words_news,
aes(x= reorder(word, frequency), y = frequency,
fill = factor(reorder(word, -frequency)))) +
geom_bar(stat = "identity")+ labs(x= "words", title = "most frequent words in news") + theme(legend.title = element_blank())+ coord_flip()
news word cloud
wordcloud(freq_words_news$word[1:100], freq_words_news$frequency[1:100],
colors = brewer.pal(8, "Dark2"))
high_freq_words_twitter <- readRDS('high_freq_words_twitter.rds')
freq_words_twitter <- readRDS('freq_words_twitter.rds')
print(kable(high_freq_words_twitter, "rst"))
##
##
## ====== ========= ======
## \ frequency word
## ====== ========= ======
## just 1456 just
## like 1226 like
## get 1083 get
## love 1012 love
## good 1002 good
## will 971 will
## thanks 917 thanks
## can 883 can
## day 868 day
## one 839 one
## know 825 know
## now 821 now
## great 772 great
## time 769 time
## today 702 today
## ====== ========= ======
ggplot(data= high_freq_words_twitter,
aes(x= reorder(word, frequency), y = frequency,
fill = factor(reorder(word, -frequency)))) +
geom_bar(stat = "identity")+ labs(x= "words", title = "most frequent words in twitter") + theme(legend.title = element_blank())+ coord_flip()
twitter word cloud
wordcloud(freq_words_twitter$word[1:100], freq_words_twitter$frequency[1:100],
colors = brewer.pal(8, "Dark2"))
Now since we have the data to the ngrams model we and the observations required we can proceed to make our shiny applicaton that predicts the the words using n - gram model.