This is a milestone report for the capstone project, and aims to provide a concise summary of my exploratory data analyis and high level goals for the project
The motivation for this project is to: - Demonstrate that I have downloaded the data and have successfully loaded it in. - Create a basic report of summary statistics about the data sets. - Report any interesting findings that I have amassed so far. - Get feedback on my plans for creating a prediction algorithm and Shiny app.
file <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("Coursera-SwiftKey.zip")) {
download.file(file, destfile="Coursera-SwiftKey.zip", method="curl")
unzip("Coursera-SwiftKey.zip")
}
path <- paste(getwd(), "/final/en_US/", sep = "")
twitter <- readLines(paste(path, "en_US.twitter.txt", sep = ""), encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines(paste(path, "en_US.blogs.txt", sep = ""), encoding = "UTF-8", skipNul = TRUE)
news <- readLines(paste(path,"en_US.news.txt", sep = ""), encoding="UTF-8", skipNul = TRUE)
twitter_filesize <- paste(round(file.size(paste(path, "en_US.twitter.txt", sep = ""))/10^6, 2), "MB", sep = " ")
twitter_wordCount <- format(sum(sapply(strsplit(twitter, " "), FUN=length, simplify = TRUE)),big.mark=",")
twitter_noLines <- format(length(twitter), big.mark=",")
twitter_maxLineLen <- format(max(nchar(twitter)), big.mark=",")
twitterSummary <-data.frame(twitter_filesize, twitter_wordCount, twitter_noLines, twitter_maxLineLen)
names(twitterSummary) <- c("File Size (MB)", "Word Count", "Line Count", "Longest Line Length")
blogs_filesize <- paste(round(file.size(paste(path, "en_US.blogs.txt", sep = ""))/10^6, 2), "MB", sep = " ")
blogs_wordCount <- format(sum(sapply(strsplit(blogs, " "), FUN=length, simplify = TRUE)), big.mark=",")
blogs_noLines <- format(length(blogs), big.mark=",")
blogs_maxLineLen <- format(max(nchar(blogs)), big.mark=",")
blogsSummary <-data.frame(blogs_filesize, blogs_wordCount, blogs_noLines, blogs_maxLineLen)
names(blogsSummary) <- c("File Size (MB)", "Word Count", "Line Count", "Longest Line Length")
news_filesize <- paste(round(file.size(paste(path, "en_US.news.txt", sep = ""))/10^6, 2), "MB", sep = " ")
news_wordCount <- format(sum(sapply(strsplit(news, " "), FUN=length, simplify = TRUE)), big.mark=",")
news_noLines <- format(length(news), big.mark=",")
news_maxLineLen <- format(max(nchar(news)), big.mark=",")
newsSummary <-data.frame(news_filesize, news_wordCount, news_noLines, news_maxLineLen)
names(newsSummary) <- c("File Size (MB)", "Word Count", "Line Count", "Longest Line Length")
summaryInfo <- data.frame(rbind(blogsSummary, newsSummary, twitterSummary))
names(summaryInfo) <- c("File Size (MB)", "Word Count", "Line Count", "Longest Line Length")
row.names(summaryInfo) <- c("Blogs", "News", "Twitter")
summaryInfo
## File Size (MB) Word Count Line Count Longest Line Length
## Blogs 210.16 MB 37,334,131 899,288 40,833
## News 205.81 MB 2,643,969 77,259 5,760
## Twitter 167.11 MB 30,373,583 2,360,148 140
In order to reduce the computational load, we will randomly take a 20% sample of each data set, and then combine them together
# sample 20% of the data
set.seed(1234)
sampleBlogs <- sample(blogs, length(blogs)*0.2, replace = FALSE)
sampleNews <- sample(news, length(news)*0.2, replace = FALSE)
sampleTwitter <- sample(twitter, length(twitter)*0.2, replace = FALSE)
sampleWords <- c(sampleBlogs, sampleNews, sampleTwitter)
sampleWords <- iconv(sampleWords, 'UTF-8','ASCII', "byte")
we will now clean the corpus
sampleCorpus <- Corpus(VectorSource(sampleWords))
sampleCorpus <- tm_map(sampleCorpus, removeNumbers)
sampleCorpus <- tm_map(sampleCorpus, removePunctuation)
sampleCorpus <- tm_map(sampleCorpus, stripWhitespace)
sampleCorpus <- tm_map(sampleCorpus, content_transformer(tolower))
sampleCorpus <- tm_map(sampleCorpus, removeWords, stopwords("english"))
get the highest frequency unigrams
unigram <- textcnt(sampleCorpus, n = 1, method = "string")
unigramSorted <- unigram[order(unigram, decreasing = TRUE)]
barplot(head(unigramSorted, 10), main = "top 10 Unigrams", col = "yellow", ylab = "Frequency")
make a wordcloud of unigrams
wordcloud(sampleCorpus, scale= c(3,0.5), min.freq=5, max.words=200, random.order = TRUE, rot.per=0.5, colors = brewer.pal(6, "Paired"), use.r.layout = FALSE)
get the highest frequency bigrams
bigram <- textcnt(sampleCorpus, n = 2, method = "string")
bigramSorted <- bigram[order(bigram, decreasing = TRUE)]
barplot(head(bigramSorted, 10), main = "top 10 bigrams", col = "green", ylab = "Frequency")
get the highest frequency trigrams
trigram <- textcnt(sampleCorpus, n = 3, method = "string")
trigramSorted <- trigram[order(trigram, decreasing = TRUE)]
trigramSorted[1:10]
## cant wait see happy mothers day let us know
## 699 693 465
## happy new year im pretty sure new york city
## 375 308 271
## looking forward seeing cinco de mayo dont even know
## 222 213 202
## cant wait get
## 181
barplot(head(trigramSorted, 5), main = "top 5 trigrams", col = "blue", ylab = "Frequency")
At a high level, the next steps can be categorized into 3 buckets.