The goal to produce this report is to briefly decribe a plan to deal with the data and in a way to create own prediction algorithm.By applying exploratory analysis and explaining only the major features of the data, a plan shall be sumarized for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. Analysis tools such as tables and plots are used to illustrate important features of the data. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
### setup
library(ggplot2)
filePath = "./data/final"
N = 100
### download the data
if (!file.exists(filePath)) {
if (!file.exists("./data/Coursera-SwiftKey.zip")) {
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
destfile = "Coursera-SwiftKey.zip", quiet=TRUE)
}
unzip("Coursera-SwiftKey.zip", exdir = "./data");
}
### load downloaded data, en_US blogs, twitter, and news files only
src.blogs <- readLines(file.path(filePath,"/en_US/en_US.blogs.txt"), encoding = "UTF-8", skipNul=TRUE)
src.twitter <- readLines(file.path(filePath,"/en_US/en_US.twitter.txt"), encoding = "UTF-8", skipNul=TRUE)
conn <- file(file.path(filePath,"/en_US/en_US.news.txt"), open="rb") # using binary mode, open connection
src.news <- readLines(conn, encoding="UTF-8", skipNul=TRUE)
close(conn) #close connection & remove
rm(conn)
Blogs file summary view
summary(src.blogs)
## Length Class Mode
## 899288 character character
News file summary view
summary(src.news)
## Length Class Mode
## 1010242 character character
Twitter file summary view
summary(src.twitter)
## Length Class Mode
## 2360148 character character
Compare files - Count of Lines
fileLength <- c(length(src.blogs),length(src.news),length(src.twitter))
fileLength <- data.frame(fileLength)
fileLength$names <- c("blogs","news","twitter")
ggplot(fileLength,aes(x=names,y=fileLength)) +geom_bar(stat='identity', fill="blue", color='blue') +xlab('File source') +ylab('Count of Lines') +ggtitle('Number of Lines in File by Source')
Compare files - Distribution of # of words in a sentance
WordsInSentance <- c(vapply(strsplit(src.blogs, "\\W+"), length, integer(1))
,vapply(strsplit(src.news, "\\W+"), length, integer(1))
,vapply(strsplit(src.twitter, "\\W+"), length, integer(1)))
WordsDistribution <- data.frame(WordsInSentance)
## counting words in a sentance, source - http://stackoverflow.com/a/8920256
newsWinS <- vapply(strsplit(sample(src.news,N), "\\W+"), length, integer(1))
blogsWinS <- vapply(strsplit(sample(src.blogs,N), "\\W+"), length, integer(1))
twitterWinS <- vapply(strsplit(sample(src.twitter,N), "\\W+"), length, integer(1))
WordsDistNews <- data.frame(newsWinS)
WordsDistBlog <- data.frame(blogsWinS)
wordsDistTwtr <- data.frame(twitterWinS)
qplot(WordsDistBlog, geom="histogram", binwidth = 1)+xlab('Number of Words') +ggtitle(paste("Blogs: Distribution of Number of Words in a Sentance (Sample of ",N,")",sep = ""))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
qplot(WordsDistNews, geom="histogram", binwidth = 1)+xlab('Number of Words') +ggtitle(paste("News: Distribution of Number of Words in a Sentance (Sample of ",N,")",sep = ""))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
qplot(wordsDistTwtr, geom="histogram", binwidth = 1)+xlab('Number of Words') +ggtitle(paste("Twitter: Distribution of Number of Words in a Sentance (Sample of ",N,")",sep = ""))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
The next plan will it be: