==============================================================
## load packages and set work directory
setwd("/Users/yxshelly/Desktop/data_science/c5_reproducible research")
library(dplyr); library(plyr); library(ggplot2)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
setwd("/Users/yxshelly/Desktop/data_science/c5_reproducible research")
if(!file.exists("data")) {
dir.create("data")
}
## download data from the website to "data"
path <- getwd()
download.file(url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip",
destfile = paste(path, "data/files.zip", sep = "/"))
list.files("./data")
unzip(zipfile = "data/files.zip")
# document the date downloaded
dataDownloaded <- date()
activity <-read.csv("activity.csv")
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
head(activity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
===========================================================================
totalSteps <- ddply(activity, "date", summarise, tot=sum(steps), na.rm=T)
head(totalSteps)
## date tot na.rm
## 1 2012-10-01 NA TRUE
## 2 2012-10-02 126 TRUE
## 3 2012-10-03 11352 TRUE
## 4 2012-10-04 12116 TRUE
## 5 2012-10-05 13294 TRUE
## 6 2012-10-06 15420 TRUE
## Warning: Removed 8 rows containing non-finite values (stat_bin).
ggplot(totalSteps, aes(x=tot)) +
geom_histogram(fill= "red", binwidth=1000) +
labs(title = "Histogram of Total Number of Daily Steps", x = "Steps", y = "Frequency")
## Warning: Removed 8 rows containing non-finite values (stat_bin).
summary(totalSteps$tot)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 41 8841 10765 10766 13294 21194 8
===========================================================================
# recode interval to factor variable
activity$interval <- factor(activity$interval)
# calculate average steps per interval across days
avePattern <- ddply(activity, "interval", summarise, average=mean(steps, na.rm=T))
head(avePattern, 10)
## interval average
## 1 0 1.7169811
## 2 5 0.3396226
## 3 10 0.1320755
## 4 15 0.1509434
## 5 20 0.0754717
## 6 25 2.0943396
## 7 30 0.5283019
## 8 35 0.8679245
## 9 40 0.0000000
## 10 45 1.4716981
str(avePattern)
## 'data.frame': 288 obs. of 2 variables:
## $ interval: Factor w/ 288 levels "0","5","10","15",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ average : num 1.717 0.3396 0.1321 0.1509 0.0755 ...
plot(avePattern$interval, avePattern$average,
xlab="interval", ylab="average of steps", type="l",
main="Average Daily Activity Pattern")
lines(avePattern$average)
max <- arrange(avePattern, average, decreasing = TRUE)
head(max)
## interval average
## 1 835 206.1698
## 2 840 195.9245
## 3 850 183.3962
## 4 845 179.5660
## 5 830 177.3019
## 6 820 171.1509
===========================================================================
summary(activity$steps)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.00 37.38 12.00 806.00 2304
# Check the column names containing missing observations
list_na <- colnames(activity)[apply(activity, 2, anyNA)]
list_na
## [1] "steps"
# Replace missing values with the mean
activity_impute_median <- data.frame(
sapply(activity,
function(x) ifelse(is.na(x),
median(x, na.rm = TRUE),
x)))
summary(activity_impute_median$steps)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 32.48 0.00 806.00
totalSteps2 <- ddply(activity_impute_median, "date", summarise, tot=sum(steps))
head(totalSteps2)
## date tot
## 1 1 0
## 2 2 126
## 3 3 11352
## 4 4 12116
## 5 5 13294
## 6 6 15420
ggplot(totalSteps2, aes(x=tot)) +
geom_histogram(fill= "blue", binwidth=1000) +
labs(title = "Histogram of Total Number of Daily Steps, Imputed", x = "Steps", y = "Frequency")
summary(totalSteps2$tot)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 6778 10395 9354 12811 21194
===========================================================================
# change date to date format
activity_impute_median$date <- as.Date(activity$date, format = "%Y-%m-%d")
activity_impute_median$day <-weekdays(activity_impute_median$date)
# binary indicator
activity_impute_median$day <-gsub("Monday|Tuesday|Wednesday|Thursday|Friday", "weekday", activity_impute_median$day)
activity_impute_median$day <-gsub("Saturday|Sunday","weekend", activity_impute_median$day)
table(activity_impute_median$day)
##
## weekday weekend
## 12960 4608
str(activity_impute_median)
## 'data.frame': 17568 obs. of 4 variables:
## $ steps : num 0 0 0 0 0 0 0 0 0 0 ...
## $ date : Date, format: "2012-10-01" "2012-10-01" ...
## $ interval: num 1 2 3 4 5 6 7 8 9 10 ...
## $ day : chr "weekday" "weekday" "weekday" "weekday" ...
activity_impute_median$interval <- factor(activity_impute_median$interval)
activity_impute_median$day <- factor(activity_impute_median$day)
# summarize by weekday/weekend
act1 <- filter(activity_impute_median, activity_impute_median$day=="weekday")
aveP1 <- ddply(act1, "interval", summarise, average=mean(steps))
head(aveP1, 10)
## interval average
## 1 1 2.02222222
## 2 2 0.40000000
## 3 3 0.15555556
## 4 4 0.17777778
## 5 5 0.08888889
## 6 6 1.31111111
## 7 7 0.62222222
## 8 8 1.02222222
## 9 9 0.00000000
## 10 10 1.60000000
act2 <- filter(activity_impute_median, activity_impute_median$day=="weekend")
aveP2 <- ddply(act2, "interval", summarise, average=mean(steps))
head(aveP1, 10)
## interval average
## 1 1 2.02222222
## 2 2 0.40000000
## 3 3 0.15555556
## 4 4 0.17777778
## 5 5 0.08888889
## 6 6 1.31111111
## 7 7 0.62222222
## 8 8 1.02222222
## 9 9 0.00000000
## 10 10 1.60000000
par(mfrow=c(2, 1))
plot(aveP1$interval, aveP1$average,
xlab="interval", ylab="average of steps", type="l",
main="Average Daily Activity Pattern - Weekdays")
lines(aveP1$average, col="blue")
plot(aveP2$interval, aveP2$average,
xlab="interval", ylab="average of steps", type="l",
main="Average Daily Activity Pattern - Weekends")
lines(aveP2$average, col="red")