Show any code that is needed to
filename <- "activity.zip"
if(!file.exists(filename)){
fileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
download.file(fileURL, filename)
}
if(!file.exists("activity.csv")){
unzip(filename)
}
act <- read.csv("activity.csv", skipNul = TRUE)
ttlStepsEachDay <- tapply(act$steps, act$date, sum, na.rm = TRUE)
ttlStepsEachDay
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## 0 126 11352 12116 13294 15420
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12
## 11015 0 12811 9900 10304 17382
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18
## 12426 15098 10139 15084 13452 10056
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24
## 11829 10395 8821 13460 8918 8355
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30
## 2492 6778 10119 11458 5018 9819
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05
## 15414 0 10600 10571 0 10439
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 8334 12883 3219 0 0 12608
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17
## 10765 7336 0 41 5441 14339
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23
## 15110 8841 4472 12787 20427 21194
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## 14478 11834 11162 13646 10183 7047
## 2012-11-30
## 0
hist(ttlStepsEachDay)
mean(ttlStepsEachDay, na.rm = TRUE)
## [1] 9354.23
median(ttlStepsEachDay, na.rm = TRUE)
## [1] 10395
x <- unique(act$interval)
y <- tapply(act$steps, act$interval, mean, na.rm = TRUE)
plot(x,y, type = "l", xlab = "5-minute interval",
ylab = "average number of steps taken")
subset(act, steps == max(act$steps, na.rm = TRUE))
## steps date interval
## 16492 806 2012-11-27 615
table(is.na(act$steps))
##
## FALSE TRUE
## 15264 2304
Devise a strategy for filling in all of the missing values in the dataset. I’ll use the mean of the interval which we already have saved as “y”
Create a new dataset that is equal to the original dataset but with the missing data filled in.
act2 <- act
for(i in 1:nrow(act2)){
if(is.na(act2[i,1])){
act2[i,1] <- y[as.character(act2[i,3])]
}
}
4.a. Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day.
ttlStepsEachDay2 <- tapply(act2$steps, act2$date, sum)
hist(ttlStepsEachDay2)
mean(ttlStepsEachDay2, na.rm = TRUE)
## [1] 10766.19
median(ttlStepsEachDay2, na.rm = TRUE)
## [1] 10766.19
4.b. Do these values differ from the estimates from the first part of the assignment? There appears to be no change
4.c. What is the impact of imputing missing data on the estimates of the total daily number of steps? There doesn’t seem to be any impact.
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.3.3
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
dates <- act[,2]
dates2 <- ymd(dates)
dayofweek <- weekdays(dates2)
dayofweek <- gsub("Monday", "Weekday", dayofweek)
dayofweek <- gsub("Tuesday", "Weekday", dayofweek)
dayofweek <- gsub("Wednesday", "Weekday", dayofweek)
dayofweek <- gsub("Thursday", "Weekday", dayofweek)
dayofweek <- gsub("Friday", "Weekday", dayofweek)
dayofweek <- gsub("Saturday", "Weekend", dayofweek)
dayofweek <- gsub("Sunday", "Weekend", dayofweek)
act <- cbind(act, dayofweek)
avgStepsEachDayofWeek <- tapply(act$steps, act$dayofweek, mean, na.rm = TRUE)
xyplot(tapply(act$steps, act$interval, mean, na.rm = TRUE) ~ unique(act$interval) | unique(act$dayofweek), data = act, layout = c(1,2), type = "l", ylab = "Number of Steps", xlab = "interval")